-
Notifications
You must be signed in to change notification settings - Fork 36
Expand file tree
/
Copy pathpybind.hpp
More file actions
343 lines (302 loc) · 12.8 KB
/
Copy pathpybind.hpp
File metadata and controls
343 lines (302 loc) · 12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
#pragma once
#include <utility> // `std::pair`
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <pybind11/numpy.h>
#include <arrow/python/pyarrow.h>
#include "ustore/ustore.hpp"
namespace unum::ustore {
namespace py = pybind11;
struct py_db_t;
struct py_transaction_t;
struct py_collection_t;
typedef enum { graph_k = 0, digraph_k = 1, multigraph_k = 2, multidigraph_k = 3 } graph_type_t;
template <graph_type_t>
struct py_graph_gt;
struct py_table_collection_t;
struct py_task_ctx_t;
/**
* @brief Wrapper for `ustore::database_t`.
* Assumes that the Python client won't use more than one
* concurrent session, as multithreading in Python is
* prohibitively expensive.
* We need to preserve the `config`, to allow re-opening.
*/
struct py_db_t : public std::enable_shared_from_this<py_db_t> {
database_t native;
std::string config;
/**
* @brief Some clients may prefer to receive extracted values
* as native Python types when possible. By default, we export
* into Apache Arrow arrays.
*/
bool export_into_arrow {true};
py_db_t(database_t&& n, std::string const& c) : native(std::move(n)), config(c) {}
py_db_t(py_db_t&& other) noexcept : native(std::move(other.native)), config(std::move(other.config)) {}
py_db_t(py_db_t const&) = delete;
};
/**
* @brief Wrapper for `ustore::transaction_t`.
* Only adds reference counting to the native C++ interface.
*/
struct py_transaction_t : public std::enable_shared_from_this<py_transaction_t> {
transaction_t native;
std::shared_ptr<py_db_t> py_db_ptr;
bool dont_watch {false};
bool flush_writes {false};
py_transaction_t(transaction_t&& t, std::shared_ptr<py_db_t> py_db_ptr) noexcept
: native(std::move(t)), py_db_ptr(py_db_ptr) {}
py_transaction_t(py_transaction_t&& other) noexcept
: native(std::move(other.native)), py_db_ptr(other.py_db_ptr), dont_watch(other.dont_watch),
flush_writes(other.flush_writes) {}
py_transaction_t(py_transaction_t const&) = delete;
};
/**
* @brief Wrapper for `ustore::blobs_collection_t`.
* We need to preserve the `name`, to upsert again, after removing it in `clear`.
* We also keep the transaction pointer, to persist the context of operation.
*/
template <typename collection_at>
struct py_collection_gt {
collection_at native;
std::shared_ptr<py_db_t> py_db_ptr;
std::shared_ptr<py_transaction_t> py_txn_ptr;
std::string name;
bool in_txn {false};
ustore_collection_t* member_collection() noexcept { return native.member_ptr(); }
ustore_arena_t* member_arena() noexcept { return native.member_arena(); }
ustore_options_t options() noexcept {
auto base = ustore_options_default_k;
return py_txn_ptr ? static_cast<ustore_options_t>( //
base | //
(py_txn_ptr->dont_watch ? ustore_option_transaction_dont_watch_k : base) |
(py_txn_ptr->flush_writes ? ustore_option_write_flush_k : base))
: base;
}
ustore_database_t db() noexcept(false) { return native.db(); }
ustore_transaction_t txn() noexcept(false) {
return in_txn ? ustore_transaction_t(py_txn_ptr->native) : ustore_transaction_t(nullptr);
}
/**
* @brief Some clients may prefer to receive extracted values
* as native Python types when possible. By default, we export
* into Apache Arrow arrays.
*/
bool export_into_arrow() const noexcept { return py_db_ptr->export_into_arrow; }
};
using py_blobs_collection_t = py_collection_gt<blobs_collection_t>;
using py_docs_collection_t = py_collection_gt<docs_collection_t>;
struct py_buffer_memory_t {
Py_buffer raw;
/// The memory that `raw.shape` points to.
Py_ssize_t shape[4];
/// The memory that `raw.strides` points to.
Py_ssize_t strides[4];
};
template <graph_type_t type_ak>
struct py_graph_gt : public std::enable_shared_from_this<py_graph_gt<type_ak>> {
std::shared_ptr<py_db_t> py_db_ptr;
std::shared_ptr<py_transaction_t> py_txn_ptr;
blobs_collection_t index;
docs_collection_t vertices_attrs;
docs_collection_t relations_attrs;
ustore_key_t key_ = 0;
bool in_txn {false};
graph_type_t type {type_ak};
py_buffer_memory_t last_buffer;
py_graph_gt() {}
py_graph_gt(py_graph_gt&&) = delete;
py_graph_gt(py_graph_gt const&) = delete;
~py_graph_gt() {}
graph_collection_t ref() {
return graph_collection_t(index.db(), index, index.txn(), index.snap(), index.member_arena());
}
ustore_key_t get_key() { return key_++; }
};
struct py_table_keys_range_t {
ustore_key_t min {std::numeric_limits<ustore_key_t>::min()};
ustore_key_t max {std::numeric_limits<ustore_key_t>::max()};
};
/**
* @brief DataFrame representation, capable of viewing joined contents
* of multiple collections. When materialized, exports Apache Arrow objects.
*/
struct py_table_collection_t : public std::enable_shared_from_this<py_table_collection_t> {
blobs_collection_t binary;
std::variant<std::monostate, std::vector<ustore_str_view_t>> columns_names;
std::variant<std::monostate, ustore_doc_field_type_t, std::vector<ustore_doc_field_type_t>> columns_types;
std::variant<std::monostate, py_table_keys_range_t, std::vector<ustore_key_t>> rows_keys;
std::size_t head {std::numeric_limits<std::size_t>::max()};
std::size_t tail {std::numeric_limits<std::size_t>::max()};
bool head_was_defined_last {true};
py_table_collection_t() = default;
py_table_collection_t(py_table_collection_t&&) = delete;
py_table_collection_t(py_table_collection_t const&) = delete;
// Compatibility with Arrow Tables.
// std::shared_ptr<ar::ChunkedArray> column(int i) const override;
// std::vector<std::shared_ptr<ar::ChunkedArray>> const& columns() const override;
// std::shared_ptr<ar::Table> Slice(int64_t offset, int64_t length) const override;
// ar::Result<std::shared_ptr<ar::Table>> RemoveColumn(int i) const override;
// ar::Result<std::shared_ptr<ar::Table>> AddColumn( //
// int i,
// std::shared_ptr<ar::Field> field_arg,
// std::shared_ptr<ar::ChunkedArray> column) const override;
// ar::Result<std::shared_ptr<ar::Table>> SetColumn( //
// int i,
// std::shared_ptr<ar::Field> field_arg,
// std::shared_ptr<ar::ChunkedArray> column) const override;
// std::shared_ptr<ar::Table> ReplaceSchemaMetadata(std::shared_ptr<ar::KeyValueMetadata const> const&) const
// override; ar::Result<std::shared_ptr<ar::Table>> Flatten(ar::MemoryPool* = ar::default_memory_pool()) const
// override; ar::Status Validate() const override; ar::Status ValidateFull() const override;
};
/**
* @brief Proxy-object for binary @c py_collection_t collections that adds:
* - serialization & deserialization of Python objects.
* - field-level lookups.
* - patching & merging: `.patch(...)` & `.merge(...)`.
* - DataFrame exports (out of this single collection).
*/
template <typename native_at>
struct py_stream_with_ending_gt {
native_at native;
ustore_key_t terminal {ustore_key_unknown_k};
bool stop {false};
};
/**
* @brief Binds DBMS to Python, as if it was `dict[str, dict[int, bytes]]`.
*
* ## Interface
*
* DataBase Methods:
* - main ~ Accesses the default collection
* - __getitem__(collection: str) ~ Accesses a named collection
* - clear() ~ Clears all the data from DB
* - transact() - Starts a new transaction (supports context managers)
*
* Collection Methods:
* - __in__(key), has_key(...) ~ Single & Batch Contains
* - __getitem__(key: int), get(...) ~ Value Lookup
* - __setitem__(key: int, value), set(...) ~ Value Upserts
* - __delitem__(key), pop(...) ~ Removes a key
* All those CRUD operations can be submitted in batches in forms of
* Python `tuple`s, `list`s, NumPy arrays, or anything that supports buffer
* protocol. Remaining collection methods include:
* - update(mapping: dict) ~ Batch Insert/Put
* - clear() ~ Removes all items in collection
* - get_column(keys) ~ Will extract/receive binary values as Apache Arrow collections
* - get_matrix(keys, max_length: int, padding: byte)
*
* All in all, collections mimic Python @c dict API, but some funcs were skipped:
* - __len__() ~ It's hard to consistently estimate the collection.
* - popitem() ~ We can't guarantee Last-In First-Out semantics.
* - setdefault(key[, default]) ~ As default values are useless in DBs.
* To access typed collections following computable properties are provided:
* - docs ~ Unpack objects into `dict`/`list`s and supports field-level ops
* - table ~ Accesses Docs in a Pandas-like fashion
* - graph ~ Accesses relations/links in NetworkX fashion
* - media ~ Unpacks and converts to Tensors on lookups
*
* ## Python classes vs Arrow Arrays
*
* Both kinds of arguments/results are supported with these bindings.
* By default, we export native Python objects in single-entry operations,
* but for batches - we use Arrow. Namely, in Batch-Reads and Range-Selects.
*
* https://python-reference.readthedocs.io/en/latest/docs/dict/
* https://docs.python.org/3/library/stdtypes.html#mapping-types-dict
*/
void wrap_database(py::module&);
/**
* @brief Python bindings for a Graph index, that mimics NetworkX.
* Unlike C++ @c graph_collection_t this may include as many as 4 collections
* seen as one heavily attributed relational index.
* Is similar in it's purpose to a pure-Python project - NetworkXum:
* https://github.com/unum-cloud/NetworkXum
*
* ## Supported Graph Types
*
* We support all the NetworkX graph kinds and more:
* https://networkx.org/documentation/stable/reference/classes/index.html#which-graph-class-should-i-use
*
* | Class | Type | Self-loops | Parallel edges |
* | Graph | undirected | Yes | No |
* | DiGraph | directed | Yes | No |
* | MultiGraph | undirected | Yes | Yes |
* | MultiDiGraph | directed | Yes | Yes |
*
* Aside from those, you can instantiate the most generic `ustore.Network`,
* controlling whether graph should be directed, allow loops, or have
* attrs in source/target vertices or edges.
* Beyond that, source and target vertices can belong to different collections.
* To sum up, we differentiate following graph types:
* - U: Undirected
* - D: Directed
* - J: Joining
*
* Example for simple non-attributed undirected graphs:
* - relations_name: ".graph"
* - attrs_name: ""
* - sources_name: ""
* - targets_name: ""
*
* Example for recommender systems
* - relations_name: "views.graph"
* - attrs_name: "views.docs"
* - sources_name: "people.docs"
* - targets_name: "movies.docs"
*
* ## Interface
*
* Primary single element methods:
* - add_edge(first, second, key?, attrs?)
* - remove_edge(first, second, key?, attrs?)
* Additional batch methods:
* - add_edges_from(firsts, seconds, keys?, attrs?)
* - remove_edges_from(firsts, seconds, keys?, attrs?)
* Intentionally not implemented:
* - __len__() ~ It's hard to consistently estimate the collection size.
*
* TODO:
* - Implement basic algorithms: PageRank, Louvain, WCC and Force-based Layout
* - Implement subgraph selection
* - Implement attributes
*/
template <graph_type_t>
void wrap_networkx(py::module&, std::string const&);
/**
* @brief Python bindings for a Document Store, that mimics Pandas.
* Is designed to export results in the form of Apache Arrow Tables.
*
* ## Usage
*
* - Take first 5 rows starting with ID #100:
* db.main.table.astype('int32').loc[100:].head(5).df
* Note that contrary to usual python slices, both the start and the stop are included
* - Take rows with IDs #100, #101:
* db.main.table.loc[[100, 101]].astype('float').df
* - Take specific columns from a rows range:
* db.main.table.loc[100:101].astype({'age':'float', 'name':'str'}).df
*
* ## Interface
*
* Choosing subsample of rows:
* - tbl.loc[100:] ~ Starting from a certain ID
* - tbl.loc[[...]] ~ Specific list of IDs
* - tbl.head(5) ~ First rows of the table
* - tbl.tail(5) ~ Last rows of the table
* Defining columns:
* - tbl.astype('int32') ~ All columns
* - tbl[names].astype('int32') ~ Specific columns
* - tbl.astype({'age':'float', 'name':'str'})
*
* In worst-case scenario, the lookup will contain 3 steps:
* 1. iteration, to collect the IDs of documents forming a range.
* 2. gist, to detect the names of fields in present documents.
* 3. gather, to export into a table.
*
* https://stackoverflow.com/a/57907044/2766161
* https://arrow.apache.org/docs/python/integration/extending.html
*/
void wrap_pandas(py::module&);
void wrap_document(py::module&);
} // namespace unum::ustore