Spaces:
Sleeping
Sleeping
Add IO Plugin example
Browse files- polars/03_loading_data.py +113 -4
polars/03_loading_data.py
CHANGED
|
@@ -13,7 +13,7 @@
|
|
| 13 |
|
| 14 |
import marimo
|
| 15 |
|
| 16 |
-
__generated_with = "0.
|
| 17 |
app = marimo.App(width="medium")
|
| 18 |
|
| 19 |
|
|
@@ -244,15 +244,95 @@ def _(mo):
|
|
| 244 |
|
| 245 |
You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need.
|
| 246 |
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
-
- Consider whenever we want to include a full example
|
| 250 |
-
- Link an example of a real production-grade plugin
|
| 251 |
"""
|
| 252 |
)
|
| 253 |
return
|
| 254 |
|
| 255 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
@app.cell(hide_code=True)
|
| 257 |
def _(mo):
|
| 258 |
mo.md(
|
|
@@ -389,6 +469,15 @@ def _():
|
|
| 389 |
return (folder,)
|
| 390 |
|
| 391 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 392 |
@app.cell
|
| 393 |
def _():
|
| 394 |
import polars as pl
|
|
@@ -396,5 +485,25 @@ def _():
|
|
| 396 |
return pd, pl
|
| 397 |
|
| 398 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 399 |
if __name__ == "__main__":
|
| 400 |
app.run()
|
|
|
|
| 13 |
|
| 14 |
import marimo
|
| 15 |
|
| 16 |
+
__generated_with = "0.14.7"
|
| 17 |
app = marimo.App(width="medium")
|
| 18 |
|
| 19 |
|
|
|
|
| 244 |
|
| 245 |
You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need.
|
| 246 |
|
| 247 |
+
Efficiently parsing the filter expressions is out of the scope for this notebook, but the simplest form of plugins are essentially generators that yield DataFrames. Even just this can help in many cases as it allows for polars to optimize the query and request data in batches as opposed to always loading everything in memory.
|
| 248 |
+
"""
|
| 249 |
+
)
|
| 250 |
+
return
|
| 251 |
+
|
| 252 |
+
|
| 253 |
+
@app.cell(hide_code=True)
|
| 254 |
+
def _(mo):
|
| 255 |
+
mo.md(
|
| 256 |
+
r"""
|
| 257 |
+
Below is a example plugin which just takes the product between multiple iterables, some highlights are that:
|
| 258 |
+
|
| 259 |
+
- You must use `register_io_source` for polars to create the LazyFrame which will consume the Generator
|
| 260 |
+
- You are expected to provide a Schema before the Generator starts
|
| 261 |
+
- - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
|
| 262 |
+
- Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
|
| 263 |
|
|
|
|
|
|
|
| 264 |
"""
|
| 265 |
)
|
| 266 |
return
|
| 267 |
|
| 268 |
|
| 269 |
+
@app.cell
|
| 270 |
+
def _(my_custom_input_plugin):
|
| 271 |
+
my_custom_input_plugin(int, range(3), range(5))
|
| 272 |
+
return
|
| 273 |
+
|
| 274 |
+
|
| 275 |
+
@app.cell
|
| 276 |
+
def _(my_custom_input_plugin, pl):
|
| 277 |
+
my_custom_input_plugin(bool, [True, False], [True, False]).with_columns(
|
| 278 |
+
(pl.col("A") & pl.col("B")).alias("AND"),
|
| 279 |
+
(pl.col("A") & pl.col("B")).not_().alias("NAND"),
|
| 280 |
+
(pl.col("A") | pl.col("B")).alias("OR"),
|
| 281 |
+
(pl.col("A") ^ pl.col("B")).alias("XOR"),
|
| 282 |
+
).collect()
|
| 283 |
+
return
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
@app.cell
|
| 287 |
+
def _(Iterator, get_positional_names, itertools, pl, register_io_source):
|
| 288 |
+
def my_custom_input_plugin(dtype, *iterables) -> pl.LazyFrame:
|
| 289 |
+
schema = pl.Schema({key: dtype for key in get_positional_names(len(iterables))})
|
| 290 |
+
|
| 291 |
+
def source_generator(
|
| 292 |
+
with_columns: list[str] | None,
|
| 293 |
+
predicate: pl.Expr | None,
|
| 294 |
+
n_rows: int | None,
|
| 295 |
+
batch_size: int | None,
|
| 296 |
+
) -> Iterator[pl.DataFrame]:
|
| 297 |
+
"""
|
| 298 |
+
Generator function that creates the source.
|
| 299 |
+
This function will be registered as IO source.
|
| 300 |
+
"""
|
| 301 |
+
if batch_size is None:
|
| 302 |
+
batch_size = 100
|
| 303 |
+
if n_rows is not None:
|
| 304 |
+
batch_size = min(batch_size, n_rows)
|
| 305 |
+
|
| 306 |
+
generator = itertools.product(*iterables)
|
| 307 |
+
while n_rows is None or n_rows > 0:
|
| 308 |
+
rows = []
|
| 309 |
+
try:
|
| 310 |
+
while len(rows) < batch_size:
|
| 311 |
+
rows.append(next(generator))
|
| 312 |
+
except StopIteration:
|
| 313 |
+
n_rows = -1
|
| 314 |
+
|
| 315 |
+
df = pl.from_records(rows, schema=schema, orient="row")
|
| 316 |
+
if n_rows is not None:
|
| 317 |
+
n_rows -= df.height
|
| 318 |
+
batch_size = min(batch_size, n_rows)
|
| 319 |
+
|
| 320 |
+
# If we would make a performant reader, we would not read these
|
| 321 |
+
# columns at all.
|
| 322 |
+
if with_columns is not None:
|
| 323 |
+
df = df.select(with_columns)
|
| 324 |
+
|
| 325 |
+
# If the source supports predicate pushdown, the expression can be parsed
|
| 326 |
+
# to skip rows/groups.
|
| 327 |
+
if predicate is not None:
|
| 328 |
+
df = df.filter(predicate)
|
| 329 |
+
|
| 330 |
+
yield df
|
| 331 |
+
|
| 332 |
+
return register_io_source(io_source=source_generator, schema=schema)
|
| 333 |
+
return (my_custom_input_plugin,)
|
| 334 |
+
|
| 335 |
+
|
| 336 |
@app.cell(hide_code=True)
|
| 337 |
def _(mo):
|
| 338 |
mo.md(
|
|
|
|
| 469 |
return (folder,)
|
| 470 |
|
| 471 |
|
| 472 |
+
@app.cell
|
| 473 |
+
def _():
|
| 474 |
+
import math
|
| 475 |
+
import string
|
| 476 |
+
import itertools
|
| 477 |
+
from typing import Iterator
|
| 478 |
+
return Iterator, itertools, string
|
| 479 |
+
|
| 480 |
+
|
| 481 |
@app.cell
|
| 482 |
def _():
|
| 483 |
import polars as pl
|
|
|
|
| 485 |
return pd, pl
|
| 486 |
|
| 487 |
|
| 488 |
+
@app.cell
|
| 489 |
+
def _():
|
| 490 |
+
from polars.io.plugins import register_io_source
|
| 491 |
+
return (register_io_source,)
|
| 492 |
+
|
| 493 |
+
|
| 494 |
+
@app.cell
|
| 495 |
+
def _(itertools, string):
|
| 496 |
+
def get_positional_names(count: int) -> list[str]:
|
| 497 |
+
out = []
|
| 498 |
+
size = 0
|
| 499 |
+
while True:
|
| 500 |
+
size += 1 # number of characters in each column name
|
| 501 |
+
for column in itertools.product(*itertools.repeat(string.ascii_uppercase, size)):
|
| 502 |
+
if len(out) >= count:
|
| 503 |
+
return out
|
| 504 |
+
out.append("".join(column))
|
| 505 |
+
return (get_positional_names,)
|
| 506 |
+
|
| 507 |
+
|
| 508 |
if __name__ == "__main__":
|
| 509 |
app.run()
|