etrotta commited on
Commit
4984617
·
1 Parent(s): e91df43

Add IO Plugin example

Browse files
Files changed (1) hide show
  1. polars/03_loading_data.py +113 -4
polars/03_loading_data.py CHANGED
@@ -13,7 +13,7 @@
13
 
14
  import marimo
15
 
16
- __generated_with = "0.13.15"
17
  app = marimo.App(width="medium")
18
 
19
 
@@ -244,15 +244,95 @@ def _(mo):
244
 
245
  You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need.
246
 
247
- TODO UPDATE THIS SECTION
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
- - Consider whenever we want to include a full example
250
- - Link an example of a real production-grade plugin
251
  """
252
  )
253
  return
254
 
255
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
256
  @app.cell(hide_code=True)
257
  def _(mo):
258
  mo.md(
@@ -389,6 +469,15 @@ def _():
389
  return (folder,)
390
 
391
 
 
 
 
 
 
 
 
 
 
392
  @app.cell
393
  def _():
394
  import polars as pl
@@ -396,5 +485,25 @@ def _():
396
  return pd, pl
397
 
398
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
  if __name__ == "__main__":
400
  app.run()
 
13
 
14
  import marimo
15
 
16
+ __generated_with = "0.14.7"
17
  app = marimo.App(width="medium")
18
 
19
 
 
244
 
245
  You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need.
246
 
247
+ Efficiently parsing the filter expressions is out of the scope for this notebook, but the simplest form of plugins are essentially generators that yield DataFrames. Even just this can help in many cases as it allows for polars to optimize the query and request data in batches as opposed to always loading everything in memory.
248
+ """
249
+ )
250
+ return
251
+
252
+
253
+ @app.cell(hide_code=True)
254
+ def _(mo):
255
+ mo.md(
256
+ r"""
257
+ Below is a example plugin which just takes the product between multiple iterables, some highlights are that:
258
+
259
+ - You must use `register_io_source` for polars to create the LazyFrame which will consume the Generator
260
+ - You are expected to provide a Schema before the Generator starts
261
+ - - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
262
+ - Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
263
 
 
 
264
  """
265
  )
266
  return
267
 
268
 
269
+ @app.cell
270
+ def _(my_custom_input_plugin):
271
+ my_custom_input_plugin(int, range(3), range(5))
272
+ return
273
+
274
+
275
+ @app.cell
276
+ def _(my_custom_input_plugin, pl):
277
+ my_custom_input_plugin(bool, [True, False], [True, False]).with_columns(
278
+ (pl.col("A") & pl.col("B")).alias("AND"),
279
+ (pl.col("A") & pl.col("B")).not_().alias("NAND"),
280
+ (pl.col("A") | pl.col("B")).alias("OR"),
281
+ (pl.col("A") ^ pl.col("B")).alias("XOR"),
282
+ ).collect()
283
+ return
284
+
285
+
286
+ @app.cell
287
+ def _(Iterator, get_positional_names, itertools, pl, register_io_source):
288
+ def my_custom_input_plugin(dtype, *iterables) -> pl.LazyFrame:
289
+ schema = pl.Schema({key: dtype for key in get_positional_names(len(iterables))})
290
+
291
+ def source_generator(
292
+ with_columns: list[str] | None,
293
+ predicate: pl.Expr | None,
294
+ n_rows: int | None,
295
+ batch_size: int | None,
296
+ ) -> Iterator[pl.DataFrame]:
297
+ """
298
+ Generator function that creates the source.
299
+ This function will be registered as IO source.
300
+ """
301
+ if batch_size is None:
302
+ batch_size = 100
303
+ if n_rows is not None:
304
+ batch_size = min(batch_size, n_rows)
305
+
306
+ generator = itertools.product(*iterables)
307
+ while n_rows is None or n_rows > 0:
308
+ rows = []
309
+ try:
310
+ while len(rows) < batch_size:
311
+ rows.append(next(generator))
312
+ except StopIteration:
313
+ n_rows = -1
314
+
315
+ df = pl.from_records(rows, schema=schema, orient="row")
316
+ if n_rows is not None:
317
+ n_rows -= df.height
318
+ batch_size = min(batch_size, n_rows)
319
+
320
+ # If we would make a performant reader, we would not read these
321
+ # columns at all.
322
+ if with_columns is not None:
323
+ df = df.select(with_columns)
324
+
325
+ # If the source supports predicate pushdown, the expression can be parsed
326
+ # to skip rows/groups.
327
+ if predicate is not None:
328
+ df = df.filter(predicate)
329
+
330
+ yield df
331
+
332
+ return register_io_source(io_source=source_generator, schema=schema)
333
+ return (my_custom_input_plugin,)
334
+
335
+
336
  @app.cell(hide_code=True)
337
  def _(mo):
338
  mo.md(
 
469
  return (folder,)
470
 
471
 
472
+ @app.cell
473
+ def _():
474
+ import math
475
+ import string
476
+ import itertools
477
+ from typing import Iterator
478
+ return Iterator, itertools, string
479
+
480
+
481
  @app.cell
482
  def _():
483
  import polars as pl
 
485
  return pd, pl
486
 
487
 
488
+ @app.cell
489
+ def _():
490
+ from polars.io.plugins import register_io_source
491
+ return (register_io_source,)
492
+
493
+
494
+ @app.cell
495
+ def _(itertools, string):
496
+ def get_positional_names(count: int) -> list[str]:
497
+ out = []
498
+ size = 0
499
+ while True:
500
+ size += 1 # number of characters in each column name
501
+ for column in itertools.product(*itertools.repeat(string.ascii_uppercase, size)):
502
+ if len(out) >= count:
503
+ return out
504
+ out.append("".join(column))
505
+ return (get_positional_names,)
506
+
507
+
508
  if __name__ == "__main__":
509
  app.run()