etrotta commited on
Commit
97f8c36
·
1 Parent(s): 7bcccc1

Update DuckDB polars support

Browse files
Files changed (1) hide show
  1. polars/03_loading_data.py +35 -18
polars/03_loading_data.py CHANGED
@@ -1,20 +1,20 @@
1
  # /// script
2
  # requires-python = ">=3.12"
3
  # dependencies = [
4
- # "adbc-driver-sqlite==1.6.0",
5
- # "duckdb==1.3.1",
6
  # "lxml==6.0.0",
7
  # "marimo",
8
- # "pandas==2.3.0",
9
- # "polars==1.31.0",
10
- # "pyarrow==20.0.0",
11
- # "sqlalchemy==2.0.41",
12
  # ]
13
  # ///
14
 
15
  import marimo
16
 
17
- __generated_with = "0.14.8"
18
  app = marimo.App(width="medium")
19
 
20
 
@@ -167,7 +167,9 @@ def _(mo):
167
  r"""
168
  ## Databases
169
 
170
- Polars doesn't supports any databases _directly_, but rather uses other libraries as Engines. Reading and writing to databases does not supports Lazy execution, but you may pass an SQL Query for the database to pre-filter the data before reaches polars. See the [User Guide](https://docs.pola.rs/user-guide/io/database) for more details.
 
 
171
 
172
  Using the Arrow Database Connectivity SQLite support as an example:
173
  """
@@ -243,24 +245,38 @@ def _(mo):
243
  r"""
244
  ## Plugin Support
245
 
246
- You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need.
247
-
248
- Efficiently parsing the filter expressions is out of the scope for this notebook, but the simplest form of plugins are essentially generators that yield DataFrames. Even just this can help in many cases as it allows for polars to optimize the query and request data in batches as opposed to always loading everything in memory.
249
  """
250
  )
251
  return
252
 
253
 
 
 
 
 
 
 
 
 
254
  @app.cell(hide_code=True)
255
  def _(mo):
256
  mo.md(
257
  r"""
 
 
 
 
 
 
258
  Below is a example plugin which just takes the product between multiple iterables, some highlights are that:
259
 
260
  - You must use `register_io_source` for polars to create the LazyFrame which will consume the Generator
261
  - You are expected to provide a Schema before the Generator starts
262
  - - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
263
  - Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
 
 
264
  """
265
  )
266
  return
@@ -339,7 +355,7 @@ def _(mo):
339
  r"""
340
  ### DuckDB
341
 
342
- In addition to Arrow interoperability support, [DuckDB](https://duckdb.org/) has also added support for loading query results into a polars LazyFrame
343
 
344
  You can read more about polars and duckdb integrations in
345
 
@@ -369,14 +385,14 @@ def _():
369
  @app.cell
370
  def _(duckdb_conn, duckdb_query):
371
  # Eager (default):
372
- duckdb_conn.execute(duckdb_query).pl()
373
  return
374
 
375
 
376
- @app.cell(disabled=True)
377
  def _(duckdb_conn, duckdb_query):
378
- # Lazy (merged but not yet released as of the time I am writing this, requires > 1.3.1):
379
- duckdb_conn.execute(duckdb_query).pl(lazy=True)
380
  return
381
 
382
 
@@ -580,10 +596,11 @@ def _(duckdb):
580
  duckdb_conn = duckdb.connect(":memory:")
581
 
582
  # Install and load the spatial extension for geometry support
 
583
  duckdb_conn.load_extension("spatial")
584
 
585
  # Create a table with geometry column
586
- duckdb_conn.execute("""
587
  CREATE TABLE locations (
588
  id INTEGER,
589
  name VARCHAR,
@@ -592,7 +609,7 @@ def _(duckdb):
592
  """)
593
 
594
  # Insert some sample data with geometry points
595
- duckdb_conn.execute("""
596
  INSERT INTO locations VALUES
597
  (1, 'New York', ST_Point(-74.0059, 40.7128)),
598
  (2, 'Los Angeles', ST_Point(-118.2437, 34.0522)),
 
1
  # /// script
2
  # requires-python = ">=3.12"
3
  # dependencies = [
4
+ # "adbc-driver-sqlite==1.7.0",
5
+ # "duckdb==1.4.0.dev2673",
6
  # "lxml==6.0.0",
7
  # "marimo",
8
+ # "pandas==2.3.2",
9
+ # "polars==1.32.3",
10
+ # "pyarrow==21.0.0",
11
+ # "sqlalchemy==2.0.43",
12
  # ]
13
  # ///
14
 
15
  import marimo
16
 
17
+ __generated_with = "0.15.0"
18
  app = marimo.App(width="medium")
19
 
20
 
 
167
  r"""
168
  ## Databases
169
 
170
+ Polars doesn't supports any databases _directly_, but rather uses other libraries as Engines. Reading and writing to databases using polars methods does not supports Lazy execution, but you may pass an SQL Query for the database to pre-filter the data before reaches polars. See the [User Guide](https://docs.pola.rs/user-guide/io/database) for more details.
171
+
172
+ You can also use other libraries with [arrow support](#arrow-support) or [polars plugins](#plugin-support) to read from databases before loading into polars, some of which support lazy reading.
173
 
174
  Using the Arrow Database Connectivity SQLite support as an example:
175
  """
 
245
  r"""
246
  ## Plugin Support
247
 
248
+ You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need, or use other libraries that support polars via their own plugins such as DuckDB.
 
 
249
  """
250
  )
251
  return
252
 
253
 
254
+ @app.cell
255
+ def _(duckdb, folder):
256
+ # Requires duckdb >= 1.4.0
257
+ conn = duckdb.connect(folder / "db.sqlite")
258
+ conn.sql("SELECT * FROM quick_reference").pl(lazy=True)
259
+ return
260
+
261
+
262
  @app.cell(hide_code=True)
263
  def _(mo):
264
  mo.md(
265
  r"""
266
+ ### Creating your own Plugin
267
+
268
+ The simplest form of plugins are essentially generators that yield DataFrames.
269
+
270
+ Without parsing filters you will be missing on performance improvements, but even just this can help improve your performance in many cases as it allows for polars to optimize the query and request data in batches as opposed to always loading everything in memory.
271
+
272
  Below is a example plugin which just takes the product between multiple iterables, some highlights are that:
273
 
274
  - You must use `register_io_source` for polars to create the LazyFrame which will consume the Generator
275
  - You are expected to provide a Schema before the Generator starts
276
  - - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
277
  - Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
278
+
279
+ Efficiently parsing the filter expressions is out of the scope for this notebook.
280
  """
281
  )
282
  return
 
355
  r"""
356
  ### DuckDB
357
 
358
+ As demonstrated above, in addition to Arrow interoperability support, [DuckDB](https://duckdb.org/) also has added support for loading query results into a polars DataFrame or LazyFrame via a polars plugin.
359
 
360
  You can read more about polars and duckdb integrations in
361
 
 
385
  @app.cell
386
  def _(duckdb_conn, duckdb_query):
387
  # Eager (default):
388
+ duckdb_conn.sql(duckdb_query).pl()
389
  return
390
 
391
 
392
+ @app.cell
393
  def _(duckdb_conn, duckdb_query):
394
+ # Lazy (requires >= 1.4.0):
395
+ duckdb_conn.sql(duckdb_query).pl(lazy=True)
396
  return
397
 
398
 
 
596
  duckdb_conn = duckdb.connect(":memory:")
597
 
598
  # Install and load the spatial extension for geometry support
599
+ duckdb_conn.install_extension("spatial")
600
  duckdb_conn.load_extension("spatial")
601
 
602
  # Create a table with geometry column
603
+ duckdb_conn.sql("""
604
  CREATE TABLE locations (
605
  id INTEGER,
606
  name VARCHAR,
 
609
  """)
610
 
611
  # Insert some sample data with geometry points
612
+ duckdb_conn.sql("""
613
  INSERT INTO locations VALUES
614
  (1, 'New York', ST_Point(-74.0059, 40.7128)),
615
  (2, 'Los Angeles', ST_Point(-118.2437, 34.0522)),