Spaces:
Running
Running
Update DuckDB polars support
Browse files- polars/03_loading_data.py +35 -18
polars/03_loading_data.py
CHANGED
|
@@ -1,20 +1,20 @@
|
|
| 1 |
# /// script
|
| 2 |
# requires-python = ">=3.12"
|
| 3 |
# dependencies = [
|
| 4 |
-
# "adbc-driver-sqlite==1.
|
| 5 |
-
# "duckdb==1.
|
| 6 |
# "lxml==6.0.0",
|
| 7 |
# "marimo",
|
| 8 |
-
# "pandas==2.3.
|
| 9 |
-
# "polars==1.
|
| 10 |
-
# "pyarrow==
|
| 11 |
-
# "sqlalchemy==2.0.
|
| 12 |
# ]
|
| 13 |
# ///
|
| 14 |
|
| 15 |
import marimo
|
| 16 |
|
| 17 |
-
__generated_with = "0.
|
| 18 |
app = marimo.App(width="medium")
|
| 19 |
|
| 20 |
|
|
@@ -167,7 +167,9 @@ def _(mo):
|
|
| 167 |
r"""
|
| 168 |
## Databases
|
| 169 |
|
| 170 |
-
Polars doesn't supports any databases _directly_, but rather uses other libraries as Engines. Reading and writing to databases does not supports Lazy execution, but you may pass an SQL Query for the database to pre-filter the data before reaches polars. See the [User Guide](https://docs.pola.rs/user-guide/io/database) for more details.
|
|
|
|
|
|
|
| 171 |
|
| 172 |
Using the Arrow Database Connectivity SQLite support as an example:
|
| 173 |
"""
|
|
@@ -243,24 +245,38 @@ def _(mo):
|
|
| 243 |
r"""
|
| 244 |
## Plugin Support
|
| 245 |
|
| 246 |
-
You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need.
|
| 247 |
-
|
| 248 |
-
Efficiently parsing the filter expressions is out of the scope for this notebook, but the simplest form of plugins are essentially generators that yield DataFrames. Even just this can help in many cases as it allows for polars to optimize the query and request data in batches as opposed to always loading everything in memory.
|
| 249 |
"""
|
| 250 |
)
|
| 251 |
return
|
| 252 |
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
@app.cell(hide_code=True)
|
| 255 |
def _(mo):
|
| 256 |
mo.md(
|
| 257 |
r"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
Below is a example plugin which just takes the product between multiple iterables, some highlights are that:
|
| 259 |
|
| 260 |
- You must use `register_io_source` for polars to create the LazyFrame which will consume the Generator
|
| 261 |
- You are expected to provide a Schema before the Generator starts
|
| 262 |
- - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
|
| 263 |
- Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
|
|
|
|
|
|
|
| 264 |
"""
|
| 265 |
)
|
| 266 |
return
|
|
@@ -339,7 +355,7 @@ def _(mo):
|
|
| 339 |
r"""
|
| 340 |
### DuckDB
|
| 341 |
|
| 342 |
-
|
| 343 |
|
| 344 |
You can read more about polars and duckdb integrations in
|
| 345 |
|
|
@@ -369,14 +385,14 @@ def _():
|
|
| 369 |
@app.cell
|
| 370 |
def _(duckdb_conn, duckdb_query):
|
| 371 |
# Eager (default):
|
| 372 |
-
duckdb_conn.
|
| 373 |
return
|
| 374 |
|
| 375 |
|
| 376 |
-
@app.cell
|
| 377 |
def _(duckdb_conn, duckdb_query):
|
| 378 |
-
# Lazy (
|
| 379 |
-
duckdb_conn.
|
| 380 |
return
|
| 381 |
|
| 382 |
|
|
@@ -580,10 +596,11 @@ def _(duckdb):
|
|
| 580 |
duckdb_conn = duckdb.connect(":memory:")
|
| 581 |
|
| 582 |
# Install and load the spatial extension for geometry support
|
|
|
|
| 583 |
duckdb_conn.load_extension("spatial")
|
| 584 |
|
| 585 |
# Create a table with geometry column
|
| 586 |
-
duckdb_conn.
|
| 587 |
CREATE TABLE locations (
|
| 588 |
id INTEGER,
|
| 589 |
name VARCHAR,
|
|
@@ -592,7 +609,7 @@ def _(duckdb):
|
|
| 592 |
""")
|
| 593 |
|
| 594 |
# Insert some sample data with geometry points
|
| 595 |
-
duckdb_conn.
|
| 596 |
INSERT INTO locations VALUES
|
| 597 |
(1, 'New York', ST_Point(-74.0059, 40.7128)),
|
| 598 |
(2, 'Los Angeles', ST_Point(-118.2437, 34.0522)),
|
|
|
|
| 1 |
# /// script
|
| 2 |
# requires-python = ">=3.12"
|
| 3 |
# dependencies = [
|
| 4 |
+
# "adbc-driver-sqlite==1.7.0",
|
| 5 |
+
# "duckdb==1.4.0.dev2673",
|
| 6 |
# "lxml==6.0.0",
|
| 7 |
# "marimo",
|
| 8 |
+
# "pandas==2.3.2",
|
| 9 |
+
# "polars==1.32.3",
|
| 10 |
+
# "pyarrow==21.0.0",
|
| 11 |
+
# "sqlalchemy==2.0.43",
|
| 12 |
# ]
|
| 13 |
# ///
|
| 14 |
|
| 15 |
import marimo
|
| 16 |
|
| 17 |
+
__generated_with = "0.15.0"
|
| 18 |
app = marimo.App(width="medium")
|
| 19 |
|
| 20 |
|
|
|
|
| 167 |
r"""
|
| 168 |
## Databases
|
| 169 |
|
| 170 |
+
Polars doesn't supports any databases _directly_, but rather uses other libraries as Engines. Reading and writing to databases using polars methods does not supports Lazy execution, but you may pass an SQL Query for the database to pre-filter the data before reaches polars. See the [User Guide](https://docs.pola.rs/user-guide/io/database) for more details.
|
| 171 |
+
|
| 172 |
+
You can also use other libraries with [arrow support](#arrow-support) or [polars plugins](#plugin-support) to read from databases before loading into polars, some of which support lazy reading.
|
| 173 |
|
| 174 |
Using the Arrow Database Connectivity SQLite support as an example:
|
| 175 |
"""
|
|
|
|
| 245 |
r"""
|
| 246 |
## Plugin Support
|
| 247 |
|
| 248 |
+
You can also write [IO Plugins](https://docs.pola.rs/user-guide/plugins/io_plugins/) for Polars in order to support any format you need, or use other libraries that support polars via their own plugins such as DuckDB.
|
|
|
|
|
|
|
| 249 |
"""
|
| 250 |
)
|
| 251 |
return
|
| 252 |
|
| 253 |
|
| 254 |
+
@app.cell
|
| 255 |
+
def _(duckdb, folder):
|
| 256 |
+
# Requires duckdb >= 1.4.0
|
| 257 |
+
conn = duckdb.connect(folder / "db.sqlite")
|
| 258 |
+
conn.sql("SELECT * FROM quick_reference").pl(lazy=True)
|
| 259 |
+
return
|
| 260 |
+
|
| 261 |
+
|
| 262 |
@app.cell(hide_code=True)
|
| 263 |
def _(mo):
|
| 264 |
mo.md(
|
| 265 |
r"""
|
| 266 |
+
### Creating your own Plugin
|
| 267 |
+
|
| 268 |
+
The simplest form of plugins are essentially generators that yield DataFrames.
|
| 269 |
+
|
| 270 |
+
Without parsing filters you will be missing on performance improvements, but even just this can help improve your performance in many cases as it allows for polars to optimize the query and request data in batches as opposed to always loading everything in memory.
|
| 271 |
+
|
| 272 |
Below is a example plugin which just takes the product between multiple iterables, some highlights are that:
|
| 273 |
|
| 274 |
- You must use `register_io_source` for polars to create the LazyFrame which will consume the Generator
|
| 275 |
- You are expected to provide a Schema before the Generator starts
|
| 276 |
- - For many use cases the Plugin may be able to infer it, but you could also pass it explicitly to the plugin function
|
| 277 |
- Ideally you should parse some of the filters and column selectors to avoid unnecessary work, but it is possible to delegate that to polars after loading the data in order to keep it simpler (at the cost of efficiency)
|
| 278 |
+
|
| 279 |
+
Efficiently parsing the filter expressions is out of the scope for this notebook.
|
| 280 |
"""
|
| 281 |
)
|
| 282 |
return
|
|
|
|
| 355 |
r"""
|
| 356 |
### DuckDB
|
| 357 |
|
| 358 |
+
As demonstrated above, in addition to Arrow interoperability support, [DuckDB](https://duckdb.org/) also has added support for loading query results into a polars DataFrame or LazyFrame via a polars plugin.
|
| 359 |
|
| 360 |
You can read more about polars and duckdb integrations in
|
| 361 |
|
|
|
|
| 385 |
@app.cell
|
| 386 |
def _(duckdb_conn, duckdb_query):
|
| 387 |
# Eager (default):
|
| 388 |
+
duckdb_conn.sql(duckdb_query).pl()
|
| 389 |
return
|
| 390 |
|
| 391 |
|
| 392 |
+
@app.cell
|
| 393 |
def _(duckdb_conn, duckdb_query):
|
| 394 |
+
# Lazy (requires >= 1.4.0):
|
| 395 |
+
duckdb_conn.sql(duckdb_query).pl(lazy=True)
|
| 396 |
return
|
| 397 |
|
| 398 |
|
|
|
|
| 596 |
duckdb_conn = duckdb.connect(":memory:")
|
| 597 |
|
| 598 |
# Install and load the spatial extension for geometry support
|
| 599 |
+
duckdb_conn.install_extension("spatial")
|
| 600 |
duckdb_conn.load_extension("spatial")
|
| 601 |
|
| 602 |
# Create a table with geometry column
|
| 603 |
+
duckdb_conn.sql("""
|
| 604 |
CREATE TABLE locations (
|
| 605 |
id INTEGER,
|
| 606 |
name VARCHAR,
|
|
|
|
| 609 |
""")
|
| 610 |
|
| 611 |
# Insert some sample data with geometry points
|
| 612 |
+
duckdb_conn.sql("""
|
| 613 |
INSERT INTO locations VALUES
|
| 614 |
(1, 'New York', ST_Point(-74.0059, 40.7128)),
|
| 615 |
(2, 'Los Angeles', ST_Point(-118.2437, 34.0522)),
|