| """Export tables from a local Postgres dev DB to parquet under ``data/dev/``. |
| |
| The agent reads DuckDB-on-Parquet at runtime, but during development the |
| source of truth is whatever Postgres instance the developer has running |
| locally (e.g. ``postgres-mealgen``). This script bridges the two: it |
| ATTACHes the Postgres database via DuckDB's ``postgres`` extension and |
| writes one parquet file per table into ``data/dev/``. |
| |
| Re-run after any schema or row change upstream. |
| |
| Usage: |
| uv run python scripts/build_dev_parquet.py |
| |
| Required env vars (typically set in ``.env``): |
| DEV_PG_DSN libpq-style DSN, e.g. |
| ``postgresql://user:pass@localhost:5432/mealgen`` |
| or ``host=localhost port=5432 dbname=mealgen user=...`` |
| |
| Tables exported are listed in ``DEV_TABLES`` below. Add to that tuple |
| when a new table needs to be available to the agent. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import logging |
| import os |
| import sys |
| from pathlib import Path |
|
|
| import duckdb |
| from dotenv import load_dotenv |
|
|
| logger = logging.getLogger(__name__) |
|
|
| REPO_ROOT: Path = Path(__file__).resolve().parent.parent |
| DEV_DIR: Path = REPO_ROOT / "data" / "dev" |
|
|
| DEV_TABLES: tuple[str, ...] = ("items", "item_nutrition") |
|
|
|
|
| def export(dsn: str, out_dir: Path, tables: tuple[str, ...]) -> dict[str, int]: |
| """ATTACH a Postgres DSN via DuckDB and write one parquet per table. |
| |
| Returns a mapping of table name to row count written. |
| """ |
| out_dir.mkdir(parents=True, exist_ok=True) |
|
|
| conn = duckdb.connect(":memory:") |
| conn.execute("INSTALL postgres") |
| conn.execute("LOAD postgres") |
|
|
| |
| |
| |
| conn.execute(f"ATTACH '{_sql_quote(dsn)}' AS pg (TYPE POSTGRES, READ_ONLY)") |
|
|
| counts: dict[str, int] = {} |
| for table in tables: |
| out_path = out_dir / f"{table}.parquet" |
| logger.info("Exporting public.%s -> %s", table, out_path) |
| conn.execute( |
| f"COPY (SELECT * FROM pg.public.{table}) " |
| f"TO '{_sql_quote(str(out_path))}' (FORMAT PARQUET)" |
| ) |
| row = conn.execute( |
| "SELECT COUNT(*) FROM read_parquet(?)", |
| [str(out_path)], |
| ).fetchone() |
| assert row is not None |
| counts[table] = int(row[0]) |
| return counts |
|
|
|
|
| def _sql_quote(value: str) -> str: |
| """Escape single quotes for embedding in a SQL string literal.""" |
| return value.replace("'", "''") |
|
|
|
|
| def main() -> int: |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s") |
| load_dotenv() |
|
|
| dsn = os.getenv("DEV_PG_DSN") |
| if not dsn: |
| print( |
| "DEV_PG_DSN is not set. Add it to .env (see .env.example).", |
| file=sys.stderr, |
| ) |
| return 1 |
|
|
| counts = export(dsn, DEV_DIR, DEV_TABLES) |
| for table, n in counts.items(): |
| rel = (DEV_DIR / f"{table}.parquet").relative_to(REPO_ROOT) |
| print(f" {table}: {n:>7,} rows -> {rel}") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| sys.exit(main()) |
|
|