File size: 3,150 Bytes
5d30bdc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Export tables from a local Postgres dev DB to parquet under ``data/dev/``.

The agent reads DuckDB-on-Parquet at runtime, but during development the
source of truth is whatever Postgres instance the developer has running
locally (e.g. ``postgres-mealgen``). This script bridges the two: it
ATTACHes the Postgres database via DuckDB's ``postgres`` extension and
writes one parquet file per table into ``data/dev/``.

Re-run after any schema or row change upstream.

Usage:
    uv run python scripts/build_dev_parquet.py

Required env vars (typically set in ``.env``):
    DEV_PG_DSN   libpq-style DSN, e.g.
                 ``postgresql://user:pass@localhost:5432/mealgen``
                 or ``host=localhost port=5432 dbname=mealgen user=...``

Tables exported are listed in ``DEV_TABLES`` below. Add to that tuple
when a new table needs to be available to the agent.
"""

from __future__ import annotations

import logging
import os
import sys
from pathlib import Path

import duckdb
from dotenv import load_dotenv

logger = logging.getLogger(__name__)

REPO_ROOT: Path = Path(__file__).resolve().parent.parent
DEV_DIR: Path = REPO_ROOT / "data" / "dev"

DEV_TABLES: tuple[str, ...] = ("items", "item_nutrition")


def export(dsn: str, out_dir: Path, tables: tuple[str, ...]) -> dict[str, int]:
    """ATTACH a Postgres DSN via DuckDB and write one parquet per table.

    Returns a mapping of table name to row count written.
    """
    out_dir.mkdir(parents=True, exist_ok=True)

    conn = duckdb.connect(":memory:")
    conn.execute("INSTALL postgres")
    conn.execute("LOAD postgres")

    # ATTACH and COPY don't bind parameters; inline the literals with the
    # standard SQL single-quote escape. DSN and out_path are
    # developer-controlled (env var + repo-local path), not external input.
    conn.execute(f"ATTACH '{_sql_quote(dsn)}' AS pg (TYPE POSTGRES, READ_ONLY)")

    counts: dict[str, int] = {}
    for table in tables:
        out_path = out_dir / f"{table}.parquet"
        logger.info("Exporting public.%s -> %s", table, out_path)
        conn.execute(
            f"COPY (SELECT * FROM pg.public.{table}) "
            f"TO '{_sql_quote(str(out_path))}' (FORMAT PARQUET)"
        )
        row = conn.execute(
            "SELECT COUNT(*) FROM read_parquet(?)",
            [str(out_path)],
        ).fetchone()
        assert row is not None
        counts[table] = int(row[0])
    return counts


def _sql_quote(value: str) -> str:
    """Escape single quotes for embedding in a SQL string literal."""
    return value.replace("'", "''")


def main() -> int:
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
    load_dotenv()

    dsn = os.getenv("DEV_PG_DSN")
    if not dsn:
        print(
            "DEV_PG_DSN is not set. Add it to .env (see .env.example).",
            file=sys.stderr,
        )
        return 1

    counts = export(dsn, DEV_DIR, DEV_TABLES)
    for table, n in counts.items():
        rel = (DEV_DIR / f"{table}.parquet").relative_to(REPO_ROOT)
        print(f"  {table}: {n:>7,} rows -> {rel}")
    return 0


if __name__ == "__main__":
    sys.exit(main())