Spaces:
Running
Running
| """ | |
| Build entity inventory from divisions_area and natural_earth parquet files. | |
| This script creates compact inventory tables containing only the fields needed | |
| for candidate sampling and distractor generation. | |
| Output: | |
| - intermediate/divisions_area_inventory.parquet | |
| - intermediate/natural_earth_inventory.parquet | |
| """ | |
| import duckdb | |
| import pandas as pd | |
| from pathlib import Path | |
| from gazet.config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH | |
| def build_divisions_area_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame: | |
| """Extract compact inventory from divisions_area.""" | |
| query = """ | |
| SELECT | |
| 'divisions_area' AS source, | |
| id, | |
| names."primary" AS name, | |
| subtype, | |
| country, | |
| region, | |
| admin_level, | |
| class, | |
| is_land, | |
| is_territorial, | |
| division_id, | |
| ST_Area(geometry) AS area_sq_deg, | |
| ST_XMin(geometry) AS xmin, | |
| ST_YMin(geometry) AS ymin, | |
| ST_XMax(geometry) AS xmax, | |
| ST_YMax(geometry) AS ymax | |
| FROM read_parquet(?) | |
| WHERE names."primary" IS NOT NULL | |
| AND trim(names."primary") != '' | |
| AND geometry IS NOT NULL | |
| """ | |
| df = con.execute(query, [DIVISIONS_AREA_PATH]).fetchdf() | |
| print(f"Divisions area inventory: {len(df)} entities") | |
| print(f"Subtypes: {df['subtype'].value_counts().to_dict()}") | |
| print(f"Countries: {df['country'].nunique()} unique") | |
| return df | |
| def build_natural_earth_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame: | |
| """Extract compact inventory from natural_earth.""" | |
| query = """ | |
| SELECT | |
| 'natural_earth' AS source, | |
| id, | |
| names."primary" AS name, | |
| subtype, | |
| country, | |
| region, | |
| admin_level, | |
| class, | |
| is_land, | |
| is_territorial, | |
| ST_Area(geometry) AS area_sq_deg, | |
| ST_XMin(geometry) AS xmin, | |
| ST_YMin(geometry) AS ymin, | |
| ST_XMax(geometry) AS xmax, | |
| ST_YMax(geometry) AS ymax | |
| FROM read_parquet(?) | |
| WHERE names."primary" IS NOT NULL | |
| AND trim(names."primary") != '' | |
| AND geometry IS NOT NULL | |
| """ | |
| df = con.execute(query, [NATURAL_EARTH_PATH]).fetchdf() | |
| print(f"\nNatural earth inventory: {len(df)} entities") | |
| print(f"Subtypes: {df['subtype'].value_counts().to_dict()}") | |
| return df | |
| def build_inventory_to_dir(output_dir: Path) -> dict: | |
| """Build and save all inventory tables to output_dir. | |
| Reusable entry point for both local CLI and Modal. | |
| Returns: | |
| Dict with counts: {"divisions_area": int, "natural_earth": int} | |
| """ | |
| output_dir.mkdir(exist_ok=True, parents=True) | |
| con = duckdb.connect() | |
| con.execute("INSTALL spatial") | |
| con.execute("LOAD spatial") | |
| print("Building divisions_area inventory...") | |
| divisions_df = build_divisions_area_inventory(con) | |
| divisions_path = output_dir / "divisions_area_inventory.parquet" | |
| divisions_df.to_parquet(divisions_path, index=False) | |
| print(f"Saved to {divisions_path}") | |
| print("\nBuilding natural_earth inventory...") | |
| natural_earth_df = build_natural_earth_inventory(con) | |
| natural_earth_path = output_dir / "natural_earth_inventory.parquet" | |
| natural_earth_df.to_parquet(natural_earth_path, index=False) | |
| print(f"Saved to {natural_earth_path}") | |
| con.close() | |
| total = len(divisions_df) + len(natural_earth_df) | |
| print(f"\nInventory build complete") | |
| print(f" Total entities: {total}") | |
| return {"divisions_area": len(divisions_df), "natural_earth": len(natural_earth_df)} | |
| def main(): | |
| """Build and save inventory tables.""" | |
| output_dir = Path(__file__).parent.parent / "intermediate" | |
| build_inventory_to_dir(output_dir) | |
| if __name__ == "__main__": | |
| main() | |