Spaces:
Sleeping
Sleeping
File size: 3,743 Bytes
d8d4856 dfb9466 d8d4856 dfb9466 d8d4856 dfb9466 d8d4856 dfb9466 d8d4856 dfb9466 d8d4856 dfb9466 d8d4856 c88725f d8d4856 c88725f d8d4856 c88725f d8d4856 c88725f d8d4856 c88725f d8d4856 c88725f d8d4856 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | """
Build entity inventory from divisions_area and natural_earth parquet files.
This script creates compact inventory tables containing only the fields needed
for candidate sampling and distractor generation.
Output:
- intermediate/divisions_area_inventory.parquet
- intermediate/natural_earth_inventory.parquet
"""
import duckdb
import pandas as pd
from pathlib import Path
from gazet.config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH
def build_divisions_area_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame:
"""Extract compact inventory from divisions_area."""
query = """
SELECT
'divisions_area' AS source,
id,
names."primary" AS name,
subtype,
country,
region,
admin_level,
class,
is_land,
is_territorial,
division_id,
ST_Area(geometry) AS area_sq_deg,
ST_XMin(geometry) AS xmin,
ST_YMin(geometry) AS ymin,
ST_XMax(geometry) AS xmax,
ST_YMax(geometry) AS ymax
FROM read_parquet(?)
WHERE names."primary" IS NOT NULL
AND trim(names."primary") != ''
AND geometry IS NOT NULL
"""
df = con.execute(query, [DIVISIONS_AREA_PATH]).fetchdf()
print(f"Divisions area inventory: {len(df)} entities")
print(f"Subtypes: {df['subtype'].value_counts().to_dict()}")
print(f"Countries: {df['country'].nunique()} unique")
return df
def build_natural_earth_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame:
"""Extract compact inventory from natural_earth."""
query = """
SELECT
'natural_earth' AS source,
id,
names."primary" AS name,
subtype,
country,
region,
admin_level,
class,
is_land,
is_territorial,
ST_Area(geometry) AS area_sq_deg,
ST_XMin(geometry) AS xmin,
ST_YMin(geometry) AS ymin,
ST_XMax(geometry) AS xmax,
ST_YMax(geometry) AS ymax
FROM read_parquet(?)
WHERE names."primary" IS NOT NULL
AND trim(names."primary") != ''
AND geometry IS NOT NULL
"""
df = con.execute(query, [NATURAL_EARTH_PATH]).fetchdf()
print(f"\nNatural earth inventory: {len(df)} entities")
print(f"Subtypes: {df['subtype'].value_counts().to_dict()}")
return df
def build_inventory_to_dir(output_dir: Path) -> dict:
"""Build and save all inventory tables to output_dir.
Reusable entry point for both local CLI and Modal.
Returns:
Dict with counts: {"divisions_area": int, "natural_earth": int}
"""
output_dir.mkdir(exist_ok=True, parents=True)
con = duckdb.connect()
con.execute("INSTALL spatial")
con.execute("LOAD spatial")
print("Building divisions_area inventory...")
divisions_df = build_divisions_area_inventory(con)
divisions_path = output_dir / "divisions_area_inventory.parquet"
divisions_df.to_parquet(divisions_path, index=False)
print(f"Saved to {divisions_path}")
print("\nBuilding natural_earth inventory...")
natural_earth_df = build_natural_earth_inventory(con)
natural_earth_path = output_dir / "natural_earth_inventory.parquet"
natural_earth_df.to_parquet(natural_earth_path, index=False)
print(f"Saved to {natural_earth_path}")
con.close()
total = len(divisions_df) + len(natural_earth_df)
print(f"\nInventory build complete")
print(f" Total entities: {total}")
return {"divisions_area": len(divisions_df), "natural_earth": len(natural_earth_df)}
def main():
"""Build and save inventory tables."""
output_dir = Path(__file__).parent.parent / "intermediate"
build_inventory_to_dir(output_dir)
if __name__ == "__main__":
main()
|