gazet / dataset /scripts /build_inventory.py
srmsoumya's picture
Fix: No pairs are created for mixed queries
dfb9466
"""
Build entity inventory from divisions_area and natural_earth parquet files.
This script creates compact inventory tables containing only the fields needed
for candidate sampling and distractor generation.
Output:
- intermediate/divisions_area_inventory.parquet
- intermediate/natural_earth_inventory.parquet
"""
import duckdb
import pandas as pd
from pathlib import Path
from gazet.config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH
def build_divisions_area_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame:
"""Extract compact inventory from divisions_area."""
query = """
SELECT
'divisions_area' AS source,
id,
names."primary" AS name,
subtype,
country,
region,
admin_level,
class,
is_land,
is_territorial,
division_id,
ST_Area(geometry) AS area_sq_deg,
ST_XMin(geometry) AS xmin,
ST_YMin(geometry) AS ymin,
ST_XMax(geometry) AS xmax,
ST_YMax(geometry) AS ymax
FROM read_parquet(?)
WHERE names."primary" IS NOT NULL
AND trim(names."primary") != ''
AND geometry IS NOT NULL
"""
df = con.execute(query, [DIVISIONS_AREA_PATH]).fetchdf()
print(f"Divisions area inventory: {len(df)} entities")
print(f"Subtypes: {df['subtype'].value_counts().to_dict()}")
print(f"Countries: {df['country'].nunique()} unique")
return df
def build_natural_earth_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame:
"""Extract compact inventory from natural_earth."""
query = """
SELECT
'natural_earth' AS source,
id,
names."primary" AS name,
subtype,
country,
region,
admin_level,
class,
is_land,
is_territorial,
ST_Area(geometry) AS area_sq_deg,
ST_XMin(geometry) AS xmin,
ST_YMin(geometry) AS ymin,
ST_XMax(geometry) AS xmax,
ST_YMax(geometry) AS ymax
FROM read_parquet(?)
WHERE names."primary" IS NOT NULL
AND trim(names."primary") != ''
AND geometry IS NOT NULL
"""
df = con.execute(query, [NATURAL_EARTH_PATH]).fetchdf()
print(f"\nNatural earth inventory: {len(df)} entities")
print(f"Subtypes: {df['subtype'].value_counts().to_dict()}")
return df
def build_inventory_to_dir(output_dir: Path) -> dict:
"""Build and save all inventory tables to output_dir.
Reusable entry point for both local CLI and Modal.
Returns:
Dict with counts: {"divisions_area": int, "natural_earth": int}
"""
output_dir.mkdir(exist_ok=True, parents=True)
con = duckdb.connect()
con.execute("INSTALL spatial")
con.execute("LOAD spatial")
print("Building divisions_area inventory...")
divisions_df = build_divisions_area_inventory(con)
divisions_path = output_dir / "divisions_area_inventory.parquet"
divisions_df.to_parquet(divisions_path, index=False)
print(f"Saved to {divisions_path}")
print("\nBuilding natural_earth inventory...")
natural_earth_df = build_natural_earth_inventory(con)
natural_earth_path = output_dir / "natural_earth_inventory.parquet"
natural_earth_df.to_parquet(natural_earth_path, index=False)
print(f"Saved to {natural_earth_path}")
con.close()
total = len(divisions_df) + len(natural_earth_df)
print(f"\nInventory build complete")
print(f" Total entities: {total}")
return {"divisions_area": len(divisions_df), "natural_earth": len(natural_earth_df)}
def main():
"""Build and save inventory tables."""
output_dir = Path(__file__).parent.parent / "intermediate"
build_inventory_to_dir(output_dir)
if __name__ == "__main__":
main()