File size: 3,743 Bytes
d8d4856
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfb9466
d8d4856
dfb9466
d8d4856
dfb9466
d8d4856
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dfb9466
d8d4856
dfb9466
d8d4856
dfb9466
d8d4856
 
 
 
 
 
 
c88725f
 
 
 
 
 
 
 
d8d4856
c88725f
d8d4856
 
 
c88725f
d8d4856
 
 
 
 
c88725f
d8d4856
 
 
 
 
c88725f
d8d4856
c88725f
 
 
 
 
 
 
 
 
 
 
d8d4856
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Build entity inventory from divisions_area and natural_earth parquet files.

This script creates compact inventory tables containing only the fields needed
for candidate sampling and distractor generation.

Output:
- intermediate/divisions_area_inventory.parquet
- intermediate/natural_earth_inventory.parquet
"""

import duckdb
import pandas as pd
from pathlib import Path

from gazet.config import DIVISIONS_AREA_PATH, NATURAL_EARTH_PATH


def build_divisions_area_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame:
    """Extract compact inventory from divisions_area."""
    query = """
    SELECT 
        'divisions_area' AS source,
        id,
        names."primary" AS name,
        subtype,
        country,
        region,
        admin_level,
        class,
        is_land,
        is_territorial,
        division_id,
        ST_Area(geometry) AS area_sq_deg,
        ST_XMin(geometry) AS xmin,
        ST_YMin(geometry) AS ymin,
        ST_XMax(geometry) AS xmax,
        ST_YMax(geometry) AS ymax
    FROM read_parquet(?)
    WHERE names."primary" IS NOT NULL
      AND trim(names."primary") != ''
      AND geometry IS NOT NULL
    """

    df = con.execute(query, [DIVISIONS_AREA_PATH]).fetchdf()
    print(f"Divisions area inventory: {len(df)} entities")
    print(f"Subtypes: {df['subtype'].value_counts().to_dict()}")
    print(f"Countries: {df['country'].nunique()} unique")
    
    return df


def build_natural_earth_inventory(con: duckdb.DuckDBPyConnection) -> pd.DataFrame:
    """Extract compact inventory from natural_earth."""
    query = """
    SELECT 
        'natural_earth' AS source,
        id,
        names."primary" AS name,
        subtype,
        country,
        region,
        admin_level,
        class,
        is_land,
        is_territorial,
        ST_Area(geometry) AS area_sq_deg,
        ST_XMin(geometry) AS xmin,
        ST_YMin(geometry) AS ymin,
        ST_XMax(geometry) AS xmax,
        ST_YMax(geometry) AS ymax
    FROM read_parquet(?)
    WHERE names."primary" IS NOT NULL
      AND trim(names."primary") != ''
      AND geometry IS NOT NULL
    """

    df = con.execute(query, [NATURAL_EARTH_PATH]).fetchdf()
    print(f"\nNatural earth inventory: {len(df)} entities")
    print(f"Subtypes: {df['subtype'].value_counts().to_dict()}")
    
    return df


def build_inventory_to_dir(output_dir: Path) -> dict:
    """Build and save all inventory tables to output_dir.

    Reusable entry point for both local CLI and Modal.

    Returns:
        Dict with counts: {"divisions_area": int, "natural_earth": int}
    """
    output_dir.mkdir(exist_ok=True, parents=True)

    con = duckdb.connect()
    con.execute("INSTALL spatial")
    con.execute("LOAD spatial")

    print("Building divisions_area inventory...")
    divisions_df = build_divisions_area_inventory(con)
    divisions_path = output_dir / "divisions_area_inventory.parquet"
    divisions_df.to_parquet(divisions_path, index=False)
    print(f"Saved to {divisions_path}")

    print("\nBuilding natural_earth inventory...")
    natural_earth_df = build_natural_earth_inventory(con)
    natural_earth_path = output_dir / "natural_earth_inventory.parquet"
    natural_earth_df.to_parquet(natural_earth_path, index=False)
    print(f"Saved to {natural_earth_path}")

    con.close()

    total = len(divisions_df) + len(natural_earth_df)
    print(f"\nInventory build complete")
    print(f"  Total entities: {total}")
    return {"divisions_area": len(divisions_df), "natural_earth": len(natural_earth_df)}


def main():
    """Build and save inventory tables."""
    output_dir = Path(__file__).parent.parent / "intermediate"
    build_inventory_to_dir(output_dir)


if __name__ == "__main__":
    main()