File size: 2,866 Bytes
dfb9466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ba9557
f9e38fd
dfb9466
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
"""Normalize source GeoParquet files to a shared CRS-neutral geometry encoding.

The training pipeline mixes Overture divisions_area and Natural Earth geometry.
Across environments these sources can advertise different CRS metadata labels
(`EPSG:4326` vs `OGC:CRS84`), which causes DuckDB spatial joins to fail even
when coordinates are already compatible lon/lat values.

This script rewrites both datasets into normalized copies whose geometry column
is rebuilt from WKB. That preserves coordinates while dropping conflicting CRS
metadata, so downstream joins behave consistently locally and on Modal.

Output layout under data/ by default:
    overture_normalized/divisions_area/part-000.parquet
    natural_earth_normalized/ne_geography.parquet
"""

from pathlib import Path

import duckdb

from gazet.config import _DATA_DIR


def normalize_geodata(output_root: Path | None = None) -> dict[str, str]:
    """Write normalized copies of both source datasets.

    Args:
        output_root: Base directory to write normalized datasets into.
            Defaults to the project data dir.

    Returns:
        Mapping of dataset name to written path/glob.
    """
    root = output_root or _DATA_DIR
    overture_dir = root / "overture_normalized" / "divisions_area"
    natural_earth_dir = root / "natural_earth_normalized"
    overture_dir.mkdir(parents=True, exist_ok=True)
    natural_earth_dir.mkdir(parents=True, exist_ok=True)

    overture_path = overture_dir / "part-000.parquet"
    natural_earth_path = natural_earth_dir / "ne_geography.parquet"

    con = duckdb.connect()
    con.execute("INSTALL spatial")
    con.execute("LOAD spatial")

    # Rebuild geometry from WKB so conflicting CRS metadata is dropped.
    con.execute(
        f"""
        COPY (
            SELECT * REPLACE (
                ST_GeomFromWKB(ST_AsWKB(geometry)) AS geometry
            )
            FROM read_parquet('{root / 'overture/divisions_area/*.parquet'}')
            WHERE geometry IS NOT NULL
              AND subtype IN ('country', 'region', 'county')
              AND is_land = true
        ) TO '{overture_path}' (FORMAT PARQUET)
        """
    )

    con.execute(
        f"""
        COPY (
            SELECT * REPLACE (
                ST_GeomFromWKB(ST_AsWKB(geometry)) AS geometry
            )
            FROM read_parquet('{root / 'natural_earth_geoparquet/ne_geography.parquet'}')
            WHERE geometry IS NOT NULL
        ) TO '{natural_earth_path}' (FORMAT PARQUET)
        """
    )
    con.close()

    return {
        "divisions_area": str(overture_dir / "*.parquet"),
        "natural_earth": str(natural_earth_path),
    }


def main() -> None:
    result = normalize_geodata()
    print("Normalized datasets written:")
    for name, path in result.items():
        print(f"  {name}: {path}")


if __name__ == "__main__":
    main()