"""Normalize source GeoParquet files to a shared CRS-neutral geometry encoding. The training pipeline mixes Overture divisions_area and Natural Earth geometry. Across environments these sources can advertise different CRS metadata labels (`EPSG:4326` vs `OGC:CRS84`), which causes DuckDB spatial joins to fail even when coordinates are already compatible lon/lat values. This script rewrites both datasets into normalized copies whose geometry column is rebuilt from WKB. That preserves coordinates while dropping conflicting CRS metadata, so downstream joins behave consistently locally and on Modal. Output layout under data/ by default: overture_normalized/divisions_area/part-000.parquet natural_earth_normalized/ne_geography.parquet """ from pathlib import Path import duckdb from gazet.config import _DATA_DIR def normalize_geodata(output_root: Path | None = None) -> dict[str, str]: """Write normalized copies of both source datasets. Args: output_root: Base directory to write normalized datasets into. Defaults to the project data dir. Returns: Mapping of dataset name to written path/glob. """ root = output_root or _DATA_DIR overture_dir = root / "overture_normalized" / "divisions_area" natural_earth_dir = root / "natural_earth_normalized" overture_dir.mkdir(parents=True, exist_ok=True) natural_earth_dir.mkdir(parents=True, exist_ok=True) overture_path = overture_dir / "part-000.parquet" natural_earth_path = natural_earth_dir / "ne_geography.parquet" con = duckdb.connect() con.execute("INSTALL spatial") con.execute("LOAD spatial") # Rebuild geometry from WKB so conflicting CRS metadata is dropped. con.execute( f""" COPY ( SELECT * REPLACE ( ST_GeomFromWKB(ST_AsWKB(geometry)) AS geometry ) FROM read_parquet('{root / 'overture/divisions_area/*.parquet'}') WHERE geometry IS NOT NULL AND subtype IN ('country', 'region', 'county') AND is_land = true ) TO '{overture_path}' (FORMAT PARQUET) """ ) con.execute( f""" COPY ( SELECT * REPLACE ( ST_GeomFromWKB(ST_AsWKB(geometry)) AS geometry ) FROM read_parquet('{root / 'natural_earth_geoparquet/ne_geography.parquet'}') WHERE geometry IS NOT NULL ) TO '{natural_earth_path}' (FORMAT PARQUET) """ ) con.close() return { "divisions_area": str(overture_dir / "*.parquet"), "natural_earth": str(natural_earth_path), } def main() -> None: result = normalize_geodata() print("Normalized datasets written:") for name, path in result.items(): print(f" {name}: {path}") if __name__ == "__main__": main()