gazet / dataset /scripts /normalize_geodata.py
srmsoumya's picture
fix: filter only inland geometry for overture
f9e38fd
"""Normalize source GeoParquet files to a shared CRS-neutral geometry encoding.
The training pipeline mixes Overture divisions_area and Natural Earth geometry.
Across environments these sources can advertise different CRS metadata labels
(`EPSG:4326` vs `OGC:CRS84`), which causes DuckDB spatial joins to fail even
when coordinates are already compatible lon/lat values.
This script rewrites both datasets into normalized copies whose geometry column
is rebuilt from WKB. That preserves coordinates while dropping conflicting CRS
metadata, so downstream joins behave consistently locally and on Modal.
Output layout under data/ by default:
overture_normalized/divisions_area/part-000.parquet
natural_earth_normalized/ne_geography.parquet
"""
from pathlib import Path
import duckdb
from gazet.config import _DATA_DIR
def normalize_geodata(output_root: Path | None = None) -> dict[str, str]:
"""Write normalized copies of both source datasets.
Args:
output_root: Base directory to write normalized datasets into.
Defaults to the project data dir.
Returns:
Mapping of dataset name to written path/glob.
"""
root = output_root or _DATA_DIR
overture_dir = root / "overture_normalized" / "divisions_area"
natural_earth_dir = root / "natural_earth_normalized"
overture_dir.mkdir(parents=True, exist_ok=True)
natural_earth_dir.mkdir(parents=True, exist_ok=True)
overture_path = overture_dir / "part-000.parquet"
natural_earth_path = natural_earth_dir / "ne_geography.parquet"
con = duckdb.connect()
con.execute("INSTALL spatial")
con.execute("LOAD spatial")
# Rebuild geometry from WKB so conflicting CRS metadata is dropped.
con.execute(
f"""
COPY (
SELECT * REPLACE (
ST_GeomFromWKB(ST_AsWKB(geometry)) AS geometry
)
FROM read_parquet('{root / 'overture/divisions_area/*.parquet'}')
WHERE geometry IS NOT NULL
AND subtype IN ('country', 'region', 'county')
AND is_land = true
) TO '{overture_path}' (FORMAT PARQUET)
"""
)
con.execute(
f"""
COPY (
SELECT * REPLACE (
ST_GeomFromWKB(ST_AsWKB(geometry)) AS geometry
)
FROM read_parquet('{root / 'natural_earth_geoparquet/ne_geography.parquet'}')
WHERE geometry IS NOT NULL
) TO '{natural_earth_path}' (FORMAT PARQUET)
"""
)
con.close()
return {
"divisions_area": str(overture_dir / "*.parquet"),
"natural_earth": str(natural_earth_path),
}
def main() -> None:
result = normalize_geodata()
print("Normalized datasets written:")
for name, path in result.items():
print(f" {name}: {path}")
if __name__ == "__main__":
main()