""" Fetch ERA5-Land daily data for the primary-city zones via the CDS API. ERA5-Land: 9km resolution (0.1°) — 6x finer than NASA POWER (0.5°/55km). At 9km, neighborhood-scale zones typically resolve to 2-3 distinct grid cells. Fetches one month at a time to stay within CDS cost limits. Caches monthly NetCDF files in data/era5land_cache/. Usage: python3 scripts/fetch_era5land.py # uses config.PRIMARY_CITY python3 scripts/fetch_era5land.py --city "Kampala" # different city Output: data/era5land_{city_slug}.json """ import argparse import json import math import sys import time from pathlib import Path import numpy as np # Make project root importable so config.py resolves sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from config import ZONES as CONFIG_ZONES, PRIMARY_CITY, CITIES, slug_for def bbox_for_zones(zones, margin: float = 0.05): """Compute (N, S, W, E) bounding box from zone lat/lon with a small margin. Default margin ≈0.5 ERA5-Land grid cells (9km), ensuring every zone has grid neighbors for nearest-neighbor interpolation. """ lats = [z.latitude for z in zones] lons = [z.longitude for z in zones] north = max(lats) + margin south = min(lats) - margin west = min(lons) - margin east = max(lons) + margin return north, south, west, east def fetch_month(client, year, month, area, cache_prefix, cache_dir): """Fetch one month of ERA5-Land data. Returns path or None.""" path = cache_dir / f"{cache_prefix}_{year}_{month:02d}.nc" if path.exists() and path.stat().st_size > 1000: return path print(f" {year}-{month:02d}: requesting...", end="", flush=True) try: client.retrieve( "reanalysis-era5-land", { "variable": [ "2m_temperature", "2m_dewpoint_temperature", "10m_u_component_of_wind", "10m_v_component_of_wind", "surface_solar_radiation_downwards", "total_precipitation", ], "year": str(year), "month": f"{month:02d}", "day": [f"{d:02d}" for d in range(1, 32)], "time": ["06:00", "12:00", "18:00"], "area": list(area), # [N, W, S, E] "data_format": "netcdf", }, str(path), ) size_kb = path.stat().st_size / 1024 print(f" ok ({size_kb:.0f} KB)") return path except Exception as e: print(f" FAILED: {e}") if path.exists(): path.unlink() return None def extract_all_zones(nc_path, zones): """Extract daily records for all zones from a single NetCDF file. Opens the file once and reads every variable in full; per-zone records are then computed from in-memory slices. For a 15-zone month this is 15x cheaper than opening once per zone. """ import netCDF4 as nc ds = nc.Dataset(str(nc_path), "r") lats = ds.variables["latitude"][:] lons = ds.variables["longitude"][:] time_var = "valid_time" if "valid_time" in ds.variables else "time" times = nc.num2date( ds.variables[time_var][:], ds.variables[time_var].units, ds.variables[time_var].calendar if hasattr(ds.variables[time_var], 'calendar') else 'standard', ) t2m_all = ds.variables["t2m"][:] d2m_all = ds.variables["d2m"][:] u10_all = ds.variables["u10"][:] v10_all = ds.variables["v10"][:] ssrd_all = ds.variables["ssrd"][:] tp_all = ds.variables["tp"][:] ds.close() result = {} for zone in zones: lat_idx = np.argmin(np.abs(lats - zone.latitude)) lon_idx = np.argmin(np.abs(lons - zone.longitude)) result[zone.zone_id] = _daily_records( times, t2m_all[:, lat_idx, lon_idx], d2m_all[:, lat_idx, lon_idx], u10_all[:, lat_idx, lon_idx], v10_all[:, lat_idx, lon_idx], ssrd_all[:, lat_idx, lon_idx], tp_all[:, lat_idx, lon_idx], ) return result def _daily_records(times, t2m, d2m, u10, v10, ssrd, tp): """Aggregate hourly samples into per-day summary records for one grid point.""" daily = {} for i, t in enumerate(times): day_str = t.strftime("%Y-%m-%d") if hasattr(t, "strftime") else str(t)[:10] if day_str not in daily: daily[day_str] = {"t2m": [], "d2m": [], "u10": [], "v10": [], "ssrd": [], "tp": []} daily[day_str]["t2m"].append(float(t2m[i])) daily[day_str]["d2m"].append(float(d2m[i])) daily[day_str]["u10"].append(float(u10[i])) daily[day_str]["v10"].append(float(v10[i])) daily[day_str]["ssrd"].append(float(ssrd[i])) daily[day_str]["tp"].append(float(tp[i])) records = [] for day_str in sorted(daily.keys()): d = daily[day_str] if not d["t2m"]: continue temps_c = [t - 273.15 for t in d["t2m"]] dewpoints_c = [t - 273.15 for t in d["d2m"]] temp_mean = sum(temps_c) / len(temps_c) mean_dew = sum(dewpoints_c) / len(dewpoints_c) es = 6.112 * math.exp(17.67 * temp_mean / (temp_mean + 243.5)) ea = 6.112 * math.exp(17.67 * mean_dew / (mean_dew + 243.5)) humidity = min(100.0, max(0.0, (ea / es) * 100.0)) if es > 0 else 50.0 wind_speeds = [math.sqrt(u**2 + v**2) for u, v in zip(d["u10"], d["v10"])] solar_vals = [max(0, s / 10800.0) for s in d["ssrd"]] records.append({ "date": day_str, "temp_max_c": round(max(temps_c), 2), "temp_min_c": round(min(temps_c), 2), "temp_mean_c": round(temp_mean, 2), "humidity_pct": round(humidity, 1), "wind_speed_ms": round(sum(wind_speeds) / len(wind_speeds), 2), "solar_rad_wm2": round(sum(solar_vals) / len(solar_vals), 1), "precip_mm": round(sum(max(0, p * 1000.0) for p in d["tp"]), 2), }) return records def main(): default_city = PRIMARY_CITY or (CITIES[0] if CITIES else "Dar es Salaam") parser = argparse.ArgumentParser( description="Fetch ERA5-Land data for the configured primary city's zones." ) parser.add_argument( "--city", default=default_city, help=f"City to fetch (default: {default_city} from config.PRIMARY_CITY)", ) parser.add_argument("--start-year", type=int, default=2005, help="First year to fetch") parser.add_argument("--end-year", type=int, default=2024, help="Last year to fetch") args = parser.parse_args() city = args.city slug = slug_for(city) # Filter zones for the target city city_zones = [z for z in CONFIG_ZONES if z.city == city] if not city_zones: print(f"ERROR: No zones found for city '{city}'") print(f" Available cities: {sorted(set(z.city for z in CONFIG_ZONES))}") sys.exit(1) data_dir = Path(__file__).resolve().parents[1] / "data" cache_dir = data_dir / "era5land_cache" output_file = data_dir / f"era5land_{slug}.json" cache_prefix = f"era5land_{slug}" data_dir.mkdir(parents=True, exist_ok=True) cache_dir.mkdir(exist_ok=True) north, south, west, east = bbox_for_zones(city_zones) # CDS API "area" is [N, W, S, E] area = (north, west, south, east) import cdsapi client = cdsapi.Client() total_months = (args.end_year - args.start_year + 1) * 12 cached = len(list(cache_dir.glob(f"{cache_prefix}_*.nc"))) print(f"ERA5-Land fetch for {city}") print(f" Zones: {len(city_zones)} ({', '.join(z.name for z in city_zones)})") print(f" Period: {args.start_year}-{args.end_year} ({total_months} months)") print(f" Cached: {cached} months") print(f" Bounding box: N={north:.2f}, S={south:.2f}, W={west:.2f}, E={east:.2f}") print(f" Output: {output_file}") print() # Fetch all months failed = [] for year in range(args.start_year, args.end_year + 1): print(f"{year}:") for month in range(1, 13): result = fetch_month(client, year, month, area, cache_prefix, cache_dir) if result is None: failed.append(f"{year}-{month:02d}") time.sleep(1) # rate limiting if failed: print(f"\nWARNING: {len(failed)} months failed: {failed[:10]}") # Extract per-zone data print(f"\nExtracting per-zone daily data...") all_data = {z.zone_id: [] for z in city_zones} for year in range(args.start_year, args.end_year + 1): for month in range(1, 13): nc_path = cache_dir / f"{cache_prefix}_{year}_{month:02d}.nc" if not nc_path.exists() or nc_path.stat().st_size < 1000: continue per_zone = extract_all_zones(nc_path, city_zones) for zid, records in per_zone.items(): all_data[zid].extend(records) # Save with open(output_file, "w") as f: json.dump(all_data, f) size_mb = output_file.stat().st_size / 1e6 print(f"\nSaved to {output_file} ({size_mb:.1f} MB)") # Check if zones got different data (the whole point of ERA5-Land) print(f"\n=== Spatial differentiation check ===") zone_name = {z.zone_id: z.name for z in city_zones} for zid, records in all_data.items(): if not records: print(f" {zid}: NO DATA") continue temps = [r["temp_max_c"] for r in records] humids = [r["humidity_pct"] for r in records] print( f" {zid} ({zone_name[zid]:12s}): {len(records)} days, " f"temp_max mean={sum(temps)/len(temps):.2f}°C, " f"humidity mean={sum(humids)/len(humids):.1f}%" ) # Check if grid points differ if len(all_data) >= 2: z1, z2 = list(all_data.keys())[:2] r1, r2 = all_data[z1], all_data[z2] if r1 and r2 and len(r1) == len(r2): diffs = [abs(a["temp_max_c"] - b["temp_max_c"]) for a, b in zip(r1, r2)] mean_diff = sum(diffs) / len(diffs) print(f"\n Mean temp difference {z1} vs {z2}: {mean_diff:.3f}°C") if mean_diff > 0.01: print(f" ERA5-Land is resolving different grid cells!") else: print(f" Same grid cell (zones too close for 9km resolution)") if __name__ == "__main__": main()