ClimAI / build_dataset.py
iPurushottam's picture
Upload folder using huggingface_hub
1bacc8b verified
"""
build_dataset.py - ClimAI Historical Dataset Builder
Run ONCE to build data/ folder. Re-run monthly to refresh.
Output: data/weather_history.json, data/earthquake_history.json,
data/aqi_history.json, data/dataset_meta.json
"""
import json, os, requests
from datetime import datetime, timedelta
LAT, LON = 13.0827, 80.2707
DATA_DIR = "data"
os.makedirs(DATA_DIR, exist_ok=True)
def log(msg):
print(f"[{datetime.now().strftime('%H:%M:%S')}] {msg}")
# ── 1. WEATHER HISTORY ──────────────────────────────────────────────────────
def fetch_weather_history():
log("Fetching 5-year weather history from Open-Meteo Archive...")
end = datetime.now() - timedelta(days=7)
start = end - timedelta(days=365 * 5)
r = requests.get("https://archive-api.open-meteo.com/v1/archive", params={
"latitude": LAT, "longitude": LON,
"start_date": start.strftime("%Y-%m-%d"),
"end_date": end.strftime("%Y-%m-%d"),
"daily": "temperature_2m_max,temperature_2m_min,precipitation_sum,wind_speed_10m_max,rain_sum",
"timezone": "Asia/Kolkata",
}, timeout=60)
r.raise_for_status()
raw = r.json()
daily = raw.get("daily", {})
times = daily.get("time", [])
temp_max = daily.get("temperature_2m_max", [])
temp_min = daily.get("temperature_2m_min", [])
precip = daily.get("precipitation_sum", [])
wind = daily.get("wind_speed_10m_max", [])
rain = daily.get("rain_sum", [])
records = [{"date": d,
"temp_max": temp_max[i] if i < len(temp_max) else None,
"temp_min": temp_min[i] if i < len(temp_min) else None,
"precip": precip[i] if i < len(precip) else None,
"wind": wind[i] if i < len(wind) else None,
"rain": rain[i] if i < len(rain) else None}
for i, d in enumerate(times)]
# Monthly summaries
monthly = {}
for rec in records:
k = rec["date"][:7]
if k not in monthly:
monthly[k] = {"temps_max": [], "temps_min": [], "precip": [], "wind": []}
for field, bucket in [("temp_max","temps_max"),("temp_min","temps_min"),("precip","precip"),("wind","wind")]:
if rec[field] is not None: monthly[k][bucket].append(rec[field])
def avg(lst): return round(sum(lst)/len(lst), 1) if lst else None
monthly_summary = [{"month": m,
"avg_temp_max": avg(v["temps_max"]),
"avg_temp_min": avg(v["temps_min"]),
"total_precip": round(sum(v["precip"]),1) if v["precip"] else 0,
"avg_wind": avg(v["wind"])}
for m, v in sorted(monthly.items())]
# Yearly summaries
yearly = {}
for rec in records:
yr = rec["date"][:4]
if yr not in yearly: yearly[yr] = {"temps_max":[], "precip":[]}
if rec["temp_max"] is not None: yearly[yr]["temps_max"].append(rec["temp_max"])
if rec["precip"] is not None: yearly[yr]["precip"].append(rec["precip"])
yearly_summary = {yr: {"avg_temp_max": avg(v["temps_max"]),
"total_precip_mm": round(sum(v["precip"]),1) if v["precip"] else 0}
for yr, v in sorted(yearly.items())}
result = {
"location": "Chennai, India", "lat": LAT, "lon": LON,
"period_start": start.strftime("%Y-%m-%d"),
"period_end": end.strftime("%Y-%m-%d"),
"total_days": len(records),
"daily": records,
"monthly": monthly_summary,
"yearly": yearly_summary,
}
path = os.path.join(DATA_DIR, "weather_history.json")
with open(path, "w") as f: json.dump(result, f)
log(f"Saved {len(records)} days -> {path}")
return result
# ── 2. EARTHQUAKE HISTORY ───────────────────────────────────────────────────
def fetch_earthquake_history():
log("Fetching earthquake history from USGS...")
end = datetime.now()
start = end - timedelta(days=365 * 5)
r = requests.get("https://earthquake.usgs.gov/fdsnws/event/1/query", params={
"format": "geojson",
"starttime": start.strftime("%Y-%m-%d"),
"endtime": end.strftime("%Y-%m-%d"),
"minlatitude": 3.0, "maxlatitude": 23.0,
"minlongitude": 70.0, "maxlongitude": 90.0,
"minmagnitude": 4.0,
"orderby": "time", "limit": 1000,
}, timeout=60)
r.raise_for_status()
raw = r.json()
events = []
for feat in raw.get("features", []):
props = feat.get("properties", {})
coords = feat.get("geometry", {}).get("coordinates", [None, None, None])
events.append({
"id": feat.get("id"),
"time": datetime.utcfromtimestamp(props["time"]/1000).strftime("%Y-%m-%dT%H:%M:%SZ") if props.get("time") else None,
"magnitude": props.get("mag"),
"place": props.get("place"),
"longitude": coords[0], "latitude": coords[1], "depth_km": coords[2],
})
result = {"region": "Bay of Bengal / South India",
"period_start": start.strftime("%Y-%m-%d"),
"period_end": end.strftime("%Y-%m-%d"),
"total_events": len(events), "events": events}
path = os.path.join(DATA_DIR, "earthquake_history.json")
with open(path, "w") as f: json.dump(result, f)
log(f"Saved {len(events)} earthquake events -> {path}")
return result
# ── 3. AQI HISTORY ──────────────────────────────────────────────────────────
def fetch_aqi_history():
log("Fetching AQI history from Open-Meteo Air Quality...")
end = datetime.now() - timedelta(days=2)
start = end - timedelta(days=365)
try:
r = requests.get("https://air-quality-api.open-meteo.com/v1/air-quality", params={
"latitude": LAT, "longitude": LON,
"hourly": "pm10,pm2_5,nitrogen_dioxide,ozone,european_aqi",
"start_date": start.strftime("%Y-%m-%d"),
"end_date": end.strftime("%Y-%m-%d"),
"timezone": "Asia/Kolkata",
}, timeout=60)
r.raise_for_status()
hourly = r.json().get("hourly", {})
times = hourly.get("time", [])
pm25 = hourly.get("pm2_5", [])
pm10 = hourly.get("pm10", [])
aqi_eu = hourly.get("european_aqi", [])
no2 = hourly.get("nitrogen_dioxide", [])
o3 = hourly.get("ozone", [])
daily = {}
for i, ts in enumerate(times):
day = ts[:10]
if day not in daily: daily[day] = {"pm25":[],"pm10":[],"aqi":[],"no2":[],"o3":[]}
if i < len(pm25) and pm25[i] is not None: daily[day]["pm25"].append(pm25[i])
if i < len(pm10) and pm10[i] is not None: daily[day]["pm10"].append(pm10[i])
if i < len(aqi_eu) and aqi_eu[i] is not None: daily[day]["aqi"].append(aqi_eu[i])
if i < len(no2) and no2[i] is not None: daily[day]["no2"].append(no2[i])
if i < len(o3) and o3[i] is not None: daily[day]["o3"].append(o3[i])
def avg(lst): return round(sum(lst)/len(lst),1) if lst else None
daily_records = [{"date": day,
"avg_pm25": avg(v["pm25"]), "avg_pm10": avg(v["pm10"]),
"avg_aqi": avg(v["aqi"]), "avg_no2": avg(v["no2"]),
"avg_o3": avg(v["o3"])}
for day, v in sorted(daily.items())]
result = {"location": "Chennai, India",
"period_start": start.strftime("%Y-%m-%d"),
"period_end": end.strftime("%Y-%m-%d"),
"total_days": len(daily_records), "daily": daily_records}
except Exception as e:
log(f"AQI fetch failed: {e}")
result = {"error": str(e), "daily": [], "total_days": 0}
path = os.path.join(DATA_DIR, "aqi_history.json")
with open(path, "w") as f: json.dump(result, f)
log(f"Saved AQI history ({result.get('total_days',0)} days) -> {path}")
return result
# ── 4. DATASET META ──────────────────────────────────────────────────────────
def build_meta(weather, earthquakes, aqi):
log("Building dataset metadata + statistics summary...")
def avg(lst): return round(sum(lst)/len(lst),1) if lst else None
daily_temps = [d["temp_max"] for d in weather.get("daily",[]) if d.get("temp_max") is not None]
daily_precip = [d["precip"] for d in weather.get("daily",[]) if d.get("precip") is not None]
daily_wind = [d["wind"] for d in weather.get("daily",[]) if d.get("wind") is not None]
eq_mags = [e["magnitude"] for e in earthquakes.get("events",[]) if e.get("magnitude") is not None]
aqi_vals = [d["avg_aqi"] for d in aqi.get("daily",[]) if d.get("avg_aqi") is not None]
monthly_precip = {}
for d in weather.get("daily", []):
if d.get("precip") is not None:
m = int(d["date"][5:7])
monthly_precip[m] = monthly_precip.get(m, 0) + d["precip"]
meta = {
"built_at": datetime.now().isoformat(),
"location": "Chennai, India (13.08N, 80.27E)",
"data_sources": ["Open-Meteo Archive", "USGS Earthquake API", "Open-Meteo AQI"],
"weather": {
"period": f"{weather.get('period_start')} to {weather.get('period_end')}",
"total_days": len(daily_temps),
"avg_temp_max_c": avg(daily_temps),
"max_temp_ever_c": max(daily_temps) if daily_temps else None,
"min_temp_ever_c": min(daily_temps) if daily_temps else None,
"avg_daily_precip_mm": round(sum(daily_precip)/len(daily_precip),2) if daily_precip else None,
"max_single_day_rain": max(daily_precip) if daily_precip else None,
"avg_wind_kmh": avg(daily_wind),
"max_wind_kmh": max(daily_wind) if daily_wind else None,
"monthly_total_precip_mm": {str(k): round(v,1) for k,v in sorted(monthly_precip.items())},
"yearly_summary": weather.get("yearly", {}),
"peak_monsoon_months": "June-November (Northeast monsoon dominant Oct-Dec)",
"heatwave_season": "March-June (temps frequently exceed 38C)",
"cyclone_peak_season": "October-December (Bay of Bengal cyclogenesis)",
},
"earthquakes": {
"total_events_5yr": earthquakes.get("total_events", 0),
"avg_magnitude": round(sum(eq_mags)/len(eq_mags),2) if eq_mags else None,
"max_magnitude": max(eq_mags) if eq_mags else None,
"region": "Bay of Bengal / South India (~1000km radius from Chennai)",
"risk_note": "Chennai on stable Deccan shield. Major risk from Bay of Bengal subduction zone.",
},
"aqi": {
"total_days": aqi.get("total_days", 0),
"avg_aqi": avg(aqi_vals),
"max_aqi": max(aqi_vals) if aqi_vals else None,
"context": "Typically moderate (50-100). Winter spikes (Nov-Jan) from crop burning and low wind.",
},
"notable_events": [
{"event": "Cyclone Michaung", "date": "December 2023", "impact": "Category 1 landfall, severe Chennai flooding, 2500mm rain in 48hr"},
{"event": "2015 Chennai Floods", "date": "November-December 2015", "impact": "Worst flooding in 100 years, 500+ deaths, $3B damage"},
{"event": "2004 Indian Ocean Tsunami", "date": "December 26 2004", "impact": "Mag 9.1 Sumatra earthquake, Chennai coastline devastated"},
{"event": "Cyclone Vardah", "date": "December 2016", "impact": "Direct hit Chennai, 100+ km/h winds, widespread damage"},
{"event": "Cyclone Nivar", "date": "November 2020", "impact": "Very severe cyclone, landfall near Puducherry, heavy Chennai rain"},
],
"climate_patterns": {
"type": "Tropical wet and dry (Koppen Aw)",
"annual_rainfall_mm": "~1400mm avg, 60% from Northeast monsoon Oct-Dec",
"sea_surface_temp": "Bay of Bengal 28-30C peak Jun-Oct (cyclone fuel)",
"urban_heat_island": "UHI adds +1.5 to 2.5C vs rural",
"warming_trend": "Max temps rising ~0.3C per decade since 1980",
"flood_risk_zones": "Adyar, Cooum, Buckingham Canal flood plains highest risk",
},
}
path = os.path.join(DATA_DIR, "dataset_meta.json")
with open(path, "w") as f: json.dump(meta, f, indent=2)
log(f"Saved dataset metadata -> {path}")
return meta
# ── MAIN ────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
log("=== ClimAI Dataset Builder ===")
weather = fetch_weather_history()
earthquakes = fetch_earthquake_history()
aqi = fetch_aqi_history()
meta = build_meta(weather, earthquakes, aqi)
log("=== Build complete ===")
log(f"Weather: {weather.get('total_days',0)} days | EQ: {earthquakes.get('total_events',0)} events | AQI: {aqi.get('total_days',0)} days")
log(f"Avg max temp: {meta['weather'].get('avg_temp_max_c')}C | Max rain: {meta['weather'].get('max_single_day_rain')}mm")