cascade_risk / src /data /emdat_parser.py
Lucasoppem's picture
Sync from GitHub main (part 2)
36f9d47 verified
Raw
History Blame Contribute Delete
3.55 kB
"""Parse EMDAT Excel data and extract European flood events."""
from __future__ import annotations
import json
from datetime import date
from pathlib import Path
import pandas as pd
from src.llm.client import load_config
from src.models.schemas import FloodEvent
def parse_emdat(config: dict | None = None) -> list[FloodEvent]:
"""Parse EMDAT Excel file and return filtered European flood events."""
if config is None:
config = load_config()
data_cfg = config["data"]
paths_cfg = config["paths"]
emdat_path = paths_cfg["emdat_file"]
df = pd.read_excel(emdat_path, engine="openpyxl")
# Filter: Flood + Europe + target years
all_years = data_cfg["train_years"] + data_cfg["test_years"]
mask = (
(df["Disaster Type"] == data_cfg["disaster_type"])
& (df["Region"] == data_cfg["region"])
& (df["Start Year"].isin(all_years))
)
filtered = df[mask].copy()
events = []
for _, row in filtered.iterrows():
start_date = _build_date(row["Start Year"], row["Start Month"], row["Start Day"])
end_date = _build_date(row["End Year"], row["End Month"], row["End Day"])
event = FloodEvent(
event_id=str(row["DisNo."]),
country=str(row["Country"]),
iso=str(row["ISO"]),
region=str(row["Region"]),
location=_safe_str(row.get("Location")),
latitude=_safe_float(row.get("Latitude")),
longitude=_safe_float(row.get("Longitude")),
start_date=start_date,
end_date=end_date,
disaster_subtype=_safe_str(row.get("Disaster Subtype")),
origin=_safe_str(row.get("Origin")),
magnitude=_safe_float(row.get("Magnitude")),
magnitude_scale=_safe_str(row.get("Magnitude Scale")),
total_deaths=_safe_int(row.get("Total Deaths")),
total_affected=_safe_int(row.get("Total Affected")),
total_damage_k_usd=_safe_float(row.get("Total Damage ('000 US$)")),
)
events.append(event)
return events
def split_events(
events: list[FloodEvent], config: dict | None = None
) -> tuple[list[FloodEvent], list[FloodEvent]]:
"""Split events into train and test sets based on year config."""
if config is None:
config = load_config()
train_years = set(config["data"]["train_years"])
test_years = set(config["data"]["test_years"])
train = [e for e in events if e.start_date.year in train_years]
test = [e for e in events if e.start_date.year in test_years]
return train, test
def save_events(events: list[FloodEvent], output_path: str) -> None:
"""Save events list to JSON file."""
path = Path(output_path)
path.parent.mkdir(parents=True, exist_ok=True)
data = [e.model_dump(mode="json") for e in events]
path.write_text(json.dumps(data, indent=2, ensure_ascii=False))
def _build_date(year, month, day) -> date:
y = int(year) if pd.notna(year) else 2000
m = int(month) if pd.notna(month) else 1
d = int(day) if pd.notna(day) else 1
return date(y, m, d)
def _safe_str(val) -> str | None:
if pd.isna(val):
return None
return str(val)
def _safe_float(val) -> float | None:
if pd.isna(val):
return None
try:
return float(val)
except (ValueError, TypeError):
return None
def _safe_int(val) -> int | None:
if pd.isna(val):
return None
try:
return int(float(val))
except (ValueError, TypeError):
return None