File size: 2,059 Bytes
6eff894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/env python3
import json, os, argparse
from pathlib import Path
from datetime import datetime, timedelta
import pandas as pd, numpy as np, subprocess

META  = Path("models/rain_xgb_tuned_meta.json")
LOGS  = Path("logs")
PRED_LOG = LOGS / "predictions.csv"

def ensure_hourly(lat, lon, past_days=120):
    env = os.environ.copy()
    env["LAT"], env["LON"], env["PAST_DAYS"] = str(lat), str(lon), str(past_days)
    subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
    subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
    return pd.read_csv("results/hourly.csv", parse_dates=["time"])

def label_from_df(df, ts_pred, horizon_h, event_mm):
    # find the row with time == ts_pred, then sum next H hours of precip_mm
    # allow slight mismatch by nearest timestamp within 1 hour
    idx = (df["time"] - ts_pred).abs().idxmin()
    if abs((df.loc[idx, "time"] - ts_pred).total_seconds()) > 3600:
        return None  # can't align
    end_idx = min(idx + horizon_h, len(df)-1)
    total = float(np.nansum(df.loc[idx+1:end_idx, "precip_mm"]))
    return 1 if total >= event_mm else 0

def main():
    if not PRED_LOG.exists():
        print("No predictions.csv found.")
        return

    meta = json.loads(Path(META).read_text())
    H = int(meta["horizon_hours"]); event_mm = float(meta["event_mm"])

    df = pd.read_csv(PRED_LOG, parse_dates=["ts_pred","logged_at"])
    updated = 0
    for i, row in df[df["y_true"].isna() | (df["y_true"]=="")].iterrows():
        ts_pred = row["ts_pred"]
        if datetime.now() < ts_pred + timedelta(hours=H):
            continue  # horizon not passed yet
        # fetch enough history to cover that timestamp
        hdf = ensure_hourly(row["lat"], row["lon"], past_days=120)
        y = label_from_df(hdf, ts_pred, H, event_mm)
        if y is not None:
            df.at[i, "y_true"] = int(y)
            updated += 1

    df.to_csv(PRED_LOG, index=False)
    print(f"Backfilled {updated} rows into {PRED_LOG}")

if __name__ == "__main__":
    main()