weather-data-fetcher-api / scripts /train_xgb_tuned_final.py
theelvace's picture
Deployable Gradio build
6eff894
import json, warnings
from pathlib import Path
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve, roc_auc_score
import joblib
warnings.filterwarnings("ignore")
H = 12
EVENT_MM = 1.0
HOURLY = Path("results/hourly.csv")
TUNED = Path("models/xgb_tuned.json")
OUT_MODEL = Path("models/rain_xgb_tuned.joblib")
OUT_META = Path("models/rain_xgb_tuned_meta.json")
# -----------------------------
# Label: >= EVENT_MM in next H hours
# -----------------------------
def make_labels(df: pd.DataFrame, horizon=H, event_mm=EVENT_MM):
prec = df["precip_mm"].values
y = np.zeros(len(df), dtype=int)
for i in range(len(prec) - horizon):
y[i] = 1 if np.nansum(prec[i+1:i+1+horizon]) >= event_mm else 0
if horizon > 0:
y = y[:-horizon]
index = df.index[:-horizon]
else:
index = df.index
return pd.Series(y, index=index)
# Feature builder (MATCH CLI/TRAINER)
def build_features(df: pd.DataFrame) -> pd.DataFrame:
required = {"time","temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"}
miss = required - set(df.columns)
if miss:
raise ValueError(f"Hourly data missing columns: {sorted(miss)}")
base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
for c in base:
df[f"d_{c}"] = df[c].diff()
df[f"ma3_{c}"] = df[c].rolling(3).mean()
for c in ["pressure","humidity","cloudcover","temp_c"]:
df[f"d3_{c}"] = df[c] - df[c].shift(3)
df["dew_proxy"] = df["temp_c"] - (df["humidity"] / 5.0)
df["d_dew_proxy"] = df["dew_proxy"].diff()
df["ma3_dew_proxy"] = df["dew_proxy"].rolling(3).mean()
df["rain_sum_3h"] = df["precip_mm"].rolling(3).sum()
df["rain_sum_6h"] = df["precip_mm"].rolling(6).sum()
df["rain_sum_12h"] = df["precip_mm"].rolling(12).sum()
df["rain_sum_24h"] = df["precip_mm"].rolling(24).sum()
df["rain_max_6h"] = df["precip_mm"].rolling(6).max()
df["rain_max_12h"] = df["precip_mm"].rolling(12).max()
is_rain = (df["precip_mm"] > 0).astype(int)
dry = (~(is_rain.astype(bool))).astype(int)
df["dry_streak_h"] = (dry.groupby((dry != dry.shift()).cumsum()).cumcount() + 1) * dry
df["dry_streak_h"] = df["dry_streak_h"].where(dry == 1, 0)
wet = is_rain
df["wet_streak_h"] = (wet.groupby((wet != wet.shift()).cumsum()).cumcount() + 1) * wet
df["wet_streak_h"] = df["wet_streak_h"].where(wet == 1, 0)
df["hour"] = df["time"].dt.hour
df["dow"] = df["time"].dt.dayofweek
df["doy"] = df["time"].dt.dayofyear
df["hoy"] = (df["doy"] - 1) * 24 + df["hour"]
df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24.0)
df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24.0)
df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7.0)
df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7.0)
df["hoy_sin"] = np.sin(2*np.pi*df["hoy"]/(365.25*24))
df["hoy_cos"] = np.cos(2*np.pi*df["hoy"]/(365.25*24))
df["hum_x_cloud"] = df["humidity"] * df["cloudcover"]
df["wind_x_cloud"] = df["wind_speed"] * df["cloudcover"]
df["press_drop_3h"] = -df["d3_pressure"]
df["press_drop_6h"] = df["pressure"].shift(6) - df["pressure"]
feats = (
base +
[f"d_{c}" for c in base] +
[f"ma3_{c}" for c in base] +
[f"d3_{c}" for c in ["pressure","humidity","cloudcover","temp_c"]] +
["dew_proxy","d_dew_proxy","ma3_dew_proxy",
"rain_sum_3h","rain_sum_6h","rain_sum_12h","rain_sum_24h","rain_max_6h","rain_max_12h",
"dry_streak_h","wet_streak_h",
"hour_sin","hour_cos","dow_sin","dow_cos","hoy_sin","hoy_cos",
"hum_x_cloud","wind_x_cloud","press_drop_3h","press_drop_6h"]
)
Xdf = df[feats].copy()
Xdf = Xdf.dropna()
return Xdf
# -----------------------------
# Threshold pickers
# -----------------------------
def _eval_at_threshold(y_true, p, t):
from sklearn.metrics import precision_recall_fscore_support
pred = (p >= t).astype(int)
P, R, F1, _ = precision_recall_fscore_support(y_true, pred, average="binary", zero_division=0)
pos_rate = float(pred.mean())
return P, R, F1, pos_rate
def pick_by_best_f1(y_true, p):
"""
DEFAULT: balanced.
Requirements:
- Precision ≥ 0.70
- Recall ≥ 0.55
- 0.15 ≤ positive rate ≤ 0.60
- Final threshold floor at 0.15
"""
from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_true, p)
best = None
for t in thr:
P, R, F1, pr = _eval_at_threshold(y_true, p, t)
if (P >= 0.70) and (R >= 0.55) and (0.15 <= pr <= 0.60):
if (best is None) or (F1 > best[1]):
best = (t, F1)
if best is not None:
return float(max(best[0], 0.15))
# fallback: conservative quantile
return float(max(np.quantile(p, 0.70), 0.15))
def pick_high_recall(y_true, p):
"""
RECALL: warn more, but not silly.
Targets / limits:
- Recall ≥ 0.88
- Precision ≥ 0.60
- positive rate ≤ 0.70
- Final threshold floor at 0.10
"""
from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_true, p)
best = None
for t in thr:
P, R, F1, pr = _eval_at_threshold(y_true, p, t)
if (R >= 0.88) and (P >= 0.60) and (pr <= 0.70):
if (best is None) or (R, F1) > (best[1], best[2]):
best = (t, R, F1)
if best is not None:
return float(max(best[0], 0.10))
# fallback: lower-but-not-silly quantile
return float(max(np.quantile(p, 0.60), 0.10))
def pick_high_precision(y_true, p):
"""
PRECISION: be picky.
Requirements:
- Precision ≥ 0.85
- Recall ≥ 0.40
- positive rate ≤ 0.50
- Final threshold floor at 0.60
"""
from sklearn.metrics import precision_recall_curve
prec, rec, thr = precision_recall_curve(y_true, p)
best = None
for t in thr:
P, R, F1, pr = _eval_at_threshold(y_true, p, t)
if (P >= 0.85) and (R >= 0.40) and (pr <= 0.50):
if (best is None) or (P, F1) > (best[1], best[2]):
best = (t, P, F1)
if best is not None:
return float(max(best[0], 0.60))
return float(max(np.quantile(p, 0.90), 0.60))
# -----------------------------
# Per-fold evaluation with time-aware val slice
# -----------------------------
def eval_timeseries_cv(model, X, y, n_splits=5, val_frac=0.15):
tscv = TimeSeriesSplit(n_splits=n_splits)
scores = []
for fold, (tr, te) in enumerate(tscv.split(X)):
X_tr, X_te = X[tr], X[te]
y_tr, y_te = y[tr], y[te]
v = max(int(len(X_tr) * val_frac), 1)
X_fit, y_fit = X_tr[:-v], y_tr[:-v]
X_val, y_val = X_tr[-v:], y_tr[-v:]
model.fit(X_fit, y_fit)
p_val = model.predict_proba(X_val)[:,1]
p_te = model.predict_proba(X_te)[:,1]
thr = pick_by_best_f1(y_val, p_val)
pred = (p_te >= thr).astype(int)
P,R,F1,_ = precision_recall_fscore_support(y_te, pred, average="binary", zero_division=0)
try: auc = roc_auc_score(y_te, p_te)
except: auc = float("nan")
scores.append(dict(P=P,R=R,F1=F1,AUC=auc,thr=float(thr)))
print(f"Fold {fold+1} → P={P:.3f} R={R:.3f} F1={F1:.3f} thr={thr:.3f}")
mean = {k: float(np.mean([s[k] for s in scores])) for k in ["P","R","F1","AUC"]}
print(f"Mean → P={mean['P']:.3f} R={mean['R']:.3f} F1={mean['F1']:.3f} AUC={mean['AUC']:.3f}")
return scores, mean
# -----------------------------
# Main
# -----------------------------
def main():
if not HOURLY.exists():
raise FileNotFoundError("results/hourly.csv not found. Run: make hourly PAST_DAYS=90")
if not TUNED.exists():
raise FileNotFoundError("models/xgb_tuned.json not found. Run: python scripts/xgb_tune_timeseries.py")
df = pd.read_csv(HOURLY, parse_dates=["time"])
# Build labels first
y_series = make_labels(df, H, EVENT_MM)
# Build features on all rows except final H (no future window)
dfX = df.iloc[:-H].copy()
Xdf = build_features(dfX) # keeps index
y_aligned = y_series.loc[Xdf.index] # align by index
X = Xdf.values
y = y_aligned.values
# Load tuned params
tuned = json.loads(TUNED.read_text())["params"]
clf = XGBClassifier(
**tuned,
objective="binary:logistic",
eval_metric="aucpr",
tree_method="hist",
random_state=42,
)
print("⚙️ Cross-validating tuned XGB…")
cv_scores, cv_mean = eval_timeseries_cv(clf, X, y, n_splits=5, val_frac=0.15)
# Fit on all data (with small tail held as validation for threshold picking)
v = max(int(len(X) * 0.20), 1)
X_fit, y_fit = X[:-v], y[:-v]
X_val, y_val = X[-v:], y[-v:]
clf.fit(X_fit, y_fit)
p_val = clf.predict_proba(X_val)[:,1]
thr_default = pick_by_best_f1(y_val, p_val)
thr_recall = pick_high_recall(y_val, p_val)
thr_prec = pick_high_precision(y_val, p_val)
# Save model + meta
Path("models").mkdir(exist_ok=True)
joblib.dump(clf, OUT_MODEL)
meta = {
"features": list(Xdf.columns),
"horizon_hours": H,
"event_mm": EVENT_MM,
"model": {"type": "xgboost", "params": tuned},
"thresholds": {
"default": float(thr_default),
"high_recall": float(thr_recall),
"high_precision": float(thr_prec),
},
"cv_mean": cv_mean,
"cv_folds": cv_scores,
}
OUT_META.write_text(json.dumps(meta, indent=2))
print("\n💾 Saved", OUT_MODEL, "and", OUT_META)
print(f"Chosen thresholds → default={thr_default:.3f} recall={thr_recall:.3f} precision={thr_prec:.3f}")
if __name__ == "__main__":
main()