import os import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.metrics import ( confusion_matrix, classification_report, roc_auc_score, precision_recall_fscore_support ) df = pd.read_csv("results/hourly.csv", parse_dates=["time"]) H = 6 precip_next = np.zeros(len(df), dtype=int) prec = df["precip_mm"].values for i in range(len(prec) - H): precip_next[i] = 1 if np.any(prec[i+1:i+1+H] > 0) else 0 df = df.iloc[:len(precip_next) - (0)].copy() df["rain_next6h"] = precip_next[:len(df)] features = [ "temp_c","humidity","cloudcover","pressure","wind_speed", "precip_mm","rain_mm" ] X = df[features].values y = df["rain_next6h"].values print("Class balance (0=no-rain, 1=rain-in-next6h):", np.bincount(y)) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.3, shuffle=False ) clf = Pipeline([ ("scaler", StandardScaler()), ("logreg", LogisticRegression(max_iter=500, class_weight="balanced")) ]) clf.fit(X_train, y_train) proba = clf.predict_proba(X_test)[:, 1] pred_050 = (proba >= 0.50).astype(int) cm = confusion_matrix(y_test, pred_050) print("\nšŸ“Š Confusion Matrix (thr=0.50)") print(cm) prec, rec, f1, _ = precision_recall_fscore_support( y_test, pred_050, average="binary", zero_division=0 ) try: auc = roc_auc_score(y_test, proba) except ValueError: auc = float("nan") print(f"Precision: {prec:.3f} Recall: {rec:.3f} F1: {f1:.3f} ROC-AUC: {auc:.3f}") print("\nDetailed report:") print(classification_report(y_test, pred_050, digits=3, zero_division=0)) # Baselines always_no = np.zeros_like(y_test) p0, r0, f10, _ = precision_recall_fscore_support( y_test, always_no, average="binary", zero_division=0 ) print("\n🧠 Baseline — always 'no rain'") print(f"Precision: {p0:.3f} Recall: {r0:.3f} F1: {f10:.3f}") # Persistence baseline recent_rain = ( pd.Series(df["precip_mm"]) .rolling(window=H, min_periods=1) .sum() .shift(1) .fillna(0) > 0 ).astype(int).values prev6_test = recent_rain[-len(y_test):] pp, rp, f1p, _ = precision_recall_fscore_support(y_test, prev6_test, average="binary", zero_division=0) print("\n🧠 Baseline — persistence (prev 6h)") print(f"Precision: {pp:.3f} Recall: {rp:.3f} F1: {f1p:.3f}") # Threshold tuning thr_recall = 0.35 thr_precision = 0.65 pred_recall = (proba >= thr_recall).astype(int) pred_precision = (proba >= thr_precision).astype(int) pr_recall, rc_recall, f1_recall, _ = precision_recall_fscore_support( y_test, pred_recall, average="binary", zero_division=0 ) pr_precision, rc_precision, f1_precision, _ = precision_recall_fscore_support( y_test, pred_precision, average="binary", zero_division=0 ) print(f"\nšŸŽ›ļø Threshold {thr_recall:.2f} → Precision: {pr_recall:.3f} Recall: {rc_recall:.3f} F1: {f1_recall:.3f}") print(f"šŸŽ›ļø Threshold {thr_precision:.2f} → Precision: {pr_precision:.3f} Recall: {rc_precision:.3f} F1: {f1_precision:.3f}") import joblib os.makedirs("models", exist_ok=True) joblib.dump(clf, "models/rain_classifier_hourly.joblib") print("\nšŸ’¾ Saved: models/rain_classifier_hourly.joblib") meta = { "horizon_hours": H, "features": features, "thresholds": { "default": 0.50, "high_recall": thr_recall, "high_precision": thr_precision, }, "metrics": { "default": {"precision": float(prec), "recall": float(rec), "f1": float(f1)}, "high_recall": { "precision": float(pr_recall), "recall": float(rc_recall), "f1": float(f1_recall), }, "high_precision": { "precision": float(pr_precision), "recall": float(rc_precision), "f1": float(f1_precision), }, "baseline_persistence": { "precision": float(pp), "recall": float(rp), "f1": float(f1p), }, }, } with open("models/rain_model_meta.json", "w") as fh: import json json.dump(meta, fh, indent=2) print("šŸ“ Saved: models/rain_model_meta.json")