Spaces:

theelvace
/

weather-data-fetcher-api

Runtime error

App Files Files Community

weather-data-fetcher-api / scripts /train_xgb_tuned_final.py

theelvace

Deployable Gradio build

6eff894 5 months ago

raw

history blame contribute delete

9.97 kB


	import json, warnings
	from pathlib import Path
	import numpy as np
	import pandas as pd
	from xgboost import XGBClassifier
	from sklearn.model_selection import TimeSeriesSplit
	from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve, roc_auc_score
	import joblib

	warnings.filterwarnings("ignore")

	H = 12
	EVENT_MM = 1.0
	HOURLY = Path("results/hourly.csv")
	TUNED = Path("models/xgb_tuned.json")
	OUT_MODEL = Path("models/rain_xgb_tuned.joblib")
	OUT_META = Path("models/rain_xgb_tuned_meta.json")

	# -----------------------------
	# Label: >= EVENT_MM in next H hours
	# -----------------------------
	def make_labels(df: pd.DataFrame, horizon=H, event_mm=EVENT_MM):
	prec = df["precip_mm"].values
	y = np.zeros(len(df), dtype=int)
	for i in range(len(prec) - horizon):
	y[i] = 1 if np.nansum(prec[i+1:i+1+horizon]) >= event_mm else 0
	if horizon > 0:
	y = y[:-horizon]
	index = df.index[:-horizon]
	else:
	index = df.index
	return pd.Series(y, index=index)

	# Feature builder (MATCH CLI/TRAINER)
	def build_features(df: pd.DataFrame) -> pd.DataFrame:
	required = {"time","temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"}
	miss = required - set(df.columns)
	if miss:
	raise ValueError(f"Hourly data missing columns: {sorted(miss)}")

	base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
	for c in base:
	df[f"d_{c}"] = df[c].diff()
	df[f"ma3_{c}"] = df[c].rolling(3).mean()

	for c in ["pressure","humidity","cloudcover","temp_c"]:
	df[f"d3_{c}"] = df[c] - df[c].shift(3)

	df["dew_proxy"] = df["temp_c"] - (df["humidity"] / 5.0)
	df["d_dew_proxy"] = df["dew_proxy"].diff()
	df["ma3_dew_proxy"] = df["dew_proxy"].rolling(3).mean()

	df["rain_sum_3h"] = df["precip_mm"].rolling(3).sum()
	df["rain_sum_6h"] = df["precip_mm"].rolling(6).sum()
	df["rain_sum_12h"] = df["precip_mm"].rolling(12).sum()
	df["rain_sum_24h"] = df["precip_mm"].rolling(24).sum()
	df["rain_max_6h"] = df["precip_mm"].rolling(6).max()
	df["rain_max_12h"] = df["precip_mm"].rolling(12).max()

	is_rain = (df["precip_mm"] > 0).astype(int)
	dry = (~(is_rain.astype(bool))).astype(int)
	df["dry_streak_h"] = (dry.groupby((dry != dry.shift()).cumsum()).cumcount() + 1) * dry
	df["dry_streak_h"] = df["dry_streak_h"].where(dry == 1, 0)
	wet = is_rain
	df["wet_streak_h"] = (wet.groupby((wet != wet.shift()).cumsum()).cumcount() + 1) * wet
	df["wet_streak_h"] = df["wet_streak_h"].where(wet == 1, 0)

	df["hour"] = df["time"].dt.hour
	df["dow"] = df["time"].dt.dayofweek
	df["doy"] = df["time"].dt.dayofyear
	df["hoy"] = (df["doy"] - 1) * 24 + df["hour"]

	df["hour_sin"] = np.sin(2np.pidf["hour"]/24.0)
	df["hour_cos"] = np.cos(2np.pidf["hour"]/24.0)
	df["dow_sin"] = np.sin(2np.pidf["dow"]/7.0)
	df["dow_cos"] = np.cos(2np.pidf["dow"]/7.0)
	df["hoy_sin"] = np.sin(2np.pidf["hoy"]/(365.25*24))
	df["hoy_cos"] = np.cos(2np.pidf["hoy"]/(365.25*24))

	df["hum_x_cloud"] = df["humidity"] * df["cloudcover"]
	df["wind_x_cloud"] = df["wind_speed"] * df["cloudcover"]
	df["press_drop_3h"] = -df["d3_pressure"]
	df["press_drop_6h"] = df["pressure"].shift(6) - df["pressure"]

	feats = (
	base +
	[f"d_{c}" for c in base] +
	[f"ma3_{c}" for c in base] +
	[f"d3_{c}" for c in ["pressure","humidity","cloudcover","temp_c"]] +
	["dew_proxy","d_dew_proxy","ma3_dew_proxy",
	"rain_sum_3h","rain_sum_6h","rain_sum_12h","rain_sum_24h","rain_max_6h","rain_max_12h",
	"dry_streak_h","wet_streak_h",
	"hour_sin","hour_cos","dow_sin","dow_cos","hoy_sin","hoy_cos",
	"hum_x_cloud","wind_x_cloud","press_drop_3h","press_drop_6h"]
	)

	Xdf = df[feats].copy()
	Xdf = Xdf.dropna()
	return Xdf

	# -----------------------------
	# Threshold pickers
	# -----------------------------
	def _eval_at_threshold(y_true, p, t):
	from sklearn.metrics import precision_recall_fscore_support
	pred = (p >= t).astype(int)
	P, R, F1, _ = precision_recall_fscore_support(y_true, pred, average="binary", zero_division=0)
	pos_rate = float(pred.mean())
	return P, R, F1, pos_rate

	def pick_by_best_f1(y_true, p):
	"""
	DEFAULT: balanced.
	Requirements:
	- Precision ≥ 0.70
	- Recall ≥ 0.55
	- 0.15 ≤ positive rate ≤ 0.60
	- Final threshold floor at 0.15
	"""
	from sklearn.metrics import precision_recall_curve
	prec, rec, thr = precision_recall_curve(y_true, p)
	best = None
	for t in thr:
	P, R, F1, pr = _eval_at_threshold(y_true, p, t)
	if (P >= 0.70) and (R >= 0.55) and (0.15 <= pr <= 0.60):
	if (best is None) or (F1 > best[1]):
	best = (t, F1)
	if best is not None:
	return float(max(best[0], 0.15))
	# fallback: conservative quantile
	return float(max(np.quantile(p, 0.70), 0.15))

	def pick_high_recall(y_true, p):
	"""
	RECALL: warn more, but not silly.
	Targets / limits:
	- Recall ≥ 0.88
	- Precision ≥ 0.60
	- positive rate ≤ 0.70
	- Final threshold floor at 0.10
	"""
	from sklearn.metrics import precision_recall_curve
	prec, rec, thr = precision_recall_curve(y_true, p)
	best = None
	for t in thr:
	P, R, F1, pr = _eval_at_threshold(y_true, p, t)
	if (R >= 0.88) and (P >= 0.60) and (pr <= 0.70):
	if (best is None) or (R, F1) > (best[1], best[2]):
	best = (t, R, F1)
	if best is not None:
	return float(max(best[0], 0.10))
	# fallback: lower-but-not-silly quantile
	return float(max(np.quantile(p, 0.60), 0.10))

	def pick_high_precision(y_true, p):
	"""
	PRECISION: be picky.
	Requirements:
	- Precision ≥ 0.85
	- Recall ≥ 0.40
	- positive rate ≤ 0.50
	- Final threshold floor at 0.60
	"""
	from sklearn.metrics import precision_recall_curve
	prec, rec, thr = precision_recall_curve(y_true, p)
	best = None
	for t in thr:
	P, R, F1, pr = _eval_at_threshold(y_true, p, t)
	if (P >= 0.85) and (R >= 0.40) and (pr <= 0.50):
	if (best is None) or (P, F1) > (best[1], best[2]):
	best = (t, P, F1)
	if best is not None:
	return float(max(best[0], 0.60))
	return float(max(np.quantile(p, 0.90), 0.60))

	# -----------------------------
	# Per-fold evaluation with time-aware val slice
	# -----------------------------
	def eval_timeseries_cv(model, X, y, n_splits=5, val_frac=0.15):
	tscv = TimeSeriesSplit(n_splits=n_splits)
	scores = []
	for fold, (tr, te) in enumerate(tscv.split(X)):
	X_tr, X_te = X[tr], X[te]
	y_tr, y_te = y[tr], y[te]
	v = max(int(len(X_tr) * val_frac), 1)
	X_fit, y_fit = X_tr[:-v], y_tr[:-v]
	X_val, y_val = X_tr[-v:], y_tr[-v:]

	model.fit(X_fit, y_fit)
	p_val = model.predict_proba(X_val)[:,1]
	p_te = model.predict_proba(X_te)[:,1]

	thr = pick_by_best_f1(y_val, p_val)
	pred = (p_te >= thr).astype(int)
	P,R,F1,_ = precision_recall_fscore_support(y_te, pred, average="binary", zero_division=0)
	try: auc = roc_auc_score(y_te, p_te)
	except: auc = float("nan")
	scores.append(dict(P=P,R=R,F1=F1,AUC=auc,thr=float(thr)))
	print(f"Fold {fold+1} → P={P:.3f} R={R:.3f} F1={F1:.3f} thr={thr:.3f}")
	mean = {k: float(np.mean([s[k] for s in scores])) for k in ["P","R","F1","AUC"]}
	print(f"Mean → P={mean['P']:.3f} R={mean['R']:.3f} F1={mean['F1']:.3f} AUC={mean['AUC']:.3f}")
	return scores, mean

	# -----------------------------
	# Main
	# -----------------------------
	def main():
	if not HOURLY.exists():
	raise FileNotFoundError("results/hourly.csv not found. Run: make hourly PAST_DAYS=90")
	if not TUNED.exists():
	raise FileNotFoundError("models/xgb_tuned.json not found. Run: python scripts/xgb_tune_timeseries.py")

	df = pd.read_csv(HOURLY, parse_dates=["time"])

	# Build labels first
	y_series = make_labels(df, H, EVENT_MM)

	# Build features on all rows except final H (no future window)
	dfX = df.iloc[:-H].copy()
	Xdf = build_features(dfX) # keeps index
	y_aligned = y_series.loc[Xdf.index] # align by index
	X = Xdf.values
	y = y_aligned.values

	# Load tuned params
	tuned = json.loads(TUNED.read_text())["params"]
	clf = XGBClassifier(
	**tuned,
	objective="binary:logistic",
	eval_metric="aucpr",
	tree_method="hist",
	random_state=42,
	)

	print("⚙️ Cross-validating tuned XGB…")
	cv_scores, cv_mean = eval_timeseries_cv(clf, X, y, n_splits=5, val_frac=0.15)

	# Fit on all data (with small tail held as validation for threshold picking)
	v = max(int(len(X) * 0.20), 1)
	X_fit, y_fit = X[:-v], y[:-v]
	X_val, y_val = X[-v:], y[-v:]

	clf.fit(X_fit, y_fit)
	p_val = clf.predict_proba(X_val)[:,1]

	thr_default = pick_by_best_f1(y_val, p_val)
	thr_recall = pick_high_recall(y_val, p_val)
	thr_prec = pick_high_precision(y_val, p_val)

	# Save model + meta
	Path("models").mkdir(exist_ok=True)
	joblib.dump(clf, OUT_MODEL)

	meta = {
	"features": list(Xdf.columns),
	"horizon_hours": H,
	"event_mm": EVENT_MM,
	"model": {"type": "xgboost", "params": tuned},
	"thresholds": {
	"default": float(thr_default),
	"high_recall": float(thr_recall),
	"high_precision": float(thr_prec),
	},
	"cv_mean": cv_mean,
	"cv_folds": cv_scores,
	}
	OUT_META.write_text(json.dumps(meta, indent=2))
	print("\n💾 Saved", OUT_MODEL, "and", OUT_META)
	print(f"Chosen thresholds → default={thr_default:.3f} recall={thr_recall:.3f} precision={thr_prec:.3f}")

	if __name__ == "__main__":
	main()