Spaces:

theelvace
/

weather-data-fetcher-api

Runtime error

App Files Files Community

theelvace commited on Nov 6, 2025

Commit

6eff894

0 Parent(s):

Deployable Gradio build

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.DS_Store +0 -0
.dockerignore +14 -0
.gitattributes +1 -0
.gitignore +7 -0
Dockerfile +16 -0
LICENSE +21 -0
Makefile +94 -0
README.md +179 -0
app.py +156 -0
app/main.py +293 -0
assets/cover.png +0 -0
assets/feature_importance.png +0 -0
assets/pr_curve.png +0 -0
assets/precip.png +0 -0
assets/roc_curve.png +0 -0
assets/temps.png +0 -0
models/rain_model_meta.json +84 -0
models/rain_xgb_cal_meta.json +117 -0
models/rain_xgb_meta.json +94 -0
models/rain_xgb_tuned_meta.json +111 -0
models/xgb_tuned.json +11 -0
pyproject.toml +26 -0
render.yaml +8 -0
requirements.txt +32 -0
scripts/analyze_weather.py +11 -0
scripts/backfill_labels.py +53 -0
scripts/coef_rain.py +32 -0
scripts/cron_predict.sh +48 -0
scripts/cv_benchmark.py +243 -0
scripts/download_models.py +41 -0
scripts/eval_operating_points.py +49 -0
scripts/explain_shap.py +62 -0
scripts/explain_shap_interaction.py +105 -0
scripts/export_daily.py +18 -0
scripts/export_hourly.py +33 -0
scripts/feature_importance_rain.py +79 -0
scripts/fetch_weather.sh +21 -0
scripts/intro_ml.py +28 -0
scripts/log_predict.py +67 -0
scripts/make_cover.py +59 -0
scripts/monitor_weekly.py +60 -0
scripts/plot_pr_roc.py +65 -0
scripts/plot_weather.py +72 -0
scripts/predict_rain.py +29 -0
scripts/process_weather.py +56 -0
scripts/rain_cli.py +34 -0
scripts/start_services.sh +28 -0
scripts/time_series_cv_demo.py +38 -0
scripts/train_classify_rain.py +93 -0
scripts/train_classify_rain_hourly.py +143 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

.dockerignore ADDED Viewed

	@@ -0,0 +1,14 @@

+.git
+.github
+.venv
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+*.log
+*.csv
+data/
+results/
+build/
+dist/
+node_modules/

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ models/*.joblib filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+data/
+results/
+logs/
+__pycache__/
+*.zip
+.env
+models/*.joblib

Dockerfile ADDED Viewed

	@@ -0,0 +1,16 @@

+FROM python:3.11-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential && \
+    rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+ENV PORT=7860
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Elvis Anselm
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

Makefile ADDED Viewed

	@@ -0,0 +1,94 @@

+.PHONY: all check download process zip clean coords plot setup viz cli \
+        install uninstall rain rain6 rain-train rain-predict rain-now eval-plots \
+        hourly
+all: check download process zip
+	@echo "🏁 Weather pipeline complete."
+check:
+	@command -v curl >/dev/null || (echo "curl missing"; exit 1)
+	@command -v python3 >/dev/null || (echo "python3 missing"; exit 1)
+	@[ -f scripts/fetch_weather.sh ] || (echo "missing scripts/fetch_weather.sh"; exit 1)
+	@[ -f scripts/process_weather.py ] || (echo "missing scripts/process_weather.py"; exit 1)
+download:
+	@bash scripts/fetch_weather.sh
+process:
+	@python3 scripts/process_weather.py
+zip:
+	@zip -j results/results.zip results/summary.txt results/summary.csv
+clean:
+	@rm -rf data results logs
+	@mkdir -p data results logs
+coords:
+	@LAT="$(LAT)" LON="$(LON)" bash scripts/fetch_weather.sh
+	@python3 scripts/process_weather.py
+	@zip -j results/results.zip results/summary.txt
+plot:
+	@python3 scripts/plot_weather.py
+viz: all plot
+	@echo "📊 Charts generated."
+setup:
+	@python3 -m venv .venv
+	@. .venv/bin/activate && pip install -r requirements.txt
+cli:
+	@python3 scripts/weather_cli.py --city Lagos --lat 6.5244 --lon 3.3792
+install:
+	@. .venv/bin/activate && pip install -e .
+uninstall:
+	@. .venv/bin/activate && pip uninstall -y weather-data-fetcher
+rain:
+	@python3 scripts/train_classify_rain.py
+rain6:
+	@python3 scripts/train_classify_rain_hourly.py
+rain-train:
+	@python3 scripts/train_rain_dual_thresholds.py
+rain-predict:
+	@python3 scripts/predict_rain.py
+rain-now:
+	@weather-cli rain --mode recall
+eval-plots:
+	@python3 scripts/plot_pr_roc.py
+hourly:
+	@LAT="$(LAT)" LON="$(LON)" PAST_DAYS="$(PAST_DAYS)" bash scripts/fetch_weather.sh
+	@python3 scripts/export_hourly.py
+.PHONY: xgb-train
+xgb-train:
+	@python3 scripts/train_xgb_12h.py
+.PHONY: xgb-train-cal
+xgb-train-cal:
+	@python3 scripts/train_xgb_12h_calibrated.py
+.PHONY: predict-log backfill monitor
+predict-log:
+	@python3 scripts/log_predict.py --city "Lagos" --lat 6.5244 --lon 3.3792 --mode default
+backfill:
+	@python3 scripts/backfill_labels.py
+monitor:
+	@python3 scripts/monitor_weekly.py
+.PHONY: cron-test
+cron-test:
+	@./scripts/cron_predict.sh default "Lagos" 6.5244 3.3792 90 >> logs/cron.log 2>&1 && tail -n 5 logs/cron.log

README.md ADDED Viewed

	@@ -0,0 +1,179 @@

+---
+title: Weather Data Fetcher
+emoji: 🌧️
+colorFrom: blue
+colorTo: gray
+sdk: docker
+hub: registry.hf.space/theelvace/weather-data-fetcher-api:latest
+pinned: false
+---
+# Weather Data Fetcher — Automated Data Pipeline
+Fetch daily Lagos (or any city) weather data using **Open-Meteo API**, process it with **Python**, and automate the full workflow via **Bash + Makefile**.
+---
+## Project Overview
+This project demonstrates a clean, reproducible workflow for data automation — the same principles used in ML and DevOps pipelines.
+**Pipeline Steps**
+1. Download daily weather JSON from Open-Meteo
+2. Parse, validate, and summarize data in Python
+3. Generate text + CSV summaries (and optional plots)
+4. Automate everything via a single `make all` command
+---
+## Charts
+## 🌧️ Rain Warning (next 6 hours)
+Predict **whether it will rain in the next 6 hours** from hourly observations (temperature, humidity, pressure, wind, cloud cover, precipitation).
+| Mode           | Threshold | Precision | Recall | When to use            |
+| -------------- | --------- | --------- | ------ | ---------------------- |
+| Default        | 0.50      | 0.71      | 0.70   | Balanced alerts        |
+| High recall    | 0.35      | 0.68      | 0.84   | Better safe than sorry |
+| High precision | 0.65      | 0.79      | 0.50   | Only warn if confident |
+### Train once
+```bash
+make hourly
+make rain-train
+make rain-now
+python scripts/train_rain_dual_thresholds.py
+python scripts/plot_pr_roc.py  # refresh PR/ROC charts
+```
+This produces:
+- `models/rain_classifier_hourly.joblib`
+- `models/rain_model_meta.json`
+- `results/pr_curve.png`, `results/roc_curve.png`
+### Predict from the latest hour
+```bash
+weather-cli rain --mode recall     # warn more often
+weather-cli rain --mode precision  # fewer false alarms
+```
+Example output:
+```
+2025-10-26 23:00:00 | P(rain ≤6h)=0.492 | mode=recall    thr=0.35 → RAIN
+2025-10-26 23:00:00 | P(rain ≤6h)=0.492 | mode=precision thr=0.65 → No rain
+```
+### How thresholds are chosen
+Training sweeps precision–recall trade-offs and stores two operating points:
+| Threshold type | Purpose                        |
+| -------------- | ------------------------------ |
+| High recall    | Catch >80 % of rain events     |
+| High precision | Warn only when ≥90 % confident |
+![PR Curve](results/pr_curve.png)
+![ROC Curve](results/roc_curve.png)
+### Model Interpretability
+ML is not useful unless we can understand what it learned. This section explains why the classifier predicts rain, and not just whether it predicts rain.
+- **Feature Coefficients (standardized):** which signals push toward rain vs no-rain
+  ```bash
+  python scripts/coef_rain.py  # writes top weights
+  ```
+  Output → `results/coef_top15.txt`
+- **Permutation importance:** which features matter most to F1 on the test set.
+  This tells us which variables the model relies on the most when making real predictions.
+  ```bash
+  python scripts/feature_importance_rain.py
+  ```
+  Output → `results/feature_importance.png`
+It engineers both raw signals and short-term deltas/rolling means. Positive coefficients push toward “RAIN”, negative toward “No rain”.
+### What the model actually learned (top signals)
+| Feature      | Meaning                                                              |
+| ------------ | -------------------------------------------------------------------- |
+| `precip_mm`  | Existing rainfall strongly predicts more rain (tropical persistence) |
+| `temp_c`     | Warmer air holds more moisture → higher chance of near-term rain     |
+| `humidity`   | High saturation = cloud condensation is likely                       |
+| `pressure`   | Falling pressure indicates unstable atmosphere / storm formation     |
+| `cloudcover` | More clouds = conditions building toward rainfall                    |
+| `wind_speed` | Negative weight — stronger winds can disperse moisture               |
+The classifier isn’t guessing; it’s surfacing familiar meteorological patterns.
+### What drives the rain predictions?
+Using SHAP explainability, I found that the model mainly relies on **humidity** and **temperature** when deciding if it will rain in the next 12 hours.
+- High humidity pushes the model strongly toward predicting rain.
+- Lower temperatures slightly increase rain probability.
+- The interaction between humidity and temperature mimics real-world weather dynamics — humid, cool conditions tend to precede rainfall.
+This means the model isn’t just memorizing data — it has captured meaningful relationships that align with atmospheric science.
+![Humidity vs Temperature SHAP interaction](results/shap_interaction.png)
+> Generated via `python scripts/explain_shap_interaction.py`, which also writes `results/shap_interaction_rev.png` for the reverse view.
+## 🌧️ Rain Events (≥1.0 mm in next 12h)
+**Label:** “Rain event if cumulative precipitation ≥ **1.0 mm** within the next **12 hours**.”
+**Policy:** Default to **Early Warning** (recall-leaning) for Lagos conditions. Offer a stricter **Cautious Alert** mode.
+**Train / thresholds / predict**
+```bash
+# (data) pull 90 days of hourly data
+make hourly PAST_DAYS=90
+# (model) train XGBoost + Isotonic calibration
+python scripts/train_xgb_12h_calibrated.py
+# (CLI) two operating modes
+weather-cli rain --mode recall     # Early Warning (higher recall)
+weather-cli rain --mode precision  # Cautious Alert (stricter)
+weather-cli rain                   # Balanced (best F1)
+```
+### 🌧️ Rain Warning (next 12h)
+Train tuned model + set guarded thresholds:
+```bash
+python scripts/xgb_tune_timeseries.py
+python scripts/train_xgb_tuned_final.py
+cp models/rain_xgb_tuned.joblib    models/rain_classifier_hourly.joblib
+cp models/rain_xgb_tuned_meta.json models/rain_model_meta.json
+```
+## Run Locally
+Clone and run:
+```bash
+make all
+```
+## CLI
+Install (editable):
+```bash
+python3 -m venv .venv && source .venv/bin/activate
+pip install -e .
+```
+Once installed, run `weather-cli --help` for all commands (including the rain mode above).

app.py ADDED Viewed

	@@ -0,0 +1,156 @@

+import json
+from pathlib import Path
+import numpy as np
+import pandas as pd
+import streamlit as st
+import joblib
+import subprocess
+import os
+from datetime import datetime, timedelta
+# Settings
+MODEL_PATH = Path("models/rain_xgb_tuned.joblib")
+META_PATH  = Path("models/rain_xgb_tuned_meta.json")
+HOURLY_CSV = Path("results/hourly.csv")
+# Load model + meta
+@st.cache_resource
+def load_model():
+    if not (MODEL_PATH.exists() and META_PATH.exists()):
+        st.error("Trained model not found. Run: python scripts/xgb_tune_timeseries.py && python scripts/train_xgb_tuned_final.py")
+        st.stop()
+    clf = joblib.load(MODEL_PATH)
+    meta = json.loads(META_PATH.read_text())
+    return clf, meta
+def build_features_like_training(df: pd.DataFrame, features: list) -> pd.DataFrame:
+    from scripts.train_xgb_tuned_final import build_features  # reuse your code
+    Xdf = build_features(df)
+    return Xdf[features]
+def ensure_hourly(lat: float, lon: float, past_days: int = 90) -> pd.DataFrame:
+    env = os.environ.copy()
+    env["LAT"] = str(lat)
+    env["LON"] = str(lon)
+    env["PAST_DAYS"] = str(past_days)
+    # If file is missing or stale (>12h), refresh
+    needs_refresh = True
+    if HOURLY_CSV.exists():
+        age_hours = (datetime.now() - datetime.fromtimestamp(HOURLY_CSV.stat().st_mtime)).total_seconds() / 3600.0
+        needs_refresh = age_hours > 12
+    if (not HOURLY_CSV.exists()) or needs_refresh:
+        st.info("Fetching fresh hourly weather…")
+        subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
+        subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
+    return pd.read_csv(HOURLY_CSV, parse_dates=["time"])
+# UI
+st.set_page_config(page_title="Rain Nowcast (12h)", page_icon="🌧️", layout="centered")
+st.title("🌧️ Rain Nowcast — next 12 hours")
+clf, meta = load_model()
+features = meta["features"]
+thr = meta["thresholds"]
+horizon_h = meta["horizon_hours"]
+# Presets for cities
+CITY_PRESETS = {
+    "Lagos 🇳🇬":   (6.5244, 3.3792),
+    "Accra 🇬🇭":   (5.6037, -0.1870),
+    "Nairobi 🇰🇪": (-1.2864, 36.8172),
+    "Kampala 🇺🇬": (0.3476, 32.5825),
+    "Addis 🇪🇹":   (8.9806, 38.7578),
+}
+col1, col2 = st.columns(2)
+with col1:
+    city = st.selectbox("City", list(CITY_PRESETS.keys()), index=0)
+with col2:
+    mode = st.selectbox("Decision mode", ["default", "recall", "precision"], index=0)
+lat, lon = CITY_PRESETS[city]
+st.caption(f"Lat/Lon: **{lat:.4f}, {lon:.4f}** • Horizon: **{horizon_h}h** • Mode: **{mode}**")
+df = ensure_hourly(lat, lon, past_days=90)
+Xdf = build_features_like_training(df.copy(), features)
+if Xdf.empty:
+    st.error("Not enough data to build features. Try again after fetch.")
+    st.stop()
+x_last = Xdf.iloc[[-1]].values
+p = float(clf.predict_proba(x_last)[0, 1])
+thr_map = {
+    "default":   float(thr["default"]),
+    "recall":    float(thr["high_recall"]),
+    "precision": float(thr["high_precision"]),
+}
+t = thr_map[mode]
+decision = "RAIN" if p >= t else "No rain"
+st.subheader("Prediction")
+st.metric(
+    label=f"P(rain ≤ {horizon_h}h)",
+    value=f"{p:.3f}",
+    delta=f"threshold={t:.2f}",
+    delta_color="inverse" if p < t else "normal"
+)
+st.markdown(
+    f"**Decision:** {'🌧️ RAIN' if decision=='RAIN' else '✅ No rain'}  "
+    f"(mode **{mode}**, threshold **{t:.2f}**)"
+)
+st.subheader("Last 48h — context")
+last48 = df.tail(48).copy()
+c1, c2 = st.columns(2)
+with c1:
+    st.line_chart(data=last48.set_index("time")[["temp_c", "humidity"]])
+with c2:
+    st.line_chart(data=last48.set_index("time")[["precip_mm", "rain_mm"]])
+# --- Probability sparkline over last 48h ---
+st.subheader("Last 48h — rain probability")
+# Recompute probabilities for all available rows, then show last 48 aligned to time
+probas_all = clf.predict_proba(Xdf.values)[:, 1]
+proba_series = pd.Series(probas_all, index=Xdf.index, name="p_rain")
+# Align times (Xdf is derived from df; both share row order except dropped NaNs at head)
+times_aligned = df.loc[Xdf.index, "time"]
+last48_p = pd.DataFrame({"time": times_aligned, "p_rain": proba_series}).tail(48).set_index("time")
+st.line_chart(last48_p)
+# --- Download buttons ---
+st.subheader("Downloads")
+st.download_button(
+    label="⬇️ Download hourly.csv",
+    data=df.to_csv(index=False).encode("utf-8"),
+    file_name="hourly.csv",
+    mime="text/csv",
+)
+latest_frame = pd.DataFrame({
+    "time": [df.loc[Xdf.index, "time"].iloc[-1]],
+    "p_rain_next_12h": [p],
+    "mode": [mode],
+    "threshold": [t],
+    "decision": [decision],
+})
+st.download_button(
+    label="⬇️ Download latest_prediction.csv",
+    data=latest_frame.to_csv(index=False).encode("utf-8"),
+    file_name="latest_prediction.csv",
+    mime="text/csv",
+)
+# Explain thresholds
+with st.expander("What do these modes mean?"):
+    st.write("""
+- **default**: balanced (good everyday choice)
+- **recall**: warn more (catches more rain, may over-warn)
+- **precision**: be picky (alerts are rare but confident)
+""")
+st.caption("Model: XGBoost (tuned) • Features rebuilt exactly like training • Data: Open-Meteo hourly")

app/main.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""
+FastAPI application exposing the rain nowcast API and a Gradio UI.
+The previous Streamlit proxy was difficult to keep alive on Spaces due to
+websocket restrictions.  This module provides the same REST endpoints while
+mounting a lightweight Gradio front-end so the UI works without websocket
+tunnelling.
+"""
+from __future__ import annotations
+import os
+import json
+import subprocess
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Tuple
+import joblib
+import pandas as pd
+import gradio as gr
+from fastapi import FastAPI, HTTPException, Query
+from pydantic import BaseModel, Field
+from xgboost import XGBClassifier
+# --------- Paths ---------
+ROOT = Path(__file__).resolve().parents[1]
+MODELS = ROOT / "models"
+RESULTS = ROOT / "results"
+SCRIPTS = ROOT / "scripts"
+MODEL_PATH = MODELS / "rain_xgb_tuned.joblib"
+META_PATH = MODELS / "rain_xgb_tuned_meta.json"
+MODEL_JSON_PATH = MODELS / "xgb_tuned.json"
+HOURLY_CSV = RESULTS / "hourly.csv"
+# Make training utilities importable.
+import sys
+sys.path.insert(0, str(ROOT))
+from scripts.train_xgb_tuned_final import build_features  # type: ignore
+# --------- Load model + meta at startup ---------
+if not META_PATH.exists():
+    raise RuntimeError(
+        "Model metadata missing. Run `python scripts/train_xgb_tuned_final.py` "
+        "or copy models/rain_xgb_tuned_meta.json into place."
+    )
+meta = json.loads(META_PATH.read_text())
+FEATURES = meta["features"]
+THRESH = meta["thresholds"]
+HORIZON_H = int(meta["horizon_hours"])
+def _load_model() -> XGBClassifier:
+    if MODEL_PATH.exists():
+        return joblib.load(MODEL_PATH)
+    if MODEL_JSON_PATH.exists():
+        params = meta.get("model", {}).get("params", {})
+        booster = XGBClassifier(**params)
+        booster.load_model(MODEL_JSON_PATH)
+        return booster
+    raise RuntimeError(
+        "Model artifact missing. Run `python scripts/train_xgb_tuned_final.py` "
+        "to generate models/rain_xgb_tuned.joblib (or xgb_tuned.json), "
+        "or copy the trained file into the models/ directory."
+    )
+model = _load_model()
+# --------- Helpers ---------
+def ensure_hourly(lat: float, lon: float, past_days: int = 90) -> pd.DataFrame:
+    """Refresh the cached hourly CSV when it is missing or stale."""
+    env = os.environ.copy()
+    env["LAT"] = str(lat)
+    env["LON"] = str(lon)
+    env["PAST_DAYS"] = str(past_days)
+    needs_refresh = True
+    if HOURLY_CSV.exists():
+        age_hours = (datetime.now().timestamp() - HOURLY_CSV.stat().st_mtime) / 3600
+        needs_refresh = age_hours > 6
+    if (not HOURLY_CSV.exists()) or needs_refresh:
+        try:
+            subprocess.run(["bash", str(SCRIPTS / "fetch_weather.sh")], check=True, env=env)
+            subprocess.run(["python3", str(SCRIPTS / "export_hourly.py")], check=True, env=env)
+        except subprocess.CalledProcessError as exc:
+            raise HTTPException(status_code=502, detail=f"Data refresh failed: {exc}") from exc
+    return pd.read_csv(HOURLY_CSV, parse_dates=["time"])
+def predict_latest(df: pd.DataFrame, mode: str) -> Dict[str, object]:
+    """Build features, score the latest hour, and return a structured response."""
+    Xdf = build_features(df.copy())
+    if Xdf.empty:
+        raise HTTPException(status_code=422, detail="Not enough rows to build features.")
+    try:
+        Xdf = Xdf[FEATURES]
+    except KeyError as exc:
+        raise HTTPException(status_code=500, detail=f"Feature mismatch: {exc}") from exc
+    x = Xdf.iloc[[-1]].values
+    probability = float(model.predict_proba(x)[0, 1])
+    thresholds = {
+        "default": float(THRESH["default"]),
+        "recall": float(THRESH["high_recall"]),
+        "precision": float(THRESH["high_precision"]),
+    }
+    if mode not in thresholds:
+        raise HTTPException(status_code=400, detail=f"Unsupported mode '{mode}'.")
+    threshold = thresholds[mode]
+    decision = "RAIN" if probability >= threshold else "No rain"
+    ts = df.loc[Xdf.index, "time"].iloc[-1]
+    return {
+        "timestamp": ts.isoformat(),
+        "probability": probability,
+        "threshold": threshold,
+        "mode": mode,
+        "decision": decision,
+        "horizon_hours": HORIZON_H,
+    }
+def format_prediction(result: Dict[str, object]) -> str:
+    """Generate a concise markdown summary for the UI."""
+    emoji = "🌧️" if result["decision"] == "RAIN" else "✅"
+    probability = result["probability"]
+    threshold = result["threshold"]
+    mode = result["mode"]
+    timestamp = result["timestamp"]
+    return (
+        f"{emoji} **Decision:** {result['decision']} (mode **{mode}**)\n\n"
+        f"- Probability of rain ≤ {HORIZON_H}h: **{probability:.3f}**\n"
+        f"- Threshold: **{threshold:.2f}**\n"
+        f"- Issued for hour ending **{timestamp}**"
+    )
+class PredictBody(BaseModel):
+    lat: float = Field(6.5244, description="Latitude")
+    lon: float = Field(3.3792, description="Longitude")
+    mode: str = Field("default", description="default | recall | precision")
+    past_days: int = Field(90, ge=14, le=180, description="How much history to fetch (days)")
+app = FastAPI(title="Rain Nowcast API", version="1.1.0")
+@app.get("/health")
+def health() -> Dict[str, object]:
+    return {
+        "status": "ok",
+        "model_file": MODEL_PATH.name,
+        "horizon_hours": HORIZON_H,
+        "thresholds": THRESH,
+        "features": FEATURES,
+    }
+@app.post("/predict")
+def predict(body: PredictBody) -> Dict[str, object]:
+    df = ensure_hourly(body.lat, body.lon, body.past_days)
+    out = predict_latest(df, body.mode)
+    return {"ok": True, "result": out}
+@app.get("/predict")
+def predict_get(
+    lat: float = Query(6.5244),
+    lon: float = Query(3.3792),
+    mode: str = Query("default"),
+    past_days: int = Query(90, ge=14, le=180),
+) -> Dict[str, object]:
+    df = ensure_hourly(lat, lon, past_days)
+    out = predict_latest(df, mode)
+    return {"ok": True, "result": out}
+# --------- Gradio UI ---------
+CITY_PRESETS: Dict[str, Tuple[float, float]] = {
+    "Lagos 🇳🇬": (6.5244, 3.3792),
+    "Accra 🇬🇭": (5.6037, -0.1870),
+    "Nairobi 🇰🇪": (-1.2864, 36.8172),
+    "Kampala 🇺🇬": (0.3476, 32.5825),
+    "Addis Ababa 🇪🇹": (8.9806, 38.7578),
+    "Custom": (0.0, 0.0),
+}
+def _resolve_location(city: str, lat: float, lon: float) -> Tuple[float, float, str]:
+    if city in CITY_PRESETS and city != "Custom":
+        chosen_lat, chosen_lon = CITY_PRESETS[city]
+        label = city
+    else:
+        chosen_lat, chosen_lon = lat, lon
+        label = f"Custom ({lat:.3f}, {lon:.3f})"
+    return chosen_lat, chosen_lon, label
+def gradio_predict(
+    city: str,
+    lat: float,
+    lon: float,
+    mode: str,
+    past_days: int,
+) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
+    chosen_lat, chosen_lon, label = _resolve_location(city, lat, lon)
+    df = ensure_hourly(chosen_lat, chosen_lon, past_days)
+    result = predict_latest(df, mode)
+    summary = format_prediction(result)
+    last48 = df.tail(48).copy()
+    last48.set_index("time", inplace=True)
+    chart = last48[["temp_c", "humidity", "precip_mm", "rain_mm"]]
+    latest = pd.DataFrame(
+        {
+            "location": [label],
+            "timestamp": [result["timestamp"]],
+            "mode": [result["mode"]],
+            "probability": [result["probability"]],
+            "threshold": [result["threshold"]],
+            "decision": [result["decision"]],
+        }
+    )
+    return summary, latest, chart
+with gr.Blocks(css=".gradio-container {max-width: 900px;}") as demo:
+    gr.Markdown("# 🌧️ Rain Nowcast\nPredict the probability of rain in the next "
+                f"{HORIZON_H} hours using the tuned XGBoost model.")
+    with gr.Row():
+        city_input = gr.Dropdown(
+            label="City preset",
+            choices=list(CITY_PRESETS.keys()),
+            value="Lagos 🇳🇬",
+        )
+        mode_input = gr.Radio(
+            label="Decision mode",
+            choices=["default", "recall", "precision"],
+            value="default",
+            info="default=balanced, recall=warn more, precision=extra picky",
+        )
+    with gr.Row():
+        lat_input = gr.Number(label="Latitude (used if city is Custom)", value=6.5244)
+        lon_input = gr.Number(label="Longitude (used if city is Custom)", value=3.3792)
+        past_days_input = gr.Slider(
+            label="History window (days)",
+            minimum=14,
+            maximum=180,
+            value=90,
+            step=1,
+        )
+    submit = gr.Button("Run prediction", variant="primary")
+    summary_md = gr.Markdown()
+    latest_df = gr.Dataframe(label="Latest prediction", wrap=True)
+    chart_df = gr.LinePlot(
+        label="Last 48h weather (hourly)",
+        x="time",
+        y=["temp_c", "humidity", "precip_mm", "rain_mm"],
+        overlay_point=True,
+        width="100%",
+        height=350,
+    )
+    submit.click(
+        gradio_predict,
+        inputs=[city_input, lat_input, lon_input, mode_input, past_days_input],
+        outputs=[summary_md, latest_df, chart_df],
+    )
+    gr.Markdown(
+        "Model features match the training pipeline "
+        "(see `scripts/train_xgb_tuned_final.py`). Data fetched from Open-Meteo."
+    )
+app = gr.mount_gradio_app(app, demo, path="/")

assets/cover.png ADDED Viewed

assets/feature_importance.png ADDED Viewed

assets/pr_curve.png ADDED Viewed

assets/precip.png ADDED Viewed

assets/roc_curve.png ADDED Viewed

assets/temps.png ADDED Viewed

models/rain_model_meta.json ADDED Viewed

	@@ -0,0 +1,84 @@

+{
+  "features": [
+    "temp_c",
+    "humidity",
+    "cloudcover",
+    "pressure",
+    "wind_speed",
+    "precip_mm",
+    "rain_mm",
+    "d_temp_c",
+    "d_humidity",
+    "d_cloudcover",
+    "d_pressure",
+    "d_wind_speed",
+    "d_precip_mm",
+    "d_rain_mm",
+    "ma3_temp_c",
+    "ma3_humidity",
+    "ma3_cloudcover",
+    "ma3_pressure",
+    "ma3_wind_speed",
+    "ma3_precip_mm",
+    "ma3_rain_mm"
+  ],
+  "horizon_hours": 12,
+  "thresholds": {
+    "default": 0.22239363491823944,
+    "high_recall": 0.22239363491823944,
+    "high_precision": 0.745196376322855
+  },
+  "metrics": {
+    "default": {
+      "threshold": 0.22239363491823944,
+      "precision": 0.8433098591549296,
+      "recall": 0.98559670781893,
+      "f1": 0.9089184060721063,
+      "auc": 0.7839506172839507,
+      "cm": [
+        [
+          13,
+          89
+        ],
+        [
+          7,
+          479
+        ]
+      ]
+    },
+    "high_recall": {
+      "threshold": 0.22239363491823944,
+      "precision": 0.8433098591549296,
+      "recall": 0.98559670781893,
+      "f1": 0.9089184060721063,
+      "auc": 0.7839506172839507,
+      "cm": [
+        [
+          13,
+          89
+        ],
+        [
+          7,
+          479
+        ]
+      ]
+    },
+    "high_precision": {
+      "threshold": 0.745196376322855,
+      "precision": 0.9033018867924528,
+      "recall": 0.7880658436213992,
+      "f1": 0.8417582417582418,
+      "auc": 0.7839506172839507,
+      "cm": [
+        [
+          61,
+          41
+        ],
+        [
+          103,
+          383
+        ]
+      ]
+    }
+  }
+}

models/rain_xgb_cal_meta.json ADDED Viewed

	@@ -0,0 +1,117 @@

+{
+  "model_type": "xgboost+isotonic",
+  "features": [
+    "temp_c",
+    "humidity",
+    "cloudcover",
+    "pressure",
+    "wind_speed",
+    "precip_mm",
+    "rain_mm",
+    "d_temp_c",
+    "d_humidity",
+    "d_cloudcover",
+    "d_pressure",
+    "d_wind_speed",
+    "d_precip_mm",
+    "d_rain_mm",
+    "ma3_temp_c",
+    "ma3_humidity",
+    "ma3_cloudcover",
+    "ma3_pressure",
+    "ma3_wind_speed",
+    "ma3_precip_mm",
+    "ma3_rain_mm",
+    "d3_pressure",
+    "d3_humidity",
+    "d3_cloudcover",
+    "d3_temp_c",
+    "dew_proxy",
+    "d_dew_proxy",
+    "ma3_dew_proxy",
+    "rain_sum_3h",
+    "rain_sum_6h",
+    "rain_sum_12h",
+    "rain_sum_24h",
+    "rain_max_6h",
+    "rain_max_12h",
+    "dry_streak_h",
+    "wet_streak_h",
+    "hour_sin",
+    "hour_cos",
+    "dow_sin",
+    "dow_cos",
+    "hum_x_cloud",
+    "wind_x_cloud",
+    "press_drop_3h"
+  ],
+  "horizon_hours": 12,
+  "event_mm": 1.0,
+  "label_desc": "Rain event if cumulative precip \u2265 1.0 mm in next 12h",
+  "thresholds": {
+    "default": 0.5123772621154785,
+    "high_recall": 0.26928117871284485,
+    "high_precision": 0.6026621460914612
+  },
+  "metrics": {
+    "default": {
+      "threshold": 0.5123772621154785,
+      "precision": 0.5376344086021505,
+      "recall": 0.8438818565400844,
+      "f1": 0.6568144499178982,
+      "auc": 0.7253714914694552,
+      "cm": [
+        [
+          173,
+          172
+        ],
+        [
+          37,
+          200
+        ]
+      ],
+      "pos_rate": 0.6391752577319587
+    },
+    "high_recall": {
+      "threshold": 0.26928117871284485,
+      "precision": 0.4976190476190476,
+      "recall": 0.8818565400843882,
+      "f1": 0.6362252663622526,
+      "auc": 0.7253714914694552,
+      "cm": [
+        [
+          134,
+          211
+        ],
+        [
+          28,
+          209
+        ]
+      ],
+      "pos_rate": 0.7216494845360825
+    },
+    "high_precision": {
+      "threshold": 0.6026621460914612,
+      "precision": 0.6490384615384616,
+      "recall": 0.569620253164557,
+      "f1": 0.6067415730337079,
+      "auc": 0.7253714914694552,
+      "cm": [
+        [
+          272,
+          73
+        ],
+        [
+          102,
+          135
+        ]
+      ],
+      "pos_rate": 0.35738831615120276
+    }
+  },
+  "policy": {
+    "default": "best F1 (balanced, early-warning baseline)",
+    "high_recall": "recall\u22650.88 & precision\u22650.55 & pos_rate\u22640.80",
+    "high_precision": "precision\u22650.80 & recall\u22650.45 (Moderate)"
+  }
+}

models/rain_xgb_meta.json ADDED Viewed

	@@ -0,0 +1,94 @@

+{
+  "features": [
+    "temp_c",
+    "humidity",
+    "cloudcover",
+    "pressure",
+    "wind_speed",
+    "precip_mm",
+    "rain_mm",
+    "d_temp_c",
+    "d_humidity",
+    "d_cloudcover",
+    "d_pressure",
+    "d_wind_speed",
+    "d_precip_mm",
+    "d_rain_mm",
+    "ma3_temp_c",
+    "ma3_humidity",
+    "ma3_cloudcover",
+    "ma3_pressure",
+    "ma3_wind_speed",
+    "ma3_precip_mm",
+    "ma3_rain_mm",
+    "pressure_d3h",
+    "humidity_d3h",
+    "cloudcover_d3h",
+    "dew_proxy",
+    "d_dew_proxy",
+    "ma3_dew_proxy"
+  ],
+  "horizon_hours": 12,
+  "thresholds": {
+    "default": 0.4454699456691742,
+    "high_recall": 0.4454699456691742,
+    "high_precision": 0.8384796977043152
+  },
+  "metrics": {
+    "default": {
+      "threshold": 0.4454699456691742,
+      "precision": 0.8151093439363817,
+      "recall": 0.9403669724770642,
+      "f1": 0.873269435569755,
+      "auc": 0.7489787718475859,
+      "cm": [
+        [
+          44,
+          93
+        ],
+        [
+          26,
+          410
+        ]
+      ],
+      "pos_rate": 0.8778359511343804
+    },
+    "high_recall": {
+      "threshold": 0.4454699456691742,
+      "precision": 0.8151093439363817,
+      "recall": 0.9403669724770642,
+      "f1": 0.873269435569755,
+      "auc": 0.7489787718475859,
+      "cm": [
+        [
+          44,
+          93
+        ],
+        [
+          26,
+          410
+        ]
+      ],
+      "pos_rate": 0.8778359511343804
+    },
+    "high_precision": {
+      "threshold": 0.8384796977043152,
+      "precision": 0.9012345679012346,
+      "recall": 0.5022935779816514,
+      "f1": 0.6450662739322534,
+      "auc": 0.7489787718475859,
+      "cm": [
+        [
+          113,
+          24
+        ],
+        [
+          217,
+          219
+        ]
+      ],
+      "pos_rate": 0.42408376963350786
+    }
+  },
+  "model_type": "xgboost"
+}

models/rain_xgb_tuned_meta.json ADDED Viewed

	@@ -0,0 +1,111 @@

+{
+  "features": [
+    "temp_c",
+    "humidity",
+    "cloudcover",
+    "pressure",
+    "wind_speed",
+    "precip_mm",
+    "rain_mm",
+    "d_temp_c",
+    "d_humidity",
+    "d_cloudcover",
+    "d_pressure",
+    "d_wind_speed",
+    "d_precip_mm",
+    "d_rain_mm",
+    "ma3_temp_c",
+    "ma3_humidity",
+    "ma3_cloudcover",
+    "ma3_pressure",
+    "ma3_wind_speed",
+    "ma3_precip_mm",
+    "ma3_rain_mm",
+    "d3_pressure",
+    "d3_humidity",
+    "d3_cloudcover",
+    "d3_temp_c",
+    "dew_proxy",
+    "d_dew_proxy",
+    "ma3_dew_proxy",
+    "rain_sum_3h",
+    "rain_sum_6h",
+    "rain_sum_12h",
+    "rain_sum_24h",
+    "rain_max_6h",
+    "rain_max_12h",
+    "dry_streak_h",
+    "wet_streak_h",
+    "hour_sin",
+    "hour_cos",
+    "dow_sin",
+    "dow_cos",
+    "hoy_sin",
+    "hoy_cos",
+    "hum_x_cloud",
+    "wind_x_cloud",
+    "press_drop_3h",
+    "press_drop_6h"
+  ],
+  "horizon_hours": 12,
+  "event_mm": 1.0,
+  "model": {
+    "type": "xgboost",
+    "params": {
+      "learning_rate": 0.05,
+      "max_depth": 3,
+      "n_estimators": 500,
+      "subsample": 0.8,
+      "colsample_bytree": 0.8,
+      "min_child_weight": 3
+    }
+  },
+  "thresholds": {
+    "default": 0.15,
+    "high_recall": 0.1,
+    "high_precision": 0.6
+  },
+  "cv_mean": {
+    "P": 0.6167141877942365,
+    "R": 0.40142749648205356,
+    "F1": 0.4687631522470538,
+    "AUC": 0.6838816207078178
+  },
+  "cv_folds": [
+    {
+      "P": 0.44,
+      "R": 0.4782608695652174,
+      "F1": 0.4583333333333333,
+      "AUC": 0.6755671077504725,
+      "thr": 0.15
+    },
+    {
+      "P": 0.7757009345794392,
+      "R": 0.4088669950738916,
+      "F1": 0.535483870967742,
+      "AUC": 0.7078279587697148,
+      "thr": 0.4764537811279297
+    },
+    {
+      "P": 0.82,
+      "R": 0.4270833333333333,
+      "F1": 0.5616438356164384,
+      "AUC": 0.6740785256410257,
+      "thr": 0.9872803688049316
+    },
+    {
+      "P": 0.32323232323232326,
+      "R": 0.3764705882352941,
+      "F1": 0.34782608695652173,
+      "AUC": 0.6218912881608338,
+      "thr": 0.9168330430984497
+    },
+    {
+      "P": 0.7246376811594203,
+      "R": 0.31645569620253167,
+      "F1": 0.44052863436123346,
+      "AUC": 0.7400432232170422,
+      "thr": 0.8837475776672363
+    }
+  ]
+}

models/xgb_tuned.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "params": {
+    "learning_rate": 0.05,
+    "max_depth": 3,
+    "n_estimators": 500,
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    "min_child_weight": 3
+  },
+  "mean_f1": 0.5780780663579321
+}

pyproject.toml ADDED Viewed

	@@ -0,0 +1,26 @@

+[project]
+name = "weather-data-fetcher"
+version = "0.2.0"
+description = "Fetch, process, and visualize daily weather from Open-Meteo."
+readme = "README.md"
+requires-python = ">=3.10"
+authors = [{ name = "Elvis Anselm" }]
+license = "MIT"
+dependencies = [
+  "requests",
+  "pandas",
+  "matplotlib",
+  "python-dotenv",
+  "pillow"
+]
+[tool.setuptools.packages.find]
+include = ["weather_cli*"]
+exclude = ["data*", "logs*", "results*", "assets*"]
+[project.scripts]
+weather-cli = "weather_cli.cli:main"
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"

render.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+services:
+  - type: web
+    name: rain-nowcast-api
+    env: docker
+    autoDeploy: true
+    plan: free
+    dockerCommand: null
+    healthCheckPath: /health

requirements.txt ADDED Viewed

	@@ -0,0 +1,32 @@

+certifi==2025.10.5
+charset-normalizer==3.4.4
+contourpy==1.3.3
+cycler==0.12.1
+fonttools==4.60.1
+idna==3.11
+joblib==1.5.2
+kiwisolver==1.4.9
+matplotlib==3.10.7
+numpy==2.3.4
+packaging==25.0
+pandas==2.3.3
+pillow==12.0.0
+pyparsing==3.2.5
+python-dateutil==2.9.0.post0
+python-dotenv==1.1.1
+pytz==2025.2
+requests==2.32.5
+scikit-learn==1.7.2
+scipy==1.16.2
+six==1.17.0
+threadpoolctl==3.6.0
+tzdata==2025.2
+urllib3==2.5.0
+-e git+https://github.com/Elvaceishim/weather_data_fetcher.git@ac53d9c31c4be6eda7988f97e1768f998c7a9f0a#egg=weather_data_fetcher
+fastapi
+uvicorn[standard]
+pydantic
+xgboost
+huggingface_hub
+streamlit
+gradio

scripts/analyze_weather.py ADDED Viewed

	@@ -0,0 +1,11 @@

+import pandas as pd
+df = pd.read_csv("results/summary.csv")
+print("\n=== HEAD ===")
+print(df.head())
+print("\n=== DESCRIBE ===")
+print(df.describe())
+print("\n=== COLUMNS ===")
+print(df.columns)
+print("\n=== MISSING VALUES ===")
+print(df.isna().sum())

scripts/backfill_labels.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python3
+import json, os, argparse
+from pathlib import Path
+from datetime import datetime, timedelta
+import pandas as pd, numpy as np, subprocess
+META  = Path("models/rain_xgb_tuned_meta.json")
+LOGS  = Path("logs")
+PRED_LOG = LOGS / "predictions.csv"
+def ensure_hourly(lat, lon, past_days=120):
+    env = os.environ.copy()
+    env["LAT"], env["LON"], env["PAST_DAYS"] = str(lat), str(lon), str(past_days)
+    subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
+    subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
+    return pd.read_csv("results/hourly.csv", parse_dates=["time"])
+def label_from_df(df, ts_pred, horizon_h, event_mm):
+    # find the row with time == ts_pred, then sum next H hours of precip_mm
+    # allow slight mismatch by nearest timestamp within 1 hour
+    idx = (df["time"] - ts_pred).abs().idxmin()
+    if abs((df.loc[idx, "time"] - ts_pred).total_seconds()) > 3600:
+        return None  # can't align
+    end_idx = min(idx + horizon_h, len(df)-1)
+    total = float(np.nansum(df.loc[idx+1:end_idx, "precip_mm"]))
+    return 1 if total >= event_mm else 0
+def main():
+    if not PRED_LOG.exists():
+        print("No predictions.csv found.")
+        return
+    meta = json.loads(Path(META).read_text())
+    H = int(meta["horizon_hours"]); event_mm = float(meta["event_mm"])
+    df = pd.read_csv(PRED_LOG, parse_dates=["ts_pred","logged_at"])
+    updated = 0
+    for i, row in df[df["y_true"].isna() | (df["y_true"]=="")].iterrows():
+        ts_pred = row["ts_pred"]
+        if datetime.now() < ts_pred + timedelta(hours=H):
+            continue  # horizon not passed yet
+        # fetch enough history to cover that timestamp
+        hdf = ensure_hourly(row["lat"], row["lon"], past_days=120)
+        y = label_from_df(hdf, ts_pred, H, event_mm)
+        if y is not None:
+            df.at[i, "y_true"] = int(y)
+            updated += 1
+    df.to_csv(PRED_LOG, index=False)
+    print(f"Backfilled {updated} rows into {PRED_LOG}")
+if __name__ == "__main__":
+    main()

scripts/coef_rain.py ADDED Viewed

	@@ -0,0 +1,32 @@

+import json, joblib, pandas as pd
+from sklearn.model_selection import train_test_split
+meta = json.load(open("models/rain_model_meta.json"))
+clf = joblib.load("models/rain_classifier_hourly.joblib")
+df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
+for c in base:
+    df[f"d_{c}"] = df[c].diff()
+    df[f"ma3_{c}"] = df[c].rolling(3).mean()
+df = df.dropna().reset_index(drop=True)
+X = df[meta["features"]].values
+y = None
+logreg = clf.named_steps["logreg"]
+coefs = logreg.coef_[0]
+features = meta["features"]
+rank = sorted(zip(features, coefs), key=lambda x: abs(x[1]), reverse=True)
+out_lines = ["Feature coefficients (standardized space):"]
+for name, w in rank[:15]:
+    out_lines.append(f"{name:20s} {w:+.3f}")
+print("\n".join(out_lines))
+with open("results/coef_top15.txt", "w") as f:
+    f.write("\n".join(out_lines))
+print("✅ Wrote results/coef_top15.txt")

scripts/cron_predict.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/usr/bin/env bash
+set -euo pipefail
+# --- Resolve repo root ---
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+cd "$REPO_ROOT"
+# --- Lock to avoid overlapping runs (portable; no flock needed) ---
+mkdir -p logs
+LOCKDIR="logs/.predict.lock"
+if ! mkdir "$LOCKDIR" 2>/dev/null; then
+  echo "[$(date '+%F %T')] Another run is in progress. Skipping."
+  exit 0
+fi
+trap 'rmdir "$LOCKDIR" 2>/dev/null || true' EXIT
+# --- Args & defaults ---
+MODE="${1:-default}"
+CITY="${2:-Lagos}"
+LAT="${3:-6.5244}"
+LON="${4:-3.3792}"
+PAST_DAYS="${5:-90}"
+# --- Activate venv if present ---
+if [[ -f ".venv/bin/activate" ]]; then
+  # shellcheck disable=SC1091
+  source .venv/bin/activate
+fi
+# --- Environment for fetch scripts ---
+export LAT="$LAT" LON="$LON" PAST_DAYS="$PAST_DAYS"
+# --- Run one logged prediction ---
+echo "[$(date '+%F %T')] cron_predict: city=$CITY lat=$LAT lon=$LON mode=$MODE days=$PAST_DAYS"
+python3 scripts/log_predict.py --city "$CITY" --lat "$LAT" --lon "$LON" --mode "$MODE" || {
+  echo "[$(date '+%F %T')] ERROR: log_predict failed"
+  exit 1
+}
+# --- (Optional) basic log rotation (keep log under ~1MB) ---
+LOGFILE="logs/cron.log"
+if [[ -f "$LOGFILE" ]] && [[ $(stat -f%z "$LOGFILE") -gt 1048576 ]]; then
+  mv "$LOGFILE" "logs/cron_$(date +%Y%m%d_%H%M%S).log" || true
+fi
+echo "[$(date '+%F %T')] cron_predict: done."

scripts/cv_benchmark.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import json, warnings
+from pathlib import Path
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import TimeSeriesSplit
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler, RobustScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
+warnings.filterwarnings("ignore")
+H = 12
+EVENT_MM = 1.0
+HOURLY = Path("results/hourly.csv")
+META   = Path("models/rain_model_meta.json")
+# -----------------------------
+# Feature builder (same as CLI/trainer)
+# -----------------------------
+def rebuild_features_like_training(df: pd.DataFrame, features_from_meta: list) -> pd.DataFrame:
+    required = {"time","temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"}
+    missing = required - set(df.columns)
+    if missing:
+        raise ValueError(f"Hourly data missing columns: {sorted(missing)}")
+    base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
+    for c in base:
+        df[f"d_{c}"]   = df[c].diff()
+        df[f"ma3_{c}"] = df[c].rolling(3).mean()
+    for c in ["pressure","humidity","cloudcover","temp_c"]:
+        df[f"d3_{c}"] = df[c] - df[c].shift(3)
+    df["dew_proxy"]     = df["temp_c"] - (df["humidity"] / 5.0)
+    df["d_dew_proxy"]   = df["dew_proxy"].diff()
+    df["ma3_dew_proxy"] = df["dew_proxy"].rolling(3).mean()
+    df["rain_sum_3h"]   = df["precip_mm"].rolling(3).sum()
+    df["rain_sum_6h"]   = df["precip_mm"].rolling(6).sum()
+    df["rain_sum_12h"]  = df["precip_mm"].rolling(12).sum()
+    df["rain_sum_24h"]  = df["precip_mm"].rolling(24).sum()
+    df["rain_max_6h"]   = df["precip_mm"].rolling(6).max()
+    df["rain_max_12h"]  = df["precip_mm"].rolling(12).max()
+    is_raining = (df["precip_mm"] > 0).astype(int)
+    dry = (~(is_raining.astype(bool))).astype(int)
+    df["dry_streak_h"] = (dry.groupby((dry != dry.shift()).cumsum()).cumcount() + 1) * dry
+    df["dry_streak_h"] = df["dry_streak_h"].where(dry == 1, 0)
+    wet = is_raining
+    df["wet_streak_h"] = (wet.groupby((wet != wet.shift()).cumsum()).cumcount() + 1) * wet
+    df["wet_streak_h"] = df["wet_streak_h"].where(wet == 1, 0)
+    df["hour"] = df["time"].dt.hour
+    df["dow"]  = df["time"].dt.dayofweek
+    df["doy"]  = df["time"].dt.dayofyear
+    df["hoy"]  = (df["doy"] - 1) * 24 + df["hour"]
+    df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24.0)
+    df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24.0)
+    df["dow_sin"]  = np.sin(2*np.pi*df["dow"]/7.0)
+    df["dow_cos"]  = np.cos(2*np.pi*df["dow"]/7.0)
+    df["hoy_sin"]  = np.sin(2*np.pi*df["hoy"]/(365.25*24))
+    df["hoy_cos"]  = np.cos(2*np.pi*df["hoy"]/(365.25*24))
+    df["hum_x_cloud"]   = df["humidity"] * df["cloudcover"]
+    df["wind_x_cloud"]  = df["wind_speed"] * df["cloudcover"]
+    df["press_drop_3h"] = -df["d3_pressure"]
+    df["press_drop_6h"] = df["pressure"].shift(6) - df["pressure"]
+    df = df.dropna().reset_index(drop=True)
+    if features_from_meta:
+        missing_feats = [c for c in features_from_meta if c not in df.columns]
+        if missing_feats:
+            raise ValueError(f"Missing features expected by model: {missing_feats}")
+        return df[features_from_meta]
+    feat = (
+        base +
+        [f"d_{c}" for c in base] +
+        [f"ma3_{c}" for c in base] +
+        [f"d3_{c}" for c in ["pressure","humidity","cloudcover","temp_c"]] +
+        ["dew_proxy","d_dew_proxy","ma3_dew_proxy",
+         "rain_sum_3h","rain_sum_6h","rain_sum_12h","rain_sum_24h","rain_max_6h","rain_max_12h",
+         "dry_streak_h","wet_streak_h",
+         "hour_sin","hour_cos","dow_sin","dow_cos","hoy_sin","hoy_cos",
+         "hum_x_cloud","wind_x_cloud","press_drop_3h","press_drop_6h"]
+    )
+    return df[feat]
+# -----------------------------
+# Label builder: ≥ EVENT_MM in next H hours
+# -----------------------------
+def make_labels(df: pd.DataFrame, horizon=H, event_mm=EVENT_MM):
+    prec = df["precip_mm"].values
+    y = np.zeros(len(df), dtype=int)
+    for i in range(len(prec) - horizon):
+        y[i] = 1 if np.nansum(prec[i+1:i+1+horizon]) >= event_mm else 0
+    y = y[:-horizon]
+    return y
+# -----------------------------
+# Models to compare
+# -----------------------------
+def build_models():
+    models = {}
+    # Logistic + StandardScaler
+    models["logreg_standard"] = Pipeline([
+        ("scaler", StandardScaler()),
+        ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
+    ])
+    # Logistic + RobustScaler (outlier-robust)
+    models["logreg_robust"] = Pipeline([
+        ("scaler", RobustScaler()),
+        ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
+    ])
+    try:
+        from xgboost import XGBClassifier
+        models["xgb"] = XGBClassifier(
+            n_estimators=800,
+            learning_rate=0.05,
+            max_depth=5,
+            min_child_weight=3.0,
+            subsample=0.8,
+            colsample_bytree=0.8,
+            reg_lambda=2.0,
+            objective="binary:logistic",
+            eval_metric="aucpr",
+            tree_method="hist",
+            random_state=42,
+        )
+    except Exception as e:
+        print(f"[warn] XGBoost unavailable: {e}")
+    return models
+def evaluate_fold(model, X_train, y_train, X_test, y_test, val_frac=0.15):
+    n = len(X_train)
+    v = max(int(n * val_frac), 1)
+    X_tr, y_tr = X_train[:-v], y_train[:-v]
+    X_val, y_val = X_train[-v:], y_train[-v:]
+    model.fit(X_tr, y_tr)
+    # Probability on val to pick threshold
+    if hasattr(model, "predict_proba"):
+        p_val = model.predict_proba(X_val)[:, 1]
+        p_test = model.predict_proba(X_test)[:, 1]
+    else:
+        if hasattr(model, "decision_function"):
+            from sklearn.preprocessing import MinMaxScaler
+            z_val = model.decision_function(X_val).reshape(-1, 1)
+            z_test = model.decision_function(X_test).reshape(-1, 1)
+            mm = MinMaxScaler()
+            p_val  = mm.fit_transform(z_val).ravel()
+            p_test = mm.transform(z_test).ravel()
+        else:
+            # fallback: hard predictions at 0.5
+            pred = model.predict(X_test)
+            P, R, F1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
+            return dict(P=P, R=R, F1=F1, thr=0.5)
+    prec, rec, thr = precision_recall_curve(y_val, p_val)
+    # Avoid degenerate thresholds: thr has length len(prec)-1
+    candidates = []
+    for t in thr:
+        pred_v = (p_val >= t).astype(int)
+        P, R, F1, _ = precision_recall_fscore_support(y_val, pred_v, average="binary", zero_division=0)
+        candidates.append((t, P, R, F1))
+    if not candidates:
+        t_star = 0.5
+    else:
+        # choose by best F1 on validation
+        t_star = max(candidates, key=lambda x: x[3])[0]
+    pred = (p_test >= t_star).astype(int)
+    P, R, F1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
+    return dict(P=P, R=R, F1=F1, thr=float(t_star))
+# -----------------------------
+# Main
+# -----------------------------
+def main():
+    if not HOURLY.exists():
+        raise FileNotFoundError("results/hourly.csv not found. Run: make hourly PAST_DAYS=90")
+    df = pd.read_csv(HOURLY, parse_dates=["time"])
+    y_all = make_labels(df, H, EVENT_MM)
+    dfX = df.iloc[:-H].copy()
+    # Use features from meta if present
+    features_from_meta = None
+    if META.exists():
+        meta = json.loads(META.read_text())
+        features_from_meta = meta.get("features", None)
+    Xdf = rebuild_features_like_training(dfX, features_from_meta)
+    n = len(Xdf)
+    if len(y_all) < n:
+        raise ValueError("Labels shorter than feature matrix; check preprocessing alignment.")
+    y = y_all[-n:]
+    X = Xdf.values[-n:]
+    assert len(X) == len(y), "Feature matrix and labels misaligned."
+    tscv = TimeSeriesSplit(n_splits=5)
+    models = build_models()
+    results = {name: [] for name in models}
+    for name, model in models.items():
+        print(f"\n=== {name} ===")
+        fold_id = 1
+        per_fold = []
+        for tr_idx, te_idx in tscv.split(X):
+            X_tr, X_te = X[tr_idx], X[te_idx]
+            y_tr, y_te = y[tr_idx], y[te_idx]
+            metrics = evaluate_fold(model, X_tr, y_tr, X_te, y_te)
+            per_fold.append(metrics)
+            print(f"Fold {fold_id} → P={metrics['P']:.3f}  R={metrics['R']:.3f}  F1={metrics['F1']:.3f}  thr={metrics['thr']:.3f}")
+            fold_id += 1
+        # Aggregate
+        Pm = np.mean([m["P"] for m in per_fold])
+        Rm = np.mean([m["R"] for m in per_fold])
+        Fm = np.mean([m["F1"] for m in per_fold])
+        print(f"Mean  → P={Pm:.3f}  R={Rm:.3f}  F1={Fm:.3f}")
+        results[name] = dict(P=Pm, R=Rm, F1=Fm)
+    print("\n=== SUMMARY (higher F1 is better) ===")
+    for name, m in sorted(results.items(), key=lambda kv: kv[1]["F1"], reverse=True):
+        print(f"{name:18s}  F1={m['F1']:.3f}  P={m['P']:.3f}  R={m['R']:.3f}")
+if __name__ == "__main__":
+    main()

scripts/download_models.py ADDED Viewed

	@@ -0,0 +1,41 @@

+from __future__ import annotations
+import os
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+from huggingface_hub.errors import EntryNotFoundError
+def main() -> None:
+    repo_id = os.environ.get("MODEL_REPO_ID", "theelvace/weather-data-fetcher-models")
+    files_env = os.environ.get(
+        "MODEL_FILES",
+        "rain_xgb_tuned.joblib rain_xgb_tuned_meta.json",
+    )
+    target_dir = Path(os.environ.get("MODEL_DIR", "models"))
+    target_dir.mkdir(parents=True, exist_ok=True)
+    filenames = [name.strip() for name in files_env.split() if name.strip()]
+    if not filenames:
+        print("MODEL_FILES is empty; nothing to download.")
+        return
+    for filename in filenames:
+        print(f"Downloading {filename} from {repo_id} ...")
+        try:
+            local_path = hf_hub_download(
+                repo_id=repo_id,
+                filename=filename,
+                local_dir=target_dir,
+                local_dir_use_symlinks=False,
+            )
+        except EntryNotFoundError:
+            print(f" • Skipping {filename}: not found in {repo_id}.")
+            continue
+        print(f"Saved to {local_path}")
+if __name__ == "__main__":
+    main()

scripts/eval_operating_points.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import json
+from pathlib import Path
+import numpy as np, pandas as pd
+from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
+meta = json.loads(Path("models/rain_xgb_tuned_meta.json").read_text())
+thr = meta["thresholds"]
+H = meta["horizon_hours"]
+df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+# make labels (>=1.0mm in next H hours)
+prec = df["precip_mm"].values
+y = np.zeros(len(df), dtype=int)
+for i in range(len(prec) - H):
+    y[i] = 1 if np.nansum(prec[i+1:i+1+H]) >= meta["event_mm"] else 0
+y = y[:-H]
+dfX = df.iloc[:-H].copy()
+# rebuild features exactly like training
+# local import
+import importlib.util
+import types
+def load_build_features():
+    spec = importlib.util.spec_from_file_location("train_xgb_tuned_final", Path("scripts/train_xgb_tuned_final.py"))
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)  # type: ignore
+    return module.build_features
+build_features = load_build_features()
+Xdf = build_features(dfX)
+X = Xdf.values
+y = y[-len(X):]  # align
+import joblib
+clf = joblib.load("models/rain_xgb_tuned.joblib")
+p = clf.predict_proba(X)[:,1]
+def report(name, t):
+    pred = (p >= t).astype(int)
+    P, R, F1, _ = precision_recall_fscore_support(y, pred, average="binary", zero_division=0)
+    cm = confusion_matrix(y, pred).tolist()
+    rate = float(pred.mean())
+    print(f"{name:<10} thr={t:.3f} | P={P:.3f} R={R:.3f} F1={F1:.3f} | alerts={rate:.2%} | cm={cm}")
+report("default",   thr["default"])
+report("recall",    thr["high_recall"])
+report("precision", thr["high_precision"])

scripts/explain_shap.py ADDED Viewed

	@@ -0,0 +1,62 @@

+#!/usr/bin/env python3
+import shap
+import joblib
+import pandas as pd
+import matplotlib.pyplot as plt
+from pathlib import Path
+import json
+import numpy as np
+import os
+# ensure matplotlib cache lives inside repo
+RESULTS_DIR = Path("results")
+RESULTS_DIR.mkdir(exist_ok=True)
+os.environ.setdefault("MPLCONFIGDIR", str(RESULTS_DIR / ".matplotlib"))
+Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
+# === Load model + metadata ===
+model = joblib.load("models/rain_xgb_tuned.joblib")
+meta = json.load(open("models/rain_xgb_tuned_meta.json"))
+features = meta["features"]
+# === Load data ===
+df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+# Rebuild features exactly like training
+import importlib.util
+spec = importlib.util.spec_from_file_location(
+    "train_xgb_tuned_final", Path("scripts/train_xgb_tuned_final.py")
+)
+module = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(module)
+build_features = module.build_features
+Xdf = build_features(df)
+X = Xdf.values.astype(np.float32)
+# Use last 500 samples for analysis (avoid overkill)
+X_sample = X[-200:]
+# === SHAP Explainer ===
+explainer = shap.Explainer(model.predict_proba, X_sample, algorithm="permutation")
+shap_values = explainer(X_sample)
+# === Global importance ===
+Path("results").mkdir(exist_ok=True)
+plt.figure()
+shap.summary_plot(shap_values, X_sample,
+feature_names=features, show=False)
+plt.tight_layout()
+plt.savefig("results/shap_summary.png", dpi=300)
+plt.close()
+# === Bar chart version ===
+plt.figure()
+shap.summary_plot(shap_values, X_sample,
+feature_names=features, plot_type="bar", show=False)
+plt.tight_layout()
+plt.savefig("results/shap_top.png", dpi=300)
+plt.close()
+print("✅ SHAP visualisations saved: results/shap_summary.png and results/shap_top.png")

scripts/explain_shap_interaction.py ADDED Viewed

	@@ -0,0 +1,105 @@

+#!/usr/bin/env python3
+"""
+Generates a SHAP dependence plot showing how HUMIDITY and
+TEMPERATURE
+jointly influence rain predictions. Outputs:
+  - results/shap_interaction.png
+"""
+import json
+from pathlib import Path
+import joblib
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import shap
+import importlib.util
+import os
+# Keep matplotlib caches inside repo to avoid home directory issues
+RESULTS_DIR = Path("results")
+RESULTS_DIR.mkdir(exist_ok=True)
+os.environ.setdefault("MPLCONFIGDIR", str(RESULTS_DIR / ".matplotlib"))
+Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
+# Load model + meta
+model = joblib.load("models/rain_xgb_tuned.joblib")
+booster = model.get_booster()
+config = json.loads(booster.save_config())
+base_score = config.get("learner", {}).get("learner_model_param", {}).get("base_score")
+if base_score:
+    cleaned = base_score.strip("[]")
+    try:
+        float(cleaned)
+    except ValueError:
+        cleaned = "0.5"
+    config["learner"]["learner_model_param"]["base_score"] = cleaned
+    booster.load_config(json.dumps(config))
+meta = json.loads(Path("models/rain_xgb_tuned_meta.json").read_text())
+features = meta["features"]
+# Load data and rebuild features exactly like training
+df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+spec = importlib.util.spec_from_file_location(
+    "train_xgb_tuned_final", "scripts/train_xgb_tuned_final.py"
+)
+module = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(module)
+build_features = module.build_features
+Xdf = build_features(df)              # same order as training
+X = Xdf.values
+X_sample = X[-120:] if len(X) > 120 else X
+X_sample_df = pd.DataFrame(X_sample, columns=features)
+X_sample_df = pd.DataFrame(X_sample, columns=features)
+# Prefer TreeExplainer for XGBoost; fallback to generic Explainer if needed
+try:
+    explainer = shap.TreeExplainer(booster, data=X_sample)
+    shap_result = explainer(X_sample)
+except Exception:
+    explainer = shap.Explainer(model.predict_proba, X_sample, algorithm="permutation")
+    shap_result = explainer(X_sample)
+# Normalize SHAP output to a 2D array aligned with feature columns
+if hasattr(shap_result, "values"):
+    values = shap_result.values
+    if values.ndim == 3:  # multi-class, take positive class (index 1)
+        values = values[:, :, 1]
+    shap_values = values
+else:
+    shap_values = np.array(shap_result)
+# Ensure sample frame matches SHAP output rows
+X_plot = X_sample_df.iloc[-shap_values.shape[0]:]
+Path("results").mkdir(exist_ok=True)
+# 1) Dependence plot: humidity colored by temp_c (classic interaction view)
+plt.figure()
+shap.dependence_plot(
+    "humidity",
+    shap_values,
+    X_plot,
+    interaction_index="temp_c",
+    show=False
+)
+plt.tight_layout()
+plt.savefig("results/shap_interaction.png", dpi=300)
+plt.close()
+# 2) (Optional) Reverse view: temp_c colored by humidity
+plt.figure()
+shap.dependence_plot(
+    "temp_c",
+    shap_values,
+    X_plot,
+    interaction_index="humidity",
+    show=False
+)
+plt.tight_layout()
+plt.savefig("results/shap_interaction_rev.png", dpi=300)
+plt.close()
+print("✅ Saved results/shap_interaction.png and results/shap_interaction_rev.png")

scripts/export_daily.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import json, pandas as pd, os
+os.makedirs("results", exist_ok=True)
+data = json.load(open("data/weather.json"))
+df = pd.DataFrame({
+    "date": data["daily"]["time"],
+    "temp_min_c": data["daily"]["temperature_2m_min"],
+    "temp_max_c": data["daily"]["temperature_2m_max"],
+    "precip_mm": data["daily"]["precipitation_sum"],
+    "cloudcover": data["daily"]["cloudcover_mean"],
+    "wind_speed": data["daily"]["wind_speed_10m_max"],
+    "humidity_max": data["daily"]["relative_humidity_2m_max"],
+    "humidity_min": data["daily"]["relative_humidity_2m_min"],
+})
+df.to_csv("results/daily.csv", index=False)
+print(f"✅ Wrote results/daily.csv with {len(df)} rows")

scripts/export_hourly.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import os
+import sys
+import json
+import pandas as pd
+os.makedirs("results", exist_ok=True)
+with open("data/weather.json") as handle:
+    data = json.load(handle)
+if "hourly" not in data:
+    print(
+        "data/weather.json missing 'hourly'. Re-run the fetch step with hourly "
+        "parameters enabled (see scripts/fetch_weather.sh).",
+        file=sys.stderr,
+    )
+    sys.exit(1)
+H = data["hourly"]
+df = pd.DataFrame({
+    "time": H["time"],
+    "temp_c": H["temperature_2m"],
+    "humidity": H["relative_humidity_2m"],
+    "cloudcover": H["cloudcover"],
+    "pressure": H["pressure_msl"],
+    "wind_speed": H["wind_speed_10m"],
+    "precip_mm": H["precipitation"],
+    "rain_mm": H["rain"],
+})
+df["time"] = pd.to_datetime(df["time"])
+df = df.sort_values("time").reset_index(drop=True)
+df.to_csv("results/hourly.csv", index=False)
+print(f"✅ Wrote results/hourly.csv with {len(df)} rows")

scripts/feature_importance_rain.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import json
+import os
+from pathlib import Path
+import joblib
+import numpy as np
+import pandas as pd
+from sklearn.inspection import permutation_importance
+from sklearn.model_selection import train_test_split
+RESULTS_DIR = "results"
+os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
+os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
+Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
+Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True)
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+def build_dataset(meta: dict) -> tuple[np.ndarray, np.ndarray]:
+    df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+    horizon = meta["horizon_hours"]
+    precip = df["precip_mm"].values
+    rain_future = np.zeros(len(df), dtype=int)
+    for i in range(len(precip) - horizon):
+        rain_future[i] = 1 if np.any(precip[i + 1 : i + 1 + horizon] > 0) else 0
+    df = df.iloc[: len(precip) - horizon].copy()
+    labels = rain_future[: len(df)]
+    features = df[meta["features"]].values
+    return features, labels
+def plot_importance(feature_names: list[str], importances: np.ndarray, std: np.ndarray) -> None:
+    order = np.argsort(importances)[::-1]
+    feature_names = np.array(feature_names)[order]
+    importances = importances[order]
+    plt.figure(figsize=(8, 5))
+    y_pos = np.arange(len(feature_names))
+    plt.barh(y_pos, importances, align="center")
+    plt.yticks(y_pos, feature_names)
+    plt.gca().invert_yaxis()
+    plt.xlabel("Permutation importance (F1 drop)")
+    plt.title("Rain classifier — feature importances")
+    plt.tight_layout()
+    Path(RESULTS_DIR).mkdir(exist_ok=True)
+    plt.savefig(os.path.join(RESULTS_DIR, "feature_importance.png"))
+    plt.close()
+def main() -> None:
+    meta = json.load(open("models/rain_model_meta.json"))
+    model = joblib.load("models/rain_classifier_hourly.joblib")
+    X, y = build_dataset(meta)
+    _, X_test, _, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
+    result = permutation_importance(
+        model,
+        X_test,
+        y_test,
+        n_repeats=25,
+        random_state=42,
+        scoring="f1",
+    )
+    plot_importance(meta["features"], result.importances_mean, result.importances_std)
+    print("✅ Wrote results/feature_importance.png")
+if __name__ == "__main__":
+    main()

scripts/fetch_weather.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+set -euo pipefail
+mkdir -p data logs
+source .env 2>/dev/null || true
+: "${LAT:=6.5244}"
+: "${LON:=3.3792}"
+: "${CITY:=Lagos}"
+: "${PAST_DAYS:=30}"
+STAMP="$(date +%Y-%m-%d_%H-%M-%S)"
+LOG_FILE=${LOG_FILE:-logs/app.log}
+echo "[${STAMP}] Fetching ${PAST_DAYS} past days for ${CITY} (${LAT}, ${LON})"
+URL="https://api.open-meteo.com/v1/forecast?latitude=${LAT}&longitude=${LON}&hourly=temperature_2m,relative_humidity_2m,cloudcover,pressure_msl,wind_speed_10m,precipitation,rain&timezone=Africa%2FLagos&past_days=${PAST_DAYS}"
+{
+  curl -sfL "$URL" -o data/weather.json
+  echo "[$STAMP] Saved to data/weather.json"
+} | tee -a "$LOG_FILE"

scripts/intro_ml.py ADDED Viewed

	@@ -0,0 +1,28 @@

+from sklearn.linear_model import LinearRegression
+import numpy as np
+# Imagine 5 days of temperatures (°C)
+# `x` is the input feature (temperature), reshaped to a column vector
+x = np.array([25, 27, 30, 32, 35]).reshape(-1, 1)
+# `y` is the output label (humidity percentage)
+y = np.array([50, 55, 63, 70, 74])
+model = LinearRegression()
+model.fit(x, y)
+pred = model.predict([[28]])
+print(f"Predicted humidity for 28°C: {pred[0]:.2f}%")
+import matplotlib.pyplot as plt
+plt.scatter(x, y, color='blue', label='data')
+plt.plot(x, model.predict(x), color='red', label='model')
+plt.xlabel('Temperature (°C)')
+plt.ylabel('Humidity (%)')
+plt.legend()
+plt.tight_layout()
+plt.savefig("results/intro_regression.png")
+print("✅ Saved results/intro_regression.png")
+print("slope:", model.coef_)
+print("intercept:", model.intercept_)

scripts/log_predict.py ADDED Viewed

	@@ -0,0 +1,67 @@

+#!/usr/bin/env python3
+import argparse, os, json
+from pathlib import Path
+from datetime import datetime
+import joblib, pandas as pd, numpy as np, subprocess
+MODEL = Path("models/rain_xgb_tuned.joblib")
+META  = Path("models/rain_xgb_tuned_meta.json")
+HOURLY = Path("results/hourly.csv")
+LOGS = Path("logs"); LOGS.mkdir(exist_ok=True)
+PRED_LOG = LOGS / "predictions.csv"
+def ensure_hourly(lat, lon, past_days=90):
+    env = os.environ.copy()
+    env["LAT"], env["LON"], env["PAST_DAYS"] = str(lat), str(lon), str(past_days)
+    if (not HOURLY.exists()):
+        subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
+        subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
+    return pd.read_csv(HOURLY, parse_dates=["time"])
+def build_features_like_training(df, features):
+    import importlib.util
+    spec = importlib.util.spec_from_file_location("train_xgb_tuned_final", "scripts/train_xgb_tuned_final.py")
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    build_features = module.build_features
+    Xdf = build_features(df)
+    return Xdf[features]
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--city", default="Lagos")
+    ap.add_argument("--lat", type=float, default=6.5244)
+    ap.add_argument("--lon", type=float, default=3.3792)
+    ap.add_argument("--mode", choices=["default","recall","precision"], default="default")
+    args = ap.parse_args()
+    meta = json.loads(META.read_text())
+    thr = meta["thresholds"]; feats = meta["features"]; H = meta["horizon_hours"]; event_mm = meta["event_mm"]
+    df = ensure_hourly(args.lat, args.lon, 90)
+    Xdf = build_features_like_training(df.copy(), feats)
+    if Xdf.empty: raise SystemExit("Not enough rows to build features")
+    clf = joblib.load(MODEL)
+    p = float(clf.predict_proba(Xdf.iloc[[-1]].values)[0,1])
+    tmap = {"default":thr["default"], "recall":thr["high_recall"], "precision":thr["high_precision"]}
+    t = float(tmap[args.mode])
+    decision = "RAIN" if p >= t else "No rain"
+    row = {
+        "ts_pred": df.loc[Xdf.index, "time"].iloc[-1].strftime("%Y-%m-%d %H:%M:%S"),
+        "logged_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
+        "city": args.city, "lat": args.lat, "lon": args.lon,
+        "mode": args.mode, "horizon_h": H, "event_mm": event_mm,
+        "p": p, "threshold": t, "decision": decision,
+        "y_true": "",  # to be filled by backfill
+    }
+    if not PRED_LOG.exists():
+        pd.DataFrame([row]).to_csv(PRED_LOG, index=False)
+    else:
+        pd.DataFrame([row]).to_csv(PRED_LOG, mode="a", header=False, index=False)
+    print(f"Logged: {row}")
+if __name__ == "__main__":
+    main()

scripts/make_cover.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from PIL import Image, ImageDraw, ImageFont
+import os
+RESULTS = "results"
+IMG1 = os.path.join(RESULTS, "temps.png")
+IMG2 = os.path.join(RESULTS, "precip.png")
+OUT  = os.path.join(RESULTS, "cover.png")
+ASSETS_DIR = "assets"
+ASSET_OUT = os.path.join(ASSETS_DIR, "cover.png")
+def fail(msg):
+    print(f"❌ {msg}")
+    raise SystemExit(1)
+if not os.path.exists(IMG1):
+    fail(f"Missing {IMG1}. Run `make viz` first.")
+if not os.path.exists(IMG2):
+    fail(f"Missing {IMG2}. Run `make viz` first.")
+img1 = Image.open(IMG1).convert("RGB")
+img2 = Image.open(IMG2).convert("RGB")
+w = min(img1.width, img2.width)
+def resize_to_width(im, target_w):
+    new_h = int(im.height * target_w / im.width)
+    return im.resize((target_w, new_h))
+img1 = resize_to_width(img1, w)
+img2 = resize_to_width(img2, w)
+pad = 16
+title_h = 48
+H = img1.height + img2.height + title_h + pad * 4
+W = w + pad * 2
+canvas = Image.new("RGB", (W, H), "white")
+y = pad
+canvas.paste(img1, (pad, y)); y += img1.height + pad
+canvas.paste(img2, (pad, y)); y += img2.height + pad
+draw = ImageDraw.Draw(canvas)
+title = "Weather Data Fetcher — Automated Pipeline"
+try:
+    font = ImageFont.load_default()
+except Exception:
+    font = None
+tw, th = draw.textbbox((0,0), title, font=font)[2:]
+tx = (W - tw) // 2
+ty = y
+draw.text((tx, ty), title, fill="black", font=font)
+os.makedirs(RESULTS, exist_ok=True)
+canvas.save(OUT, optimize=True)
+if ASSETS_DIR:
+    os.makedirs(ASSETS_DIR, exist_ok=True)
+    canvas.save(ASSET_OUT, optimize=True)
+print(f"✅ Created {OUT}")

scripts/monitor_weekly.py ADDED Viewed

	@@ -0,0 +1,60 @@

+#!/usr/bin/env python3
+from pathlib import Path
+import pandas as pd, numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import precision_recall_fscore_support, brier_score_loss
+LOG = Path("logs/predictions.csv")
+OUT = Path("results"); OUT.mkdir(exist_ok=True)
+def week_key(ts):  # ISO year-week
+    iso = ts.isocalendar()
+    return f"{iso.year}-W{iso.week:02d}"
+def calibration_plot(p, y, bins=10, out_png="results/calibration.png"):
+    df = pd.DataFrame({"p":p, "y":y}).dropna()
+    df["bin"] = pd.qcut(df["p"], q=bins, duplicates="drop")
+    g = df.groupby("bin").agg(avg_p=("p","mean"), frac_pos=("y","mean"), n=("y","size")).reset_index(drop=True)
+    plt.figure()
+    plt.plot([0,1],[0,1], linestyle="--")
+    plt.plot(g["avg_p"], g["frac_pos"], marker="o")
+    plt.xlabel("Predicted probability")
+    plt.ylabel("Observed frequency")
+    plt.title("Calibration")
+    for i, n in enumerate(g["n"]):
+        plt.annotate(str(int(n)), (g["avg_p"].iloc[i], g["frac_pos"].iloc[i]))
+    plt.tight_layout()
+    plt.savefig(out_png, dpi=300); plt.close()
+def main():
+    if not LOG.exists():
+        print("No logs yet.")
+        return
+    df = pd.read_csv(LOG, parse_dates=["ts_pred","logged_at"])
+    df = df[df["y_true"].astype(str).isin(["0","1"])].copy()
+    if df.empty:
+        print("No rows with y_true yet.")
+        return
+    df["y_true"] = df["y_true"].astype(int)
+    df["week"] = df["ts_pred"].apply(week_key)
+    # Weekly metrics per mode
+    rows = []
+    for (wk, mode), grp in df.groupby(["week","mode"]):
+        y = grp["y_true"].values
+        # decision at time of logging
+        yhat = (grp["p"].values >= grp["threshold"].values).astype(int)
+        P,R,F1,_ = precision_recall_fscore_support(y, yhat, average="binary", zero_division=0)
+        alerts = float(yhat.mean())
+        brier = brier_score_loss(y, grp["p"].values)
+        rows.append({"week":wk,"mode":mode,"n":len(grp),"precision":P,"recall":R,"f1":F1,"alert_rate":alerts,"brier":brier})
+    rep = pd.DataFrame(rows).sort_values(["week","mode"])
+    rep.to_csv(OUT/"weekly_report.csv", index=False)
+    print(rep)
+    # Overall calibration (all modes combined)
+    calibration_plot(df["p"].values, df["y_true"].values, bins=12, out_png=str(OUT/"calibration.png"))
+    print("Saved:", OUT/"weekly_report.csv", "and", OUT/"calibration.png")
+if __name__ == "__main__":
+    main()

scripts/plot_pr_roc.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import json
+import os
+import joblib
+import pandas as pd
+import numpy as np
+from sklearn.metrics import precision_recall_curve, roc_curve, auc
+from sklearn.model_selection import train_test_split
+RESULTS_DIR = "results"
+os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
+os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
+from pathlib import Path
+Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
+Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True)
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+meta = json.load(open("models/rain_model_meta.json"))
+clf = joblib.load("models/rain_classifier_hourly.joblib")
+df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+H = meta["horizon_hours"]
+features = meta["features"]
+precip_next = np.zeros(len(df), dtype=int)
+prec = df["precip_mm"].values
+for i in range(len(prec) - H):
+    precip_next[i] = 1 if np.any(prec[i + 1 : i + 1 + H] > 0) else 0
+df = df.iloc[: len(precip_next)].copy()
+df["rain_next6h"] = precip_next[: len(df)]
+X = df[features].values
+y = df["rain_next6h"].values
+_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
+proba = clf.predict_proba(X_test)[:, 1]
+precision, recall, _ = precision_recall_curve(y_test, proba)
+fpr, tpr, _ = roc_curve(y_test, proba)
+plt.figure()
+plt.plot(recall, precision)
+plt.xlabel("Recall")
+plt.ylabel("Precision")
+plt.title("Precision–Recall")
+plt.tight_layout()
+plt.savefig("results/pr_curve.png")
+plt.close()
+plt.figure()
+plt.plot(fpr, tpr)
+plt.xlabel("FPR")
+plt.ylabel("TPR")
+plt.title(f"ROC (AUC={auc(fpr, tpr):.2f})")
+plt.tight_layout()
+plt.savefig("results/roc_curve.png")
+plt.close()
+print("✅ Wrote results/pr_curve.png and results/roc_curve.png")

scripts/plot_weather.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import json
+import os
+from shutil import copyfile
+RESULTS_DIR = "results"
+ASSETS_DIR = "assets"
+TEMPS_RESULTS = os.path.join(RESULTS_DIR, "temps.png")
+PRECIP_RESULTS = os.path.join(RESULTS_DIR, "precip.png")
+TEMPS_ASSET = os.path.join(ASSETS_DIR, "temps.png")
+PRECIP_ASSET = os.path.join(ASSETS_DIR, "precip.png")
+os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
+os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import pandas as pd
+def ensure_dirs():
+    os.makedirs(RESULTS_DIR, exist_ok=True)
+    os.makedirs(ASSETS_DIR, exist_ok=True)
+    os.makedirs(os.environ["MPLCONFIGDIR"], exist_ok=True)
+    os.makedirs(os.environ["XDG_CACHE_HOME"], exist_ok=True)
+def mirror_asset(src: str, dest: str) -> None:
+    copyfile(src, dest)
+def main():
+    with open("data/weather.json") as handle:
+        data = json.load(handle)
+    days = pd.to_datetime(data["daily"]["time"])
+    tmax = pd.Series(data["daily"]["temperature_2m_max"])
+    tmin = pd.Series(data["daily"]["temperature_2m_min"])
+    prec = pd.Series(data["daily"].get("precipitation_sum", [0] * len(days)))
+    ensure_dirs()
+    plt.figure()
+    plt.plot(days, tmax, marker="o", label="Max °C")
+    plt.plot(days, tmin, marker="o", label="Min °C")
+    plt.xticks(rotation=45, ha="right")
+    plt.title("Daily Temperatures (°C)")
+    plt.legend()
+    plt.tight_layout()
+    plt.savefig(TEMPS_RESULTS)
+    plt.close()
+    mirror_asset(TEMPS_RESULTS, TEMPS_ASSET)
+    # Precipitation bar chart
+    plt.figure()
+    plt.bar(days, prec)
+    plt.xticks(rotation=45, ha="right")
+    plt.title("Daily Precipitation (mm)")
+    plt.tight_layout()
+    plt.savefig(PRECIP_RESULTS)
+    plt.close()
+    mirror_asset(PRECIP_RESULTS, PRECIP_ASSET)
+    print(f"✅ Wrote {TEMPS_RESULTS} / {PRECIP_RESULTS}")
+    print(f"✅ Updated assets at {TEMPS_ASSET} / {PRECIP_ASSET}")
+if __name__ == "__main__":
+    main()

scripts/predict_rain.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import sys, json, joblib
+import numpy as np
+import pandas as pd
+# Load latest hour from results/hourly.csv, predict next 6h rain
+df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+row = df.iloc[-1:].copy()
+for col in ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]:
+    row[f"d_{col}"] = df[col].diff().iloc[-1]
+    row[f"ma3_{col}"] = df[col].rolling(3).mean().iloc[-1]
+features = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
+features += [f"d_{c}" for c in features]
+features += [f"ma3_{c}" for c in ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]]
+# Load model + meta
+clf = joblib.load("models/rain_classifier_hourly.joblib")
+meta = json.load(open("models/rain_model_meta.json"))
+X = row[meta["features"]].values
+proba = float(clf.predict_proba(X)[0,1])
+thr_r = meta["thresholds"]["high_recall"]
+thr_p = meta["thresholds"]["high_precision"]
+print(f"Latest hour: {row['time'].iloc[0]}")
+print(f"P(rain next {meta['horizon_hours']}h) = {proba:.3f}")
+print(f"High-Recall mode:   {'RAIN' if proba>=thr_r else 'No rain'} (thr={thr_r:.2f})")
+print(f"High-Precision mode:{'RAIN' if proba>=thr_p else 'No rain'} (thr={thr_p:.2f})")

scripts/process_weather.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from dotenv import load_dotenv
+import os, json, sys, logging
+load_dotenv()
+LAT = os.getenv("LAT", "6.5244")
+LON = os.getenv("LON", "3.3792")
+CITY = os.getenv("CITY", "Lagos")
+LOG_FILE = os.getenv("LOG_FILE", "logs/app.log")
+os.makedirs("logs", exist_ok=True)
+logging.basicConfig(
+    filename=LOG_FILE,
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+)
+logging.info(f"Processing weather for {CITY} ({LAT}, {LON})")
+IN, OUT_DIR = "data/weather.json", "results"
+OUT = os.path.join(OUT_DIR, "summary.txt")
+os.makedirs(OUT_DIR, exist_ok=True)
+logging.info("Reading weather.json")
+try:
+    with open(IN) as f:
+        data = json.load(f)
+except FileNotFoundError:
+    print("weather.json not found. Run `make download`.", file=sys.stderr); sys.exit(1)
+try:
+    daily = data["daily"]
+    days = daily["time"]
+    tmax = daily["temperature_2m_max"]
+    tmin = daily["temperature_2m_min"]
+    prec = daily.get("precipitation_sum", [0]*len(days))
+except Exception as e:
+    print(f"Unexpected JSON structure: {e}", file=sys.stderr); sys.exit(2)
+with open(OUT, "w") as f:
+    f.write("Lagos (Africa/Lagos) – Daily summary\n")
+    f.write("-----------------------------------\n")
+    for d, lo, hi, p in zip(days, tmin, tmax, prec):
+        f.write(f"{d}: {lo}°C – {hi}°C | precip: {p} mm\n")
+logging.info(f"Wrote summary to {OUT}")
+print(f"✅ Wrote {OUT}")
+import csv
+with open(os.path.join(OUT_DIR, "summary.csv"), "w", newline="") as f:
+    w = csv.writer(f)
+    w.writerow(["date", "temp_min_c", "temp_max_c", "precip_mm"])
+    for d, lo, hi, p in zip(days, tmin, tmax, prec):
+        w.writerow([d, lo, hi, p])
+print("✅ Wrote results/summary.csv")

scripts/rain_cli.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import argparse, json, joblib, pandas as pd
+def main():
+    ap = argparse.ArgumentParser(description="Rain warning in next 6h")
+    ap.add_argument("--mode", choices=["recall","precision","default"], default="recall")
+    args = ap.parse_args()
+    meta = json.load(open("models/rain_model_meta.json"))
+    clf = joblib.load("models/rain_classifier_hourly.joblib")
+    df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+    row = df.iloc[-1:].copy()
+    # rebuild features like training
+    base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
+    for col in base:
+        row[f"d_{col}"] = df[col].diff().iloc[-1]
+        row[f"ma3_{col}"] = df[col].rolling(3).mean().iloc[-1]
+    X = row[meta["features"]].values
+    p = float(clf.predict_proba(X)[0,1])
+    thr = {
+        "default": meta["thresholds"]["default"],
+        "recall": meta["thresholds"]["high_recall"],
+        "precision": meta["thresholds"]["high_precision"],
+    }[args.mode]
+    decision = "RAIN" if p >= thr else "No rain"
+    print(f"{row['time'].iloc[0]}  |  P(rain ≤{meta['horizon_hours']}h)={p:.3f}  |  mode={args.mode} thr={thr:.2f}  →  {decision}")
+if __name__ == "__main__":
+    main()

scripts/start_services.sh ADDED Viewed

	@@ -0,0 +1,28 @@

+#!/bin/bash
+set -euo pipefail
+STREAMLIT_PORT="${STREAMLIT_PORT:-8501}"
+UVICORN_PORT="${UVICORN_PORT:-${PORT:-8000}}"
+HOST="0.0.0.0"
+echo "Environment: PORT=${PORT:-<unset>} STREAMLIT_PORT=${STREAMLIT_PORT} UVICORN_PORT=${UVICORN_PORT}"
+export STREAMLIT_SERVER_HEADLESS=true
+export STREAMLIT_SERVER_PORT="${STREAMLIT_PORT}"
+export STREAMLIT_SERVER_ADDRESS="${HOST}"
+echo "🌐 Starting Streamlit on port ${STREAMLIT_PORT}"
+streamlit run streamlit_app.py --server.port "${STREAMLIT_PORT}" --server.address "${HOST}" &
+STREAMLIT_PID=$!
+cleanup() {
+  echo "🛑 Shutting down services..."
+  if kill -0 "${STREAMLIT_PID}" 2>/dev/null; then
+    kill "${STREAMLIT_PID}" 2>/dev/null || true
+    wait "${STREAMLIT_PID}" 2>/dev/null || true
+  fi
+}
+trap cleanup EXIT INT TERM
+echo "🚀 Starting FastAPI (uvicorn) on port ${UVICORN_PORT}"
+exec python -m uvicorn app.main:app --host "${HOST}" --port "${UVICORN_PORT}" --proxy-headers

scripts/time_series_cv_demo.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import numpy as np
+import pandas as pd
+from sklearn.model_selection import TimeSeriesSplit
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import f1_score
+# Load your hourly data
+df = pd.read_csv("results/hourly.csv")
+df = df.dropna().reset_index(drop=True)
+# Features and target
+features = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
+X = df[features].values
+y = (df["precip_mm"].shift(-6) > 0).astype(int)  # rain in next 6h
+y = y[:-6]
+X = X[:-6]
+# Time-series CV setup
+tscv = TimeSeriesSplit(n_splits=5)
+f1_scores = []
+for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
+    X_train, X_test = X[train_idx], X[test_idx]
+    y_train, y_test = y[train_idx], y[test_idx]
+    clf = Pipeline([
+        ("scaler", StandardScaler()),
+        ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
+    ])
+    clf.fit(X_train, y_train)
+    preds = clf.predict(X_test)
+    score = f1_score(y_test, preds)
+    f1_scores.append(score)
+    print(f"Fold {fold+1} F1: {score:.3f}")
+print("\nAverage F1 across folds:", np.mean(f1_scores).round(3))

scripts/train_classify_rain.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+import pandas as pd
+import numpy as np
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import (
+    confusion_matrix, classification_report,
+    roc_auc_score, roc_curve, precision_recall_fscore_support
+)
+df = pd.read_csv("results/daily.csv")
+df["precip_tomorrow"] = df["precip_mm"].shift(-1)
+df = df.dropna()  # drop last row without tomorrow
+df["rain_tomorrow"] = (df["precip_tomorrow"] > 0).astype(int)
+features = [
+    "temp_max_c",
+    "temp_min_c",
+    "cloudcover",
+    "wind_speed",
+    "humidity_max",
+    "humidity_min",
+    "precip_mm",        # rain today often implies rain persists
+]
+X = df[features].values
+y = df["rain_tomorrow"].values
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, shuffle=False
+)
+clf = Pipeline([
+    ("scaler", StandardScaler()),
+    ("logreg", LogisticRegression(max_iter=200,
+class_weight="balanced"))
+])
+clf.fit(X_train, y_train)
+proba = clf.predict_proba(X_test)[:, 1]       # P(rain)
+pred_default = (proba >= 0.5).astype(int)     # default threshold
+labels = [0, 1]
+cm = confusion_matrix(y_test, pred_default, labels=labels)
+tn, fp, fn, tp = cm.ravel()
+prec, rec, f1, _ = precision_recall_fscore_support(
+    y_test, pred_default, average="binary", zero_division=0
+)
+auc = (
+    roc_auc_score(y_test, proba)
+    if len(np.unique(y_test)) > 1 and len(np.unique(proba)) > 1
+    else float("nan")
+)
+print("📊 Confusion Matrix (threshold=0.50)")
+print(cm)
+auc_str = f"{auc:.3f}" if np.isfinite(auc) else "n/a"
+print(f"\nPrecision: {prec:.3f}  Recall: {rec:.3f}  F1: {f1:.3f}  ROC-AUC: {auc_str}")
+print("\nDetailed report:")
+print(classification_report(y_test, pred_default, digits=3, zero_division=0, labels=labels))
+always_no = np.zeros_like(y_test)
+prec0, rec0, f10, _ = precision_recall_fscore_support(
+    y_test, always_no, average="binary", zero_division=0
+)
+print("\n⚠️ Baseline — always 'no rain'")
+print(f"Precision: {prec0:.3f}  Recall: {rec0:.3f}  F1: {f10:.3f}")
+today_rain = (df["precip_mm"].values[-len(y_test)-1:-1] > 0).astype(int)
+precp, recp, f1p, _ = precision_recall_fscore_support(
+    y_test, today_rain, average="binary", zero_division=0
+)
+print("\n🧠 Baseline — 'tomorrow rain = today rain'")
+print(f"Precision: {precp:.3f}  Recall: {recp:.3f}  F1: {f1p:.3f}")
+thr = 0.35
+pred_tuned = (proba >= thr).astype(int)
+prec_t, rec_t, f1_t, _ = precision_recall_fscore_support(
+    y_test, pred_tuned, average="binary", zero_division=0
+)
+print(f"\n🎛️  Threshold {thr:.2f} → Precision: {prec_t:.3f}  Recall: {rec_t:.3f}  F1: {f1_t:.3f}")
+import joblib
+os.makedirs("models", exist_ok=True)
+joblib.dump(clf, "models/rain_classifier.joblib")
+print("\n💾 Saved: models/rain_classifier.joblib")

scripts/train_classify_rain_hourly.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import os
+import numpy as np
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+from sklearn.linear_model import LogisticRegression
+from sklearn.pipeline import Pipeline
+from sklearn.metrics import (
+    confusion_matrix, classification_report, roc_auc_score,
+    precision_recall_fscore_support
+)
+df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
+H = 6
+precip_next = np.zeros(len(df), dtype=int)
+prec = df["precip_mm"].values
+for i in range(len(prec) - H):
+    precip_next[i] = 1 if np.any(prec[i+1:i+1+H] > 0) else 0
+df = df.iloc[:len(precip_next) - (0)].copy()
+df["rain_next6h"] = precip_next[:len(df)]
+features = [
+    "temp_c","humidity","cloudcover","pressure","wind_speed",
+    "precip_mm","rain_mm"
+]
+X = df[features].values
+y = df["rain_next6h"].values
+print("Class balance (0=no-rain, 1=rain-in-next6h):", np.bincount(y))
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.3, shuffle=False
+)
+clf = Pipeline([
+    ("scaler", StandardScaler()),
+    ("logreg", LogisticRegression(max_iter=500, class_weight="balanced"))
+])
+clf.fit(X_train, y_train)
+proba = clf.predict_proba(X_test)[:, 1]
+pred_050 = (proba >= 0.50).astype(int)
+cm = confusion_matrix(y_test, pred_050)
+print("\n📊 Confusion Matrix (thr=0.50)")
+print(cm)
+prec, rec, f1, _ = precision_recall_fscore_support(
+    y_test, pred_050, average="binary", zero_division=0
+)
+try:
+    auc = roc_auc_score(y_test, proba)
+except ValueError:
+    auc = float("nan")
+print(f"Precision: {prec:.3f}  Recall: {rec:.3f}  F1: {f1:.3f}  ROC-AUC: {auc:.3f}")
+print("\nDetailed report:")
+print(classification_report(y_test, pred_050, digits=3, zero_division=0))
+# Baselines
+always_no = np.zeros_like(y_test)
+p0, r0, f10, _ = precision_recall_fscore_support(
+    y_test, always_no, average="binary", zero_division=0
+)
+print("\n🧠 Baseline — always 'no rain'")
+print(f"Precision: {p0:.3f}  Recall: {r0:.3f}  F1: {f10:.3f}")
+# Persistence baseline
+recent_rain = (
+    pd.Series(df["precip_mm"])
+    .rolling(window=H, min_periods=1)
+    .sum()
+    .shift(1)
+    .fillna(0)
+    > 0
+).astype(int).values
+prev6_test = recent_rain[-len(y_test):]
+pp, rp, f1p, _ = precision_recall_fscore_support(y_test, prev6_test, average="binary", zero_division=0)
+print("\n🧠 Baseline — persistence (prev 6h)")
+print(f"Precision: {pp:.3f}  Recall: {rp:.3f}  F1: {f1p:.3f}")
+# Threshold tuning
+thr_recall = 0.35
+thr_precision = 0.65
+pred_recall = (proba >= thr_recall).astype(int)
+pred_precision = (proba >= thr_precision).astype(int)
+pr_recall, rc_recall, f1_recall, _ = precision_recall_fscore_support(
+    y_test, pred_recall, average="binary", zero_division=0
+)
+pr_precision, rc_precision, f1_precision, _ = precision_recall_fscore_support(
+    y_test, pred_precision, average="binary", zero_division=0
+)
+print(f"\n🎛️ Threshold {thr_recall:.2f} → Precision: {pr_recall:.3f}  Recall: {rc_recall:.3f}  F1: {f1_recall:.3f}")
+print(f"🎛️ Threshold {thr_precision:.2f} → Precision: {pr_precision:.3f}  Recall: {rc_precision:.3f}  F1: {f1_precision:.3f}")
+import joblib
+os.makedirs("models", exist_ok=True)
+joblib.dump(clf, "models/rain_classifier_hourly.joblib")
+print("\n💾 Saved: models/rain_classifier_hourly.joblib")
+meta = {
+    "horizon_hours": H,
+    "features": features,
+    "thresholds": {
+        "default": 0.50,
+        "high_recall": thr_recall,
+        "high_precision": thr_precision,
+    },
+    "metrics": {
+        "default": {"precision": float(prec), "recall": float(rec), "f1": float(f1)},
+        "high_recall": {
+            "precision": float(pr_recall),
+            "recall": float(rc_recall),
+            "f1": float(f1_recall),
+        },
+        "high_precision": {
+            "precision": float(pr_precision),
+            "recall": float(rc_precision),
+            "f1": float(f1_precision),
+        },
+        "baseline_persistence": {
+            "precision": float(pp),
+            "recall": float(rp),
+            "f1": float(f1p),
+        },
+    },
+}
+with open("models/rain_model_meta.json", "w") as fh:
+    import json
+    json.dump(meta, fh, indent=2)
+print("📝 Saved: models/rain_model_meta.json")