theelvace commited on
Commit
6eff894
·
0 Parent(s):

Deployable Gradio build

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
.DS_Store ADDED
Binary file (6.15 kB). View file
 
.dockerignore ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .git
2
+ .github
3
+ .venv
4
+ __pycache__
5
+ *.pyc
6
+ *.pyo
7
+ *.pyd
8
+ *.log
9
+ *.csv
10
+ data/
11
+ results/
12
+ build/
13
+ dist/
14
+ node_modules/
.gitattributes ADDED
@@ -0,0 +1 @@
 
 
1
+ models/*.joblib filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ data/
2
+ results/
3
+ logs/
4
+ __pycache__/
5
+ *.zip
6
+ .env
7
+ models/*.joblib
Dockerfile ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ WORKDIR /app
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends \
6
+ build-essential && \
7
+ rm -rf /var/lib/apt/lists/*
8
+
9
+ COPY requirements.txt .
10
+ RUN pip install --no-cache-dir -r requirements.txt
11
+
12
+ COPY . .
13
+
14
+ ENV PORT=7860
15
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
16
+
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Elvis Anselm
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Makefile ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ .PHONY: all check download process zip clean coords plot setup viz cli \
3
+ install uninstall rain rain6 rain-train rain-predict rain-now eval-plots \
4
+ hourly
5
+
6
+ all: check download process zip
7
+ @echo "🏁 Weather pipeline complete."
8
+
9
+ check:
10
+ @command -v curl >/dev/null || (echo "curl missing"; exit 1)
11
+ @command -v python3 >/dev/null || (echo "python3 missing"; exit 1)
12
+ @[ -f scripts/fetch_weather.sh ] || (echo "missing scripts/fetch_weather.sh"; exit 1)
13
+ @[ -f scripts/process_weather.py ] || (echo "missing scripts/process_weather.py"; exit 1)
14
+
15
+ download:
16
+ @bash scripts/fetch_weather.sh
17
+
18
+ process:
19
+ @python3 scripts/process_weather.py
20
+
21
+ zip:
22
+ @zip -j results/results.zip results/summary.txt results/summary.csv
23
+
24
+ clean:
25
+ @rm -rf data results logs
26
+ @mkdir -p data results logs
27
+
28
+ coords:
29
+ @LAT="$(LAT)" LON="$(LON)" bash scripts/fetch_weather.sh
30
+ @python3 scripts/process_weather.py
31
+ @zip -j results/results.zip results/summary.txt
32
+
33
+ plot:
34
+ @python3 scripts/plot_weather.py
35
+
36
+ viz: all plot
37
+ @echo "📊 Charts generated."
38
+
39
+ setup:
40
+ @python3 -m venv .venv
41
+ @. .venv/bin/activate && pip install -r requirements.txt
42
+
43
+ cli:
44
+ @python3 scripts/weather_cli.py --city Lagos --lat 6.5244 --lon 3.3792
45
+
46
+ install:
47
+ @. .venv/bin/activate && pip install -e .
48
+
49
+ uninstall:
50
+ @. .venv/bin/activate && pip uninstall -y weather-data-fetcher
51
+
52
+ rain:
53
+ @python3 scripts/train_classify_rain.py
54
+
55
+ rain6:
56
+ @python3 scripts/train_classify_rain_hourly.py
57
+
58
+ rain-train:
59
+ @python3 scripts/train_rain_dual_thresholds.py
60
+
61
+ rain-predict:
62
+ @python3 scripts/predict_rain.py
63
+
64
+ rain-now:
65
+ @weather-cli rain --mode recall
66
+
67
+ eval-plots:
68
+ @python3 scripts/plot_pr_roc.py
69
+
70
+ hourly:
71
+ @LAT="$(LAT)" LON="$(LON)" PAST_DAYS="$(PAST_DAYS)" bash scripts/fetch_weather.sh
72
+ @python3 scripts/export_hourly.py
73
+
74
+ .PHONY: xgb-train
75
+ xgb-train:
76
+ @python3 scripts/train_xgb_12h.py
77
+
78
+ .PHONY: xgb-train-cal
79
+ xgb-train-cal:
80
+ @python3 scripts/train_xgb_12h_calibrated.py
81
+
82
+ .PHONY: predict-log backfill monitor
83
+ predict-log:
84
+ @python3 scripts/log_predict.py --city "Lagos" --lat 6.5244 --lon 3.3792 --mode default
85
+
86
+ backfill:
87
+ @python3 scripts/backfill_labels.py
88
+
89
+ monitor:
90
+ @python3 scripts/monitor_weekly.py
91
+
92
+ .PHONY: cron-test
93
+ cron-test:
94
+ @./scripts/cron_predict.sh default "Lagos" 6.5244 3.3792 90 >> logs/cron.log 2>&1 && tail -n 5 logs/cron.log
README.md ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Weather Data Fetcher
3
+ emoji: 🌧️
4
+ colorFrom: blue
5
+ colorTo: gray
6
+ sdk: docker
7
+ hub: registry.hf.space/theelvace/weather-data-fetcher-api:latest
8
+ pinned: false
9
+ ---
10
+
11
+ # Weather Data Fetcher — Automated Data Pipeline
12
+
13
+ Fetch daily Lagos (or any city) weather data using **Open-Meteo API**, process it with **Python**, and automate the full workflow via **Bash + Makefile**.
14
+
15
+ ---
16
+
17
+ ## Project Overview
18
+
19
+ This project demonstrates a clean, reproducible workflow for data automation — the same principles used in ML and DevOps pipelines.
20
+
21
+ **Pipeline Steps**
22
+
23
+ 1. Download daily weather JSON from Open-Meteo
24
+ 2. Parse, validate, and summarize data in Python
25
+ 3. Generate text + CSV summaries (and optional plots)
26
+ 4. Automate everything via a single `make all` command
27
+
28
+ ---
29
+
30
+ ## Charts
31
+
32
+ ## 🌧️ Rain Warning (next 6 hours)
33
+
34
+ Predict **whether it will rain in the next 6 hours** from hourly observations (temperature, humidity, pressure, wind, cloud cover, precipitation).
35
+
36
+ | Mode | Threshold | Precision | Recall | When to use |
37
+ | -------------- | --------- | --------- | ------ | ---------------------- |
38
+ | Default | 0.50 | 0.71 | 0.70 | Balanced alerts |
39
+ | High recall | 0.35 | 0.68 | 0.84 | Better safe than sorry |
40
+ | High precision | 0.65 | 0.79 | 0.50 | Only warn if confident |
41
+
42
+ ### Train once
43
+
44
+ ```bash
45
+ make hourly
46
+ make rain-train
47
+ make rain-now
48
+ python scripts/train_rain_dual_thresholds.py
49
+ python scripts/plot_pr_roc.py # refresh PR/ROC charts
50
+ ```
51
+
52
+ This produces:
53
+
54
+ - `models/rain_classifier_hourly.joblib`
55
+ - `models/rain_model_meta.json`
56
+ - `results/pr_curve.png`, `results/roc_curve.png`
57
+
58
+ ### Predict from the latest hour
59
+
60
+ ```bash
61
+ weather-cli rain --mode recall # warn more often
62
+ weather-cli rain --mode precision # fewer false alarms
63
+ ```
64
+
65
+ Example output:
66
+
67
+ ```
68
+ 2025-10-26 23:00:00 | P(rain ≤6h)=0.492 | mode=recall thr=0.35 → RAIN
69
+ 2025-10-26 23:00:00 | P(rain ≤6h)=0.492 | mode=precision thr=0.65 → No rain
70
+ ```
71
+
72
+ ### How thresholds are chosen
73
+
74
+ Training sweeps precision–recall trade-offs and stores two operating points:
75
+
76
+ | Threshold type | Purpose |
77
+ | -------------- | ------------------------------ |
78
+ | High recall | Catch >80 % of rain events |
79
+ | High precision | Warn only when ≥90 % confident |
80
+
81
+ ![PR Curve](results/pr_curve.png)
82
+ ![ROC Curve](results/roc_curve.png)
83
+
84
+ ### Model Interpretability
85
+
86
+ ML is not useful unless we can understand what it learned. This section explains why the classifier predicts rain, and not just whether it predicts rain.
87
+
88
+ - **Feature Coefficients (standardized):** which signals push toward rain vs no-rain
89
+
90
+ ```bash
91
+ python scripts/coef_rain.py # writes top weights
92
+ ```
93
+
94
+ Output → `results/coef_top15.txt`
95
+
96
+ - **Permutation importance:** which features matter most to F1 on the test set.
97
+ This tells us which variables the model relies on the most when making real predictions.
98
+ ```bash
99
+ python scripts/feature_importance_rain.py
100
+ ```
101
+ Output → `results/feature_importance.png`
102
+
103
+ It engineers both raw signals and short-term deltas/rolling means. Positive coefficients push toward “RAIN”, negative toward “No rain”.
104
+
105
+ ### What the model actually learned (top signals)
106
+
107
+ | Feature | Meaning |
108
+ | ------------ | -------------------------------------------------------------------- |
109
+ | `precip_mm` | Existing rainfall strongly predicts more rain (tropical persistence) |
110
+ | `temp_c` | Warmer air holds more moisture → higher chance of near-term rain |
111
+ | `humidity` | High saturation = cloud condensation is likely |
112
+ | `pressure` | Falling pressure indicates unstable atmosphere / storm formation |
113
+ | `cloudcover` | More clouds = conditions building toward rainfall |
114
+ | `wind_speed` | Negative weight — stronger winds can disperse moisture |
115
+
116
+ The classifier isn’t guessing; it’s surfacing familiar meteorological patterns.
117
+
118
+ ### What drives the rain predictions?
119
+
120
+ Using SHAP explainability, I found that the model mainly relies on **humidity** and **temperature** when deciding if it will rain in the next 12 hours.
121
+
122
+ - High humidity pushes the model strongly toward predicting rain.
123
+ - Lower temperatures slightly increase rain probability.
124
+ - The interaction between humidity and temperature mimics real-world weather dynamics — humid, cool conditions tend to precede rainfall.
125
+
126
+ This means the model isn’t just memorizing data — it has captured meaningful relationships that align with atmospheric science.
127
+
128
+ ![Humidity vs Temperature SHAP interaction](results/shap_interaction.png)
129
+
130
+ > Generated via `python scripts/explain_shap_interaction.py`, which also writes `results/shap_interaction_rev.png` for the reverse view.
131
+
132
+ ## 🌧️ Rain Events (≥1.0 mm in next 12h)
133
+
134
+ **Label:** “Rain event if cumulative precipitation ≥ **1.0 mm** within the next **12 hours**.”
135
+ **Policy:** Default to **Early Warning** (recall-leaning) for Lagos conditions. Offer a stricter **Cautious Alert** mode.
136
+
137
+ **Train / thresholds / predict**
138
+
139
+ ```bash
140
+ # (data) pull 90 days of hourly data
141
+ make hourly PAST_DAYS=90
142
+
143
+ # (model) train XGBoost + Isotonic calibration
144
+ python scripts/train_xgb_12h_calibrated.py
145
+
146
+ # (CLI) two operating modes
147
+ weather-cli rain --mode recall # Early Warning (higher recall)
148
+ weather-cli rain --mode precision # Cautious Alert (stricter)
149
+ weather-cli rain # Balanced (best F1)
150
+ ```
151
+
152
+ ### 🌧️ Rain Warning (next 12h)
153
+ Train tuned model + set guarded thresholds:
154
+
155
+ ```bash
156
+ python scripts/xgb_tune_timeseries.py
157
+ python scripts/train_xgb_tuned_final.py
158
+ cp models/rain_xgb_tuned.joblib models/rain_classifier_hourly.joblib
159
+ cp models/rain_xgb_tuned_meta.json models/rain_model_meta.json
160
+ ```
161
+
162
+ ## Run Locally
163
+
164
+ Clone and run:
165
+
166
+ ```bash
167
+ make all
168
+ ```
169
+
170
+ ## CLI
171
+
172
+ Install (editable):
173
+
174
+ ```bash
175
+ python3 -m venv .venv && source .venv/bin/activate
176
+ pip install -e .
177
+ ```
178
+
179
+ Once installed, run `weather-cli --help` for all commands (including the rain mode above).
app.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json
3
+ from pathlib import Path
4
+ import numpy as np
5
+ import pandas as pd
6
+ import streamlit as st
7
+ import joblib
8
+ import subprocess
9
+ import os
10
+ from datetime import datetime, timedelta
11
+
12
+ # Settings
13
+ MODEL_PATH = Path("models/rain_xgb_tuned.joblib")
14
+ META_PATH = Path("models/rain_xgb_tuned_meta.json")
15
+ HOURLY_CSV = Path("results/hourly.csv")
16
+
17
+ # Load model + meta
18
+ @st.cache_resource
19
+ def load_model():
20
+ if not (MODEL_PATH.exists() and META_PATH.exists()):
21
+ st.error("Trained model not found. Run: python scripts/xgb_tune_timeseries.py && python scripts/train_xgb_tuned_final.py")
22
+ st.stop()
23
+ clf = joblib.load(MODEL_PATH)
24
+ meta = json.loads(META_PATH.read_text())
25
+ return clf, meta
26
+
27
+ def build_features_like_training(df: pd.DataFrame, features: list) -> pd.DataFrame:
28
+ from scripts.train_xgb_tuned_final import build_features # reuse your code
29
+ Xdf = build_features(df)
30
+ return Xdf[features]
31
+
32
+ def ensure_hourly(lat: float, lon: float, past_days: int = 90) -> pd.DataFrame:
33
+ env = os.environ.copy()
34
+ env["LAT"] = str(lat)
35
+ env["LON"] = str(lon)
36
+ env["PAST_DAYS"] = str(past_days)
37
+
38
+ # If file is missing or stale (>12h), refresh
39
+ needs_refresh = True
40
+ if HOURLY_CSV.exists():
41
+ age_hours = (datetime.now() - datetime.fromtimestamp(HOURLY_CSV.stat().st_mtime)).total_seconds() / 3600.0
42
+ needs_refresh = age_hours > 12
43
+
44
+ if (not HOURLY_CSV.exists()) or needs_refresh:
45
+ st.info("Fetching fresh hourly weather…")
46
+ subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
47
+ subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
48
+
49
+ return pd.read_csv(HOURLY_CSV, parse_dates=["time"])
50
+
51
+ # UI
52
+ st.set_page_config(page_title="Rain Nowcast (12h)", page_icon="🌧️", layout="centered")
53
+ st.title("🌧️ Rain Nowcast — next 12 hours")
54
+
55
+ clf, meta = load_model()
56
+ features = meta["features"]
57
+ thr = meta["thresholds"]
58
+ horizon_h = meta["horizon_hours"]
59
+
60
+ # Presets for cities
61
+ CITY_PRESETS = {
62
+ "Lagos 🇳🇬": (6.5244, 3.3792),
63
+ "Accra 🇬🇭": (5.6037, -0.1870),
64
+ "Nairobi 🇰🇪": (-1.2864, 36.8172),
65
+ "Kampala 🇺🇬": (0.3476, 32.5825),
66
+ "Addis 🇪🇹": (8.9806, 38.7578),
67
+ }
68
+
69
+ col1, col2 = st.columns(2)
70
+ with col1:
71
+ city = st.selectbox("City", list(CITY_PRESETS.keys()), index=0)
72
+ with col2:
73
+ mode = st.selectbox("Decision mode", ["default", "recall", "precision"], index=0)
74
+
75
+ lat, lon = CITY_PRESETS[city]
76
+ st.caption(f"Lat/Lon: **{lat:.4f}, {lon:.4f}** • Horizon: **{horizon_h}h** • Mode: **{mode}**")
77
+
78
+ df = ensure_hourly(lat, lon, past_days=90)
79
+
80
+ Xdf = build_features_like_training(df.copy(), features)
81
+ if Xdf.empty:
82
+ st.error("Not enough data to build features. Try again after fetch.")
83
+ st.stop()
84
+
85
+ x_last = Xdf.iloc[[-1]].values
86
+ p = float(clf.predict_proba(x_last)[0, 1])
87
+ thr_map = {
88
+ "default": float(thr["default"]),
89
+ "recall": float(thr["high_recall"]),
90
+ "precision": float(thr["high_precision"]),
91
+ }
92
+ t = thr_map[mode]
93
+ decision = "RAIN" if p >= t else "No rain"
94
+
95
+ st.subheader("Prediction")
96
+ st.metric(
97
+ label=f"P(rain ≤ {horizon_h}h)",
98
+ value=f"{p:.3f}",
99
+ delta=f"threshold={t:.2f}",
100
+ delta_color="inverse" if p < t else "normal"
101
+ )
102
+ st.markdown(
103
+ f"**Decision:** {'🌧️ RAIN' if decision=='RAIN' else '✅ No rain'} "
104
+ f"(mode **{mode}**, threshold **{t:.2f}**)"
105
+ )
106
+
107
+ st.subheader("Last 48h — context")
108
+ last48 = df.tail(48).copy()
109
+ c1, c2 = st.columns(2)
110
+ with c1:
111
+ st.line_chart(data=last48.set_index("time")[["temp_c", "humidity"]])
112
+ with c2:
113
+ st.line_chart(data=last48.set_index("time")[["precip_mm", "rain_mm"]])
114
+
115
+ # --- Probability sparkline over last 48h ---
116
+ st.subheader("Last 48h — rain probability")
117
+ # Recompute probabilities for all available rows, then show last 48 aligned to time
118
+ probas_all = clf.predict_proba(Xdf.values)[:, 1]
119
+ proba_series = pd.Series(probas_all, index=Xdf.index, name="p_rain")
120
+ # Align times (Xdf is derived from df; both share row order except dropped NaNs at head)
121
+ times_aligned = df.loc[Xdf.index, "time"]
122
+ last48_p = pd.DataFrame({"time": times_aligned, "p_rain": proba_series}).tail(48).set_index("time")
123
+ st.line_chart(last48_p)
124
+
125
+ # --- Download buttons ---
126
+ st.subheader("Downloads")
127
+ st.download_button(
128
+ label="⬇️ Download hourly.csv",
129
+ data=df.to_csv(index=False).encode("utf-8"),
130
+ file_name="hourly.csv",
131
+ mime="text/csv",
132
+ )
133
+
134
+ latest_frame = pd.DataFrame({
135
+ "time": [df.loc[Xdf.index, "time"].iloc[-1]],
136
+ "p_rain_next_12h": [p],
137
+ "mode": [mode],
138
+ "threshold": [t],
139
+ "decision": [decision],
140
+ })
141
+ st.download_button(
142
+ label="⬇️ Download latest_prediction.csv",
143
+ data=latest_frame.to_csv(index=False).encode("utf-8"),
144
+ file_name="latest_prediction.csv",
145
+ mime="text/csv",
146
+ )
147
+
148
+ # Explain thresholds
149
+ with st.expander("What do these modes mean?"):
150
+ st.write("""
151
+ - **default**: balanced (good everyday choice)
152
+ - **recall**: warn more (catches more rain, may over-warn)
153
+ - **precision**: be picky (alerts are rare but confident)
154
+ """)
155
+
156
+ st.caption("Model: XGBoost (tuned) • Features rebuilt exactly like training • Data: Open-Meteo hourly")
app/main.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FastAPI application exposing the rain nowcast API and a Gradio UI.
3
+
4
+ The previous Streamlit proxy was difficult to keep alive on Spaces due to
5
+ websocket restrictions. This module provides the same REST endpoints while
6
+ mounting a lightweight Gradio front-end so the UI works without websocket
7
+ tunnelling.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import os
13
+ import json
14
+ import subprocess
15
+ from datetime import datetime
16
+ from pathlib import Path
17
+ from typing import Dict, Tuple
18
+
19
+ import joblib
20
+ import pandas as pd
21
+ import gradio as gr
22
+ from fastapi import FastAPI, HTTPException, Query
23
+ from pydantic import BaseModel, Field
24
+ from xgboost import XGBClassifier
25
+
26
+ # --------- Paths ---------
27
+ ROOT = Path(__file__).resolve().parents[1]
28
+ MODELS = ROOT / "models"
29
+ RESULTS = ROOT / "results"
30
+ SCRIPTS = ROOT / "scripts"
31
+
32
+ MODEL_PATH = MODELS / "rain_xgb_tuned.joblib"
33
+ META_PATH = MODELS / "rain_xgb_tuned_meta.json"
34
+ MODEL_JSON_PATH = MODELS / "xgb_tuned.json"
35
+ HOURLY_CSV = RESULTS / "hourly.csv"
36
+
37
+ # Make training utilities importable.
38
+ import sys
39
+
40
+ sys.path.insert(0, str(ROOT))
41
+ from scripts.train_xgb_tuned_final import build_features # type: ignore
42
+
43
+ # --------- Load model + meta at startup ---------
44
+ if not META_PATH.exists():
45
+ raise RuntimeError(
46
+ "Model metadata missing. Run `python scripts/train_xgb_tuned_final.py` "
47
+ "or copy models/rain_xgb_tuned_meta.json into place."
48
+ )
49
+
50
+ meta = json.loads(META_PATH.read_text())
51
+ FEATURES = meta["features"]
52
+ THRESH = meta["thresholds"]
53
+ HORIZON_H = int(meta["horizon_hours"])
54
+
55
+
56
+ def _load_model() -> XGBClassifier:
57
+ if MODEL_PATH.exists():
58
+ return joblib.load(MODEL_PATH)
59
+
60
+ if MODEL_JSON_PATH.exists():
61
+ params = meta.get("model", {}).get("params", {})
62
+ booster = XGBClassifier(**params)
63
+ booster.load_model(MODEL_JSON_PATH)
64
+ return booster
65
+
66
+ raise RuntimeError(
67
+ "Model artifact missing. Run `python scripts/train_xgb_tuned_final.py` "
68
+ "to generate models/rain_xgb_tuned.joblib (or xgb_tuned.json), "
69
+ "or copy the trained file into the models/ directory."
70
+ )
71
+
72
+
73
+ model = _load_model()
74
+
75
+ # --------- Helpers ---------
76
+ def ensure_hourly(lat: float, lon: float, past_days: int = 90) -> pd.DataFrame:
77
+ """Refresh the cached hourly CSV when it is missing or stale."""
78
+ env = os.environ.copy()
79
+ env["LAT"] = str(lat)
80
+ env["LON"] = str(lon)
81
+ env["PAST_DAYS"] = str(past_days)
82
+
83
+ needs_refresh = True
84
+ if HOURLY_CSV.exists():
85
+ age_hours = (datetime.now().timestamp() - HOURLY_CSV.stat().st_mtime) / 3600
86
+ needs_refresh = age_hours > 6
87
+
88
+ if (not HOURLY_CSV.exists()) or needs_refresh:
89
+ try:
90
+ subprocess.run(["bash", str(SCRIPTS / "fetch_weather.sh")], check=True, env=env)
91
+ subprocess.run(["python3", str(SCRIPTS / "export_hourly.py")], check=True, env=env)
92
+ except subprocess.CalledProcessError as exc:
93
+ raise HTTPException(status_code=502, detail=f"Data refresh failed: {exc}") from exc
94
+
95
+ return pd.read_csv(HOURLY_CSV, parse_dates=["time"])
96
+
97
+
98
+ def predict_latest(df: pd.DataFrame, mode: str) -> Dict[str, object]:
99
+ """Build features, score the latest hour, and return a structured response."""
100
+ Xdf = build_features(df.copy())
101
+ if Xdf.empty:
102
+ raise HTTPException(status_code=422, detail="Not enough rows to build features.")
103
+
104
+ try:
105
+ Xdf = Xdf[FEATURES]
106
+ except KeyError as exc:
107
+ raise HTTPException(status_code=500, detail=f"Feature mismatch: {exc}") from exc
108
+
109
+ x = Xdf.iloc[[-1]].values
110
+ probability = float(model.predict_proba(x)[0, 1])
111
+
112
+ thresholds = {
113
+ "default": float(THRESH["default"]),
114
+ "recall": float(THRESH["high_recall"]),
115
+ "precision": float(THRESH["high_precision"]),
116
+ }
117
+ if mode not in thresholds:
118
+ raise HTTPException(status_code=400, detail=f"Unsupported mode '{mode}'.")
119
+
120
+ threshold = thresholds[mode]
121
+ decision = "RAIN" if probability >= threshold else "No rain"
122
+ ts = df.loc[Xdf.index, "time"].iloc[-1]
123
+
124
+ return {
125
+ "timestamp": ts.isoformat(),
126
+ "probability": probability,
127
+ "threshold": threshold,
128
+ "mode": mode,
129
+ "decision": decision,
130
+ "horizon_hours": HORIZON_H,
131
+ }
132
+
133
+
134
+ def format_prediction(result: Dict[str, object]) -> str:
135
+ """Generate a concise markdown summary for the UI."""
136
+ emoji = "🌧️" if result["decision"] == "RAIN" else "✅"
137
+ probability = result["probability"]
138
+ threshold = result["threshold"]
139
+ mode = result["mode"]
140
+ timestamp = result["timestamp"]
141
+ return (
142
+ f"{emoji} **Decision:** {result['decision']} (mode **{mode}**)\n\n"
143
+ f"- Probability of rain ≤ {HORIZON_H}h: **{probability:.3f}**\n"
144
+ f"- Threshold: **{threshold:.2f}**\n"
145
+ f"- Issued for hour ending **{timestamp}**"
146
+ )
147
+
148
+
149
+ class PredictBody(BaseModel):
150
+ lat: float = Field(6.5244, description="Latitude")
151
+ lon: float = Field(3.3792, description="Longitude")
152
+ mode: str = Field("default", description="default | recall | precision")
153
+ past_days: int = Field(90, ge=14, le=180, description="How much history to fetch (days)")
154
+
155
+
156
+ app = FastAPI(title="Rain Nowcast API", version="1.1.0")
157
+
158
+
159
+ @app.get("/health")
160
+ def health() -> Dict[str, object]:
161
+ return {
162
+ "status": "ok",
163
+ "model_file": MODEL_PATH.name,
164
+ "horizon_hours": HORIZON_H,
165
+ "thresholds": THRESH,
166
+ "features": FEATURES,
167
+ }
168
+
169
+
170
+ @app.post("/predict")
171
+ def predict(body: PredictBody) -> Dict[str, object]:
172
+ df = ensure_hourly(body.lat, body.lon, body.past_days)
173
+ out = predict_latest(df, body.mode)
174
+ return {"ok": True, "result": out}
175
+
176
+
177
+ @app.get("/predict")
178
+ def predict_get(
179
+ lat: float = Query(6.5244),
180
+ lon: float = Query(3.3792),
181
+ mode: str = Query("default"),
182
+ past_days: int = Query(90, ge=14, le=180),
183
+ ) -> Dict[str, object]:
184
+ df = ensure_hourly(lat, lon, past_days)
185
+ out = predict_latest(df, mode)
186
+ return {"ok": True, "result": out}
187
+
188
+
189
+ # --------- Gradio UI ---------
190
+ CITY_PRESETS: Dict[str, Tuple[float, float]] = {
191
+ "Lagos 🇳🇬": (6.5244, 3.3792),
192
+ "Accra 🇬🇭": (5.6037, -0.1870),
193
+ "Nairobi 🇰🇪": (-1.2864, 36.8172),
194
+ "Kampala 🇺🇬": (0.3476, 32.5825),
195
+ "Addis Ababa 🇪🇹": (8.9806, 38.7578),
196
+ "Custom": (0.0, 0.0),
197
+ }
198
+
199
+
200
+ def _resolve_location(city: str, lat: float, lon: float) -> Tuple[float, float, str]:
201
+ if city in CITY_PRESETS and city != "Custom":
202
+ chosen_lat, chosen_lon = CITY_PRESETS[city]
203
+ label = city
204
+ else:
205
+ chosen_lat, chosen_lon = lat, lon
206
+ label = f"Custom ({lat:.3f}, {lon:.3f})"
207
+ return chosen_lat, chosen_lon, label
208
+
209
+
210
+ def gradio_predict(
211
+ city: str,
212
+ lat: float,
213
+ lon: float,
214
+ mode: str,
215
+ past_days: int,
216
+ ) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
217
+ chosen_lat, chosen_lon, label = _resolve_location(city, lat, lon)
218
+ df = ensure_hourly(chosen_lat, chosen_lon, past_days)
219
+ result = predict_latest(df, mode)
220
+
221
+ summary = format_prediction(result)
222
+
223
+ last48 = df.tail(48).copy()
224
+ last48.set_index("time", inplace=True)
225
+ chart = last48[["temp_c", "humidity", "precip_mm", "rain_mm"]]
226
+
227
+ latest = pd.DataFrame(
228
+ {
229
+ "location": [label],
230
+ "timestamp": [result["timestamp"]],
231
+ "mode": [result["mode"]],
232
+ "probability": [result["probability"]],
233
+ "threshold": [result["threshold"]],
234
+ "decision": [result["decision"]],
235
+ }
236
+ )
237
+ return summary, latest, chart
238
+
239
+
240
+ with gr.Blocks(css=".gradio-container {max-width: 900px;}") as demo:
241
+ gr.Markdown("# 🌧️ Rain Nowcast\nPredict the probability of rain in the next "
242
+ f"{HORIZON_H} hours using the tuned XGBoost model.")
243
+
244
+ with gr.Row():
245
+ city_input = gr.Dropdown(
246
+ label="City preset",
247
+ choices=list(CITY_PRESETS.keys()),
248
+ value="Lagos 🇳🇬",
249
+ )
250
+ mode_input = gr.Radio(
251
+ label="Decision mode",
252
+ choices=["default", "recall", "precision"],
253
+ value="default",
254
+ info="default=balanced, recall=warn more, precision=extra picky",
255
+ )
256
+
257
+ with gr.Row():
258
+ lat_input = gr.Number(label="Latitude (used if city is Custom)", value=6.5244)
259
+ lon_input = gr.Number(label="Longitude (used if city is Custom)", value=3.3792)
260
+ past_days_input = gr.Slider(
261
+ label="History window (days)",
262
+ minimum=14,
263
+ maximum=180,
264
+ value=90,
265
+ step=1,
266
+ )
267
+
268
+ submit = gr.Button("Run prediction", variant="primary")
269
+
270
+ summary_md = gr.Markdown()
271
+ latest_df = gr.Dataframe(label="Latest prediction", wrap=True)
272
+ chart_df = gr.LinePlot(
273
+ label="Last 48h weather (hourly)",
274
+ x="time",
275
+ y=["temp_c", "humidity", "precip_mm", "rain_mm"],
276
+ overlay_point=True,
277
+ width="100%",
278
+ height=350,
279
+ )
280
+
281
+ submit.click(
282
+ gradio_predict,
283
+ inputs=[city_input, lat_input, lon_input, mode_input, past_days_input],
284
+ outputs=[summary_md, latest_df, chart_df],
285
+ )
286
+
287
+ gr.Markdown(
288
+ "Model features match the training pipeline "
289
+ "(see `scripts/train_xgb_tuned_final.py`). Data fetched from Open-Meteo."
290
+ )
291
+
292
+
293
+ app = gr.mount_gradio_app(app, demo, path="/")
assets/cover.png ADDED
assets/feature_importance.png ADDED
assets/pr_curve.png ADDED
assets/precip.png ADDED
assets/roc_curve.png ADDED
assets/temps.png ADDED
models/rain_model_meta.json ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "features": [
3
+ "temp_c",
4
+ "humidity",
5
+ "cloudcover",
6
+ "pressure",
7
+ "wind_speed",
8
+ "precip_mm",
9
+ "rain_mm",
10
+ "d_temp_c",
11
+ "d_humidity",
12
+ "d_cloudcover",
13
+ "d_pressure",
14
+ "d_wind_speed",
15
+ "d_precip_mm",
16
+ "d_rain_mm",
17
+ "ma3_temp_c",
18
+ "ma3_humidity",
19
+ "ma3_cloudcover",
20
+ "ma3_pressure",
21
+ "ma3_wind_speed",
22
+ "ma3_precip_mm",
23
+ "ma3_rain_mm"
24
+ ],
25
+ "horizon_hours": 12,
26
+ "thresholds": {
27
+ "default": 0.22239363491823944,
28
+ "high_recall": 0.22239363491823944,
29
+ "high_precision": 0.745196376322855
30
+ },
31
+ "metrics": {
32
+ "default": {
33
+ "threshold": 0.22239363491823944,
34
+ "precision": 0.8433098591549296,
35
+ "recall": 0.98559670781893,
36
+ "f1": 0.9089184060721063,
37
+ "auc": 0.7839506172839507,
38
+ "cm": [
39
+ [
40
+ 13,
41
+ 89
42
+ ],
43
+ [
44
+ 7,
45
+ 479
46
+ ]
47
+ ]
48
+ },
49
+ "high_recall": {
50
+ "threshold": 0.22239363491823944,
51
+ "precision": 0.8433098591549296,
52
+ "recall": 0.98559670781893,
53
+ "f1": 0.9089184060721063,
54
+ "auc": 0.7839506172839507,
55
+ "cm": [
56
+ [
57
+ 13,
58
+ 89
59
+ ],
60
+ [
61
+ 7,
62
+ 479
63
+ ]
64
+ ]
65
+ },
66
+ "high_precision": {
67
+ "threshold": 0.745196376322855,
68
+ "precision": 0.9033018867924528,
69
+ "recall": 0.7880658436213992,
70
+ "f1": 0.8417582417582418,
71
+ "auc": 0.7839506172839507,
72
+ "cm": [
73
+ [
74
+ 61,
75
+ 41
76
+ ],
77
+ [
78
+ 103,
79
+ 383
80
+ ]
81
+ ]
82
+ }
83
+ }
84
+ }
models/rain_xgb_cal_meta.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "xgboost+isotonic",
3
+ "features": [
4
+ "temp_c",
5
+ "humidity",
6
+ "cloudcover",
7
+ "pressure",
8
+ "wind_speed",
9
+ "precip_mm",
10
+ "rain_mm",
11
+ "d_temp_c",
12
+ "d_humidity",
13
+ "d_cloudcover",
14
+ "d_pressure",
15
+ "d_wind_speed",
16
+ "d_precip_mm",
17
+ "d_rain_mm",
18
+ "ma3_temp_c",
19
+ "ma3_humidity",
20
+ "ma3_cloudcover",
21
+ "ma3_pressure",
22
+ "ma3_wind_speed",
23
+ "ma3_precip_mm",
24
+ "ma3_rain_mm",
25
+ "d3_pressure",
26
+ "d3_humidity",
27
+ "d3_cloudcover",
28
+ "d3_temp_c",
29
+ "dew_proxy",
30
+ "d_dew_proxy",
31
+ "ma3_dew_proxy",
32
+ "rain_sum_3h",
33
+ "rain_sum_6h",
34
+ "rain_sum_12h",
35
+ "rain_sum_24h",
36
+ "rain_max_6h",
37
+ "rain_max_12h",
38
+ "dry_streak_h",
39
+ "wet_streak_h",
40
+ "hour_sin",
41
+ "hour_cos",
42
+ "dow_sin",
43
+ "dow_cos",
44
+ "hum_x_cloud",
45
+ "wind_x_cloud",
46
+ "press_drop_3h"
47
+ ],
48
+ "horizon_hours": 12,
49
+ "event_mm": 1.0,
50
+ "label_desc": "Rain event if cumulative precip \u2265 1.0 mm in next 12h",
51
+ "thresholds": {
52
+ "default": 0.5123772621154785,
53
+ "high_recall": 0.26928117871284485,
54
+ "high_precision": 0.6026621460914612
55
+ },
56
+ "metrics": {
57
+ "default": {
58
+ "threshold": 0.5123772621154785,
59
+ "precision": 0.5376344086021505,
60
+ "recall": 0.8438818565400844,
61
+ "f1": 0.6568144499178982,
62
+ "auc": 0.7253714914694552,
63
+ "cm": [
64
+ [
65
+ 173,
66
+ 172
67
+ ],
68
+ [
69
+ 37,
70
+ 200
71
+ ]
72
+ ],
73
+ "pos_rate": 0.6391752577319587
74
+ },
75
+ "high_recall": {
76
+ "threshold": 0.26928117871284485,
77
+ "precision": 0.4976190476190476,
78
+ "recall": 0.8818565400843882,
79
+ "f1": 0.6362252663622526,
80
+ "auc": 0.7253714914694552,
81
+ "cm": [
82
+ [
83
+ 134,
84
+ 211
85
+ ],
86
+ [
87
+ 28,
88
+ 209
89
+ ]
90
+ ],
91
+ "pos_rate": 0.7216494845360825
92
+ },
93
+ "high_precision": {
94
+ "threshold": 0.6026621460914612,
95
+ "precision": 0.6490384615384616,
96
+ "recall": 0.569620253164557,
97
+ "f1": 0.6067415730337079,
98
+ "auc": 0.7253714914694552,
99
+ "cm": [
100
+ [
101
+ 272,
102
+ 73
103
+ ],
104
+ [
105
+ 102,
106
+ 135
107
+ ]
108
+ ],
109
+ "pos_rate": 0.35738831615120276
110
+ }
111
+ },
112
+ "policy": {
113
+ "default": "best F1 (balanced, early-warning baseline)",
114
+ "high_recall": "recall\u22650.88 & precision\u22650.55 & pos_rate\u22640.80",
115
+ "high_precision": "precision\u22650.80 & recall\u22650.45 (Moderate)"
116
+ }
117
+ }
models/rain_xgb_meta.json ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "features": [
3
+ "temp_c",
4
+ "humidity",
5
+ "cloudcover",
6
+ "pressure",
7
+ "wind_speed",
8
+ "precip_mm",
9
+ "rain_mm",
10
+ "d_temp_c",
11
+ "d_humidity",
12
+ "d_cloudcover",
13
+ "d_pressure",
14
+ "d_wind_speed",
15
+ "d_precip_mm",
16
+ "d_rain_mm",
17
+ "ma3_temp_c",
18
+ "ma3_humidity",
19
+ "ma3_cloudcover",
20
+ "ma3_pressure",
21
+ "ma3_wind_speed",
22
+ "ma3_precip_mm",
23
+ "ma3_rain_mm",
24
+ "pressure_d3h",
25
+ "humidity_d3h",
26
+ "cloudcover_d3h",
27
+ "dew_proxy",
28
+ "d_dew_proxy",
29
+ "ma3_dew_proxy"
30
+ ],
31
+ "horizon_hours": 12,
32
+ "thresholds": {
33
+ "default": 0.4454699456691742,
34
+ "high_recall": 0.4454699456691742,
35
+ "high_precision": 0.8384796977043152
36
+ },
37
+ "metrics": {
38
+ "default": {
39
+ "threshold": 0.4454699456691742,
40
+ "precision": 0.8151093439363817,
41
+ "recall": 0.9403669724770642,
42
+ "f1": 0.873269435569755,
43
+ "auc": 0.7489787718475859,
44
+ "cm": [
45
+ [
46
+ 44,
47
+ 93
48
+ ],
49
+ [
50
+ 26,
51
+ 410
52
+ ]
53
+ ],
54
+ "pos_rate": 0.8778359511343804
55
+ },
56
+ "high_recall": {
57
+ "threshold": 0.4454699456691742,
58
+ "precision": 0.8151093439363817,
59
+ "recall": 0.9403669724770642,
60
+ "f1": 0.873269435569755,
61
+ "auc": 0.7489787718475859,
62
+ "cm": [
63
+ [
64
+ 44,
65
+ 93
66
+ ],
67
+ [
68
+ 26,
69
+ 410
70
+ ]
71
+ ],
72
+ "pos_rate": 0.8778359511343804
73
+ },
74
+ "high_precision": {
75
+ "threshold": 0.8384796977043152,
76
+ "precision": 0.9012345679012346,
77
+ "recall": 0.5022935779816514,
78
+ "f1": 0.6450662739322534,
79
+ "auc": 0.7489787718475859,
80
+ "cm": [
81
+ [
82
+ 113,
83
+ 24
84
+ ],
85
+ [
86
+ 217,
87
+ 219
88
+ ]
89
+ ],
90
+ "pos_rate": 0.42408376963350786
91
+ }
92
+ },
93
+ "model_type": "xgboost"
94
+ }
models/rain_xgb_tuned_meta.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "features": [
3
+ "temp_c",
4
+ "humidity",
5
+ "cloudcover",
6
+ "pressure",
7
+ "wind_speed",
8
+ "precip_mm",
9
+ "rain_mm",
10
+ "d_temp_c",
11
+ "d_humidity",
12
+ "d_cloudcover",
13
+ "d_pressure",
14
+ "d_wind_speed",
15
+ "d_precip_mm",
16
+ "d_rain_mm",
17
+ "ma3_temp_c",
18
+ "ma3_humidity",
19
+ "ma3_cloudcover",
20
+ "ma3_pressure",
21
+ "ma3_wind_speed",
22
+ "ma3_precip_mm",
23
+ "ma3_rain_mm",
24
+ "d3_pressure",
25
+ "d3_humidity",
26
+ "d3_cloudcover",
27
+ "d3_temp_c",
28
+ "dew_proxy",
29
+ "d_dew_proxy",
30
+ "ma3_dew_proxy",
31
+ "rain_sum_3h",
32
+ "rain_sum_6h",
33
+ "rain_sum_12h",
34
+ "rain_sum_24h",
35
+ "rain_max_6h",
36
+ "rain_max_12h",
37
+ "dry_streak_h",
38
+ "wet_streak_h",
39
+ "hour_sin",
40
+ "hour_cos",
41
+ "dow_sin",
42
+ "dow_cos",
43
+ "hoy_sin",
44
+ "hoy_cos",
45
+ "hum_x_cloud",
46
+ "wind_x_cloud",
47
+ "press_drop_3h",
48
+ "press_drop_6h"
49
+ ],
50
+ "horizon_hours": 12,
51
+ "event_mm": 1.0,
52
+ "model": {
53
+ "type": "xgboost",
54
+ "params": {
55
+ "learning_rate": 0.05,
56
+ "max_depth": 3,
57
+ "n_estimators": 500,
58
+ "subsample": 0.8,
59
+ "colsample_bytree": 0.8,
60
+ "min_child_weight": 3
61
+ }
62
+ },
63
+ "thresholds": {
64
+ "default": 0.15,
65
+ "high_recall": 0.1,
66
+ "high_precision": 0.6
67
+ },
68
+ "cv_mean": {
69
+ "P": 0.6167141877942365,
70
+ "R": 0.40142749648205356,
71
+ "F1": 0.4687631522470538,
72
+ "AUC": 0.6838816207078178
73
+ },
74
+ "cv_folds": [
75
+ {
76
+ "P": 0.44,
77
+ "R": 0.4782608695652174,
78
+ "F1": 0.4583333333333333,
79
+ "AUC": 0.6755671077504725,
80
+ "thr": 0.15
81
+ },
82
+ {
83
+ "P": 0.7757009345794392,
84
+ "R": 0.4088669950738916,
85
+ "F1": 0.535483870967742,
86
+ "AUC": 0.7078279587697148,
87
+ "thr": 0.4764537811279297
88
+ },
89
+ {
90
+ "P": 0.82,
91
+ "R": 0.4270833333333333,
92
+ "F1": 0.5616438356164384,
93
+ "AUC": 0.6740785256410257,
94
+ "thr": 0.9872803688049316
95
+ },
96
+ {
97
+ "P": 0.32323232323232326,
98
+ "R": 0.3764705882352941,
99
+ "F1": 0.34782608695652173,
100
+ "AUC": 0.6218912881608338,
101
+ "thr": 0.9168330430984497
102
+ },
103
+ {
104
+ "P": 0.7246376811594203,
105
+ "R": 0.31645569620253167,
106
+ "F1": 0.44052863436123346,
107
+ "AUC": 0.7400432232170422,
108
+ "thr": 0.8837475776672363
109
+ }
110
+ ]
111
+ }
models/xgb_tuned.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "params": {
3
+ "learning_rate": 0.05,
4
+ "max_depth": 3,
5
+ "n_estimators": 500,
6
+ "subsample": 0.8,
7
+ "colsample_bytree": 0.8,
8
+ "min_child_weight": 3
9
+ },
10
+ "mean_f1": 0.5780780663579321
11
+ }
pyproject.toml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "weather-data-fetcher"
3
+ version = "0.2.0"
4
+ description = "Fetch, process, and visualize daily weather from Open-Meteo."
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ authors = [{ name = "Elvis Anselm" }]
8
+ license = "MIT"
9
+ dependencies = [
10
+ "requests",
11
+ "pandas",
12
+ "matplotlib",
13
+ "python-dotenv",
14
+ "pillow"
15
+ ]
16
+
17
+ [tool.setuptools.packages.find]
18
+ include = ["weather_cli*"]
19
+ exclude = ["data*", "logs*", "results*", "assets*"]
20
+
21
+ [project.scripts]
22
+ weather-cli = "weather_cli.cli:main"
23
+
24
+ [build-system]
25
+ requires = ["setuptools>=68", "wheel"]
26
+ build-backend = "setuptools.build_meta"
render.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ services:
2
+ - type: web
3
+ name: rain-nowcast-api
4
+ env: docker
5
+ autoDeploy: true
6
+ plan: free
7
+ dockerCommand: null
8
+ healthCheckPath: /health
requirements.txt ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ certifi==2025.10.5
2
+ charset-normalizer==3.4.4
3
+ contourpy==1.3.3
4
+ cycler==0.12.1
5
+ fonttools==4.60.1
6
+ idna==3.11
7
+ joblib==1.5.2
8
+ kiwisolver==1.4.9
9
+ matplotlib==3.10.7
10
+ numpy==2.3.4
11
+ packaging==25.0
12
+ pandas==2.3.3
13
+ pillow==12.0.0
14
+ pyparsing==3.2.5
15
+ python-dateutil==2.9.0.post0
16
+ python-dotenv==1.1.1
17
+ pytz==2025.2
18
+ requests==2.32.5
19
+ scikit-learn==1.7.2
20
+ scipy==1.16.2
21
+ six==1.17.0
22
+ threadpoolctl==3.6.0
23
+ tzdata==2025.2
24
+ urllib3==2.5.0
25
+ -e git+https://github.com/Elvaceishim/weather_data_fetcher.git@ac53d9c31c4be6eda7988f97e1768f998c7a9f0a#egg=weather_data_fetcher
26
+ fastapi
27
+ uvicorn[standard]
28
+ pydantic
29
+ xgboost
30
+ huggingface_hub
31
+ streamlit
32
+ gradio
scripts/analyze_weather.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ df = pd.read_csv("results/summary.csv")
4
+ print("\n=== HEAD ===")
5
+ print(df.head())
6
+ print("\n=== DESCRIBE ===")
7
+ print(df.describe())
8
+ print("\n=== COLUMNS ===")
9
+ print(df.columns)
10
+ print("\n=== MISSING VALUES ===")
11
+ print(df.isna().sum())
scripts/backfill_labels.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import json, os, argparse
3
+ from pathlib import Path
4
+ from datetime import datetime, timedelta
5
+ import pandas as pd, numpy as np, subprocess
6
+
7
+ META = Path("models/rain_xgb_tuned_meta.json")
8
+ LOGS = Path("logs")
9
+ PRED_LOG = LOGS / "predictions.csv"
10
+
11
+ def ensure_hourly(lat, lon, past_days=120):
12
+ env = os.environ.copy()
13
+ env["LAT"], env["LON"], env["PAST_DAYS"] = str(lat), str(lon), str(past_days)
14
+ subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
15
+ subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
16
+ return pd.read_csv("results/hourly.csv", parse_dates=["time"])
17
+
18
+ def label_from_df(df, ts_pred, horizon_h, event_mm):
19
+ # find the row with time == ts_pred, then sum next H hours of precip_mm
20
+ # allow slight mismatch by nearest timestamp within 1 hour
21
+ idx = (df["time"] - ts_pred).abs().idxmin()
22
+ if abs((df.loc[idx, "time"] - ts_pred).total_seconds()) > 3600:
23
+ return None # can't align
24
+ end_idx = min(idx + horizon_h, len(df)-1)
25
+ total = float(np.nansum(df.loc[idx+1:end_idx, "precip_mm"]))
26
+ return 1 if total >= event_mm else 0
27
+
28
+ def main():
29
+ if not PRED_LOG.exists():
30
+ print("No predictions.csv found.")
31
+ return
32
+
33
+ meta = json.loads(Path(META).read_text())
34
+ H = int(meta["horizon_hours"]); event_mm = float(meta["event_mm"])
35
+
36
+ df = pd.read_csv(PRED_LOG, parse_dates=["ts_pred","logged_at"])
37
+ updated = 0
38
+ for i, row in df[df["y_true"].isna() | (df["y_true"]=="")].iterrows():
39
+ ts_pred = row["ts_pred"]
40
+ if datetime.now() < ts_pred + timedelta(hours=H):
41
+ continue # horizon not passed yet
42
+ # fetch enough history to cover that timestamp
43
+ hdf = ensure_hourly(row["lat"], row["lon"], past_days=120)
44
+ y = label_from_df(hdf, ts_pred, H, event_mm)
45
+ if y is not None:
46
+ df.at[i, "y_true"] = int(y)
47
+ updated += 1
48
+
49
+ df.to_csv(PRED_LOG, index=False)
50
+ print(f"Backfilled {updated} rows into {PRED_LOG}")
51
+
52
+ if __name__ == "__main__":
53
+ main()
scripts/coef_rain.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json, joblib, pandas as pd
3
+ from sklearn.model_selection import train_test_split
4
+
5
+ meta = json.load(open("models/rain_model_meta.json"))
6
+ clf = joblib.load("models/rain_classifier_hourly.joblib")
7
+
8
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
9
+ base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
10
+ for c in base:
11
+ df[f"d_{c}"] = df[c].diff()
12
+ df[f"ma3_{c}"] = df[c].rolling(3).mean()
13
+ df = df.dropna().reset_index(drop=True)
14
+
15
+ X = df[meta["features"]].values
16
+ y = None
17
+
18
+ logreg = clf.named_steps["logreg"]
19
+ coefs = logreg.coef_[0]
20
+ features = meta["features"]
21
+
22
+ rank = sorted(zip(features, coefs), key=lambda x: abs(x[1]), reverse=True)
23
+
24
+ out_lines = ["Feature coefficients (standardized space):"]
25
+ for name, w in rank[:15]:
26
+ out_lines.append(f"{name:20s} {w:+.3f}")
27
+
28
+ print("\n".join(out_lines))
29
+ with open("results/coef_top15.txt", "w") as f:
30
+ f.write("\n".join(out_lines))
31
+
32
+ print("✅ Wrote results/coef_top15.txt")
scripts/cron_predict.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ set -euo pipefail
4
+
5
+ # --- Resolve repo root ---
6
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
7
+ REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
8
+ cd "$REPO_ROOT"
9
+
10
+ # --- Lock to avoid overlapping runs (portable; no flock needed) ---
11
+ mkdir -p logs
12
+ LOCKDIR="logs/.predict.lock"
13
+ if ! mkdir "$LOCKDIR" 2>/dev/null; then
14
+ echo "[$(date '+%F %T')] Another run is in progress. Skipping."
15
+ exit 0
16
+ fi
17
+ trap 'rmdir "$LOCKDIR" 2>/dev/null || true' EXIT
18
+
19
+ # --- Args & defaults ---
20
+ MODE="${1:-default}"
21
+ CITY="${2:-Lagos}"
22
+ LAT="${3:-6.5244}"
23
+ LON="${4:-3.3792}"
24
+ PAST_DAYS="${5:-90}"
25
+
26
+ # --- Activate venv if present ---
27
+ if [[ -f ".venv/bin/activate" ]]; then
28
+ # shellcheck disable=SC1091
29
+ source .venv/bin/activate
30
+ fi
31
+
32
+ # --- Environment for fetch scripts ---
33
+ export LAT="$LAT" LON="$LON" PAST_DAYS="$PAST_DAYS"
34
+
35
+ # --- Run one logged prediction ---
36
+ echo "[$(date '+%F %T')] cron_predict: city=$CITY lat=$LAT lon=$LON mode=$MODE days=$PAST_DAYS"
37
+ python3 scripts/log_predict.py --city "$CITY" --lat "$LAT" --lon "$LON" --mode "$MODE" || {
38
+ echo "[$(date '+%F %T')] ERROR: log_predict failed"
39
+ exit 1
40
+ }
41
+
42
+ # --- (Optional) basic log rotation (keep log under ~1MB) ---
43
+ LOGFILE="logs/cron.log"
44
+ if [[ -f "$LOGFILE" ]] && [[ $(stat -f%z "$LOGFILE") -gt 1048576 ]]; then
45
+ mv "$LOGFILE" "logs/cron_$(date +%Y%m%d_%H%M%S).log" || true
46
+ fi
47
+
48
+ echo "[$(date '+%F %T')] cron_predict: done."
scripts/cv_benchmark.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json, warnings
3
+ from pathlib import Path
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+
8
+ from sklearn.model_selection import TimeSeriesSplit
9
+ from sklearn.pipeline import Pipeline
10
+ from sklearn.preprocessing import StandardScaler, RobustScaler
11
+ from sklearn.linear_model import LogisticRegression
12
+ from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
13
+
14
+ warnings.filterwarnings("ignore")
15
+
16
+ H = 12
17
+ EVENT_MM = 1.0
18
+
19
+ HOURLY = Path("results/hourly.csv")
20
+ META = Path("models/rain_model_meta.json")
21
+
22
+ # -----------------------------
23
+ # Feature builder (same as CLI/trainer)
24
+ # -----------------------------
25
+ def rebuild_features_like_training(df: pd.DataFrame, features_from_meta: list) -> pd.DataFrame:
26
+ required = {"time","temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"}
27
+ missing = required - set(df.columns)
28
+ if missing:
29
+ raise ValueError(f"Hourly data missing columns: {sorted(missing)}")
30
+
31
+ base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
32
+ for c in base:
33
+ df[f"d_{c}"] = df[c].diff()
34
+ df[f"ma3_{c}"] = df[c].rolling(3).mean()
35
+
36
+ for c in ["pressure","humidity","cloudcover","temp_c"]:
37
+ df[f"d3_{c}"] = df[c] - df[c].shift(3)
38
+
39
+ df["dew_proxy"] = df["temp_c"] - (df["humidity"] / 5.0)
40
+ df["d_dew_proxy"] = df["dew_proxy"].diff()
41
+ df["ma3_dew_proxy"] = df["dew_proxy"].rolling(3).mean()
42
+
43
+ df["rain_sum_3h"] = df["precip_mm"].rolling(3).sum()
44
+ df["rain_sum_6h"] = df["precip_mm"].rolling(6).sum()
45
+ df["rain_sum_12h"] = df["precip_mm"].rolling(12).sum()
46
+ df["rain_sum_24h"] = df["precip_mm"].rolling(24).sum()
47
+ df["rain_max_6h"] = df["precip_mm"].rolling(6).max()
48
+ df["rain_max_12h"] = df["precip_mm"].rolling(12).max()
49
+
50
+ is_raining = (df["precip_mm"] > 0).astype(int)
51
+ dry = (~(is_raining.astype(bool))).astype(int)
52
+ df["dry_streak_h"] = (dry.groupby((dry != dry.shift()).cumsum()).cumcount() + 1) * dry
53
+ df["dry_streak_h"] = df["dry_streak_h"].where(dry == 1, 0)
54
+
55
+ wet = is_raining
56
+ df["wet_streak_h"] = (wet.groupby((wet != wet.shift()).cumsum()).cumcount() + 1) * wet
57
+ df["wet_streak_h"] = df["wet_streak_h"].where(wet == 1, 0)
58
+
59
+ df["hour"] = df["time"].dt.hour
60
+ df["dow"] = df["time"].dt.dayofweek
61
+ df["doy"] = df["time"].dt.dayofyear
62
+ df["hoy"] = (df["doy"] - 1) * 24 + df["hour"]
63
+
64
+ df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24.0)
65
+ df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24.0)
66
+ df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7.0)
67
+ df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7.0)
68
+ df["hoy_sin"] = np.sin(2*np.pi*df["hoy"]/(365.25*24))
69
+ df["hoy_cos"] = np.cos(2*np.pi*df["hoy"]/(365.25*24))
70
+
71
+ df["hum_x_cloud"] = df["humidity"] * df["cloudcover"]
72
+ df["wind_x_cloud"] = df["wind_speed"] * df["cloudcover"]
73
+ df["press_drop_3h"] = -df["d3_pressure"]
74
+ df["press_drop_6h"] = df["pressure"].shift(6) - df["pressure"]
75
+
76
+ df = df.dropna().reset_index(drop=True)
77
+
78
+ if features_from_meta:
79
+ missing_feats = [c for c in features_from_meta if c not in df.columns]
80
+ if missing_feats:
81
+ raise ValueError(f"Missing features expected by model: {missing_feats}")
82
+ return df[features_from_meta]
83
+
84
+ feat = (
85
+ base +
86
+ [f"d_{c}" for c in base] +
87
+ [f"ma3_{c}" for c in base] +
88
+ [f"d3_{c}" for c in ["pressure","humidity","cloudcover","temp_c"]] +
89
+ ["dew_proxy","d_dew_proxy","ma3_dew_proxy",
90
+ "rain_sum_3h","rain_sum_6h","rain_sum_12h","rain_sum_24h","rain_max_6h","rain_max_12h",
91
+ "dry_streak_h","wet_streak_h",
92
+ "hour_sin","hour_cos","dow_sin","dow_cos","hoy_sin","hoy_cos",
93
+ "hum_x_cloud","wind_x_cloud","press_drop_3h","press_drop_6h"]
94
+ )
95
+ return df[feat]
96
+
97
+ # -----------------------------
98
+ # Label builder: ≥ EVENT_MM in next H hours
99
+ # -----------------------------
100
+ def make_labels(df: pd.DataFrame, horizon=H, event_mm=EVENT_MM):
101
+ prec = df["precip_mm"].values
102
+ y = np.zeros(len(df), dtype=int)
103
+ for i in range(len(prec) - horizon):
104
+ y[i] = 1 if np.nansum(prec[i+1:i+1+horizon]) >= event_mm else 0
105
+ y = y[:-horizon]
106
+ return y
107
+
108
+ # -----------------------------
109
+ # Models to compare
110
+ # -----------------------------
111
+ def build_models():
112
+ models = {}
113
+
114
+ # Logistic + StandardScaler
115
+ models["logreg_standard"] = Pipeline([
116
+ ("scaler", StandardScaler()),
117
+ ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
118
+ ])
119
+
120
+ # Logistic + RobustScaler (outlier-robust)
121
+ models["logreg_robust"] = Pipeline([
122
+ ("scaler", RobustScaler()),
123
+ ("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
124
+ ])
125
+
126
+ try:
127
+ from xgboost import XGBClassifier
128
+ models["xgb"] = XGBClassifier(
129
+ n_estimators=800,
130
+ learning_rate=0.05,
131
+ max_depth=5,
132
+ min_child_weight=3.0,
133
+ subsample=0.8,
134
+ colsample_bytree=0.8,
135
+ reg_lambda=2.0,
136
+ objective="binary:logistic",
137
+ eval_metric="aucpr",
138
+ tree_method="hist",
139
+ random_state=42,
140
+ )
141
+ except Exception as e:
142
+ print(f"[warn] XGBoost unavailable: {e}")
143
+ return models
144
+
145
+ def evaluate_fold(model, X_train, y_train, X_test, y_test, val_frac=0.15):
146
+ n = len(X_train)
147
+ v = max(int(n * val_frac), 1)
148
+ X_tr, y_tr = X_train[:-v], y_train[:-v]
149
+ X_val, y_val = X_train[-v:], y_train[-v:]
150
+
151
+ model.fit(X_tr, y_tr)
152
+
153
+ # Probability on val to pick threshold
154
+ if hasattr(model, "predict_proba"):
155
+ p_val = model.predict_proba(X_val)[:, 1]
156
+ p_test = model.predict_proba(X_test)[:, 1]
157
+ else:
158
+ if hasattr(model, "decision_function"):
159
+ from sklearn.preprocessing import MinMaxScaler
160
+ z_val = model.decision_function(X_val).reshape(-1, 1)
161
+ z_test = model.decision_function(X_test).reshape(-1, 1)
162
+ mm = MinMaxScaler()
163
+ p_val = mm.fit_transform(z_val).ravel()
164
+ p_test = mm.transform(z_test).ravel()
165
+ else:
166
+ # fallback: hard predictions at 0.5
167
+ pred = model.predict(X_test)
168
+ P, R, F1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
169
+ return dict(P=P, R=R, F1=F1, thr=0.5)
170
+
171
+ prec, rec, thr = precision_recall_curve(y_val, p_val)
172
+ # Avoid degenerate thresholds: thr has length len(prec)-1
173
+ candidates = []
174
+ for t in thr:
175
+ pred_v = (p_val >= t).astype(int)
176
+ P, R, F1, _ = precision_recall_fscore_support(y_val, pred_v, average="binary", zero_division=0)
177
+ candidates.append((t, P, R, F1))
178
+ if not candidates:
179
+ t_star = 0.5
180
+ else:
181
+ # choose by best F1 on validation
182
+ t_star = max(candidates, key=lambda x: x[3])[0]
183
+
184
+ pred = (p_test >= t_star).astype(int)
185
+ P, R, F1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
186
+ return dict(P=P, R=R, F1=F1, thr=float(t_star))
187
+
188
+ # -----------------------------
189
+ # Main
190
+ # -----------------------------
191
+ def main():
192
+ if not HOURLY.exists():
193
+ raise FileNotFoundError("results/hourly.csv not found. Run: make hourly PAST_DAYS=90")
194
+
195
+ df = pd.read_csv(HOURLY, parse_dates=["time"])
196
+ y_all = make_labels(df, H, EVENT_MM)
197
+ dfX = df.iloc[:-H].copy()
198
+
199
+ # Use features from meta if present
200
+ features_from_meta = None
201
+ if META.exists():
202
+ meta = json.loads(META.read_text())
203
+ features_from_meta = meta.get("features", None)
204
+
205
+ Xdf = rebuild_features_like_training(dfX, features_from_meta)
206
+ n = len(Xdf)
207
+ if len(y_all) < n:
208
+ raise ValueError("Labels shorter than feature matrix; check preprocessing alignment.")
209
+ y = y_all[-n:]
210
+ X = Xdf.values[-n:]
211
+ assert len(X) == len(y), "Feature matrix and labels misaligned."
212
+
213
+ tscv = TimeSeriesSplit(n_splits=5)
214
+
215
+ models = build_models()
216
+ results = {name: [] for name in models}
217
+
218
+ for name, model in models.items():
219
+ print(f"\n=== {name} ===")
220
+ fold_id = 1
221
+ per_fold = []
222
+ for tr_idx, te_idx in tscv.split(X):
223
+ X_tr, X_te = X[tr_idx], X[te_idx]
224
+ y_tr, y_te = y[tr_idx], y[te_idx]
225
+
226
+ metrics = evaluate_fold(model, X_tr, y_tr, X_te, y_te)
227
+ per_fold.append(metrics)
228
+ print(f"Fold {fold_id} → P={metrics['P']:.3f} R={metrics['R']:.3f} F1={metrics['F1']:.3f} thr={metrics['thr']:.3f}")
229
+ fold_id += 1
230
+
231
+ # Aggregate
232
+ Pm = np.mean([m["P"] for m in per_fold])
233
+ Rm = np.mean([m["R"] for m in per_fold])
234
+ Fm = np.mean([m["F1"] for m in per_fold])
235
+ print(f"Mean → P={Pm:.3f} R={Rm:.3f} F1={Fm:.3f}")
236
+ results[name] = dict(P=Pm, R=Rm, F1=Fm)
237
+
238
+ print("\n=== SUMMARY (higher F1 is better) ===")
239
+ for name, m in sorted(results.items(), key=lambda kv: kv[1]["F1"], reverse=True):
240
+ print(f"{name:18s} F1={m['F1']:.3f} P={m['P']:.3f} R={m['R']:.3f}")
241
+
242
+ if __name__ == "__main__":
243
+ main()
scripts/download_models.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ from pathlib import Path
5
+
6
+ from huggingface_hub import hf_hub_download
7
+ from huggingface_hub.errors import EntryNotFoundError
8
+
9
+
10
+ def main() -> None:
11
+ repo_id = os.environ.get("MODEL_REPO_ID", "theelvace/weather-data-fetcher-models")
12
+ files_env = os.environ.get(
13
+ "MODEL_FILES",
14
+ "rain_xgb_tuned.joblib rain_xgb_tuned_meta.json",
15
+ )
16
+ target_dir = Path(os.environ.get("MODEL_DIR", "models"))
17
+
18
+ target_dir.mkdir(parents=True, exist_ok=True)
19
+
20
+ filenames = [name.strip() for name in files_env.split() if name.strip()]
21
+ if not filenames:
22
+ print("MODEL_FILES is empty; nothing to download.")
23
+ return
24
+
25
+ for filename in filenames:
26
+ print(f"Downloading {filename} from {repo_id} ...")
27
+ try:
28
+ local_path = hf_hub_download(
29
+ repo_id=repo_id,
30
+ filename=filename,
31
+ local_dir=target_dir,
32
+ local_dir_use_symlinks=False,
33
+ )
34
+ except EntryNotFoundError:
35
+ print(f" • Skipping {filename}: not found in {repo_id}.")
36
+ continue
37
+ print(f"Saved to {local_path}")
38
+
39
+
40
+ if __name__ == "__main__":
41
+ main()
scripts/eval_operating_points.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ import numpy as np, pandas as pd
4
+ from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
5
+
6
+ meta = json.loads(Path("models/rain_xgb_tuned_meta.json").read_text())
7
+ thr = meta["thresholds"]
8
+ H = meta["horizon_hours"]
9
+
10
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
11
+
12
+ # make labels (>=1.0mm in next H hours)
13
+ prec = df["precip_mm"].values
14
+ y = np.zeros(len(df), dtype=int)
15
+ for i in range(len(prec) - H):
16
+ y[i] = 1 if np.nansum(prec[i+1:i+1+H]) >= meta["event_mm"] else 0
17
+ y = y[:-H]
18
+ dfX = df.iloc[:-H].copy()
19
+
20
+ # rebuild features exactly like training
21
+ # local import
22
+ import importlib.util
23
+ import types
24
+
25
+ def load_build_features():
26
+ spec = importlib.util.spec_from_file_location("train_xgb_tuned_final", Path("scripts/train_xgb_tuned_final.py"))
27
+ module = importlib.util.module_from_spec(spec)
28
+ spec.loader.exec_module(module) # type: ignore
29
+ return module.build_features
30
+
31
+ build_features = load_build_features()
32
+ Xdf = build_features(dfX)
33
+ X = Xdf.values
34
+ y = y[-len(X):] # align
35
+
36
+ import joblib
37
+ clf = joblib.load("models/rain_xgb_tuned.joblib")
38
+ p = clf.predict_proba(X)[:,1]
39
+
40
+ def report(name, t):
41
+ pred = (p >= t).astype(int)
42
+ P, R, F1, _ = precision_recall_fscore_support(y, pred, average="binary", zero_division=0)
43
+ cm = confusion_matrix(y, pred).tolist()
44
+ rate = float(pred.mean())
45
+ print(f"{name:<10} thr={t:.3f} | P={P:.3f} R={R:.3f} F1={F1:.3f} | alerts={rate:.2%} | cm={cm}")
46
+
47
+ report("default", thr["default"])
48
+ report("recall", thr["high_recall"])
49
+ report("precision", thr["high_precision"])
scripts/explain_shap.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import shap
3
+ import joblib
4
+ import pandas as pd
5
+ import matplotlib.pyplot as plt
6
+ from pathlib import Path
7
+ import json
8
+ import numpy as np
9
+ import os
10
+
11
+ # ensure matplotlib cache lives inside repo
12
+ RESULTS_DIR = Path("results")
13
+ RESULTS_DIR.mkdir(exist_ok=True)
14
+ os.environ.setdefault("MPLCONFIGDIR", str(RESULTS_DIR / ".matplotlib"))
15
+ Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
16
+
17
+ # === Load model + metadata ===
18
+ model = joblib.load("models/rain_xgb_tuned.joblib")
19
+ meta = json.load(open("models/rain_xgb_tuned_meta.json"))
20
+ features = meta["features"]
21
+
22
+ # === Load data ===
23
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
24
+
25
+ # Rebuild features exactly like training
26
+ import importlib.util
27
+
28
+ spec = importlib.util.spec_from_file_location(
29
+ "train_xgb_tuned_final", Path("scripts/train_xgb_tuned_final.py")
30
+ )
31
+ module = importlib.util.module_from_spec(spec)
32
+ spec.loader.exec_module(module)
33
+ build_features = module.build_features
34
+
35
+ Xdf = build_features(df)
36
+ X = Xdf.values.astype(np.float32)
37
+
38
+ # Use last 500 samples for analysis (avoid overkill)
39
+ X_sample = X[-200:]
40
+
41
+ # === SHAP Explainer ===
42
+ explainer = shap.Explainer(model.predict_proba, X_sample, algorithm="permutation")
43
+ shap_values = explainer(X_sample)
44
+
45
+ # === Global importance ===
46
+ Path("results").mkdir(exist_ok=True)
47
+ plt.figure()
48
+ shap.summary_plot(shap_values, X_sample,
49
+ feature_names=features, show=False)
50
+ plt.tight_layout()
51
+ plt.savefig("results/shap_summary.png", dpi=300)
52
+ plt.close()
53
+
54
+ # === Bar chart version ===
55
+ plt.figure()
56
+ shap.summary_plot(shap_values, X_sample,
57
+ feature_names=features, plot_type="bar", show=False)
58
+ plt.tight_layout()
59
+ plt.savefig("results/shap_top.png", dpi=300)
60
+ plt.close()
61
+
62
+ print("✅ SHAP visualisations saved: results/shap_summary.png and results/shap_top.png")
scripts/explain_shap_interaction.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Generates a SHAP dependence plot showing how HUMIDITY and
4
+ TEMPERATURE
5
+ jointly influence rain predictions. Outputs:
6
+ - results/shap_interaction.png
7
+ """
8
+ import json
9
+ from pathlib import Path
10
+
11
+ import joblib
12
+ import numpy as np
13
+ import pandas as pd
14
+ import matplotlib.pyplot as plt
15
+ import shap
16
+ import importlib.util
17
+ import os
18
+
19
+ # Keep matplotlib caches inside repo to avoid home directory issues
20
+ RESULTS_DIR = Path("results")
21
+ RESULTS_DIR.mkdir(exist_ok=True)
22
+ os.environ.setdefault("MPLCONFIGDIR", str(RESULTS_DIR / ".matplotlib"))
23
+ Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
24
+
25
+ # Load model + meta
26
+ model = joblib.load("models/rain_xgb_tuned.joblib")
27
+ booster = model.get_booster()
28
+ config = json.loads(booster.save_config())
29
+ base_score = config.get("learner", {}).get("learner_model_param", {}).get("base_score")
30
+ if base_score:
31
+ cleaned = base_score.strip("[]")
32
+ try:
33
+ float(cleaned)
34
+ except ValueError:
35
+ cleaned = "0.5"
36
+ config["learner"]["learner_model_param"]["base_score"] = cleaned
37
+ booster.load_config(json.dumps(config))
38
+
39
+ meta = json.loads(Path("models/rain_xgb_tuned_meta.json").read_text())
40
+ features = meta["features"]
41
+
42
+ # Load data and rebuild features exactly like training
43
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
44
+
45
+ spec = importlib.util.spec_from_file_location(
46
+ "train_xgb_tuned_final", "scripts/train_xgb_tuned_final.py"
47
+ )
48
+ module = importlib.util.module_from_spec(spec)
49
+ spec.loader.exec_module(module)
50
+ build_features = module.build_features
51
+ Xdf = build_features(df) # same order as training
52
+ X = Xdf.values
53
+ X_sample = X[-120:] if len(X) > 120 else X
54
+ X_sample_df = pd.DataFrame(X_sample, columns=features)
55
+ X_sample_df = pd.DataFrame(X_sample, columns=features)
56
+
57
+ # Prefer TreeExplainer for XGBoost; fallback to generic Explainer if needed
58
+ try:
59
+ explainer = shap.TreeExplainer(booster, data=X_sample)
60
+ shap_result = explainer(X_sample)
61
+ except Exception:
62
+ explainer = shap.Explainer(model.predict_proba, X_sample, algorithm="permutation")
63
+ shap_result = explainer(X_sample)
64
+
65
+ # Normalize SHAP output to a 2D array aligned with feature columns
66
+ if hasattr(shap_result, "values"):
67
+ values = shap_result.values
68
+ if values.ndim == 3: # multi-class, take positive class (index 1)
69
+ values = values[:, :, 1]
70
+ shap_values = values
71
+ else:
72
+ shap_values = np.array(shap_result)
73
+
74
+ # Ensure sample frame matches SHAP output rows
75
+ X_plot = X_sample_df.iloc[-shap_values.shape[0]:]
76
+
77
+ Path("results").mkdir(exist_ok=True)
78
+
79
+ # 1) Dependence plot: humidity colored by temp_c (classic interaction view)
80
+ plt.figure()
81
+ shap.dependence_plot(
82
+ "humidity",
83
+ shap_values,
84
+ X_plot,
85
+ interaction_index="temp_c",
86
+ show=False
87
+ )
88
+ plt.tight_layout()
89
+ plt.savefig("results/shap_interaction.png", dpi=300)
90
+ plt.close()
91
+
92
+ # 2) (Optional) Reverse view: temp_c colored by humidity
93
+ plt.figure()
94
+ shap.dependence_plot(
95
+ "temp_c",
96
+ shap_values,
97
+ X_plot,
98
+ interaction_index="humidity",
99
+ show=False
100
+ )
101
+ plt.tight_layout()
102
+ plt.savefig("results/shap_interaction_rev.png", dpi=300)
103
+ plt.close()
104
+
105
+ print("✅ Saved results/shap_interaction.png and results/shap_interaction_rev.png")
scripts/export_daily.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import json, pandas as pd, os
3
+
4
+ os.makedirs("results", exist_ok=True)
5
+ data = json.load(open("data/weather.json"))
6
+
7
+ df = pd.DataFrame({
8
+ "date": data["daily"]["time"],
9
+ "temp_min_c": data["daily"]["temperature_2m_min"],
10
+ "temp_max_c": data["daily"]["temperature_2m_max"],
11
+ "precip_mm": data["daily"]["precipitation_sum"],
12
+ "cloudcover": data["daily"]["cloudcover_mean"],
13
+ "wind_speed": data["daily"]["wind_speed_10m_max"],
14
+ "humidity_max": data["daily"]["relative_humidity_2m_max"],
15
+ "humidity_min": data["daily"]["relative_humidity_2m_min"],
16
+ })
17
+ df.to_csv("results/daily.csv", index=False)
18
+ print(f"✅ Wrote results/daily.csv with {len(df)} rows")
scripts/export_hourly.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ import pandas as pd
5
+
6
+ os.makedirs("results", exist_ok=True)
7
+ with open("data/weather.json") as handle:
8
+ data = json.load(handle)
9
+
10
+ if "hourly" not in data:
11
+ print(
12
+ "data/weather.json missing 'hourly'. Re-run the fetch step with hourly "
13
+ "parameters enabled (see scripts/fetch_weather.sh).",
14
+ file=sys.stderr,
15
+ )
16
+ sys.exit(1)
17
+
18
+ H = data["hourly"]
19
+ df = pd.DataFrame({
20
+ "time": H["time"],
21
+ "temp_c": H["temperature_2m"],
22
+ "humidity": H["relative_humidity_2m"],
23
+ "cloudcover": H["cloudcover"],
24
+ "pressure": H["pressure_msl"],
25
+ "wind_speed": H["wind_speed_10m"],
26
+ "precip_mm": H["precipitation"],
27
+ "rain_mm": H["rain"],
28
+ })
29
+
30
+ df["time"] = pd.to_datetime(df["time"])
31
+ df = df.sort_values("time").reset_index(drop=True)
32
+ df.to_csv("results/hourly.csv", index=False)
33
+ print(f"✅ Wrote results/hourly.csv with {len(df)} rows")
scripts/feature_importance_rain.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from pathlib import Path
4
+
5
+ import joblib
6
+ import numpy as np
7
+ import pandas as pd
8
+ from sklearn.inspection import permutation_importance
9
+ from sklearn.model_selection import train_test_split
10
+
11
+ RESULTS_DIR = "results"
12
+ os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
13
+ os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
14
+
15
+ Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
16
+ Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True)
17
+
18
+ import matplotlib
19
+
20
+ matplotlib.use("Agg")
21
+ import matplotlib.pyplot as plt
22
+
23
+
24
+ def build_dataset(meta: dict) -> tuple[np.ndarray, np.ndarray]:
25
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
26
+ horizon = meta["horizon_hours"]
27
+
28
+ precip = df["precip_mm"].values
29
+ rain_future = np.zeros(len(df), dtype=int)
30
+ for i in range(len(precip) - horizon):
31
+ rain_future[i] = 1 if np.any(precip[i + 1 : i + 1 + horizon] > 0) else 0
32
+
33
+ df = df.iloc[: len(precip) - horizon].copy()
34
+ labels = rain_future[: len(df)]
35
+ features = df[meta["features"]].values
36
+ return features, labels
37
+
38
+
39
+ def plot_importance(feature_names: list[str], importances: np.ndarray, std: np.ndarray) -> None:
40
+ order = np.argsort(importances)[::-1]
41
+ feature_names = np.array(feature_names)[order]
42
+ importances = importances[order]
43
+
44
+ plt.figure(figsize=(8, 5))
45
+ y_pos = np.arange(len(feature_names))
46
+ plt.barh(y_pos, importances, align="center")
47
+ plt.yticks(y_pos, feature_names)
48
+ plt.gca().invert_yaxis()
49
+ plt.xlabel("Permutation importance (F1 drop)")
50
+ plt.title("Rain classifier — feature importances")
51
+ plt.tight_layout()
52
+
53
+ Path(RESULTS_DIR).mkdir(exist_ok=True)
54
+ plt.savefig(os.path.join(RESULTS_DIR, "feature_importance.png"))
55
+ plt.close()
56
+
57
+
58
+ def main() -> None:
59
+ meta = json.load(open("models/rain_model_meta.json"))
60
+ model = joblib.load("models/rain_classifier_hourly.joblib")
61
+
62
+ X, y = build_dataset(meta)
63
+ _, X_test, _, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
64
+
65
+ result = permutation_importance(
66
+ model,
67
+ X_test,
68
+ y_test,
69
+ n_repeats=25,
70
+ random_state=42,
71
+ scoring="f1",
72
+ )
73
+
74
+ plot_importance(meta["features"], result.importances_mean, result.importances_std)
75
+ print("✅ Wrote results/feature_importance.png")
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main()
scripts/fetch_weather.sh ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ mkdir -p data logs
4
+
5
+ source .env 2>/dev/null || true
6
+ : "${LAT:=6.5244}"
7
+ : "${LON:=3.3792}"
8
+ : "${CITY:=Lagos}"
9
+ : "${PAST_DAYS:=30}"
10
+
11
+ STAMP="$(date +%Y-%m-%d_%H-%M-%S)"
12
+ LOG_FILE=${LOG_FILE:-logs/app.log}
13
+
14
+ echo "[${STAMP}] Fetching ${PAST_DAYS} past days for ${CITY} (${LAT}, ${LON})"
15
+
16
+ URL="https://api.open-meteo.com/v1/forecast?latitude=${LAT}&longitude=${LON}&hourly=temperature_2m,relative_humidity_2m,cloudcover,pressure_msl,wind_speed_10m,precipitation,rain&timezone=Africa%2FLagos&past_days=${PAST_DAYS}"
17
+
18
+ {
19
+ curl -sfL "$URL" -o data/weather.json
20
+ echo "[$STAMP] Saved to data/weather.json"
21
+ } | tee -a "$LOG_FILE"
scripts/intro_ml.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.linear_model import LinearRegression
2
+ import numpy as np
3
+
4
+ # Imagine 5 days of temperatures (°C)
5
+ # `x` is the input feature (temperature), reshaped to a column vector
6
+ x = np.array([25, 27, 30, 32, 35]).reshape(-1, 1)
7
+ # `y` is the output label (humidity percentage)
8
+ y = np.array([50, 55, 63, 70, 74])
9
+
10
+ model = LinearRegression()
11
+ model.fit(x, y)
12
+
13
+ pred = model.predict([[28]])
14
+ print(f"Predicted humidity for 28°C: {pred[0]:.2f}%")
15
+
16
+ import matplotlib.pyplot as plt
17
+
18
+ plt.scatter(x, y, color='blue', label='data')
19
+ plt.plot(x, model.predict(x), color='red', label='model')
20
+ plt.xlabel('Temperature (°C)')
21
+ plt.ylabel('Humidity (%)')
22
+ plt.legend()
23
+ plt.tight_layout()
24
+ plt.savefig("results/intro_regression.png")
25
+ print("✅ Saved results/intro_regression.png")
26
+
27
+ print("slope:", model.coef_)
28
+ print("intercept:", model.intercept_)
scripts/log_predict.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import argparse, os, json
3
+ from pathlib import Path
4
+ from datetime import datetime
5
+ import joblib, pandas as pd, numpy as np, subprocess
6
+
7
+ MODEL = Path("models/rain_xgb_tuned.joblib")
8
+ META = Path("models/rain_xgb_tuned_meta.json")
9
+ HOURLY = Path("results/hourly.csv")
10
+ LOGS = Path("logs"); LOGS.mkdir(exist_ok=True)
11
+ PRED_LOG = LOGS / "predictions.csv"
12
+
13
+ def ensure_hourly(lat, lon, past_days=90):
14
+ env = os.environ.copy()
15
+ env["LAT"], env["LON"], env["PAST_DAYS"] = str(lat), str(lon), str(past_days)
16
+ if (not HOURLY.exists()):
17
+ subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
18
+ subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
19
+ return pd.read_csv(HOURLY, parse_dates=["time"])
20
+
21
+ def build_features_like_training(df, features):
22
+ import importlib.util
23
+ spec = importlib.util.spec_from_file_location("train_xgb_tuned_final", "scripts/train_xgb_tuned_final.py")
24
+ module = importlib.util.module_from_spec(spec)
25
+ spec.loader.exec_module(module)
26
+ build_features = module.build_features
27
+ Xdf = build_features(df)
28
+ return Xdf[features]
29
+
30
+ def main():
31
+ ap = argparse.ArgumentParser()
32
+ ap.add_argument("--city", default="Lagos")
33
+ ap.add_argument("--lat", type=float, default=6.5244)
34
+ ap.add_argument("--lon", type=float, default=3.3792)
35
+ ap.add_argument("--mode", choices=["default","recall","precision"], default="default")
36
+ args = ap.parse_args()
37
+
38
+ meta = json.loads(META.read_text())
39
+ thr = meta["thresholds"]; feats = meta["features"]; H = meta["horizon_hours"]; event_mm = meta["event_mm"]
40
+
41
+ df = ensure_hourly(args.lat, args.lon, 90)
42
+ Xdf = build_features_like_training(df.copy(), feats)
43
+ if Xdf.empty: raise SystemExit("Not enough rows to build features")
44
+
45
+ clf = joblib.load(MODEL)
46
+ p = float(clf.predict_proba(Xdf.iloc[[-1]].values)[0,1])
47
+ tmap = {"default":thr["default"], "recall":thr["high_recall"], "precision":thr["high_precision"]}
48
+ t = float(tmap[args.mode])
49
+ decision = "RAIN" if p >= t else "No rain"
50
+
51
+ row = {
52
+ "ts_pred": df.loc[Xdf.index, "time"].iloc[-1].strftime("%Y-%m-%d %H:%M:%S"),
53
+ "logged_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
54
+ "city": args.city, "lat": args.lat, "lon": args.lon,
55
+ "mode": args.mode, "horizon_h": H, "event_mm": event_mm,
56
+ "p": p, "threshold": t, "decision": decision,
57
+ "y_true": "", # to be filled by backfill
58
+ }
59
+ if not PRED_LOG.exists():
60
+ pd.DataFrame([row]).to_csv(PRED_LOG, index=False)
61
+ else:
62
+ pd.DataFrame([row]).to_csv(PRED_LOG, mode="a", header=False, index=False)
63
+
64
+ print(f"Logged: {row}")
65
+
66
+ if __name__ == "__main__":
67
+ main()
scripts/make_cover.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image, ImageDraw, ImageFont
2
+ import os
3
+
4
+ RESULTS = "results"
5
+ IMG1 = os.path.join(RESULTS, "temps.png")
6
+ IMG2 = os.path.join(RESULTS, "precip.png")
7
+ OUT = os.path.join(RESULTS, "cover.png")
8
+ ASSETS_DIR = "assets"
9
+ ASSET_OUT = os.path.join(ASSETS_DIR, "cover.png")
10
+
11
+ def fail(msg):
12
+ print(f"❌ {msg}")
13
+ raise SystemExit(1)
14
+
15
+ if not os.path.exists(IMG1):
16
+ fail(f"Missing {IMG1}. Run `make viz` first.")
17
+ if not os.path.exists(IMG2):
18
+ fail(f"Missing {IMG2}. Run `make viz` first.")
19
+
20
+ img1 = Image.open(IMG1).convert("RGB")
21
+ img2 = Image.open(IMG2).convert("RGB")
22
+
23
+ w = min(img1.width, img2.width)
24
+ def resize_to_width(im, target_w):
25
+ new_h = int(im.height * target_w / im.width)
26
+ return im.resize((target_w, new_h))
27
+
28
+ img1 = resize_to_width(img1, w)
29
+ img2 = resize_to_width(img2, w)
30
+
31
+ pad = 16
32
+ title_h = 48
33
+ H = img1.height + img2.height + title_h + pad * 4
34
+ W = w + pad * 2
35
+
36
+ canvas = Image.new("RGB", (W, H), "white")
37
+
38
+ y = pad
39
+ canvas.paste(img1, (pad, y)); y += img1.height + pad
40
+ canvas.paste(img2, (pad, y)); y += img2.height + pad
41
+
42
+ draw = ImageDraw.Draw(canvas)
43
+ title = "Weather Data Fetcher — Automated Pipeline"
44
+ try:
45
+ font = ImageFont.load_default()
46
+ except Exception:
47
+ font = None
48
+
49
+ tw, th = draw.textbbox((0,0), title, font=font)[2:]
50
+ tx = (W - tw) // 2
51
+ ty = y
52
+ draw.text((tx, ty), title, fill="black", font=font)
53
+
54
+ os.makedirs(RESULTS, exist_ok=True)
55
+ canvas.save(OUT, optimize=True)
56
+ if ASSETS_DIR:
57
+ os.makedirs(ASSETS_DIR, exist_ok=True)
58
+ canvas.save(ASSET_OUT, optimize=True)
59
+ print(f"✅ Created {OUT}")
scripts/monitor_weekly.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ from pathlib import Path
3
+ import pandas as pd, numpy as np
4
+ import matplotlib.pyplot as plt
5
+ from sklearn.metrics import precision_recall_fscore_support, brier_score_loss
6
+
7
+ LOG = Path("logs/predictions.csv")
8
+ OUT = Path("results"); OUT.mkdir(exist_ok=True)
9
+
10
+ def week_key(ts): # ISO year-week
11
+ iso = ts.isocalendar()
12
+ return f"{iso.year}-W{iso.week:02d}"
13
+
14
+ def calibration_plot(p, y, bins=10, out_png="results/calibration.png"):
15
+ df = pd.DataFrame({"p":p, "y":y}).dropna()
16
+ df["bin"] = pd.qcut(df["p"], q=bins, duplicates="drop")
17
+ g = df.groupby("bin").agg(avg_p=("p","mean"), frac_pos=("y","mean"), n=("y","size")).reset_index(drop=True)
18
+ plt.figure()
19
+ plt.plot([0,1],[0,1], linestyle="--")
20
+ plt.plot(g["avg_p"], g["frac_pos"], marker="o")
21
+ plt.xlabel("Predicted probability")
22
+ plt.ylabel("Observed frequency")
23
+ plt.title("Calibration")
24
+ for i, n in enumerate(g["n"]):
25
+ plt.annotate(str(int(n)), (g["avg_p"].iloc[i], g["frac_pos"].iloc[i]))
26
+ plt.tight_layout()
27
+ plt.savefig(out_png, dpi=300); plt.close()
28
+
29
+ def main():
30
+ if not LOG.exists():
31
+ print("No logs yet.")
32
+ return
33
+ df = pd.read_csv(LOG, parse_dates=["ts_pred","logged_at"])
34
+ df = df[df["y_true"].astype(str).isin(["0","1"])].copy()
35
+ if df.empty:
36
+ print("No rows with y_true yet.")
37
+ return
38
+ df["y_true"] = df["y_true"].astype(int)
39
+ df["week"] = df["ts_pred"].apply(week_key)
40
+
41
+ # Weekly metrics per mode
42
+ rows = []
43
+ for (wk, mode), grp in df.groupby(["week","mode"]):
44
+ y = grp["y_true"].values
45
+ # decision at time of logging
46
+ yhat = (grp["p"].values >= grp["threshold"].values).astype(int)
47
+ P,R,F1,_ = precision_recall_fscore_support(y, yhat, average="binary", zero_division=0)
48
+ alerts = float(yhat.mean())
49
+ brier = brier_score_loss(y, grp["p"].values)
50
+ rows.append({"week":wk,"mode":mode,"n":len(grp),"precision":P,"recall":R,"f1":F1,"alert_rate":alerts,"brier":brier})
51
+ rep = pd.DataFrame(rows).sort_values(["week","mode"])
52
+ rep.to_csv(OUT/"weekly_report.csv", index=False)
53
+ print(rep)
54
+
55
+ # Overall calibration (all modes combined)
56
+ calibration_plot(df["p"].values, df["y_true"].values, bins=12, out_png=str(OUT/"calibration.png"))
57
+ print("Saved:", OUT/"weekly_report.csv", "and", OUT/"calibration.png")
58
+
59
+ if __name__ == "__main__":
60
+ main()
scripts/plot_pr_roc.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import joblib
4
+ import pandas as pd
5
+ import numpy as np
6
+ from sklearn.metrics import precision_recall_curve, roc_curve, auc
7
+ from sklearn.model_selection import train_test_split
8
+
9
+ RESULTS_DIR = "results"
10
+ os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
11
+ os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
12
+
13
+ from pathlib import Path
14
+
15
+ Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
16
+ Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True)
17
+
18
+ import matplotlib
19
+
20
+ matplotlib.use("Agg")
21
+ import matplotlib.pyplot as plt
22
+
23
+ meta = json.load(open("models/rain_model_meta.json"))
24
+ clf = joblib.load("models/rain_classifier_hourly.joblib")
25
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
26
+
27
+ H = meta["horizon_hours"]
28
+ features = meta["features"]
29
+
30
+ precip_next = np.zeros(len(df), dtype=int)
31
+ prec = df["precip_mm"].values
32
+ for i in range(len(prec) - H):
33
+ precip_next[i] = 1 if np.any(prec[i + 1 : i + 1 + H] > 0) else 0
34
+
35
+ df = df.iloc[: len(precip_next)].copy()
36
+ df["rain_next6h"] = precip_next[: len(df)]
37
+
38
+ X = df[features].values
39
+ y = df["rain_next6h"].values
40
+
41
+ _, X_test, _, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
42
+ proba = clf.predict_proba(X_test)[:, 1]
43
+
44
+ precision, recall, _ = precision_recall_curve(y_test, proba)
45
+ fpr, tpr, _ = roc_curve(y_test, proba)
46
+
47
+ plt.figure()
48
+ plt.plot(recall, precision)
49
+ plt.xlabel("Recall")
50
+ plt.ylabel("Precision")
51
+ plt.title("Precision–Recall")
52
+ plt.tight_layout()
53
+ plt.savefig("results/pr_curve.png")
54
+ plt.close()
55
+
56
+ plt.figure()
57
+ plt.plot(fpr, tpr)
58
+ plt.xlabel("FPR")
59
+ plt.ylabel("TPR")
60
+ plt.title(f"ROC (AUC={auc(fpr, tpr):.2f})")
61
+ plt.tight_layout()
62
+ plt.savefig("results/roc_curve.png")
63
+ plt.close()
64
+
65
+ print("✅ Wrote results/pr_curve.png and results/roc_curve.png")
scripts/plot_weather.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from shutil import copyfile
4
+
5
+
6
+ RESULTS_DIR = "results"
7
+ ASSETS_DIR = "assets"
8
+ TEMPS_RESULTS = os.path.join(RESULTS_DIR, "temps.png")
9
+ PRECIP_RESULTS = os.path.join(RESULTS_DIR, "precip.png")
10
+ TEMPS_ASSET = os.path.join(ASSETS_DIR, "temps.png")
11
+ PRECIP_ASSET = os.path.join(ASSETS_DIR, "precip.png")
12
+
13
+ os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
14
+ os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
15
+
16
+ import matplotlib
17
+
18
+ matplotlib.use("Agg")
19
+ import matplotlib.pyplot as plt
20
+ import pandas as pd
21
+
22
+
23
+ def ensure_dirs():
24
+ os.makedirs(RESULTS_DIR, exist_ok=True)
25
+ os.makedirs(ASSETS_DIR, exist_ok=True)
26
+ os.makedirs(os.environ["MPLCONFIGDIR"], exist_ok=True)
27
+ os.makedirs(os.environ["XDG_CACHE_HOME"], exist_ok=True)
28
+
29
+
30
+ def mirror_asset(src: str, dest: str) -> None:
31
+ copyfile(src, dest)
32
+
33
+
34
+ def main():
35
+ with open("data/weather.json") as handle:
36
+ data = json.load(handle)
37
+
38
+ days = pd.to_datetime(data["daily"]["time"])
39
+ tmax = pd.Series(data["daily"]["temperature_2m_max"])
40
+ tmin = pd.Series(data["daily"]["temperature_2m_min"])
41
+ prec = pd.Series(data["daily"].get("precipitation_sum", [0] * len(days)))
42
+
43
+ ensure_dirs()
44
+
45
+ plt.figure()
46
+ plt.plot(days, tmax, marker="o", label="Max °C")
47
+ plt.plot(days, tmin, marker="o", label="Min °C")
48
+ plt.xticks(rotation=45, ha="right")
49
+ plt.title("Daily Temperatures (°C)")
50
+ plt.legend()
51
+ plt.tight_layout()
52
+ plt.savefig(TEMPS_RESULTS)
53
+ plt.close()
54
+
55
+ mirror_asset(TEMPS_RESULTS, TEMPS_ASSET)
56
+
57
+ # Precipitation bar chart
58
+ plt.figure()
59
+ plt.bar(days, prec)
60
+ plt.xticks(rotation=45, ha="right")
61
+ plt.title("Daily Precipitation (mm)")
62
+ plt.tight_layout()
63
+ plt.savefig(PRECIP_RESULTS)
64
+ plt.close()
65
+
66
+ mirror_asset(PRECIP_RESULTS, PRECIP_ASSET)
67
+
68
+ print(f"✅ Wrote {TEMPS_RESULTS} / {PRECIP_RESULTS}")
69
+ print(f"✅ Updated assets at {TEMPS_ASSET} / {PRECIP_ASSET}")
70
+
71
+ if __name__ == "__main__":
72
+ main()
scripts/predict_rain.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, json, joblib
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ # Load latest hour from results/hourly.csv, predict next 6h rain
6
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
7
+ row = df.iloc[-1:].copy()
8
+
9
+ for col in ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]:
10
+ row[f"d_{col}"] = df[col].diff().iloc[-1]
11
+ row[f"ma3_{col}"] = df[col].rolling(3).mean().iloc[-1]
12
+
13
+ features = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
14
+ features += [f"d_{c}" for c in features]
15
+ features += [f"ma3_{c}" for c in ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]]
16
+
17
+ # Load model + meta
18
+ clf = joblib.load("models/rain_classifier_hourly.joblib")
19
+ meta = json.load(open("models/rain_model_meta.json"))
20
+ X = row[meta["features"]].values
21
+
22
+ proba = float(clf.predict_proba(X)[0,1])
23
+ thr_r = meta["thresholds"]["high_recall"]
24
+ thr_p = meta["thresholds"]["high_precision"]
25
+
26
+ print(f"Latest hour: {row['time'].iloc[0]}")
27
+ print(f"P(rain next {meta['horizon_hours']}h) = {proba:.3f}")
28
+ print(f"High-Recall mode: {'RAIN' if proba>=thr_r else 'No rain'} (thr={thr_r:.2f})")
29
+ print(f"High-Precision mode:{'RAIN' if proba>=thr_p else 'No rain'} (thr={thr_p:.2f})")
scripts/process_weather.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dotenv import load_dotenv
2
+ import os, json, sys, logging
3
+
4
+ load_dotenv()
5
+ LAT = os.getenv("LAT", "6.5244")
6
+ LON = os.getenv("LON", "3.3792")
7
+ CITY = os.getenv("CITY", "Lagos")
8
+ LOG_FILE = os.getenv("LOG_FILE", "logs/app.log")
9
+
10
+ os.makedirs("logs", exist_ok=True)
11
+ logging.basicConfig(
12
+ filename=LOG_FILE,
13
+ level=logging.INFO,
14
+ format="%(asctime)s [%(levelname)s] %(message)s",
15
+ )
16
+
17
+ logging.info(f"Processing weather for {CITY} ({LAT}, {LON})")
18
+
19
+ IN, OUT_DIR = "data/weather.json", "results"
20
+ OUT = os.path.join(OUT_DIR, "summary.txt")
21
+ os.makedirs(OUT_DIR, exist_ok=True)
22
+
23
+ logging.info("Reading weather.json")
24
+
25
+ try:
26
+ with open(IN) as f:
27
+ data = json.load(f)
28
+ except FileNotFoundError:
29
+ print("weather.json not found. Run `make download`.", file=sys.stderr); sys.exit(1)
30
+
31
+ try:
32
+ daily = data["daily"]
33
+ days = daily["time"]
34
+ tmax = daily["temperature_2m_max"]
35
+ tmin = daily["temperature_2m_min"]
36
+ prec = daily.get("precipitation_sum", [0]*len(days))
37
+ except Exception as e:
38
+ print(f"Unexpected JSON structure: {e}", file=sys.stderr); sys.exit(2)
39
+
40
+ with open(OUT, "w") as f:
41
+ f.write("Lagos (Africa/Lagos) – Daily summary\n")
42
+ f.write("-----------------------------------\n")
43
+ for d, lo, hi, p in zip(days, tmin, tmax, prec):
44
+ f.write(f"{d}: {lo}°C – {hi}°C | precip: {p} mm\n")
45
+
46
+ logging.info(f"Wrote summary to {OUT}")
47
+
48
+ print(f"✅ Wrote {OUT}")
49
+
50
+ import csv
51
+ with open(os.path.join(OUT_DIR, "summary.csv"), "w", newline="") as f:
52
+ w = csv.writer(f)
53
+ w.writerow(["date", "temp_min_c", "temp_max_c", "precip_mm"])
54
+ for d, lo, hi, p in zip(days, tmin, tmax, prec):
55
+ w.writerow([d, lo, hi, p])
56
+ print("✅ Wrote results/summary.csv")
scripts/rain_cli.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import argparse, json, joblib, pandas as pd
3
+
4
+ def main():
5
+ ap = argparse.ArgumentParser(description="Rain warning in next 6h")
6
+ ap.add_argument("--mode", choices=["recall","precision","default"], default="recall")
7
+ args = ap.parse_args()
8
+
9
+ meta = json.load(open("models/rain_model_meta.json"))
10
+ clf = joblib.load("models/rain_classifier_hourly.joblib")
11
+
12
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
13
+ row = df.iloc[-1:].copy()
14
+
15
+ # rebuild features like training
16
+ base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
17
+ for col in base:
18
+ row[f"d_{col}"] = df[col].diff().iloc[-1]
19
+ row[f"ma3_{col}"] = df[col].rolling(3).mean().iloc[-1]
20
+
21
+ X = row[meta["features"]].values
22
+ p = float(clf.predict_proba(X)[0,1])
23
+
24
+ thr = {
25
+ "default": meta["thresholds"]["default"],
26
+ "recall": meta["thresholds"]["high_recall"],
27
+ "precision": meta["thresholds"]["high_precision"],
28
+ }[args.mode]
29
+
30
+ decision = "RAIN" if p >= thr else "No rain"
31
+ print(f"{row['time'].iloc[0]} | P(rain ≤{meta['horizon_hours']}h)={p:.3f} | mode={args.mode} thr={thr:.2f} → {decision}")
32
+
33
+ if __name__ == "__main__":
34
+ main()
scripts/start_services.sh ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+
4
+ STREAMLIT_PORT="${STREAMLIT_PORT:-8501}"
5
+ UVICORN_PORT="${UVICORN_PORT:-${PORT:-8000}}"
6
+ HOST="0.0.0.0"
7
+
8
+ echo "Environment: PORT=${PORT:-<unset>} STREAMLIT_PORT=${STREAMLIT_PORT} UVICORN_PORT=${UVICORN_PORT}"
9
+
10
+ export STREAMLIT_SERVER_HEADLESS=true
11
+ export STREAMLIT_SERVER_PORT="${STREAMLIT_PORT}"
12
+ export STREAMLIT_SERVER_ADDRESS="${HOST}"
13
+
14
+ echo "🌐 Starting Streamlit on port ${STREAMLIT_PORT}"
15
+ streamlit run streamlit_app.py --server.port "${STREAMLIT_PORT}" --server.address "${HOST}" &
16
+ STREAMLIT_PID=$!
17
+
18
+ cleanup() {
19
+ echo "🛑 Shutting down services..."
20
+ if kill -0 "${STREAMLIT_PID}" 2>/dev/null; then
21
+ kill "${STREAMLIT_PID}" 2>/dev/null || true
22
+ wait "${STREAMLIT_PID}" 2>/dev/null || true
23
+ fi
24
+ }
25
+ trap cleanup EXIT INT TERM
26
+
27
+ echo "🚀 Starting FastAPI (uvicorn) on port ${UVICORN_PORT}"
28
+ exec python -m uvicorn app.main:app --host "${HOST}" --port "${UVICORN_PORT}" --proxy-headers
scripts/time_series_cv_demo.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.model_selection import TimeSeriesSplit
4
+ from sklearn.pipeline import Pipeline
5
+ from sklearn.preprocessing import StandardScaler
6
+ from sklearn.linear_model import LogisticRegression
7
+ from sklearn.metrics import f1_score
8
+
9
+ # Load your hourly data
10
+ df = pd.read_csv("results/hourly.csv")
11
+ df = df.dropna().reset_index(drop=True)
12
+
13
+ # Features and target
14
+ features = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
15
+ X = df[features].values
16
+ y = (df["precip_mm"].shift(-6) > 0).astype(int) # rain in next 6h
17
+ y = y[:-6]
18
+ X = X[:-6]
19
+
20
+ # Time-series CV setup
21
+ tscv = TimeSeriesSplit(n_splits=5)
22
+
23
+ f1_scores = []
24
+ for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
25
+ X_train, X_test = X[train_idx], X[test_idx]
26
+ y_train, y_test = y[train_idx], y[test_idx]
27
+
28
+ clf = Pipeline([
29
+ ("scaler", StandardScaler()),
30
+ ("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
31
+ ])
32
+ clf.fit(X_train, y_train)
33
+ preds = clf.predict(X_test)
34
+ score = f1_score(y_test, preds)
35
+ f1_scores.append(score)
36
+ print(f"Fold {fold+1} F1: {score:.3f}")
37
+
38
+ print("\nAverage F1 across folds:", np.mean(f1_scores).round(3))
scripts/train_classify_rain.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.metrics import (
10
+ confusion_matrix, classification_report,
11
+ roc_auc_score, roc_curve, precision_recall_fscore_support
12
+ )
13
+
14
+ df = pd.read_csv("results/daily.csv")
15
+
16
+ df["precip_tomorrow"] = df["precip_mm"].shift(-1)
17
+ df = df.dropna() # drop last row without tomorrow
18
+
19
+ df["rain_tomorrow"] = (df["precip_tomorrow"] > 0).astype(int)
20
+
21
+ features = [
22
+ "temp_max_c",
23
+ "temp_min_c",
24
+ "cloudcover",
25
+ "wind_speed",
26
+ "humidity_max",
27
+ "humidity_min",
28
+ "precip_mm", # rain today often implies rain persists
29
+ ]
30
+ X = df[features].values
31
+ y = df["rain_tomorrow"].values
32
+
33
+ X_train, X_test, y_train, y_test = train_test_split(
34
+ X, y, test_size=0.3, shuffle=False
35
+ )
36
+
37
+ clf = Pipeline([
38
+ ("scaler", StandardScaler()),
39
+ ("logreg", LogisticRegression(max_iter=200,
40
+ class_weight="balanced"))
41
+ ])
42
+
43
+ clf.fit(X_train, y_train)
44
+
45
+ proba = clf.predict_proba(X_test)[:, 1] # P(rain)
46
+ pred_default = (proba >= 0.5).astype(int) # default threshold
47
+
48
+ labels = [0, 1]
49
+ cm = confusion_matrix(y_test, pred_default, labels=labels)
50
+ tn, fp, fn, tp = cm.ravel()
51
+
52
+ prec, rec, f1, _ = precision_recall_fscore_support(
53
+ y_test, pred_default, average="binary", zero_division=0
54
+ )
55
+ auc = (
56
+ roc_auc_score(y_test, proba)
57
+ if len(np.unique(y_test)) > 1 and len(np.unique(proba)) > 1
58
+ else float("nan")
59
+ )
60
+
61
+ print("📊 Confusion Matrix (threshold=0.50)")
62
+ print(cm)
63
+ auc_str = f"{auc:.3f}" if np.isfinite(auc) else "n/a"
64
+ print(f"\nPrecision: {prec:.3f} Recall: {rec:.3f} F1: {f1:.3f} ROC-AUC: {auc_str}")
65
+
66
+ print("\nDetailed report:")
67
+ print(classification_report(y_test, pred_default, digits=3, zero_division=0, labels=labels))
68
+
69
+ always_no = np.zeros_like(y_test)
70
+ prec0, rec0, f10, _ = precision_recall_fscore_support(
71
+ y_test, always_no, average="binary", zero_division=0
72
+ )
73
+ print("\n⚠️ Baseline — always 'no rain'")
74
+ print(f"Precision: {prec0:.3f} Recall: {rec0:.3f} F1: {f10:.3f}")
75
+
76
+ today_rain = (df["precip_mm"].values[-len(y_test)-1:-1] > 0).astype(int)
77
+ precp, recp, f1p, _ = precision_recall_fscore_support(
78
+ y_test, today_rain, average="binary", zero_division=0
79
+ )
80
+ print("\n🧠 Baseline — 'tomorrow rain = today rain'")
81
+ print(f"Precision: {precp:.3f} Recall: {recp:.3f} F1: {f1p:.3f}")
82
+
83
+ thr = 0.35
84
+ pred_tuned = (proba >= thr).astype(int)
85
+ prec_t, rec_t, f1_t, _ = precision_recall_fscore_support(
86
+ y_test, pred_tuned, average="binary", zero_division=0
87
+ )
88
+ print(f"\n🎛️ Threshold {thr:.2f} → Precision: {prec_t:.3f} Recall: {rec_t:.3f} F1: {f1_t:.3f}")
89
+
90
+ import joblib
91
+ os.makedirs("models", exist_ok=True)
92
+ joblib.dump(clf, "models/rain_classifier.joblib")
93
+ print("\n💾 Saved: models/rain_classifier.joblib")
scripts/train_classify_rain_hourly.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.preprocessing import StandardScaler
7
+ from sklearn.linear_model import LogisticRegression
8
+ from sklearn.pipeline import Pipeline
9
+ from sklearn.metrics import (
10
+ confusion_matrix, classification_report, roc_auc_score,
11
+ precision_recall_fscore_support
12
+ )
13
+
14
+ df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
15
+
16
+ H = 6
17
+ precip_next = np.zeros(len(df), dtype=int)
18
+
19
+ prec = df["precip_mm"].values
20
+
21
+ for i in range(len(prec) - H):
22
+ precip_next[i] = 1 if np.any(prec[i+1:i+1+H] > 0) else 0
23
+
24
+ df = df.iloc[:len(precip_next) - (0)].copy()
25
+ df["rain_next6h"] = precip_next[:len(df)]
26
+
27
+
28
+ features = [
29
+ "temp_c","humidity","cloudcover","pressure","wind_speed",
30
+ "precip_mm","rain_mm"
31
+ ]
32
+ X = df[features].values
33
+ y = df["rain_next6h"].values
34
+
35
+ print("Class balance (0=no-rain, 1=rain-in-next6h):", np.bincount(y))
36
+
37
+ X_train, X_test, y_train, y_test = train_test_split(
38
+ X, y, test_size=0.3, shuffle=False
39
+ )
40
+
41
+ clf = Pipeline([
42
+ ("scaler", StandardScaler()),
43
+ ("logreg", LogisticRegression(max_iter=500, class_weight="balanced"))
44
+ ])
45
+ clf.fit(X_train, y_train)
46
+
47
+ proba = clf.predict_proba(X_test)[:, 1]
48
+ pred_050 = (proba >= 0.50).astype(int)
49
+
50
+ cm = confusion_matrix(y_test, pred_050)
51
+ print("\n📊 Confusion Matrix (thr=0.50)")
52
+ print(cm)
53
+
54
+ prec, rec, f1, _ = precision_recall_fscore_support(
55
+ y_test, pred_050, average="binary", zero_division=0
56
+ )
57
+ try:
58
+ auc = roc_auc_score(y_test, proba)
59
+ except ValueError:
60
+ auc = float("nan")
61
+
62
+ print(f"Precision: {prec:.3f} Recall: {rec:.3f} F1: {f1:.3f} ROC-AUC: {auc:.3f}")
63
+
64
+ print("\nDetailed report:")
65
+ print(classification_report(y_test, pred_050, digits=3, zero_division=0))
66
+
67
+ # Baselines
68
+ always_no = np.zeros_like(y_test)
69
+ p0, r0, f10, _ = precision_recall_fscore_support(
70
+ y_test, always_no, average="binary", zero_division=0
71
+ )
72
+ print("\n🧠 Baseline — always 'no rain'")
73
+ print(f"Precision: {p0:.3f} Recall: {r0:.3f} F1: {f10:.3f}")
74
+
75
+ # Persistence baseline
76
+ recent_rain = (
77
+ pd.Series(df["precip_mm"])
78
+ .rolling(window=H, min_periods=1)
79
+ .sum()
80
+ .shift(1)
81
+ .fillna(0)
82
+ > 0
83
+ ).astype(int).values
84
+ prev6_test = recent_rain[-len(y_test):]
85
+ pp, rp, f1p, _ = precision_recall_fscore_support(y_test, prev6_test, average="binary", zero_division=0)
86
+ print("\n🧠 Baseline — persistence (prev 6h)")
87
+ print(f"Precision: {pp:.3f} Recall: {rp:.3f} F1: {f1p:.3f}")
88
+
89
+ # Threshold tuning
90
+ thr_recall = 0.35
91
+ thr_precision = 0.65
92
+
93
+ pred_recall = (proba >= thr_recall).astype(int)
94
+ pred_precision = (proba >= thr_precision).astype(int)
95
+
96
+ pr_recall, rc_recall, f1_recall, _ = precision_recall_fscore_support(
97
+ y_test, pred_recall, average="binary", zero_division=0
98
+ )
99
+ pr_precision, rc_precision, f1_precision, _ = precision_recall_fscore_support(
100
+ y_test, pred_precision, average="binary", zero_division=0
101
+ )
102
+
103
+ print(f"\n🎛️ Threshold {thr_recall:.2f} → Precision: {pr_recall:.3f} Recall: {rc_recall:.3f} F1: {f1_recall:.3f}")
104
+ print(f"🎛️ Threshold {thr_precision:.2f} → Precision: {pr_precision:.3f} Recall: {rc_precision:.3f} F1: {f1_precision:.3f}")
105
+
106
+ import joblib
107
+ os.makedirs("models", exist_ok=True)
108
+ joblib.dump(clf, "models/rain_classifier_hourly.joblib")
109
+ print("\n💾 Saved: models/rain_classifier_hourly.joblib")
110
+
111
+ meta = {
112
+ "horizon_hours": H,
113
+ "features": features,
114
+ "thresholds": {
115
+ "default": 0.50,
116
+ "high_recall": thr_recall,
117
+ "high_precision": thr_precision,
118
+ },
119
+ "metrics": {
120
+ "default": {"precision": float(prec), "recall": float(rec), "f1": float(f1)},
121
+ "high_recall": {
122
+ "precision": float(pr_recall),
123
+ "recall": float(rc_recall),
124
+ "f1": float(f1_recall),
125
+ },
126
+ "high_precision": {
127
+ "precision": float(pr_precision),
128
+ "recall": float(rc_precision),
129
+ "f1": float(f1_precision),
130
+ },
131
+ "baseline_persistence": {
132
+ "precision": float(pp),
133
+ "recall": float(rp),
134
+ "f1": float(f1p),
135
+ },
136
+ },
137
+ }
138
+
139
+ with open("models/rain_model_meta.json", "w") as fh:
140
+ import json
141
+ json.dump(meta, fh, indent=2)
142
+
143
+ print("📝 Saved: models/rain_model_meta.json")