Spaces:
Runtime error
Runtime error
Commit ·
6eff894
0
Parent(s):
Deployable Gradio build
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .DS_Store +0 -0
- .dockerignore +14 -0
- .gitattributes +1 -0
- .gitignore +7 -0
- Dockerfile +16 -0
- LICENSE +21 -0
- Makefile +94 -0
- README.md +179 -0
- app.py +156 -0
- app/main.py +293 -0
- assets/cover.png +0 -0
- assets/feature_importance.png +0 -0
- assets/pr_curve.png +0 -0
- assets/precip.png +0 -0
- assets/roc_curve.png +0 -0
- assets/temps.png +0 -0
- models/rain_model_meta.json +84 -0
- models/rain_xgb_cal_meta.json +117 -0
- models/rain_xgb_meta.json +94 -0
- models/rain_xgb_tuned_meta.json +111 -0
- models/xgb_tuned.json +11 -0
- pyproject.toml +26 -0
- render.yaml +8 -0
- requirements.txt +32 -0
- scripts/analyze_weather.py +11 -0
- scripts/backfill_labels.py +53 -0
- scripts/coef_rain.py +32 -0
- scripts/cron_predict.sh +48 -0
- scripts/cv_benchmark.py +243 -0
- scripts/download_models.py +41 -0
- scripts/eval_operating_points.py +49 -0
- scripts/explain_shap.py +62 -0
- scripts/explain_shap_interaction.py +105 -0
- scripts/export_daily.py +18 -0
- scripts/export_hourly.py +33 -0
- scripts/feature_importance_rain.py +79 -0
- scripts/fetch_weather.sh +21 -0
- scripts/intro_ml.py +28 -0
- scripts/log_predict.py +67 -0
- scripts/make_cover.py +59 -0
- scripts/monitor_weekly.py +60 -0
- scripts/plot_pr_roc.py +65 -0
- scripts/plot_weather.py +72 -0
- scripts/predict_rain.py +29 -0
- scripts/process_weather.py +56 -0
- scripts/rain_cli.py +34 -0
- scripts/start_services.sh +28 -0
- scripts/time_series_cv_demo.py +38 -0
- scripts/train_classify_rain.py +93 -0
- scripts/train_classify_rain_hourly.py +143 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.dockerignore
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.git
|
| 2 |
+
.github
|
| 3 |
+
.venv
|
| 4 |
+
__pycache__
|
| 5 |
+
*.pyc
|
| 6 |
+
*.pyo
|
| 7 |
+
*.pyd
|
| 8 |
+
*.log
|
| 9 |
+
*.csv
|
| 10 |
+
data/
|
| 11 |
+
results/
|
| 12 |
+
build/
|
| 13 |
+
dist/
|
| 14 |
+
node_modules/
|
.gitattributes
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
models/*.joblib filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
data/
|
| 2 |
+
results/
|
| 3 |
+
logs/
|
| 4 |
+
__pycache__/
|
| 5 |
+
*.zip
|
| 6 |
+
.env
|
| 7 |
+
models/*.joblib
|
Dockerfile
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
build-essential && \
|
| 7 |
+
rm -rf /var/lib/apt/lists/*
|
| 8 |
+
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 11 |
+
|
| 12 |
+
COPY . .
|
| 13 |
+
|
| 14 |
+
ENV PORT=7860
|
| 15 |
+
CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]
|
| 16 |
+
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Elvis Anselm
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
Makefile
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
.PHONY: all check download process zip clean coords plot setup viz cli \
|
| 3 |
+
install uninstall rain rain6 rain-train rain-predict rain-now eval-plots \
|
| 4 |
+
hourly
|
| 5 |
+
|
| 6 |
+
all: check download process zip
|
| 7 |
+
@echo "🏁 Weather pipeline complete."
|
| 8 |
+
|
| 9 |
+
check:
|
| 10 |
+
@command -v curl >/dev/null || (echo "curl missing"; exit 1)
|
| 11 |
+
@command -v python3 >/dev/null || (echo "python3 missing"; exit 1)
|
| 12 |
+
@[ -f scripts/fetch_weather.sh ] || (echo "missing scripts/fetch_weather.sh"; exit 1)
|
| 13 |
+
@[ -f scripts/process_weather.py ] || (echo "missing scripts/process_weather.py"; exit 1)
|
| 14 |
+
|
| 15 |
+
download:
|
| 16 |
+
@bash scripts/fetch_weather.sh
|
| 17 |
+
|
| 18 |
+
process:
|
| 19 |
+
@python3 scripts/process_weather.py
|
| 20 |
+
|
| 21 |
+
zip:
|
| 22 |
+
@zip -j results/results.zip results/summary.txt results/summary.csv
|
| 23 |
+
|
| 24 |
+
clean:
|
| 25 |
+
@rm -rf data results logs
|
| 26 |
+
@mkdir -p data results logs
|
| 27 |
+
|
| 28 |
+
coords:
|
| 29 |
+
@LAT="$(LAT)" LON="$(LON)" bash scripts/fetch_weather.sh
|
| 30 |
+
@python3 scripts/process_weather.py
|
| 31 |
+
@zip -j results/results.zip results/summary.txt
|
| 32 |
+
|
| 33 |
+
plot:
|
| 34 |
+
@python3 scripts/plot_weather.py
|
| 35 |
+
|
| 36 |
+
viz: all plot
|
| 37 |
+
@echo "📊 Charts generated."
|
| 38 |
+
|
| 39 |
+
setup:
|
| 40 |
+
@python3 -m venv .venv
|
| 41 |
+
@. .venv/bin/activate && pip install -r requirements.txt
|
| 42 |
+
|
| 43 |
+
cli:
|
| 44 |
+
@python3 scripts/weather_cli.py --city Lagos --lat 6.5244 --lon 3.3792
|
| 45 |
+
|
| 46 |
+
install:
|
| 47 |
+
@. .venv/bin/activate && pip install -e .
|
| 48 |
+
|
| 49 |
+
uninstall:
|
| 50 |
+
@. .venv/bin/activate && pip uninstall -y weather-data-fetcher
|
| 51 |
+
|
| 52 |
+
rain:
|
| 53 |
+
@python3 scripts/train_classify_rain.py
|
| 54 |
+
|
| 55 |
+
rain6:
|
| 56 |
+
@python3 scripts/train_classify_rain_hourly.py
|
| 57 |
+
|
| 58 |
+
rain-train:
|
| 59 |
+
@python3 scripts/train_rain_dual_thresholds.py
|
| 60 |
+
|
| 61 |
+
rain-predict:
|
| 62 |
+
@python3 scripts/predict_rain.py
|
| 63 |
+
|
| 64 |
+
rain-now:
|
| 65 |
+
@weather-cli rain --mode recall
|
| 66 |
+
|
| 67 |
+
eval-plots:
|
| 68 |
+
@python3 scripts/plot_pr_roc.py
|
| 69 |
+
|
| 70 |
+
hourly:
|
| 71 |
+
@LAT="$(LAT)" LON="$(LON)" PAST_DAYS="$(PAST_DAYS)" bash scripts/fetch_weather.sh
|
| 72 |
+
@python3 scripts/export_hourly.py
|
| 73 |
+
|
| 74 |
+
.PHONY: xgb-train
|
| 75 |
+
xgb-train:
|
| 76 |
+
@python3 scripts/train_xgb_12h.py
|
| 77 |
+
|
| 78 |
+
.PHONY: xgb-train-cal
|
| 79 |
+
xgb-train-cal:
|
| 80 |
+
@python3 scripts/train_xgb_12h_calibrated.py
|
| 81 |
+
|
| 82 |
+
.PHONY: predict-log backfill monitor
|
| 83 |
+
predict-log:
|
| 84 |
+
@python3 scripts/log_predict.py --city "Lagos" --lat 6.5244 --lon 3.3792 --mode default
|
| 85 |
+
|
| 86 |
+
backfill:
|
| 87 |
+
@python3 scripts/backfill_labels.py
|
| 88 |
+
|
| 89 |
+
monitor:
|
| 90 |
+
@python3 scripts/monitor_weekly.py
|
| 91 |
+
|
| 92 |
+
.PHONY: cron-test
|
| 93 |
+
cron-test:
|
| 94 |
+
@./scripts/cron_predict.sh default "Lagos" 6.5244 3.3792 90 >> logs/cron.log 2>&1 && tail -n 5 logs/cron.log
|
README.md
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Weather Data Fetcher
|
| 3 |
+
emoji: 🌧️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: gray
|
| 6 |
+
sdk: docker
|
| 7 |
+
hub: registry.hf.space/theelvace/weather-data-fetcher-api:latest
|
| 8 |
+
pinned: false
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Weather Data Fetcher — Automated Data Pipeline
|
| 12 |
+
|
| 13 |
+
Fetch daily Lagos (or any city) weather data using **Open-Meteo API**, process it with **Python**, and automate the full workflow via **Bash + Makefile**.
|
| 14 |
+
|
| 15 |
+
---
|
| 16 |
+
|
| 17 |
+
## Project Overview
|
| 18 |
+
|
| 19 |
+
This project demonstrates a clean, reproducible workflow for data automation — the same principles used in ML and DevOps pipelines.
|
| 20 |
+
|
| 21 |
+
**Pipeline Steps**
|
| 22 |
+
|
| 23 |
+
1. Download daily weather JSON from Open-Meteo
|
| 24 |
+
2. Parse, validate, and summarize data in Python
|
| 25 |
+
3. Generate text + CSV summaries (and optional plots)
|
| 26 |
+
4. Automate everything via a single `make all` command
|
| 27 |
+
|
| 28 |
+
---
|
| 29 |
+
|
| 30 |
+
## Charts
|
| 31 |
+
|
| 32 |
+
## 🌧️ Rain Warning (next 6 hours)
|
| 33 |
+
|
| 34 |
+
Predict **whether it will rain in the next 6 hours** from hourly observations (temperature, humidity, pressure, wind, cloud cover, precipitation).
|
| 35 |
+
|
| 36 |
+
| Mode | Threshold | Precision | Recall | When to use |
|
| 37 |
+
| -------------- | --------- | --------- | ------ | ---------------------- |
|
| 38 |
+
| Default | 0.50 | 0.71 | 0.70 | Balanced alerts |
|
| 39 |
+
| High recall | 0.35 | 0.68 | 0.84 | Better safe than sorry |
|
| 40 |
+
| High precision | 0.65 | 0.79 | 0.50 | Only warn if confident |
|
| 41 |
+
|
| 42 |
+
### Train once
|
| 43 |
+
|
| 44 |
+
```bash
|
| 45 |
+
make hourly
|
| 46 |
+
make rain-train
|
| 47 |
+
make rain-now
|
| 48 |
+
python scripts/train_rain_dual_thresholds.py
|
| 49 |
+
python scripts/plot_pr_roc.py # refresh PR/ROC charts
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
This produces:
|
| 53 |
+
|
| 54 |
+
- `models/rain_classifier_hourly.joblib`
|
| 55 |
+
- `models/rain_model_meta.json`
|
| 56 |
+
- `results/pr_curve.png`, `results/roc_curve.png`
|
| 57 |
+
|
| 58 |
+
### Predict from the latest hour
|
| 59 |
+
|
| 60 |
+
```bash
|
| 61 |
+
weather-cli rain --mode recall # warn more often
|
| 62 |
+
weather-cli rain --mode precision # fewer false alarms
|
| 63 |
+
```
|
| 64 |
+
|
| 65 |
+
Example output:
|
| 66 |
+
|
| 67 |
+
```
|
| 68 |
+
2025-10-26 23:00:00 | P(rain ≤6h)=0.492 | mode=recall thr=0.35 → RAIN
|
| 69 |
+
2025-10-26 23:00:00 | P(rain ≤6h)=0.492 | mode=precision thr=0.65 → No rain
|
| 70 |
+
```
|
| 71 |
+
|
| 72 |
+
### How thresholds are chosen
|
| 73 |
+
|
| 74 |
+
Training sweeps precision–recall trade-offs and stores two operating points:
|
| 75 |
+
|
| 76 |
+
| Threshold type | Purpose |
|
| 77 |
+
| -------------- | ------------------------------ |
|
| 78 |
+
| High recall | Catch >80 % of rain events |
|
| 79 |
+
| High precision | Warn only when ≥90 % confident |
|
| 80 |
+
|
| 81 |
+

|
| 82 |
+

|
| 83 |
+
|
| 84 |
+
### Model Interpretability
|
| 85 |
+
|
| 86 |
+
ML is not useful unless we can understand what it learned. This section explains why the classifier predicts rain, and not just whether it predicts rain.
|
| 87 |
+
|
| 88 |
+
- **Feature Coefficients (standardized):** which signals push toward rain vs no-rain
|
| 89 |
+
|
| 90 |
+
```bash
|
| 91 |
+
python scripts/coef_rain.py # writes top weights
|
| 92 |
+
```
|
| 93 |
+
|
| 94 |
+
Output → `results/coef_top15.txt`
|
| 95 |
+
|
| 96 |
+
- **Permutation importance:** which features matter most to F1 on the test set.
|
| 97 |
+
This tells us which variables the model relies on the most when making real predictions.
|
| 98 |
+
```bash
|
| 99 |
+
python scripts/feature_importance_rain.py
|
| 100 |
+
```
|
| 101 |
+
Output → `results/feature_importance.png`
|
| 102 |
+
|
| 103 |
+
It engineers both raw signals and short-term deltas/rolling means. Positive coefficients push toward “RAIN”, negative toward “No rain”.
|
| 104 |
+
|
| 105 |
+
### What the model actually learned (top signals)
|
| 106 |
+
|
| 107 |
+
| Feature | Meaning |
|
| 108 |
+
| ------------ | -------------------------------------------------------------------- |
|
| 109 |
+
| `precip_mm` | Existing rainfall strongly predicts more rain (tropical persistence) |
|
| 110 |
+
| `temp_c` | Warmer air holds more moisture → higher chance of near-term rain |
|
| 111 |
+
| `humidity` | High saturation = cloud condensation is likely |
|
| 112 |
+
| `pressure` | Falling pressure indicates unstable atmosphere / storm formation |
|
| 113 |
+
| `cloudcover` | More clouds = conditions building toward rainfall |
|
| 114 |
+
| `wind_speed` | Negative weight — stronger winds can disperse moisture |
|
| 115 |
+
|
| 116 |
+
The classifier isn’t guessing; it’s surfacing familiar meteorological patterns.
|
| 117 |
+
|
| 118 |
+
### What drives the rain predictions?
|
| 119 |
+
|
| 120 |
+
Using SHAP explainability, I found that the model mainly relies on **humidity** and **temperature** when deciding if it will rain in the next 12 hours.
|
| 121 |
+
|
| 122 |
+
- High humidity pushes the model strongly toward predicting rain.
|
| 123 |
+
- Lower temperatures slightly increase rain probability.
|
| 124 |
+
- The interaction between humidity and temperature mimics real-world weather dynamics — humid, cool conditions tend to precede rainfall.
|
| 125 |
+
|
| 126 |
+
This means the model isn’t just memorizing data — it has captured meaningful relationships that align with atmospheric science.
|
| 127 |
+
|
| 128 |
+

|
| 129 |
+
|
| 130 |
+
> Generated via `python scripts/explain_shap_interaction.py`, which also writes `results/shap_interaction_rev.png` for the reverse view.
|
| 131 |
+
|
| 132 |
+
## 🌧️ Rain Events (≥1.0 mm in next 12h)
|
| 133 |
+
|
| 134 |
+
**Label:** “Rain event if cumulative precipitation ≥ **1.0 mm** within the next **12 hours**.”
|
| 135 |
+
**Policy:** Default to **Early Warning** (recall-leaning) for Lagos conditions. Offer a stricter **Cautious Alert** mode.
|
| 136 |
+
|
| 137 |
+
**Train / thresholds / predict**
|
| 138 |
+
|
| 139 |
+
```bash
|
| 140 |
+
# (data) pull 90 days of hourly data
|
| 141 |
+
make hourly PAST_DAYS=90
|
| 142 |
+
|
| 143 |
+
# (model) train XGBoost + Isotonic calibration
|
| 144 |
+
python scripts/train_xgb_12h_calibrated.py
|
| 145 |
+
|
| 146 |
+
# (CLI) two operating modes
|
| 147 |
+
weather-cli rain --mode recall # Early Warning (higher recall)
|
| 148 |
+
weather-cli rain --mode precision # Cautious Alert (stricter)
|
| 149 |
+
weather-cli rain # Balanced (best F1)
|
| 150 |
+
```
|
| 151 |
+
|
| 152 |
+
### 🌧️ Rain Warning (next 12h)
|
| 153 |
+
Train tuned model + set guarded thresholds:
|
| 154 |
+
|
| 155 |
+
```bash
|
| 156 |
+
python scripts/xgb_tune_timeseries.py
|
| 157 |
+
python scripts/train_xgb_tuned_final.py
|
| 158 |
+
cp models/rain_xgb_tuned.joblib models/rain_classifier_hourly.joblib
|
| 159 |
+
cp models/rain_xgb_tuned_meta.json models/rain_model_meta.json
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
## Run Locally
|
| 163 |
+
|
| 164 |
+
Clone and run:
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
make all
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
## CLI
|
| 171 |
+
|
| 172 |
+
Install (editable):
|
| 173 |
+
|
| 174 |
+
```bash
|
| 175 |
+
python3 -m venv .venv && source .venv/bin/activate
|
| 176 |
+
pip install -e .
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
Once installed, run `weather-cli --help` for all commands (including the rain mode above).
|
app.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import numpy as np
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import streamlit as st
|
| 7 |
+
import joblib
|
| 8 |
+
import subprocess
|
| 9 |
+
import os
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
|
| 12 |
+
# Settings
|
| 13 |
+
MODEL_PATH = Path("models/rain_xgb_tuned.joblib")
|
| 14 |
+
META_PATH = Path("models/rain_xgb_tuned_meta.json")
|
| 15 |
+
HOURLY_CSV = Path("results/hourly.csv")
|
| 16 |
+
|
| 17 |
+
# Load model + meta
|
| 18 |
+
@st.cache_resource
|
| 19 |
+
def load_model():
|
| 20 |
+
if not (MODEL_PATH.exists() and META_PATH.exists()):
|
| 21 |
+
st.error("Trained model not found. Run: python scripts/xgb_tune_timeseries.py && python scripts/train_xgb_tuned_final.py")
|
| 22 |
+
st.stop()
|
| 23 |
+
clf = joblib.load(MODEL_PATH)
|
| 24 |
+
meta = json.loads(META_PATH.read_text())
|
| 25 |
+
return clf, meta
|
| 26 |
+
|
| 27 |
+
def build_features_like_training(df: pd.DataFrame, features: list) -> pd.DataFrame:
|
| 28 |
+
from scripts.train_xgb_tuned_final import build_features # reuse your code
|
| 29 |
+
Xdf = build_features(df)
|
| 30 |
+
return Xdf[features]
|
| 31 |
+
|
| 32 |
+
def ensure_hourly(lat: float, lon: float, past_days: int = 90) -> pd.DataFrame:
|
| 33 |
+
env = os.environ.copy()
|
| 34 |
+
env["LAT"] = str(lat)
|
| 35 |
+
env["LON"] = str(lon)
|
| 36 |
+
env["PAST_DAYS"] = str(past_days)
|
| 37 |
+
|
| 38 |
+
# If file is missing or stale (>12h), refresh
|
| 39 |
+
needs_refresh = True
|
| 40 |
+
if HOURLY_CSV.exists():
|
| 41 |
+
age_hours = (datetime.now() - datetime.fromtimestamp(HOURLY_CSV.stat().st_mtime)).total_seconds() / 3600.0
|
| 42 |
+
needs_refresh = age_hours > 12
|
| 43 |
+
|
| 44 |
+
if (not HOURLY_CSV.exists()) or needs_refresh:
|
| 45 |
+
st.info("Fetching fresh hourly weather…")
|
| 46 |
+
subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
|
| 47 |
+
subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
|
| 48 |
+
|
| 49 |
+
return pd.read_csv(HOURLY_CSV, parse_dates=["time"])
|
| 50 |
+
|
| 51 |
+
# UI
|
| 52 |
+
st.set_page_config(page_title="Rain Nowcast (12h)", page_icon="🌧️", layout="centered")
|
| 53 |
+
st.title("🌧️ Rain Nowcast — next 12 hours")
|
| 54 |
+
|
| 55 |
+
clf, meta = load_model()
|
| 56 |
+
features = meta["features"]
|
| 57 |
+
thr = meta["thresholds"]
|
| 58 |
+
horizon_h = meta["horizon_hours"]
|
| 59 |
+
|
| 60 |
+
# Presets for cities
|
| 61 |
+
CITY_PRESETS = {
|
| 62 |
+
"Lagos 🇳🇬": (6.5244, 3.3792),
|
| 63 |
+
"Accra 🇬🇭": (5.6037, -0.1870),
|
| 64 |
+
"Nairobi 🇰🇪": (-1.2864, 36.8172),
|
| 65 |
+
"Kampala 🇺🇬": (0.3476, 32.5825),
|
| 66 |
+
"Addis 🇪🇹": (8.9806, 38.7578),
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
col1, col2 = st.columns(2)
|
| 70 |
+
with col1:
|
| 71 |
+
city = st.selectbox("City", list(CITY_PRESETS.keys()), index=0)
|
| 72 |
+
with col2:
|
| 73 |
+
mode = st.selectbox("Decision mode", ["default", "recall", "precision"], index=0)
|
| 74 |
+
|
| 75 |
+
lat, lon = CITY_PRESETS[city]
|
| 76 |
+
st.caption(f"Lat/Lon: **{lat:.4f}, {lon:.4f}** • Horizon: **{horizon_h}h** • Mode: **{mode}**")
|
| 77 |
+
|
| 78 |
+
df = ensure_hourly(lat, lon, past_days=90)
|
| 79 |
+
|
| 80 |
+
Xdf = build_features_like_training(df.copy(), features)
|
| 81 |
+
if Xdf.empty:
|
| 82 |
+
st.error("Not enough data to build features. Try again after fetch.")
|
| 83 |
+
st.stop()
|
| 84 |
+
|
| 85 |
+
x_last = Xdf.iloc[[-1]].values
|
| 86 |
+
p = float(clf.predict_proba(x_last)[0, 1])
|
| 87 |
+
thr_map = {
|
| 88 |
+
"default": float(thr["default"]),
|
| 89 |
+
"recall": float(thr["high_recall"]),
|
| 90 |
+
"precision": float(thr["high_precision"]),
|
| 91 |
+
}
|
| 92 |
+
t = thr_map[mode]
|
| 93 |
+
decision = "RAIN" if p >= t else "No rain"
|
| 94 |
+
|
| 95 |
+
st.subheader("Prediction")
|
| 96 |
+
st.metric(
|
| 97 |
+
label=f"P(rain ≤ {horizon_h}h)",
|
| 98 |
+
value=f"{p:.3f}",
|
| 99 |
+
delta=f"threshold={t:.2f}",
|
| 100 |
+
delta_color="inverse" if p < t else "normal"
|
| 101 |
+
)
|
| 102 |
+
st.markdown(
|
| 103 |
+
f"**Decision:** {'🌧️ RAIN' if decision=='RAIN' else '✅ No rain'} "
|
| 104 |
+
f"(mode **{mode}**, threshold **{t:.2f}**)"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
st.subheader("Last 48h — context")
|
| 108 |
+
last48 = df.tail(48).copy()
|
| 109 |
+
c1, c2 = st.columns(2)
|
| 110 |
+
with c1:
|
| 111 |
+
st.line_chart(data=last48.set_index("time")[["temp_c", "humidity"]])
|
| 112 |
+
with c2:
|
| 113 |
+
st.line_chart(data=last48.set_index("time")[["precip_mm", "rain_mm"]])
|
| 114 |
+
|
| 115 |
+
# --- Probability sparkline over last 48h ---
|
| 116 |
+
st.subheader("Last 48h — rain probability")
|
| 117 |
+
# Recompute probabilities for all available rows, then show last 48 aligned to time
|
| 118 |
+
probas_all = clf.predict_proba(Xdf.values)[:, 1]
|
| 119 |
+
proba_series = pd.Series(probas_all, index=Xdf.index, name="p_rain")
|
| 120 |
+
# Align times (Xdf is derived from df; both share row order except dropped NaNs at head)
|
| 121 |
+
times_aligned = df.loc[Xdf.index, "time"]
|
| 122 |
+
last48_p = pd.DataFrame({"time": times_aligned, "p_rain": proba_series}).tail(48).set_index("time")
|
| 123 |
+
st.line_chart(last48_p)
|
| 124 |
+
|
| 125 |
+
# --- Download buttons ---
|
| 126 |
+
st.subheader("Downloads")
|
| 127 |
+
st.download_button(
|
| 128 |
+
label="⬇️ Download hourly.csv",
|
| 129 |
+
data=df.to_csv(index=False).encode("utf-8"),
|
| 130 |
+
file_name="hourly.csv",
|
| 131 |
+
mime="text/csv",
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
latest_frame = pd.DataFrame({
|
| 135 |
+
"time": [df.loc[Xdf.index, "time"].iloc[-1]],
|
| 136 |
+
"p_rain_next_12h": [p],
|
| 137 |
+
"mode": [mode],
|
| 138 |
+
"threshold": [t],
|
| 139 |
+
"decision": [decision],
|
| 140 |
+
})
|
| 141 |
+
st.download_button(
|
| 142 |
+
label="⬇️ Download latest_prediction.csv",
|
| 143 |
+
data=latest_frame.to_csv(index=False).encode("utf-8"),
|
| 144 |
+
file_name="latest_prediction.csv",
|
| 145 |
+
mime="text/csv",
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Explain thresholds
|
| 149 |
+
with st.expander("What do these modes mean?"):
|
| 150 |
+
st.write("""
|
| 151 |
+
- **default**: balanced (good everyday choice)
|
| 152 |
+
- **recall**: warn more (catches more rain, may over-warn)
|
| 153 |
+
- **precision**: be picky (alerts are rare but confident)
|
| 154 |
+
""")
|
| 155 |
+
|
| 156 |
+
st.caption("Model: XGBoost (tuned) • Features rebuilt exactly like training • Data: Open-Meteo hourly")
|
app/main.py
ADDED
|
@@ -0,0 +1,293 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
FastAPI application exposing the rain nowcast API and a Gradio UI.
|
| 3 |
+
|
| 4 |
+
The previous Streamlit proxy was difficult to keep alive on Spaces due to
|
| 5 |
+
websocket restrictions. This module provides the same REST endpoints while
|
| 6 |
+
mounting a lightweight Gradio front-end so the UI works without websocket
|
| 7 |
+
tunnelling.
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
from __future__ import annotations
|
| 11 |
+
|
| 12 |
+
import os
|
| 13 |
+
import json
|
| 14 |
+
import subprocess
|
| 15 |
+
from datetime import datetime
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Dict, Tuple
|
| 18 |
+
|
| 19 |
+
import joblib
|
| 20 |
+
import pandas as pd
|
| 21 |
+
import gradio as gr
|
| 22 |
+
from fastapi import FastAPI, HTTPException, Query
|
| 23 |
+
from pydantic import BaseModel, Field
|
| 24 |
+
from xgboost import XGBClassifier
|
| 25 |
+
|
| 26 |
+
# --------- Paths ---------
|
| 27 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 28 |
+
MODELS = ROOT / "models"
|
| 29 |
+
RESULTS = ROOT / "results"
|
| 30 |
+
SCRIPTS = ROOT / "scripts"
|
| 31 |
+
|
| 32 |
+
MODEL_PATH = MODELS / "rain_xgb_tuned.joblib"
|
| 33 |
+
META_PATH = MODELS / "rain_xgb_tuned_meta.json"
|
| 34 |
+
MODEL_JSON_PATH = MODELS / "xgb_tuned.json"
|
| 35 |
+
HOURLY_CSV = RESULTS / "hourly.csv"
|
| 36 |
+
|
| 37 |
+
# Make training utilities importable.
|
| 38 |
+
import sys
|
| 39 |
+
|
| 40 |
+
sys.path.insert(0, str(ROOT))
|
| 41 |
+
from scripts.train_xgb_tuned_final import build_features # type: ignore
|
| 42 |
+
|
| 43 |
+
# --------- Load model + meta at startup ---------
|
| 44 |
+
if not META_PATH.exists():
|
| 45 |
+
raise RuntimeError(
|
| 46 |
+
"Model metadata missing. Run `python scripts/train_xgb_tuned_final.py` "
|
| 47 |
+
"or copy models/rain_xgb_tuned_meta.json into place."
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
meta = json.loads(META_PATH.read_text())
|
| 51 |
+
FEATURES = meta["features"]
|
| 52 |
+
THRESH = meta["thresholds"]
|
| 53 |
+
HORIZON_H = int(meta["horizon_hours"])
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _load_model() -> XGBClassifier:
|
| 57 |
+
if MODEL_PATH.exists():
|
| 58 |
+
return joblib.load(MODEL_PATH)
|
| 59 |
+
|
| 60 |
+
if MODEL_JSON_PATH.exists():
|
| 61 |
+
params = meta.get("model", {}).get("params", {})
|
| 62 |
+
booster = XGBClassifier(**params)
|
| 63 |
+
booster.load_model(MODEL_JSON_PATH)
|
| 64 |
+
return booster
|
| 65 |
+
|
| 66 |
+
raise RuntimeError(
|
| 67 |
+
"Model artifact missing. Run `python scripts/train_xgb_tuned_final.py` "
|
| 68 |
+
"to generate models/rain_xgb_tuned.joblib (or xgb_tuned.json), "
|
| 69 |
+
"or copy the trained file into the models/ directory."
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
model = _load_model()
|
| 74 |
+
|
| 75 |
+
# --------- Helpers ---------
|
| 76 |
+
def ensure_hourly(lat: float, lon: float, past_days: int = 90) -> pd.DataFrame:
|
| 77 |
+
"""Refresh the cached hourly CSV when it is missing or stale."""
|
| 78 |
+
env = os.environ.copy()
|
| 79 |
+
env["LAT"] = str(lat)
|
| 80 |
+
env["LON"] = str(lon)
|
| 81 |
+
env["PAST_DAYS"] = str(past_days)
|
| 82 |
+
|
| 83 |
+
needs_refresh = True
|
| 84 |
+
if HOURLY_CSV.exists():
|
| 85 |
+
age_hours = (datetime.now().timestamp() - HOURLY_CSV.stat().st_mtime) / 3600
|
| 86 |
+
needs_refresh = age_hours > 6
|
| 87 |
+
|
| 88 |
+
if (not HOURLY_CSV.exists()) or needs_refresh:
|
| 89 |
+
try:
|
| 90 |
+
subprocess.run(["bash", str(SCRIPTS / "fetch_weather.sh")], check=True, env=env)
|
| 91 |
+
subprocess.run(["python3", str(SCRIPTS / "export_hourly.py")], check=True, env=env)
|
| 92 |
+
except subprocess.CalledProcessError as exc:
|
| 93 |
+
raise HTTPException(status_code=502, detail=f"Data refresh failed: {exc}") from exc
|
| 94 |
+
|
| 95 |
+
return pd.read_csv(HOURLY_CSV, parse_dates=["time"])
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def predict_latest(df: pd.DataFrame, mode: str) -> Dict[str, object]:
|
| 99 |
+
"""Build features, score the latest hour, and return a structured response."""
|
| 100 |
+
Xdf = build_features(df.copy())
|
| 101 |
+
if Xdf.empty:
|
| 102 |
+
raise HTTPException(status_code=422, detail="Not enough rows to build features.")
|
| 103 |
+
|
| 104 |
+
try:
|
| 105 |
+
Xdf = Xdf[FEATURES]
|
| 106 |
+
except KeyError as exc:
|
| 107 |
+
raise HTTPException(status_code=500, detail=f"Feature mismatch: {exc}") from exc
|
| 108 |
+
|
| 109 |
+
x = Xdf.iloc[[-1]].values
|
| 110 |
+
probability = float(model.predict_proba(x)[0, 1])
|
| 111 |
+
|
| 112 |
+
thresholds = {
|
| 113 |
+
"default": float(THRESH["default"]),
|
| 114 |
+
"recall": float(THRESH["high_recall"]),
|
| 115 |
+
"precision": float(THRESH["high_precision"]),
|
| 116 |
+
}
|
| 117 |
+
if mode not in thresholds:
|
| 118 |
+
raise HTTPException(status_code=400, detail=f"Unsupported mode '{mode}'.")
|
| 119 |
+
|
| 120 |
+
threshold = thresholds[mode]
|
| 121 |
+
decision = "RAIN" if probability >= threshold else "No rain"
|
| 122 |
+
ts = df.loc[Xdf.index, "time"].iloc[-1]
|
| 123 |
+
|
| 124 |
+
return {
|
| 125 |
+
"timestamp": ts.isoformat(),
|
| 126 |
+
"probability": probability,
|
| 127 |
+
"threshold": threshold,
|
| 128 |
+
"mode": mode,
|
| 129 |
+
"decision": decision,
|
| 130 |
+
"horizon_hours": HORIZON_H,
|
| 131 |
+
}
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def format_prediction(result: Dict[str, object]) -> str:
|
| 135 |
+
"""Generate a concise markdown summary for the UI."""
|
| 136 |
+
emoji = "🌧️" if result["decision"] == "RAIN" else "✅"
|
| 137 |
+
probability = result["probability"]
|
| 138 |
+
threshold = result["threshold"]
|
| 139 |
+
mode = result["mode"]
|
| 140 |
+
timestamp = result["timestamp"]
|
| 141 |
+
return (
|
| 142 |
+
f"{emoji} **Decision:** {result['decision']} (mode **{mode}**)\n\n"
|
| 143 |
+
f"- Probability of rain ≤ {HORIZON_H}h: **{probability:.3f}**\n"
|
| 144 |
+
f"- Threshold: **{threshold:.2f}**\n"
|
| 145 |
+
f"- Issued for hour ending **{timestamp}**"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class PredictBody(BaseModel):
|
| 150 |
+
lat: float = Field(6.5244, description="Latitude")
|
| 151 |
+
lon: float = Field(3.3792, description="Longitude")
|
| 152 |
+
mode: str = Field("default", description="default | recall | precision")
|
| 153 |
+
past_days: int = Field(90, ge=14, le=180, description="How much history to fetch (days)")
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
app = FastAPI(title="Rain Nowcast API", version="1.1.0")
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
@app.get("/health")
|
| 160 |
+
def health() -> Dict[str, object]:
|
| 161 |
+
return {
|
| 162 |
+
"status": "ok",
|
| 163 |
+
"model_file": MODEL_PATH.name,
|
| 164 |
+
"horizon_hours": HORIZON_H,
|
| 165 |
+
"thresholds": THRESH,
|
| 166 |
+
"features": FEATURES,
|
| 167 |
+
}
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
@app.post("/predict")
|
| 171 |
+
def predict(body: PredictBody) -> Dict[str, object]:
|
| 172 |
+
df = ensure_hourly(body.lat, body.lon, body.past_days)
|
| 173 |
+
out = predict_latest(df, body.mode)
|
| 174 |
+
return {"ok": True, "result": out}
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
@app.get("/predict")
|
| 178 |
+
def predict_get(
|
| 179 |
+
lat: float = Query(6.5244),
|
| 180 |
+
lon: float = Query(3.3792),
|
| 181 |
+
mode: str = Query("default"),
|
| 182 |
+
past_days: int = Query(90, ge=14, le=180),
|
| 183 |
+
) -> Dict[str, object]:
|
| 184 |
+
df = ensure_hourly(lat, lon, past_days)
|
| 185 |
+
out = predict_latest(df, mode)
|
| 186 |
+
return {"ok": True, "result": out}
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
# --------- Gradio UI ---------
|
| 190 |
+
CITY_PRESETS: Dict[str, Tuple[float, float]] = {
|
| 191 |
+
"Lagos 🇳🇬": (6.5244, 3.3792),
|
| 192 |
+
"Accra 🇬🇭": (5.6037, -0.1870),
|
| 193 |
+
"Nairobi 🇰🇪": (-1.2864, 36.8172),
|
| 194 |
+
"Kampala 🇺🇬": (0.3476, 32.5825),
|
| 195 |
+
"Addis Ababa 🇪🇹": (8.9806, 38.7578),
|
| 196 |
+
"Custom": (0.0, 0.0),
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _resolve_location(city: str, lat: float, lon: float) -> Tuple[float, float, str]:
|
| 201 |
+
if city in CITY_PRESETS and city != "Custom":
|
| 202 |
+
chosen_lat, chosen_lon = CITY_PRESETS[city]
|
| 203 |
+
label = city
|
| 204 |
+
else:
|
| 205 |
+
chosen_lat, chosen_lon = lat, lon
|
| 206 |
+
label = f"Custom ({lat:.3f}, {lon:.3f})"
|
| 207 |
+
return chosen_lat, chosen_lon, label
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
def gradio_predict(
|
| 211 |
+
city: str,
|
| 212 |
+
lat: float,
|
| 213 |
+
lon: float,
|
| 214 |
+
mode: str,
|
| 215 |
+
past_days: int,
|
| 216 |
+
) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
|
| 217 |
+
chosen_lat, chosen_lon, label = _resolve_location(city, lat, lon)
|
| 218 |
+
df = ensure_hourly(chosen_lat, chosen_lon, past_days)
|
| 219 |
+
result = predict_latest(df, mode)
|
| 220 |
+
|
| 221 |
+
summary = format_prediction(result)
|
| 222 |
+
|
| 223 |
+
last48 = df.tail(48).copy()
|
| 224 |
+
last48.set_index("time", inplace=True)
|
| 225 |
+
chart = last48[["temp_c", "humidity", "precip_mm", "rain_mm"]]
|
| 226 |
+
|
| 227 |
+
latest = pd.DataFrame(
|
| 228 |
+
{
|
| 229 |
+
"location": [label],
|
| 230 |
+
"timestamp": [result["timestamp"]],
|
| 231 |
+
"mode": [result["mode"]],
|
| 232 |
+
"probability": [result["probability"]],
|
| 233 |
+
"threshold": [result["threshold"]],
|
| 234 |
+
"decision": [result["decision"]],
|
| 235 |
+
}
|
| 236 |
+
)
|
| 237 |
+
return summary, latest, chart
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
with gr.Blocks(css=".gradio-container {max-width: 900px;}") as demo:
|
| 241 |
+
gr.Markdown("# 🌧️ Rain Nowcast\nPredict the probability of rain in the next "
|
| 242 |
+
f"{HORIZON_H} hours using the tuned XGBoost model.")
|
| 243 |
+
|
| 244 |
+
with gr.Row():
|
| 245 |
+
city_input = gr.Dropdown(
|
| 246 |
+
label="City preset",
|
| 247 |
+
choices=list(CITY_PRESETS.keys()),
|
| 248 |
+
value="Lagos 🇳🇬",
|
| 249 |
+
)
|
| 250 |
+
mode_input = gr.Radio(
|
| 251 |
+
label="Decision mode",
|
| 252 |
+
choices=["default", "recall", "precision"],
|
| 253 |
+
value="default",
|
| 254 |
+
info="default=balanced, recall=warn more, precision=extra picky",
|
| 255 |
+
)
|
| 256 |
+
|
| 257 |
+
with gr.Row():
|
| 258 |
+
lat_input = gr.Number(label="Latitude (used if city is Custom)", value=6.5244)
|
| 259 |
+
lon_input = gr.Number(label="Longitude (used if city is Custom)", value=3.3792)
|
| 260 |
+
past_days_input = gr.Slider(
|
| 261 |
+
label="History window (days)",
|
| 262 |
+
minimum=14,
|
| 263 |
+
maximum=180,
|
| 264 |
+
value=90,
|
| 265 |
+
step=1,
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
submit = gr.Button("Run prediction", variant="primary")
|
| 269 |
+
|
| 270 |
+
summary_md = gr.Markdown()
|
| 271 |
+
latest_df = gr.Dataframe(label="Latest prediction", wrap=True)
|
| 272 |
+
chart_df = gr.LinePlot(
|
| 273 |
+
label="Last 48h weather (hourly)",
|
| 274 |
+
x="time",
|
| 275 |
+
y=["temp_c", "humidity", "precip_mm", "rain_mm"],
|
| 276 |
+
overlay_point=True,
|
| 277 |
+
width="100%",
|
| 278 |
+
height=350,
|
| 279 |
+
)
|
| 280 |
+
|
| 281 |
+
submit.click(
|
| 282 |
+
gradio_predict,
|
| 283 |
+
inputs=[city_input, lat_input, lon_input, mode_input, past_days_input],
|
| 284 |
+
outputs=[summary_md, latest_df, chart_df],
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
gr.Markdown(
|
| 288 |
+
"Model features match the training pipeline "
|
| 289 |
+
"(see `scripts/train_xgb_tuned_final.py`). Data fetched from Open-Meteo."
|
| 290 |
+
)
|
| 291 |
+
|
| 292 |
+
|
| 293 |
+
app = gr.mount_gradio_app(app, demo, path="/")
|
assets/cover.png
ADDED
|
assets/feature_importance.png
ADDED
|
assets/pr_curve.png
ADDED
|
assets/precip.png
ADDED
|
assets/roc_curve.png
ADDED
|
assets/temps.png
ADDED
|
models/rain_model_meta.json
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"features": [
|
| 3 |
+
"temp_c",
|
| 4 |
+
"humidity",
|
| 5 |
+
"cloudcover",
|
| 6 |
+
"pressure",
|
| 7 |
+
"wind_speed",
|
| 8 |
+
"precip_mm",
|
| 9 |
+
"rain_mm",
|
| 10 |
+
"d_temp_c",
|
| 11 |
+
"d_humidity",
|
| 12 |
+
"d_cloudcover",
|
| 13 |
+
"d_pressure",
|
| 14 |
+
"d_wind_speed",
|
| 15 |
+
"d_precip_mm",
|
| 16 |
+
"d_rain_mm",
|
| 17 |
+
"ma3_temp_c",
|
| 18 |
+
"ma3_humidity",
|
| 19 |
+
"ma3_cloudcover",
|
| 20 |
+
"ma3_pressure",
|
| 21 |
+
"ma3_wind_speed",
|
| 22 |
+
"ma3_precip_mm",
|
| 23 |
+
"ma3_rain_mm"
|
| 24 |
+
],
|
| 25 |
+
"horizon_hours": 12,
|
| 26 |
+
"thresholds": {
|
| 27 |
+
"default": 0.22239363491823944,
|
| 28 |
+
"high_recall": 0.22239363491823944,
|
| 29 |
+
"high_precision": 0.745196376322855
|
| 30 |
+
},
|
| 31 |
+
"metrics": {
|
| 32 |
+
"default": {
|
| 33 |
+
"threshold": 0.22239363491823944,
|
| 34 |
+
"precision": 0.8433098591549296,
|
| 35 |
+
"recall": 0.98559670781893,
|
| 36 |
+
"f1": 0.9089184060721063,
|
| 37 |
+
"auc": 0.7839506172839507,
|
| 38 |
+
"cm": [
|
| 39 |
+
[
|
| 40 |
+
13,
|
| 41 |
+
89
|
| 42 |
+
],
|
| 43 |
+
[
|
| 44 |
+
7,
|
| 45 |
+
479
|
| 46 |
+
]
|
| 47 |
+
]
|
| 48 |
+
},
|
| 49 |
+
"high_recall": {
|
| 50 |
+
"threshold": 0.22239363491823944,
|
| 51 |
+
"precision": 0.8433098591549296,
|
| 52 |
+
"recall": 0.98559670781893,
|
| 53 |
+
"f1": 0.9089184060721063,
|
| 54 |
+
"auc": 0.7839506172839507,
|
| 55 |
+
"cm": [
|
| 56 |
+
[
|
| 57 |
+
13,
|
| 58 |
+
89
|
| 59 |
+
],
|
| 60 |
+
[
|
| 61 |
+
7,
|
| 62 |
+
479
|
| 63 |
+
]
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
"high_precision": {
|
| 67 |
+
"threshold": 0.745196376322855,
|
| 68 |
+
"precision": 0.9033018867924528,
|
| 69 |
+
"recall": 0.7880658436213992,
|
| 70 |
+
"f1": 0.8417582417582418,
|
| 71 |
+
"auc": 0.7839506172839507,
|
| 72 |
+
"cm": [
|
| 73 |
+
[
|
| 74 |
+
61,
|
| 75 |
+
41
|
| 76 |
+
],
|
| 77 |
+
[
|
| 78 |
+
103,
|
| 79 |
+
383
|
| 80 |
+
]
|
| 81 |
+
]
|
| 82 |
+
}
|
| 83 |
+
}
|
| 84 |
+
}
|
models/rain_xgb_cal_meta.json
ADDED
|
@@ -0,0 +1,117 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_type": "xgboost+isotonic",
|
| 3 |
+
"features": [
|
| 4 |
+
"temp_c",
|
| 5 |
+
"humidity",
|
| 6 |
+
"cloudcover",
|
| 7 |
+
"pressure",
|
| 8 |
+
"wind_speed",
|
| 9 |
+
"precip_mm",
|
| 10 |
+
"rain_mm",
|
| 11 |
+
"d_temp_c",
|
| 12 |
+
"d_humidity",
|
| 13 |
+
"d_cloudcover",
|
| 14 |
+
"d_pressure",
|
| 15 |
+
"d_wind_speed",
|
| 16 |
+
"d_precip_mm",
|
| 17 |
+
"d_rain_mm",
|
| 18 |
+
"ma3_temp_c",
|
| 19 |
+
"ma3_humidity",
|
| 20 |
+
"ma3_cloudcover",
|
| 21 |
+
"ma3_pressure",
|
| 22 |
+
"ma3_wind_speed",
|
| 23 |
+
"ma3_precip_mm",
|
| 24 |
+
"ma3_rain_mm",
|
| 25 |
+
"d3_pressure",
|
| 26 |
+
"d3_humidity",
|
| 27 |
+
"d3_cloudcover",
|
| 28 |
+
"d3_temp_c",
|
| 29 |
+
"dew_proxy",
|
| 30 |
+
"d_dew_proxy",
|
| 31 |
+
"ma3_dew_proxy",
|
| 32 |
+
"rain_sum_3h",
|
| 33 |
+
"rain_sum_6h",
|
| 34 |
+
"rain_sum_12h",
|
| 35 |
+
"rain_sum_24h",
|
| 36 |
+
"rain_max_6h",
|
| 37 |
+
"rain_max_12h",
|
| 38 |
+
"dry_streak_h",
|
| 39 |
+
"wet_streak_h",
|
| 40 |
+
"hour_sin",
|
| 41 |
+
"hour_cos",
|
| 42 |
+
"dow_sin",
|
| 43 |
+
"dow_cos",
|
| 44 |
+
"hum_x_cloud",
|
| 45 |
+
"wind_x_cloud",
|
| 46 |
+
"press_drop_3h"
|
| 47 |
+
],
|
| 48 |
+
"horizon_hours": 12,
|
| 49 |
+
"event_mm": 1.0,
|
| 50 |
+
"label_desc": "Rain event if cumulative precip \u2265 1.0 mm in next 12h",
|
| 51 |
+
"thresholds": {
|
| 52 |
+
"default": 0.5123772621154785,
|
| 53 |
+
"high_recall": 0.26928117871284485,
|
| 54 |
+
"high_precision": 0.6026621460914612
|
| 55 |
+
},
|
| 56 |
+
"metrics": {
|
| 57 |
+
"default": {
|
| 58 |
+
"threshold": 0.5123772621154785,
|
| 59 |
+
"precision": 0.5376344086021505,
|
| 60 |
+
"recall": 0.8438818565400844,
|
| 61 |
+
"f1": 0.6568144499178982,
|
| 62 |
+
"auc": 0.7253714914694552,
|
| 63 |
+
"cm": [
|
| 64 |
+
[
|
| 65 |
+
173,
|
| 66 |
+
172
|
| 67 |
+
],
|
| 68 |
+
[
|
| 69 |
+
37,
|
| 70 |
+
200
|
| 71 |
+
]
|
| 72 |
+
],
|
| 73 |
+
"pos_rate": 0.6391752577319587
|
| 74 |
+
},
|
| 75 |
+
"high_recall": {
|
| 76 |
+
"threshold": 0.26928117871284485,
|
| 77 |
+
"precision": 0.4976190476190476,
|
| 78 |
+
"recall": 0.8818565400843882,
|
| 79 |
+
"f1": 0.6362252663622526,
|
| 80 |
+
"auc": 0.7253714914694552,
|
| 81 |
+
"cm": [
|
| 82 |
+
[
|
| 83 |
+
134,
|
| 84 |
+
211
|
| 85 |
+
],
|
| 86 |
+
[
|
| 87 |
+
28,
|
| 88 |
+
209
|
| 89 |
+
]
|
| 90 |
+
],
|
| 91 |
+
"pos_rate": 0.7216494845360825
|
| 92 |
+
},
|
| 93 |
+
"high_precision": {
|
| 94 |
+
"threshold": 0.6026621460914612,
|
| 95 |
+
"precision": 0.6490384615384616,
|
| 96 |
+
"recall": 0.569620253164557,
|
| 97 |
+
"f1": 0.6067415730337079,
|
| 98 |
+
"auc": 0.7253714914694552,
|
| 99 |
+
"cm": [
|
| 100 |
+
[
|
| 101 |
+
272,
|
| 102 |
+
73
|
| 103 |
+
],
|
| 104 |
+
[
|
| 105 |
+
102,
|
| 106 |
+
135
|
| 107 |
+
]
|
| 108 |
+
],
|
| 109 |
+
"pos_rate": 0.35738831615120276
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"policy": {
|
| 113 |
+
"default": "best F1 (balanced, early-warning baseline)",
|
| 114 |
+
"high_recall": "recall\u22650.88 & precision\u22650.55 & pos_rate\u22640.80",
|
| 115 |
+
"high_precision": "precision\u22650.80 & recall\u22650.45 (Moderate)"
|
| 116 |
+
}
|
| 117 |
+
}
|
models/rain_xgb_meta.json
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"features": [
|
| 3 |
+
"temp_c",
|
| 4 |
+
"humidity",
|
| 5 |
+
"cloudcover",
|
| 6 |
+
"pressure",
|
| 7 |
+
"wind_speed",
|
| 8 |
+
"precip_mm",
|
| 9 |
+
"rain_mm",
|
| 10 |
+
"d_temp_c",
|
| 11 |
+
"d_humidity",
|
| 12 |
+
"d_cloudcover",
|
| 13 |
+
"d_pressure",
|
| 14 |
+
"d_wind_speed",
|
| 15 |
+
"d_precip_mm",
|
| 16 |
+
"d_rain_mm",
|
| 17 |
+
"ma3_temp_c",
|
| 18 |
+
"ma3_humidity",
|
| 19 |
+
"ma3_cloudcover",
|
| 20 |
+
"ma3_pressure",
|
| 21 |
+
"ma3_wind_speed",
|
| 22 |
+
"ma3_precip_mm",
|
| 23 |
+
"ma3_rain_mm",
|
| 24 |
+
"pressure_d3h",
|
| 25 |
+
"humidity_d3h",
|
| 26 |
+
"cloudcover_d3h",
|
| 27 |
+
"dew_proxy",
|
| 28 |
+
"d_dew_proxy",
|
| 29 |
+
"ma3_dew_proxy"
|
| 30 |
+
],
|
| 31 |
+
"horizon_hours": 12,
|
| 32 |
+
"thresholds": {
|
| 33 |
+
"default": 0.4454699456691742,
|
| 34 |
+
"high_recall": 0.4454699456691742,
|
| 35 |
+
"high_precision": 0.8384796977043152
|
| 36 |
+
},
|
| 37 |
+
"metrics": {
|
| 38 |
+
"default": {
|
| 39 |
+
"threshold": 0.4454699456691742,
|
| 40 |
+
"precision": 0.8151093439363817,
|
| 41 |
+
"recall": 0.9403669724770642,
|
| 42 |
+
"f1": 0.873269435569755,
|
| 43 |
+
"auc": 0.7489787718475859,
|
| 44 |
+
"cm": [
|
| 45 |
+
[
|
| 46 |
+
44,
|
| 47 |
+
93
|
| 48 |
+
],
|
| 49 |
+
[
|
| 50 |
+
26,
|
| 51 |
+
410
|
| 52 |
+
]
|
| 53 |
+
],
|
| 54 |
+
"pos_rate": 0.8778359511343804
|
| 55 |
+
},
|
| 56 |
+
"high_recall": {
|
| 57 |
+
"threshold": 0.4454699456691742,
|
| 58 |
+
"precision": 0.8151093439363817,
|
| 59 |
+
"recall": 0.9403669724770642,
|
| 60 |
+
"f1": 0.873269435569755,
|
| 61 |
+
"auc": 0.7489787718475859,
|
| 62 |
+
"cm": [
|
| 63 |
+
[
|
| 64 |
+
44,
|
| 65 |
+
93
|
| 66 |
+
],
|
| 67 |
+
[
|
| 68 |
+
26,
|
| 69 |
+
410
|
| 70 |
+
]
|
| 71 |
+
],
|
| 72 |
+
"pos_rate": 0.8778359511343804
|
| 73 |
+
},
|
| 74 |
+
"high_precision": {
|
| 75 |
+
"threshold": 0.8384796977043152,
|
| 76 |
+
"precision": 0.9012345679012346,
|
| 77 |
+
"recall": 0.5022935779816514,
|
| 78 |
+
"f1": 0.6450662739322534,
|
| 79 |
+
"auc": 0.7489787718475859,
|
| 80 |
+
"cm": [
|
| 81 |
+
[
|
| 82 |
+
113,
|
| 83 |
+
24
|
| 84 |
+
],
|
| 85 |
+
[
|
| 86 |
+
217,
|
| 87 |
+
219
|
| 88 |
+
]
|
| 89 |
+
],
|
| 90 |
+
"pos_rate": 0.42408376963350786
|
| 91 |
+
}
|
| 92 |
+
},
|
| 93 |
+
"model_type": "xgboost"
|
| 94 |
+
}
|
models/rain_xgb_tuned_meta.json
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"features": [
|
| 3 |
+
"temp_c",
|
| 4 |
+
"humidity",
|
| 5 |
+
"cloudcover",
|
| 6 |
+
"pressure",
|
| 7 |
+
"wind_speed",
|
| 8 |
+
"precip_mm",
|
| 9 |
+
"rain_mm",
|
| 10 |
+
"d_temp_c",
|
| 11 |
+
"d_humidity",
|
| 12 |
+
"d_cloudcover",
|
| 13 |
+
"d_pressure",
|
| 14 |
+
"d_wind_speed",
|
| 15 |
+
"d_precip_mm",
|
| 16 |
+
"d_rain_mm",
|
| 17 |
+
"ma3_temp_c",
|
| 18 |
+
"ma3_humidity",
|
| 19 |
+
"ma3_cloudcover",
|
| 20 |
+
"ma3_pressure",
|
| 21 |
+
"ma3_wind_speed",
|
| 22 |
+
"ma3_precip_mm",
|
| 23 |
+
"ma3_rain_mm",
|
| 24 |
+
"d3_pressure",
|
| 25 |
+
"d3_humidity",
|
| 26 |
+
"d3_cloudcover",
|
| 27 |
+
"d3_temp_c",
|
| 28 |
+
"dew_proxy",
|
| 29 |
+
"d_dew_proxy",
|
| 30 |
+
"ma3_dew_proxy",
|
| 31 |
+
"rain_sum_3h",
|
| 32 |
+
"rain_sum_6h",
|
| 33 |
+
"rain_sum_12h",
|
| 34 |
+
"rain_sum_24h",
|
| 35 |
+
"rain_max_6h",
|
| 36 |
+
"rain_max_12h",
|
| 37 |
+
"dry_streak_h",
|
| 38 |
+
"wet_streak_h",
|
| 39 |
+
"hour_sin",
|
| 40 |
+
"hour_cos",
|
| 41 |
+
"dow_sin",
|
| 42 |
+
"dow_cos",
|
| 43 |
+
"hoy_sin",
|
| 44 |
+
"hoy_cos",
|
| 45 |
+
"hum_x_cloud",
|
| 46 |
+
"wind_x_cloud",
|
| 47 |
+
"press_drop_3h",
|
| 48 |
+
"press_drop_6h"
|
| 49 |
+
],
|
| 50 |
+
"horizon_hours": 12,
|
| 51 |
+
"event_mm": 1.0,
|
| 52 |
+
"model": {
|
| 53 |
+
"type": "xgboost",
|
| 54 |
+
"params": {
|
| 55 |
+
"learning_rate": 0.05,
|
| 56 |
+
"max_depth": 3,
|
| 57 |
+
"n_estimators": 500,
|
| 58 |
+
"subsample": 0.8,
|
| 59 |
+
"colsample_bytree": 0.8,
|
| 60 |
+
"min_child_weight": 3
|
| 61 |
+
}
|
| 62 |
+
},
|
| 63 |
+
"thresholds": {
|
| 64 |
+
"default": 0.15,
|
| 65 |
+
"high_recall": 0.1,
|
| 66 |
+
"high_precision": 0.6
|
| 67 |
+
},
|
| 68 |
+
"cv_mean": {
|
| 69 |
+
"P": 0.6167141877942365,
|
| 70 |
+
"R": 0.40142749648205356,
|
| 71 |
+
"F1": 0.4687631522470538,
|
| 72 |
+
"AUC": 0.6838816207078178
|
| 73 |
+
},
|
| 74 |
+
"cv_folds": [
|
| 75 |
+
{
|
| 76 |
+
"P": 0.44,
|
| 77 |
+
"R": 0.4782608695652174,
|
| 78 |
+
"F1": 0.4583333333333333,
|
| 79 |
+
"AUC": 0.6755671077504725,
|
| 80 |
+
"thr": 0.15
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"P": 0.7757009345794392,
|
| 84 |
+
"R": 0.4088669950738916,
|
| 85 |
+
"F1": 0.535483870967742,
|
| 86 |
+
"AUC": 0.7078279587697148,
|
| 87 |
+
"thr": 0.4764537811279297
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"P": 0.82,
|
| 91 |
+
"R": 0.4270833333333333,
|
| 92 |
+
"F1": 0.5616438356164384,
|
| 93 |
+
"AUC": 0.6740785256410257,
|
| 94 |
+
"thr": 0.9872803688049316
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"P": 0.32323232323232326,
|
| 98 |
+
"R": 0.3764705882352941,
|
| 99 |
+
"F1": 0.34782608695652173,
|
| 100 |
+
"AUC": 0.6218912881608338,
|
| 101 |
+
"thr": 0.9168330430984497
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"P": 0.7246376811594203,
|
| 105 |
+
"R": 0.31645569620253167,
|
| 106 |
+
"F1": 0.44052863436123346,
|
| 107 |
+
"AUC": 0.7400432232170422,
|
| 108 |
+
"thr": 0.8837475776672363
|
| 109 |
+
}
|
| 110 |
+
]
|
| 111 |
+
}
|
models/xgb_tuned.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"params": {
|
| 3 |
+
"learning_rate": 0.05,
|
| 4 |
+
"max_depth": 3,
|
| 5 |
+
"n_estimators": 500,
|
| 6 |
+
"subsample": 0.8,
|
| 7 |
+
"colsample_bytree": 0.8,
|
| 8 |
+
"min_child_weight": 3
|
| 9 |
+
},
|
| 10 |
+
"mean_f1": 0.5780780663579321
|
| 11 |
+
}
|
pyproject.toml
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "weather-data-fetcher"
|
| 3 |
+
version = "0.2.0"
|
| 4 |
+
description = "Fetch, process, and visualize daily weather from Open-Meteo."
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.10"
|
| 7 |
+
authors = [{ name = "Elvis Anselm" }]
|
| 8 |
+
license = "MIT"
|
| 9 |
+
dependencies = [
|
| 10 |
+
"requests",
|
| 11 |
+
"pandas",
|
| 12 |
+
"matplotlib",
|
| 13 |
+
"python-dotenv",
|
| 14 |
+
"pillow"
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
[tool.setuptools.packages.find]
|
| 18 |
+
include = ["weather_cli*"]
|
| 19 |
+
exclude = ["data*", "logs*", "results*", "assets*"]
|
| 20 |
+
|
| 21 |
+
[project.scripts]
|
| 22 |
+
weather-cli = "weather_cli.cli:main"
|
| 23 |
+
|
| 24 |
+
[build-system]
|
| 25 |
+
requires = ["setuptools>=68", "wheel"]
|
| 26 |
+
build-backend = "setuptools.build_meta"
|
render.yaml
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
services:
|
| 2 |
+
- type: web
|
| 3 |
+
name: rain-nowcast-api
|
| 4 |
+
env: docker
|
| 5 |
+
autoDeploy: true
|
| 6 |
+
plan: free
|
| 7 |
+
dockerCommand: null
|
| 8 |
+
healthCheckPath: /health
|
requirements.txt
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
certifi==2025.10.5
|
| 2 |
+
charset-normalizer==3.4.4
|
| 3 |
+
contourpy==1.3.3
|
| 4 |
+
cycler==0.12.1
|
| 5 |
+
fonttools==4.60.1
|
| 6 |
+
idna==3.11
|
| 7 |
+
joblib==1.5.2
|
| 8 |
+
kiwisolver==1.4.9
|
| 9 |
+
matplotlib==3.10.7
|
| 10 |
+
numpy==2.3.4
|
| 11 |
+
packaging==25.0
|
| 12 |
+
pandas==2.3.3
|
| 13 |
+
pillow==12.0.0
|
| 14 |
+
pyparsing==3.2.5
|
| 15 |
+
python-dateutil==2.9.0.post0
|
| 16 |
+
python-dotenv==1.1.1
|
| 17 |
+
pytz==2025.2
|
| 18 |
+
requests==2.32.5
|
| 19 |
+
scikit-learn==1.7.2
|
| 20 |
+
scipy==1.16.2
|
| 21 |
+
six==1.17.0
|
| 22 |
+
threadpoolctl==3.6.0
|
| 23 |
+
tzdata==2025.2
|
| 24 |
+
urllib3==2.5.0
|
| 25 |
+
-e git+https://github.com/Elvaceishim/weather_data_fetcher.git@ac53d9c31c4be6eda7988f97e1768f998c7a9f0a#egg=weather_data_fetcher
|
| 26 |
+
fastapi
|
| 27 |
+
uvicorn[standard]
|
| 28 |
+
pydantic
|
| 29 |
+
xgboost
|
| 30 |
+
huggingface_hub
|
| 31 |
+
streamlit
|
| 32 |
+
gradio
|
scripts/analyze_weather.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
|
| 3 |
+
df = pd.read_csv("results/summary.csv")
|
| 4 |
+
print("\n=== HEAD ===")
|
| 5 |
+
print(df.head())
|
| 6 |
+
print("\n=== DESCRIBE ===")
|
| 7 |
+
print(df.describe())
|
| 8 |
+
print("\n=== COLUMNS ===")
|
| 9 |
+
print(df.columns)
|
| 10 |
+
print("\n=== MISSING VALUES ===")
|
| 11 |
+
print(df.isna().sum())
|
scripts/backfill_labels.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import json, os, argparse
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from datetime import datetime, timedelta
|
| 5 |
+
import pandas as pd, numpy as np, subprocess
|
| 6 |
+
|
| 7 |
+
META = Path("models/rain_xgb_tuned_meta.json")
|
| 8 |
+
LOGS = Path("logs")
|
| 9 |
+
PRED_LOG = LOGS / "predictions.csv"
|
| 10 |
+
|
| 11 |
+
def ensure_hourly(lat, lon, past_days=120):
|
| 12 |
+
env = os.environ.copy()
|
| 13 |
+
env["LAT"], env["LON"], env["PAST_DAYS"] = str(lat), str(lon), str(past_days)
|
| 14 |
+
subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
|
| 15 |
+
subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
|
| 16 |
+
return pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 17 |
+
|
| 18 |
+
def label_from_df(df, ts_pred, horizon_h, event_mm):
|
| 19 |
+
# find the row with time == ts_pred, then sum next H hours of precip_mm
|
| 20 |
+
# allow slight mismatch by nearest timestamp within 1 hour
|
| 21 |
+
idx = (df["time"] - ts_pred).abs().idxmin()
|
| 22 |
+
if abs((df.loc[idx, "time"] - ts_pred).total_seconds()) > 3600:
|
| 23 |
+
return None # can't align
|
| 24 |
+
end_idx = min(idx + horizon_h, len(df)-1)
|
| 25 |
+
total = float(np.nansum(df.loc[idx+1:end_idx, "precip_mm"]))
|
| 26 |
+
return 1 if total >= event_mm else 0
|
| 27 |
+
|
| 28 |
+
def main():
|
| 29 |
+
if not PRED_LOG.exists():
|
| 30 |
+
print("No predictions.csv found.")
|
| 31 |
+
return
|
| 32 |
+
|
| 33 |
+
meta = json.loads(Path(META).read_text())
|
| 34 |
+
H = int(meta["horizon_hours"]); event_mm = float(meta["event_mm"])
|
| 35 |
+
|
| 36 |
+
df = pd.read_csv(PRED_LOG, parse_dates=["ts_pred","logged_at"])
|
| 37 |
+
updated = 0
|
| 38 |
+
for i, row in df[df["y_true"].isna() | (df["y_true"]=="")].iterrows():
|
| 39 |
+
ts_pred = row["ts_pred"]
|
| 40 |
+
if datetime.now() < ts_pred + timedelta(hours=H):
|
| 41 |
+
continue # horizon not passed yet
|
| 42 |
+
# fetch enough history to cover that timestamp
|
| 43 |
+
hdf = ensure_hourly(row["lat"], row["lon"], past_days=120)
|
| 44 |
+
y = label_from_df(hdf, ts_pred, H, event_mm)
|
| 45 |
+
if y is not None:
|
| 46 |
+
df.at[i, "y_true"] = int(y)
|
| 47 |
+
updated += 1
|
| 48 |
+
|
| 49 |
+
df.to_csv(PRED_LOG, index=False)
|
| 50 |
+
print(f"Backfilled {updated} rows into {PRED_LOG}")
|
| 51 |
+
|
| 52 |
+
if __name__ == "__main__":
|
| 53 |
+
main()
|
scripts/coef_rain.py
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json, joblib, pandas as pd
|
| 3 |
+
from sklearn.model_selection import train_test_split
|
| 4 |
+
|
| 5 |
+
meta = json.load(open("models/rain_model_meta.json"))
|
| 6 |
+
clf = joblib.load("models/rain_classifier_hourly.joblib")
|
| 7 |
+
|
| 8 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 9 |
+
base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
|
| 10 |
+
for c in base:
|
| 11 |
+
df[f"d_{c}"] = df[c].diff()
|
| 12 |
+
df[f"ma3_{c}"] = df[c].rolling(3).mean()
|
| 13 |
+
df = df.dropna().reset_index(drop=True)
|
| 14 |
+
|
| 15 |
+
X = df[meta["features"]].values
|
| 16 |
+
y = None
|
| 17 |
+
|
| 18 |
+
logreg = clf.named_steps["logreg"]
|
| 19 |
+
coefs = logreg.coef_[0]
|
| 20 |
+
features = meta["features"]
|
| 21 |
+
|
| 22 |
+
rank = sorted(zip(features, coefs), key=lambda x: abs(x[1]), reverse=True)
|
| 23 |
+
|
| 24 |
+
out_lines = ["Feature coefficients (standardized space):"]
|
| 25 |
+
for name, w in rank[:15]:
|
| 26 |
+
out_lines.append(f"{name:20s} {w:+.3f}")
|
| 27 |
+
|
| 28 |
+
print("\n".join(out_lines))
|
| 29 |
+
with open("results/coef_top15.txt", "w") as f:
|
| 30 |
+
f.write("\n".join(out_lines))
|
| 31 |
+
|
| 32 |
+
print("✅ Wrote results/coef_top15.txt")
|
scripts/cron_predict.sh
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
|
| 3 |
+
set -euo pipefail
|
| 4 |
+
|
| 5 |
+
# --- Resolve repo root ---
|
| 6 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 7 |
+
REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
| 8 |
+
cd "$REPO_ROOT"
|
| 9 |
+
|
| 10 |
+
# --- Lock to avoid overlapping runs (portable; no flock needed) ---
|
| 11 |
+
mkdir -p logs
|
| 12 |
+
LOCKDIR="logs/.predict.lock"
|
| 13 |
+
if ! mkdir "$LOCKDIR" 2>/dev/null; then
|
| 14 |
+
echo "[$(date '+%F %T')] Another run is in progress. Skipping."
|
| 15 |
+
exit 0
|
| 16 |
+
fi
|
| 17 |
+
trap 'rmdir "$LOCKDIR" 2>/dev/null || true' EXIT
|
| 18 |
+
|
| 19 |
+
# --- Args & defaults ---
|
| 20 |
+
MODE="${1:-default}"
|
| 21 |
+
CITY="${2:-Lagos}"
|
| 22 |
+
LAT="${3:-6.5244}"
|
| 23 |
+
LON="${4:-3.3792}"
|
| 24 |
+
PAST_DAYS="${5:-90}"
|
| 25 |
+
|
| 26 |
+
# --- Activate venv if present ---
|
| 27 |
+
if [[ -f ".venv/bin/activate" ]]; then
|
| 28 |
+
# shellcheck disable=SC1091
|
| 29 |
+
source .venv/bin/activate
|
| 30 |
+
fi
|
| 31 |
+
|
| 32 |
+
# --- Environment for fetch scripts ---
|
| 33 |
+
export LAT="$LAT" LON="$LON" PAST_DAYS="$PAST_DAYS"
|
| 34 |
+
|
| 35 |
+
# --- Run one logged prediction ---
|
| 36 |
+
echo "[$(date '+%F %T')] cron_predict: city=$CITY lat=$LAT lon=$LON mode=$MODE days=$PAST_DAYS"
|
| 37 |
+
python3 scripts/log_predict.py --city "$CITY" --lat "$LAT" --lon "$LON" --mode "$MODE" || {
|
| 38 |
+
echo "[$(date '+%F %T')] ERROR: log_predict failed"
|
| 39 |
+
exit 1
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# --- (Optional) basic log rotation (keep log under ~1MB) ---
|
| 43 |
+
LOGFILE="logs/cron.log"
|
| 44 |
+
if [[ -f "$LOGFILE" ]] && [[ $(stat -f%z "$LOGFILE") -gt 1048576 ]]; then
|
| 45 |
+
mv "$LOGFILE" "logs/cron_$(date +%Y%m%d_%H%M%S).log" || true
|
| 46 |
+
fi
|
| 47 |
+
|
| 48 |
+
echo "[$(date '+%F %T')] cron_predict: done."
|
scripts/cv_benchmark.py
ADDED
|
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json, warnings
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
|
| 8 |
+
from sklearn.model_selection import TimeSeriesSplit
|
| 9 |
+
from sklearn.pipeline import Pipeline
|
| 10 |
+
from sklearn.preprocessing import StandardScaler, RobustScaler
|
| 11 |
+
from sklearn.linear_model import LogisticRegression
|
| 12 |
+
from sklearn.metrics import precision_recall_fscore_support, precision_recall_curve
|
| 13 |
+
|
| 14 |
+
warnings.filterwarnings("ignore")
|
| 15 |
+
|
| 16 |
+
H = 12
|
| 17 |
+
EVENT_MM = 1.0
|
| 18 |
+
|
| 19 |
+
HOURLY = Path("results/hourly.csv")
|
| 20 |
+
META = Path("models/rain_model_meta.json")
|
| 21 |
+
|
| 22 |
+
# -----------------------------
|
| 23 |
+
# Feature builder (same as CLI/trainer)
|
| 24 |
+
# -----------------------------
|
| 25 |
+
def rebuild_features_like_training(df: pd.DataFrame, features_from_meta: list) -> pd.DataFrame:
|
| 26 |
+
required = {"time","temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"}
|
| 27 |
+
missing = required - set(df.columns)
|
| 28 |
+
if missing:
|
| 29 |
+
raise ValueError(f"Hourly data missing columns: {sorted(missing)}")
|
| 30 |
+
|
| 31 |
+
base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
|
| 32 |
+
for c in base:
|
| 33 |
+
df[f"d_{c}"] = df[c].diff()
|
| 34 |
+
df[f"ma3_{c}"] = df[c].rolling(3).mean()
|
| 35 |
+
|
| 36 |
+
for c in ["pressure","humidity","cloudcover","temp_c"]:
|
| 37 |
+
df[f"d3_{c}"] = df[c] - df[c].shift(3)
|
| 38 |
+
|
| 39 |
+
df["dew_proxy"] = df["temp_c"] - (df["humidity"] / 5.0)
|
| 40 |
+
df["d_dew_proxy"] = df["dew_proxy"].diff()
|
| 41 |
+
df["ma3_dew_proxy"] = df["dew_proxy"].rolling(3).mean()
|
| 42 |
+
|
| 43 |
+
df["rain_sum_3h"] = df["precip_mm"].rolling(3).sum()
|
| 44 |
+
df["rain_sum_6h"] = df["precip_mm"].rolling(6).sum()
|
| 45 |
+
df["rain_sum_12h"] = df["precip_mm"].rolling(12).sum()
|
| 46 |
+
df["rain_sum_24h"] = df["precip_mm"].rolling(24).sum()
|
| 47 |
+
df["rain_max_6h"] = df["precip_mm"].rolling(6).max()
|
| 48 |
+
df["rain_max_12h"] = df["precip_mm"].rolling(12).max()
|
| 49 |
+
|
| 50 |
+
is_raining = (df["precip_mm"] > 0).astype(int)
|
| 51 |
+
dry = (~(is_raining.astype(bool))).astype(int)
|
| 52 |
+
df["dry_streak_h"] = (dry.groupby((dry != dry.shift()).cumsum()).cumcount() + 1) * dry
|
| 53 |
+
df["dry_streak_h"] = df["dry_streak_h"].where(dry == 1, 0)
|
| 54 |
+
|
| 55 |
+
wet = is_raining
|
| 56 |
+
df["wet_streak_h"] = (wet.groupby((wet != wet.shift()).cumsum()).cumcount() + 1) * wet
|
| 57 |
+
df["wet_streak_h"] = df["wet_streak_h"].where(wet == 1, 0)
|
| 58 |
+
|
| 59 |
+
df["hour"] = df["time"].dt.hour
|
| 60 |
+
df["dow"] = df["time"].dt.dayofweek
|
| 61 |
+
df["doy"] = df["time"].dt.dayofyear
|
| 62 |
+
df["hoy"] = (df["doy"] - 1) * 24 + df["hour"]
|
| 63 |
+
|
| 64 |
+
df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24.0)
|
| 65 |
+
df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24.0)
|
| 66 |
+
df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7.0)
|
| 67 |
+
df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7.0)
|
| 68 |
+
df["hoy_sin"] = np.sin(2*np.pi*df["hoy"]/(365.25*24))
|
| 69 |
+
df["hoy_cos"] = np.cos(2*np.pi*df["hoy"]/(365.25*24))
|
| 70 |
+
|
| 71 |
+
df["hum_x_cloud"] = df["humidity"] * df["cloudcover"]
|
| 72 |
+
df["wind_x_cloud"] = df["wind_speed"] * df["cloudcover"]
|
| 73 |
+
df["press_drop_3h"] = -df["d3_pressure"]
|
| 74 |
+
df["press_drop_6h"] = df["pressure"].shift(6) - df["pressure"]
|
| 75 |
+
|
| 76 |
+
df = df.dropna().reset_index(drop=True)
|
| 77 |
+
|
| 78 |
+
if features_from_meta:
|
| 79 |
+
missing_feats = [c for c in features_from_meta if c not in df.columns]
|
| 80 |
+
if missing_feats:
|
| 81 |
+
raise ValueError(f"Missing features expected by model: {missing_feats}")
|
| 82 |
+
return df[features_from_meta]
|
| 83 |
+
|
| 84 |
+
feat = (
|
| 85 |
+
base +
|
| 86 |
+
[f"d_{c}" for c in base] +
|
| 87 |
+
[f"ma3_{c}" for c in base] +
|
| 88 |
+
[f"d3_{c}" for c in ["pressure","humidity","cloudcover","temp_c"]] +
|
| 89 |
+
["dew_proxy","d_dew_proxy","ma3_dew_proxy",
|
| 90 |
+
"rain_sum_3h","rain_sum_6h","rain_sum_12h","rain_sum_24h","rain_max_6h","rain_max_12h",
|
| 91 |
+
"dry_streak_h","wet_streak_h",
|
| 92 |
+
"hour_sin","hour_cos","dow_sin","dow_cos","hoy_sin","hoy_cos",
|
| 93 |
+
"hum_x_cloud","wind_x_cloud","press_drop_3h","press_drop_6h"]
|
| 94 |
+
)
|
| 95 |
+
return df[feat]
|
| 96 |
+
|
| 97 |
+
# -----------------------------
|
| 98 |
+
# Label builder: ≥ EVENT_MM in next H hours
|
| 99 |
+
# -----------------------------
|
| 100 |
+
def make_labels(df: pd.DataFrame, horizon=H, event_mm=EVENT_MM):
|
| 101 |
+
prec = df["precip_mm"].values
|
| 102 |
+
y = np.zeros(len(df), dtype=int)
|
| 103 |
+
for i in range(len(prec) - horizon):
|
| 104 |
+
y[i] = 1 if np.nansum(prec[i+1:i+1+horizon]) >= event_mm else 0
|
| 105 |
+
y = y[:-horizon]
|
| 106 |
+
return y
|
| 107 |
+
|
| 108 |
+
# -----------------------------
|
| 109 |
+
# Models to compare
|
| 110 |
+
# -----------------------------
|
| 111 |
+
def build_models():
|
| 112 |
+
models = {}
|
| 113 |
+
|
| 114 |
+
# Logistic + StandardScaler
|
| 115 |
+
models["logreg_standard"] = Pipeline([
|
| 116 |
+
("scaler", StandardScaler()),
|
| 117 |
+
("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
|
| 118 |
+
])
|
| 119 |
+
|
| 120 |
+
# Logistic + RobustScaler (outlier-robust)
|
| 121 |
+
models["logreg_robust"] = Pipeline([
|
| 122 |
+
("scaler", RobustScaler()),
|
| 123 |
+
("clf", LogisticRegression(max_iter=2000, class_weight="balanced"))
|
| 124 |
+
])
|
| 125 |
+
|
| 126 |
+
try:
|
| 127 |
+
from xgboost import XGBClassifier
|
| 128 |
+
models["xgb"] = XGBClassifier(
|
| 129 |
+
n_estimators=800,
|
| 130 |
+
learning_rate=0.05,
|
| 131 |
+
max_depth=5,
|
| 132 |
+
min_child_weight=3.0,
|
| 133 |
+
subsample=0.8,
|
| 134 |
+
colsample_bytree=0.8,
|
| 135 |
+
reg_lambda=2.0,
|
| 136 |
+
objective="binary:logistic",
|
| 137 |
+
eval_metric="aucpr",
|
| 138 |
+
tree_method="hist",
|
| 139 |
+
random_state=42,
|
| 140 |
+
)
|
| 141 |
+
except Exception as e:
|
| 142 |
+
print(f"[warn] XGBoost unavailable: {e}")
|
| 143 |
+
return models
|
| 144 |
+
|
| 145 |
+
def evaluate_fold(model, X_train, y_train, X_test, y_test, val_frac=0.15):
|
| 146 |
+
n = len(X_train)
|
| 147 |
+
v = max(int(n * val_frac), 1)
|
| 148 |
+
X_tr, y_tr = X_train[:-v], y_train[:-v]
|
| 149 |
+
X_val, y_val = X_train[-v:], y_train[-v:]
|
| 150 |
+
|
| 151 |
+
model.fit(X_tr, y_tr)
|
| 152 |
+
|
| 153 |
+
# Probability on val to pick threshold
|
| 154 |
+
if hasattr(model, "predict_proba"):
|
| 155 |
+
p_val = model.predict_proba(X_val)[:, 1]
|
| 156 |
+
p_test = model.predict_proba(X_test)[:, 1]
|
| 157 |
+
else:
|
| 158 |
+
if hasattr(model, "decision_function"):
|
| 159 |
+
from sklearn.preprocessing import MinMaxScaler
|
| 160 |
+
z_val = model.decision_function(X_val).reshape(-1, 1)
|
| 161 |
+
z_test = model.decision_function(X_test).reshape(-1, 1)
|
| 162 |
+
mm = MinMaxScaler()
|
| 163 |
+
p_val = mm.fit_transform(z_val).ravel()
|
| 164 |
+
p_test = mm.transform(z_test).ravel()
|
| 165 |
+
else:
|
| 166 |
+
# fallback: hard predictions at 0.5
|
| 167 |
+
pred = model.predict(X_test)
|
| 168 |
+
P, R, F1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
|
| 169 |
+
return dict(P=P, R=R, F1=F1, thr=0.5)
|
| 170 |
+
|
| 171 |
+
prec, rec, thr = precision_recall_curve(y_val, p_val)
|
| 172 |
+
# Avoid degenerate thresholds: thr has length len(prec)-1
|
| 173 |
+
candidates = []
|
| 174 |
+
for t in thr:
|
| 175 |
+
pred_v = (p_val >= t).astype(int)
|
| 176 |
+
P, R, F1, _ = precision_recall_fscore_support(y_val, pred_v, average="binary", zero_division=0)
|
| 177 |
+
candidates.append((t, P, R, F1))
|
| 178 |
+
if not candidates:
|
| 179 |
+
t_star = 0.5
|
| 180 |
+
else:
|
| 181 |
+
# choose by best F1 on validation
|
| 182 |
+
t_star = max(candidates, key=lambda x: x[3])[0]
|
| 183 |
+
|
| 184 |
+
pred = (p_test >= t_star).astype(int)
|
| 185 |
+
P, R, F1, _ = precision_recall_fscore_support(y_test, pred, average="binary", zero_division=0)
|
| 186 |
+
return dict(P=P, R=R, F1=F1, thr=float(t_star))
|
| 187 |
+
|
| 188 |
+
# -----------------------------
|
| 189 |
+
# Main
|
| 190 |
+
# -----------------------------
|
| 191 |
+
def main():
|
| 192 |
+
if not HOURLY.exists():
|
| 193 |
+
raise FileNotFoundError("results/hourly.csv not found. Run: make hourly PAST_DAYS=90")
|
| 194 |
+
|
| 195 |
+
df = pd.read_csv(HOURLY, parse_dates=["time"])
|
| 196 |
+
y_all = make_labels(df, H, EVENT_MM)
|
| 197 |
+
dfX = df.iloc[:-H].copy()
|
| 198 |
+
|
| 199 |
+
# Use features from meta if present
|
| 200 |
+
features_from_meta = None
|
| 201 |
+
if META.exists():
|
| 202 |
+
meta = json.loads(META.read_text())
|
| 203 |
+
features_from_meta = meta.get("features", None)
|
| 204 |
+
|
| 205 |
+
Xdf = rebuild_features_like_training(dfX, features_from_meta)
|
| 206 |
+
n = len(Xdf)
|
| 207 |
+
if len(y_all) < n:
|
| 208 |
+
raise ValueError("Labels shorter than feature matrix; check preprocessing alignment.")
|
| 209 |
+
y = y_all[-n:]
|
| 210 |
+
X = Xdf.values[-n:]
|
| 211 |
+
assert len(X) == len(y), "Feature matrix and labels misaligned."
|
| 212 |
+
|
| 213 |
+
tscv = TimeSeriesSplit(n_splits=5)
|
| 214 |
+
|
| 215 |
+
models = build_models()
|
| 216 |
+
results = {name: [] for name in models}
|
| 217 |
+
|
| 218 |
+
for name, model in models.items():
|
| 219 |
+
print(f"\n=== {name} ===")
|
| 220 |
+
fold_id = 1
|
| 221 |
+
per_fold = []
|
| 222 |
+
for tr_idx, te_idx in tscv.split(X):
|
| 223 |
+
X_tr, X_te = X[tr_idx], X[te_idx]
|
| 224 |
+
y_tr, y_te = y[tr_idx], y[te_idx]
|
| 225 |
+
|
| 226 |
+
metrics = evaluate_fold(model, X_tr, y_tr, X_te, y_te)
|
| 227 |
+
per_fold.append(metrics)
|
| 228 |
+
print(f"Fold {fold_id} → P={metrics['P']:.3f} R={metrics['R']:.3f} F1={metrics['F1']:.3f} thr={metrics['thr']:.3f}")
|
| 229 |
+
fold_id += 1
|
| 230 |
+
|
| 231 |
+
# Aggregate
|
| 232 |
+
Pm = np.mean([m["P"] for m in per_fold])
|
| 233 |
+
Rm = np.mean([m["R"] for m in per_fold])
|
| 234 |
+
Fm = np.mean([m["F1"] for m in per_fold])
|
| 235 |
+
print(f"Mean → P={Pm:.3f} R={Rm:.3f} F1={Fm:.3f}")
|
| 236 |
+
results[name] = dict(P=Pm, R=Rm, F1=Fm)
|
| 237 |
+
|
| 238 |
+
print("\n=== SUMMARY (higher F1 is better) ===")
|
| 239 |
+
for name, m in sorted(results.items(), key=lambda kv: kv[1]["F1"], reverse=True):
|
| 240 |
+
print(f"{name:18s} F1={m['F1']:.3f} P={m['P']:.3f} R={m['R']:.3f}")
|
| 241 |
+
|
| 242 |
+
if __name__ == "__main__":
|
| 243 |
+
main()
|
scripts/download_models.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
|
| 3 |
+
import os
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from huggingface_hub import hf_hub_download
|
| 7 |
+
from huggingface_hub.errors import EntryNotFoundError
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def main() -> None:
|
| 11 |
+
repo_id = os.environ.get("MODEL_REPO_ID", "theelvace/weather-data-fetcher-models")
|
| 12 |
+
files_env = os.environ.get(
|
| 13 |
+
"MODEL_FILES",
|
| 14 |
+
"rain_xgb_tuned.joblib rain_xgb_tuned_meta.json",
|
| 15 |
+
)
|
| 16 |
+
target_dir = Path(os.environ.get("MODEL_DIR", "models"))
|
| 17 |
+
|
| 18 |
+
target_dir.mkdir(parents=True, exist_ok=True)
|
| 19 |
+
|
| 20 |
+
filenames = [name.strip() for name in files_env.split() if name.strip()]
|
| 21 |
+
if not filenames:
|
| 22 |
+
print("MODEL_FILES is empty; nothing to download.")
|
| 23 |
+
return
|
| 24 |
+
|
| 25 |
+
for filename in filenames:
|
| 26 |
+
print(f"Downloading {filename} from {repo_id} ...")
|
| 27 |
+
try:
|
| 28 |
+
local_path = hf_hub_download(
|
| 29 |
+
repo_id=repo_id,
|
| 30 |
+
filename=filename,
|
| 31 |
+
local_dir=target_dir,
|
| 32 |
+
local_dir_use_symlinks=False,
|
| 33 |
+
)
|
| 34 |
+
except EntryNotFoundError:
|
| 35 |
+
print(f" • Skipping {filename}: not found in {repo_id}.")
|
| 36 |
+
continue
|
| 37 |
+
print(f"Saved to {local_path}")
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
main()
|
scripts/eval_operating_points.py
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import numpy as np, pandas as pd
|
| 4 |
+
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
|
| 5 |
+
|
| 6 |
+
meta = json.loads(Path("models/rain_xgb_tuned_meta.json").read_text())
|
| 7 |
+
thr = meta["thresholds"]
|
| 8 |
+
H = meta["horizon_hours"]
|
| 9 |
+
|
| 10 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 11 |
+
|
| 12 |
+
# make labels (>=1.0mm in next H hours)
|
| 13 |
+
prec = df["precip_mm"].values
|
| 14 |
+
y = np.zeros(len(df), dtype=int)
|
| 15 |
+
for i in range(len(prec) - H):
|
| 16 |
+
y[i] = 1 if np.nansum(prec[i+1:i+1+H]) >= meta["event_mm"] else 0
|
| 17 |
+
y = y[:-H]
|
| 18 |
+
dfX = df.iloc[:-H].copy()
|
| 19 |
+
|
| 20 |
+
# rebuild features exactly like training
|
| 21 |
+
# local import
|
| 22 |
+
import importlib.util
|
| 23 |
+
import types
|
| 24 |
+
|
| 25 |
+
def load_build_features():
|
| 26 |
+
spec = importlib.util.spec_from_file_location("train_xgb_tuned_final", Path("scripts/train_xgb_tuned_final.py"))
|
| 27 |
+
module = importlib.util.module_from_spec(spec)
|
| 28 |
+
spec.loader.exec_module(module) # type: ignore
|
| 29 |
+
return module.build_features
|
| 30 |
+
|
| 31 |
+
build_features = load_build_features()
|
| 32 |
+
Xdf = build_features(dfX)
|
| 33 |
+
X = Xdf.values
|
| 34 |
+
y = y[-len(X):] # align
|
| 35 |
+
|
| 36 |
+
import joblib
|
| 37 |
+
clf = joblib.load("models/rain_xgb_tuned.joblib")
|
| 38 |
+
p = clf.predict_proba(X)[:,1]
|
| 39 |
+
|
| 40 |
+
def report(name, t):
|
| 41 |
+
pred = (p >= t).astype(int)
|
| 42 |
+
P, R, F1, _ = precision_recall_fscore_support(y, pred, average="binary", zero_division=0)
|
| 43 |
+
cm = confusion_matrix(y, pred).tolist()
|
| 44 |
+
rate = float(pred.mean())
|
| 45 |
+
print(f"{name:<10} thr={t:.3f} | P={P:.3f} R={R:.3f} F1={F1:.3f} | alerts={rate:.2%} | cm={cm}")
|
| 46 |
+
|
| 47 |
+
report("default", thr["default"])
|
| 48 |
+
report("recall", thr["high_recall"])
|
| 49 |
+
report("precision", thr["high_precision"])
|
scripts/explain_shap.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import shap
|
| 3 |
+
import joblib
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import matplotlib.pyplot as plt
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import json
|
| 8 |
+
import numpy as np
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
# ensure matplotlib cache lives inside repo
|
| 12 |
+
RESULTS_DIR = Path("results")
|
| 13 |
+
RESULTS_DIR.mkdir(exist_ok=True)
|
| 14 |
+
os.environ.setdefault("MPLCONFIGDIR", str(RESULTS_DIR / ".matplotlib"))
|
| 15 |
+
Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
|
| 16 |
+
|
| 17 |
+
# === Load model + metadata ===
|
| 18 |
+
model = joblib.load("models/rain_xgb_tuned.joblib")
|
| 19 |
+
meta = json.load(open("models/rain_xgb_tuned_meta.json"))
|
| 20 |
+
features = meta["features"]
|
| 21 |
+
|
| 22 |
+
# === Load data ===
|
| 23 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 24 |
+
|
| 25 |
+
# Rebuild features exactly like training
|
| 26 |
+
import importlib.util
|
| 27 |
+
|
| 28 |
+
spec = importlib.util.spec_from_file_location(
|
| 29 |
+
"train_xgb_tuned_final", Path("scripts/train_xgb_tuned_final.py")
|
| 30 |
+
)
|
| 31 |
+
module = importlib.util.module_from_spec(spec)
|
| 32 |
+
spec.loader.exec_module(module)
|
| 33 |
+
build_features = module.build_features
|
| 34 |
+
|
| 35 |
+
Xdf = build_features(df)
|
| 36 |
+
X = Xdf.values.astype(np.float32)
|
| 37 |
+
|
| 38 |
+
# Use last 500 samples for analysis (avoid overkill)
|
| 39 |
+
X_sample = X[-200:]
|
| 40 |
+
|
| 41 |
+
# === SHAP Explainer ===
|
| 42 |
+
explainer = shap.Explainer(model.predict_proba, X_sample, algorithm="permutation")
|
| 43 |
+
shap_values = explainer(X_sample)
|
| 44 |
+
|
| 45 |
+
# === Global importance ===
|
| 46 |
+
Path("results").mkdir(exist_ok=True)
|
| 47 |
+
plt.figure()
|
| 48 |
+
shap.summary_plot(shap_values, X_sample,
|
| 49 |
+
feature_names=features, show=False)
|
| 50 |
+
plt.tight_layout()
|
| 51 |
+
plt.savefig("results/shap_summary.png", dpi=300)
|
| 52 |
+
plt.close()
|
| 53 |
+
|
| 54 |
+
# === Bar chart version ===
|
| 55 |
+
plt.figure()
|
| 56 |
+
shap.summary_plot(shap_values, X_sample,
|
| 57 |
+
feature_names=features, plot_type="bar", show=False)
|
| 58 |
+
plt.tight_layout()
|
| 59 |
+
plt.savefig("results/shap_top.png", dpi=300)
|
| 60 |
+
plt.close()
|
| 61 |
+
|
| 62 |
+
print("✅ SHAP visualisations saved: results/shap_summary.png and results/shap_top.png")
|
scripts/explain_shap_interaction.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Generates a SHAP dependence plot showing how HUMIDITY and
|
| 4 |
+
TEMPERATURE
|
| 5 |
+
jointly influence rain predictions. Outputs:
|
| 6 |
+
- results/shap_interaction.png
|
| 7 |
+
"""
|
| 8 |
+
import json
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
import joblib
|
| 12 |
+
import numpy as np
|
| 13 |
+
import pandas as pd
|
| 14 |
+
import matplotlib.pyplot as plt
|
| 15 |
+
import shap
|
| 16 |
+
import importlib.util
|
| 17 |
+
import os
|
| 18 |
+
|
| 19 |
+
# Keep matplotlib caches inside repo to avoid home directory issues
|
| 20 |
+
RESULTS_DIR = Path("results")
|
| 21 |
+
RESULTS_DIR.mkdir(exist_ok=True)
|
| 22 |
+
os.environ.setdefault("MPLCONFIGDIR", str(RESULTS_DIR / ".matplotlib"))
|
| 23 |
+
Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
# Load model + meta
|
| 26 |
+
model = joblib.load("models/rain_xgb_tuned.joblib")
|
| 27 |
+
booster = model.get_booster()
|
| 28 |
+
config = json.loads(booster.save_config())
|
| 29 |
+
base_score = config.get("learner", {}).get("learner_model_param", {}).get("base_score")
|
| 30 |
+
if base_score:
|
| 31 |
+
cleaned = base_score.strip("[]")
|
| 32 |
+
try:
|
| 33 |
+
float(cleaned)
|
| 34 |
+
except ValueError:
|
| 35 |
+
cleaned = "0.5"
|
| 36 |
+
config["learner"]["learner_model_param"]["base_score"] = cleaned
|
| 37 |
+
booster.load_config(json.dumps(config))
|
| 38 |
+
|
| 39 |
+
meta = json.loads(Path("models/rain_xgb_tuned_meta.json").read_text())
|
| 40 |
+
features = meta["features"]
|
| 41 |
+
|
| 42 |
+
# Load data and rebuild features exactly like training
|
| 43 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 44 |
+
|
| 45 |
+
spec = importlib.util.spec_from_file_location(
|
| 46 |
+
"train_xgb_tuned_final", "scripts/train_xgb_tuned_final.py"
|
| 47 |
+
)
|
| 48 |
+
module = importlib.util.module_from_spec(spec)
|
| 49 |
+
spec.loader.exec_module(module)
|
| 50 |
+
build_features = module.build_features
|
| 51 |
+
Xdf = build_features(df) # same order as training
|
| 52 |
+
X = Xdf.values
|
| 53 |
+
X_sample = X[-120:] if len(X) > 120 else X
|
| 54 |
+
X_sample_df = pd.DataFrame(X_sample, columns=features)
|
| 55 |
+
X_sample_df = pd.DataFrame(X_sample, columns=features)
|
| 56 |
+
|
| 57 |
+
# Prefer TreeExplainer for XGBoost; fallback to generic Explainer if needed
|
| 58 |
+
try:
|
| 59 |
+
explainer = shap.TreeExplainer(booster, data=X_sample)
|
| 60 |
+
shap_result = explainer(X_sample)
|
| 61 |
+
except Exception:
|
| 62 |
+
explainer = shap.Explainer(model.predict_proba, X_sample, algorithm="permutation")
|
| 63 |
+
shap_result = explainer(X_sample)
|
| 64 |
+
|
| 65 |
+
# Normalize SHAP output to a 2D array aligned with feature columns
|
| 66 |
+
if hasattr(shap_result, "values"):
|
| 67 |
+
values = shap_result.values
|
| 68 |
+
if values.ndim == 3: # multi-class, take positive class (index 1)
|
| 69 |
+
values = values[:, :, 1]
|
| 70 |
+
shap_values = values
|
| 71 |
+
else:
|
| 72 |
+
shap_values = np.array(shap_result)
|
| 73 |
+
|
| 74 |
+
# Ensure sample frame matches SHAP output rows
|
| 75 |
+
X_plot = X_sample_df.iloc[-shap_values.shape[0]:]
|
| 76 |
+
|
| 77 |
+
Path("results").mkdir(exist_ok=True)
|
| 78 |
+
|
| 79 |
+
# 1) Dependence plot: humidity colored by temp_c (classic interaction view)
|
| 80 |
+
plt.figure()
|
| 81 |
+
shap.dependence_plot(
|
| 82 |
+
"humidity",
|
| 83 |
+
shap_values,
|
| 84 |
+
X_plot,
|
| 85 |
+
interaction_index="temp_c",
|
| 86 |
+
show=False
|
| 87 |
+
)
|
| 88 |
+
plt.tight_layout()
|
| 89 |
+
plt.savefig("results/shap_interaction.png", dpi=300)
|
| 90 |
+
plt.close()
|
| 91 |
+
|
| 92 |
+
# 2) (Optional) Reverse view: temp_c colored by humidity
|
| 93 |
+
plt.figure()
|
| 94 |
+
shap.dependence_plot(
|
| 95 |
+
"temp_c",
|
| 96 |
+
shap_values,
|
| 97 |
+
X_plot,
|
| 98 |
+
interaction_index="humidity",
|
| 99 |
+
show=False
|
| 100 |
+
)
|
| 101 |
+
plt.tight_layout()
|
| 102 |
+
plt.savefig("results/shap_interaction_rev.png", dpi=300)
|
| 103 |
+
plt.close()
|
| 104 |
+
|
| 105 |
+
print("✅ Saved results/shap_interaction.png and results/shap_interaction_rev.png")
|
scripts/export_daily.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import json, pandas as pd, os
|
| 3 |
+
|
| 4 |
+
os.makedirs("results", exist_ok=True)
|
| 5 |
+
data = json.load(open("data/weather.json"))
|
| 6 |
+
|
| 7 |
+
df = pd.DataFrame({
|
| 8 |
+
"date": data["daily"]["time"],
|
| 9 |
+
"temp_min_c": data["daily"]["temperature_2m_min"],
|
| 10 |
+
"temp_max_c": data["daily"]["temperature_2m_max"],
|
| 11 |
+
"precip_mm": data["daily"]["precipitation_sum"],
|
| 12 |
+
"cloudcover": data["daily"]["cloudcover_mean"],
|
| 13 |
+
"wind_speed": data["daily"]["wind_speed_10m_max"],
|
| 14 |
+
"humidity_max": data["daily"]["relative_humidity_2m_max"],
|
| 15 |
+
"humidity_min": data["daily"]["relative_humidity_2m_min"],
|
| 16 |
+
})
|
| 17 |
+
df.to_csv("results/daily.csv", index=False)
|
| 18 |
+
print(f"✅ Wrote results/daily.csv with {len(df)} rows")
|
scripts/export_hourly.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import json
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
os.makedirs("results", exist_ok=True)
|
| 7 |
+
with open("data/weather.json") as handle:
|
| 8 |
+
data = json.load(handle)
|
| 9 |
+
|
| 10 |
+
if "hourly" not in data:
|
| 11 |
+
print(
|
| 12 |
+
"data/weather.json missing 'hourly'. Re-run the fetch step with hourly "
|
| 13 |
+
"parameters enabled (see scripts/fetch_weather.sh).",
|
| 14 |
+
file=sys.stderr,
|
| 15 |
+
)
|
| 16 |
+
sys.exit(1)
|
| 17 |
+
|
| 18 |
+
H = data["hourly"]
|
| 19 |
+
df = pd.DataFrame({
|
| 20 |
+
"time": H["time"],
|
| 21 |
+
"temp_c": H["temperature_2m"],
|
| 22 |
+
"humidity": H["relative_humidity_2m"],
|
| 23 |
+
"cloudcover": H["cloudcover"],
|
| 24 |
+
"pressure": H["pressure_msl"],
|
| 25 |
+
"wind_speed": H["wind_speed_10m"],
|
| 26 |
+
"precip_mm": H["precipitation"],
|
| 27 |
+
"rain_mm": H["rain"],
|
| 28 |
+
})
|
| 29 |
+
|
| 30 |
+
df["time"] = pd.to_datetime(df["time"])
|
| 31 |
+
df = df.sort_values("time").reset_index(drop=True)
|
| 32 |
+
df.to_csv("results/hourly.csv", index=False)
|
| 33 |
+
print(f"✅ Wrote results/hourly.csv with {len(df)} rows")
|
scripts/feature_importance_rain.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
|
| 5 |
+
import joblib
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
from sklearn.inspection import permutation_importance
|
| 9 |
+
from sklearn.model_selection import train_test_split
|
| 10 |
+
|
| 11 |
+
RESULTS_DIR = "results"
|
| 12 |
+
os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
|
| 13 |
+
os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
|
| 14 |
+
|
| 15 |
+
Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
|
| 16 |
+
Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
import matplotlib
|
| 19 |
+
|
| 20 |
+
matplotlib.use("Agg")
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def build_dataset(meta: dict) -> tuple[np.ndarray, np.ndarray]:
|
| 25 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 26 |
+
horizon = meta["horizon_hours"]
|
| 27 |
+
|
| 28 |
+
precip = df["precip_mm"].values
|
| 29 |
+
rain_future = np.zeros(len(df), dtype=int)
|
| 30 |
+
for i in range(len(precip) - horizon):
|
| 31 |
+
rain_future[i] = 1 if np.any(precip[i + 1 : i + 1 + horizon] > 0) else 0
|
| 32 |
+
|
| 33 |
+
df = df.iloc[: len(precip) - horizon].copy()
|
| 34 |
+
labels = rain_future[: len(df)]
|
| 35 |
+
features = df[meta["features"]].values
|
| 36 |
+
return features, labels
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def plot_importance(feature_names: list[str], importances: np.ndarray, std: np.ndarray) -> None:
|
| 40 |
+
order = np.argsort(importances)[::-1]
|
| 41 |
+
feature_names = np.array(feature_names)[order]
|
| 42 |
+
importances = importances[order]
|
| 43 |
+
|
| 44 |
+
plt.figure(figsize=(8, 5))
|
| 45 |
+
y_pos = np.arange(len(feature_names))
|
| 46 |
+
plt.barh(y_pos, importances, align="center")
|
| 47 |
+
plt.yticks(y_pos, feature_names)
|
| 48 |
+
plt.gca().invert_yaxis()
|
| 49 |
+
plt.xlabel("Permutation importance (F1 drop)")
|
| 50 |
+
plt.title("Rain classifier — feature importances")
|
| 51 |
+
plt.tight_layout()
|
| 52 |
+
|
| 53 |
+
Path(RESULTS_DIR).mkdir(exist_ok=True)
|
| 54 |
+
plt.savefig(os.path.join(RESULTS_DIR, "feature_importance.png"))
|
| 55 |
+
plt.close()
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def main() -> None:
|
| 59 |
+
meta = json.load(open("models/rain_model_meta.json"))
|
| 60 |
+
model = joblib.load("models/rain_classifier_hourly.joblib")
|
| 61 |
+
|
| 62 |
+
X, y = build_dataset(meta)
|
| 63 |
+
_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
|
| 64 |
+
|
| 65 |
+
result = permutation_importance(
|
| 66 |
+
model,
|
| 67 |
+
X_test,
|
| 68 |
+
y_test,
|
| 69 |
+
n_repeats=25,
|
| 70 |
+
random_state=42,
|
| 71 |
+
scoring="f1",
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
plot_importance(meta["features"], result.importances_mean, result.importances_std)
|
| 75 |
+
print("✅ Wrote results/feature_importance.png")
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
if __name__ == "__main__":
|
| 79 |
+
main()
|
scripts/fetch_weather.sh
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
mkdir -p data logs
|
| 4 |
+
|
| 5 |
+
source .env 2>/dev/null || true
|
| 6 |
+
: "${LAT:=6.5244}"
|
| 7 |
+
: "${LON:=3.3792}"
|
| 8 |
+
: "${CITY:=Lagos}"
|
| 9 |
+
: "${PAST_DAYS:=30}"
|
| 10 |
+
|
| 11 |
+
STAMP="$(date +%Y-%m-%d_%H-%M-%S)"
|
| 12 |
+
LOG_FILE=${LOG_FILE:-logs/app.log}
|
| 13 |
+
|
| 14 |
+
echo "[${STAMP}] Fetching ${PAST_DAYS} past days for ${CITY} (${LAT}, ${LON})"
|
| 15 |
+
|
| 16 |
+
URL="https://api.open-meteo.com/v1/forecast?latitude=${LAT}&longitude=${LON}&hourly=temperature_2m,relative_humidity_2m,cloudcover,pressure_msl,wind_speed_10m,precipitation,rain&timezone=Africa%2FLagos&past_days=${PAST_DAYS}"
|
| 17 |
+
|
| 18 |
+
{
|
| 19 |
+
curl -sfL "$URL" -o data/weather.json
|
| 20 |
+
echo "[$STAMP] Saved to data/weather.json"
|
| 21 |
+
} | tee -a "$LOG_FILE"
|
scripts/intro_ml.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.linear_model import LinearRegression
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
# Imagine 5 days of temperatures (°C)
|
| 5 |
+
# `x` is the input feature (temperature), reshaped to a column vector
|
| 6 |
+
x = np.array([25, 27, 30, 32, 35]).reshape(-1, 1)
|
| 7 |
+
# `y` is the output label (humidity percentage)
|
| 8 |
+
y = np.array([50, 55, 63, 70, 74])
|
| 9 |
+
|
| 10 |
+
model = LinearRegression()
|
| 11 |
+
model.fit(x, y)
|
| 12 |
+
|
| 13 |
+
pred = model.predict([[28]])
|
| 14 |
+
print(f"Predicted humidity for 28°C: {pred[0]:.2f}%")
|
| 15 |
+
|
| 16 |
+
import matplotlib.pyplot as plt
|
| 17 |
+
|
| 18 |
+
plt.scatter(x, y, color='blue', label='data')
|
| 19 |
+
plt.plot(x, model.predict(x), color='red', label='model')
|
| 20 |
+
plt.xlabel('Temperature (°C)')
|
| 21 |
+
plt.ylabel('Humidity (%)')
|
| 22 |
+
plt.legend()
|
| 23 |
+
plt.tight_layout()
|
| 24 |
+
plt.savefig("results/intro_regression.png")
|
| 25 |
+
print("✅ Saved results/intro_regression.png")
|
| 26 |
+
|
| 27 |
+
print("slope:", model.coef_)
|
| 28 |
+
print("intercept:", model.intercept_)
|
scripts/log_predict.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import argparse, os, json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from datetime import datetime
|
| 5 |
+
import joblib, pandas as pd, numpy as np, subprocess
|
| 6 |
+
|
| 7 |
+
MODEL = Path("models/rain_xgb_tuned.joblib")
|
| 8 |
+
META = Path("models/rain_xgb_tuned_meta.json")
|
| 9 |
+
HOURLY = Path("results/hourly.csv")
|
| 10 |
+
LOGS = Path("logs"); LOGS.mkdir(exist_ok=True)
|
| 11 |
+
PRED_LOG = LOGS / "predictions.csv"
|
| 12 |
+
|
| 13 |
+
def ensure_hourly(lat, lon, past_days=90):
|
| 14 |
+
env = os.environ.copy()
|
| 15 |
+
env["LAT"], env["LON"], env["PAST_DAYS"] = str(lat), str(lon), str(past_days)
|
| 16 |
+
if (not HOURLY.exists()):
|
| 17 |
+
subprocess.run(["bash", "scripts/fetch_weather.sh"], check=True, env=env)
|
| 18 |
+
subprocess.run(["python3", "scripts/export_hourly.py"], check=True, env=env)
|
| 19 |
+
return pd.read_csv(HOURLY, parse_dates=["time"])
|
| 20 |
+
|
| 21 |
+
def build_features_like_training(df, features):
|
| 22 |
+
import importlib.util
|
| 23 |
+
spec = importlib.util.spec_from_file_location("train_xgb_tuned_final", "scripts/train_xgb_tuned_final.py")
|
| 24 |
+
module = importlib.util.module_from_spec(spec)
|
| 25 |
+
spec.loader.exec_module(module)
|
| 26 |
+
build_features = module.build_features
|
| 27 |
+
Xdf = build_features(df)
|
| 28 |
+
return Xdf[features]
|
| 29 |
+
|
| 30 |
+
def main():
|
| 31 |
+
ap = argparse.ArgumentParser()
|
| 32 |
+
ap.add_argument("--city", default="Lagos")
|
| 33 |
+
ap.add_argument("--lat", type=float, default=6.5244)
|
| 34 |
+
ap.add_argument("--lon", type=float, default=3.3792)
|
| 35 |
+
ap.add_argument("--mode", choices=["default","recall","precision"], default="default")
|
| 36 |
+
args = ap.parse_args()
|
| 37 |
+
|
| 38 |
+
meta = json.loads(META.read_text())
|
| 39 |
+
thr = meta["thresholds"]; feats = meta["features"]; H = meta["horizon_hours"]; event_mm = meta["event_mm"]
|
| 40 |
+
|
| 41 |
+
df = ensure_hourly(args.lat, args.lon, 90)
|
| 42 |
+
Xdf = build_features_like_training(df.copy(), feats)
|
| 43 |
+
if Xdf.empty: raise SystemExit("Not enough rows to build features")
|
| 44 |
+
|
| 45 |
+
clf = joblib.load(MODEL)
|
| 46 |
+
p = float(clf.predict_proba(Xdf.iloc[[-1]].values)[0,1])
|
| 47 |
+
tmap = {"default":thr["default"], "recall":thr["high_recall"], "precision":thr["high_precision"]}
|
| 48 |
+
t = float(tmap[args.mode])
|
| 49 |
+
decision = "RAIN" if p >= t else "No rain"
|
| 50 |
+
|
| 51 |
+
row = {
|
| 52 |
+
"ts_pred": df.loc[Xdf.index, "time"].iloc[-1].strftime("%Y-%m-%d %H:%M:%S"),
|
| 53 |
+
"logged_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
| 54 |
+
"city": args.city, "lat": args.lat, "lon": args.lon,
|
| 55 |
+
"mode": args.mode, "horizon_h": H, "event_mm": event_mm,
|
| 56 |
+
"p": p, "threshold": t, "decision": decision,
|
| 57 |
+
"y_true": "", # to be filled by backfill
|
| 58 |
+
}
|
| 59 |
+
if not PRED_LOG.exists():
|
| 60 |
+
pd.DataFrame([row]).to_csv(PRED_LOG, index=False)
|
| 61 |
+
else:
|
| 62 |
+
pd.DataFrame([row]).to_csv(PRED_LOG, mode="a", header=False, index=False)
|
| 63 |
+
|
| 64 |
+
print(f"Logged: {row}")
|
| 65 |
+
|
| 66 |
+
if __name__ == "__main__":
|
| 67 |
+
main()
|
scripts/make_cover.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from PIL import Image, ImageDraw, ImageFont
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
RESULTS = "results"
|
| 5 |
+
IMG1 = os.path.join(RESULTS, "temps.png")
|
| 6 |
+
IMG2 = os.path.join(RESULTS, "precip.png")
|
| 7 |
+
OUT = os.path.join(RESULTS, "cover.png")
|
| 8 |
+
ASSETS_DIR = "assets"
|
| 9 |
+
ASSET_OUT = os.path.join(ASSETS_DIR, "cover.png")
|
| 10 |
+
|
| 11 |
+
def fail(msg):
|
| 12 |
+
print(f"❌ {msg}")
|
| 13 |
+
raise SystemExit(1)
|
| 14 |
+
|
| 15 |
+
if not os.path.exists(IMG1):
|
| 16 |
+
fail(f"Missing {IMG1}. Run `make viz` first.")
|
| 17 |
+
if not os.path.exists(IMG2):
|
| 18 |
+
fail(f"Missing {IMG2}. Run `make viz` first.")
|
| 19 |
+
|
| 20 |
+
img1 = Image.open(IMG1).convert("RGB")
|
| 21 |
+
img2 = Image.open(IMG2).convert("RGB")
|
| 22 |
+
|
| 23 |
+
w = min(img1.width, img2.width)
|
| 24 |
+
def resize_to_width(im, target_w):
|
| 25 |
+
new_h = int(im.height * target_w / im.width)
|
| 26 |
+
return im.resize((target_w, new_h))
|
| 27 |
+
|
| 28 |
+
img1 = resize_to_width(img1, w)
|
| 29 |
+
img2 = resize_to_width(img2, w)
|
| 30 |
+
|
| 31 |
+
pad = 16
|
| 32 |
+
title_h = 48
|
| 33 |
+
H = img1.height + img2.height + title_h + pad * 4
|
| 34 |
+
W = w + pad * 2
|
| 35 |
+
|
| 36 |
+
canvas = Image.new("RGB", (W, H), "white")
|
| 37 |
+
|
| 38 |
+
y = pad
|
| 39 |
+
canvas.paste(img1, (pad, y)); y += img1.height + pad
|
| 40 |
+
canvas.paste(img2, (pad, y)); y += img2.height + pad
|
| 41 |
+
|
| 42 |
+
draw = ImageDraw.Draw(canvas)
|
| 43 |
+
title = "Weather Data Fetcher — Automated Pipeline"
|
| 44 |
+
try:
|
| 45 |
+
font = ImageFont.load_default()
|
| 46 |
+
except Exception:
|
| 47 |
+
font = None
|
| 48 |
+
|
| 49 |
+
tw, th = draw.textbbox((0,0), title, font=font)[2:]
|
| 50 |
+
tx = (W - tw) // 2
|
| 51 |
+
ty = y
|
| 52 |
+
draw.text((tx, ty), title, fill="black", font=font)
|
| 53 |
+
|
| 54 |
+
os.makedirs(RESULTS, exist_ok=True)
|
| 55 |
+
canvas.save(OUT, optimize=True)
|
| 56 |
+
if ASSETS_DIR:
|
| 57 |
+
os.makedirs(ASSETS_DIR, exist_ok=True)
|
| 58 |
+
canvas.save(ASSET_OUT, optimize=True)
|
| 59 |
+
print(f"✅ Created {OUT}")
|
scripts/monitor_weekly.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import pandas as pd, numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
from sklearn.metrics import precision_recall_fscore_support, brier_score_loss
|
| 6 |
+
|
| 7 |
+
LOG = Path("logs/predictions.csv")
|
| 8 |
+
OUT = Path("results"); OUT.mkdir(exist_ok=True)
|
| 9 |
+
|
| 10 |
+
def week_key(ts): # ISO year-week
|
| 11 |
+
iso = ts.isocalendar()
|
| 12 |
+
return f"{iso.year}-W{iso.week:02d}"
|
| 13 |
+
|
| 14 |
+
def calibration_plot(p, y, bins=10, out_png="results/calibration.png"):
|
| 15 |
+
df = pd.DataFrame({"p":p, "y":y}).dropna()
|
| 16 |
+
df["bin"] = pd.qcut(df["p"], q=bins, duplicates="drop")
|
| 17 |
+
g = df.groupby("bin").agg(avg_p=("p","mean"), frac_pos=("y","mean"), n=("y","size")).reset_index(drop=True)
|
| 18 |
+
plt.figure()
|
| 19 |
+
plt.plot([0,1],[0,1], linestyle="--")
|
| 20 |
+
plt.plot(g["avg_p"], g["frac_pos"], marker="o")
|
| 21 |
+
plt.xlabel("Predicted probability")
|
| 22 |
+
plt.ylabel("Observed frequency")
|
| 23 |
+
plt.title("Calibration")
|
| 24 |
+
for i, n in enumerate(g["n"]):
|
| 25 |
+
plt.annotate(str(int(n)), (g["avg_p"].iloc[i], g["frac_pos"].iloc[i]))
|
| 26 |
+
plt.tight_layout()
|
| 27 |
+
plt.savefig(out_png, dpi=300); plt.close()
|
| 28 |
+
|
| 29 |
+
def main():
|
| 30 |
+
if not LOG.exists():
|
| 31 |
+
print("No logs yet.")
|
| 32 |
+
return
|
| 33 |
+
df = pd.read_csv(LOG, parse_dates=["ts_pred","logged_at"])
|
| 34 |
+
df = df[df["y_true"].astype(str).isin(["0","1"])].copy()
|
| 35 |
+
if df.empty:
|
| 36 |
+
print("No rows with y_true yet.")
|
| 37 |
+
return
|
| 38 |
+
df["y_true"] = df["y_true"].astype(int)
|
| 39 |
+
df["week"] = df["ts_pred"].apply(week_key)
|
| 40 |
+
|
| 41 |
+
# Weekly metrics per mode
|
| 42 |
+
rows = []
|
| 43 |
+
for (wk, mode), grp in df.groupby(["week","mode"]):
|
| 44 |
+
y = grp["y_true"].values
|
| 45 |
+
# decision at time of logging
|
| 46 |
+
yhat = (grp["p"].values >= grp["threshold"].values).astype(int)
|
| 47 |
+
P,R,F1,_ = precision_recall_fscore_support(y, yhat, average="binary", zero_division=0)
|
| 48 |
+
alerts = float(yhat.mean())
|
| 49 |
+
brier = brier_score_loss(y, grp["p"].values)
|
| 50 |
+
rows.append({"week":wk,"mode":mode,"n":len(grp),"precision":P,"recall":R,"f1":F1,"alert_rate":alerts,"brier":brier})
|
| 51 |
+
rep = pd.DataFrame(rows).sort_values(["week","mode"])
|
| 52 |
+
rep.to_csv(OUT/"weekly_report.csv", index=False)
|
| 53 |
+
print(rep)
|
| 54 |
+
|
| 55 |
+
# Overall calibration (all modes combined)
|
| 56 |
+
calibration_plot(df["p"].values, df["y_true"].values, bins=12, out_png=str(OUT/"calibration.png"))
|
| 57 |
+
print("Saved:", OUT/"weekly_report.csv", "and", OUT/"calibration.png")
|
| 58 |
+
|
| 59 |
+
if __name__ == "__main__":
|
| 60 |
+
main()
|
scripts/plot_pr_roc.py
ADDED
|
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
import joblib
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import numpy as np
|
| 6 |
+
from sklearn.metrics import precision_recall_curve, roc_curve, auc
|
| 7 |
+
from sklearn.model_selection import train_test_split
|
| 8 |
+
|
| 9 |
+
RESULTS_DIR = "results"
|
| 10 |
+
os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
|
| 11 |
+
os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
|
| 12 |
+
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
|
| 15 |
+
Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
|
| 16 |
+
Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
import matplotlib
|
| 19 |
+
|
| 20 |
+
matplotlib.use("Agg")
|
| 21 |
+
import matplotlib.pyplot as plt
|
| 22 |
+
|
| 23 |
+
meta = json.load(open("models/rain_model_meta.json"))
|
| 24 |
+
clf = joblib.load("models/rain_classifier_hourly.joblib")
|
| 25 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 26 |
+
|
| 27 |
+
H = meta["horizon_hours"]
|
| 28 |
+
features = meta["features"]
|
| 29 |
+
|
| 30 |
+
precip_next = np.zeros(len(df), dtype=int)
|
| 31 |
+
prec = df["precip_mm"].values
|
| 32 |
+
for i in range(len(prec) - H):
|
| 33 |
+
precip_next[i] = 1 if np.any(prec[i + 1 : i + 1 + H] > 0) else 0
|
| 34 |
+
|
| 35 |
+
df = df.iloc[: len(precip_next)].copy()
|
| 36 |
+
df["rain_next6h"] = precip_next[: len(df)]
|
| 37 |
+
|
| 38 |
+
X = df[features].values
|
| 39 |
+
y = df["rain_next6h"].values
|
| 40 |
+
|
| 41 |
+
_, X_test, _, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)
|
| 42 |
+
proba = clf.predict_proba(X_test)[:, 1]
|
| 43 |
+
|
| 44 |
+
precision, recall, _ = precision_recall_curve(y_test, proba)
|
| 45 |
+
fpr, tpr, _ = roc_curve(y_test, proba)
|
| 46 |
+
|
| 47 |
+
plt.figure()
|
| 48 |
+
plt.plot(recall, precision)
|
| 49 |
+
plt.xlabel("Recall")
|
| 50 |
+
plt.ylabel("Precision")
|
| 51 |
+
plt.title("Precision–Recall")
|
| 52 |
+
plt.tight_layout()
|
| 53 |
+
plt.savefig("results/pr_curve.png")
|
| 54 |
+
plt.close()
|
| 55 |
+
|
| 56 |
+
plt.figure()
|
| 57 |
+
plt.plot(fpr, tpr)
|
| 58 |
+
plt.xlabel("FPR")
|
| 59 |
+
plt.ylabel("TPR")
|
| 60 |
+
plt.title(f"ROC (AUC={auc(fpr, tpr):.2f})")
|
| 61 |
+
plt.tight_layout()
|
| 62 |
+
plt.savefig("results/roc_curve.png")
|
| 63 |
+
plt.close()
|
| 64 |
+
|
| 65 |
+
print("✅ Wrote results/pr_curve.png and results/roc_curve.png")
|
scripts/plot_weather.py
ADDED
|
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from shutil import copyfile
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
RESULTS_DIR = "results"
|
| 7 |
+
ASSETS_DIR = "assets"
|
| 8 |
+
TEMPS_RESULTS = os.path.join(RESULTS_DIR, "temps.png")
|
| 9 |
+
PRECIP_RESULTS = os.path.join(RESULTS_DIR, "precip.png")
|
| 10 |
+
TEMPS_ASSET = os.path.join(ASSETS_DIR, "temps.png")
|
| 11 |
+
PRECIP_ASSET = os.path.join(ASSETS_DIR, "precip.png")
|
| 12 |
+
|
| 13 |
+
os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
|
| 14 |
+
os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))
|
| 15 |
+
|
| 16 |
+
import matplotlib
|
| 17 |
+
|
| 18 |
+
matplotlib.use("Agg")
|
| 19 |
+
import matplotlib.pyplot as plt
|
| 20 |
+
import pandas as pd
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def ensure_dirs():
|
| 24 |
+
os.makedirs(RESULTS_DIR, exist_ok=True)
|
| 25 |
+
os.makedirs(ASSETS_DIR, exist_ok=True)
|
| 26 |
+
os.makedirs(os.environ["MPLCONFIGDIR"], exist_ok=True)
|
| 27 |
+
os.makedirs(os.environ["XDG_CACHE_HOME"], exist_ok=True)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def mirror_asset(src: str, dest: str) -> None:
|
| 31 |
+
copyfile(src, dest)
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def main():
|
| 35 |
+
with open("data/weather.json") as handle:
|
| 36 |
+
data = json.load(handle)
|
| 37 |
+
|
| 38 |
+
days = pd.to_datetime(data["daily"]["time"])
|
| 39 |
+
tmax = pd.Series(data["daily"]["temperature_2m_max"])
|
| 40 |
+
tmin = pd.Series(data["daily"]["temperature_2m_min"])
|
| 41 |
+
prec = pd.Series(data["daily"].get("precipitation_sum", [0] * len(days)))
|
| 42 |
+
|
| 43 |
+
ensure_dirs()
|
| 44 |
+
|
| 45 |
+
plt.figure()
|
| 46 |
+
plt.plot(days, tmax, marker="o", label="Max °C")
|
| 47 |
+
plt.plot(days, tmin, marker="o", label="Min °C")
|
| 48 |
+
plt.xticks(rotation=45, ha="right")
|
| 49 |
+
plt.title("Daily Temperatures (°C)")
|
| 50 |
+
plt.legend()
|
| 51 |
+
plt.tight_layout()
|
| 52 |
+
plt.savefig(TEMPS_RESULTS)
|
| 53 |
+
plt.close()
|
| 54 |
+
|
| 55 |
+
mirror_asset(TEMPS_RESULTS, TEMPS_ASSET)
|
| 56 |
+
|
| 57 |
+
# Precipitation bar chart
|
| 58 |
+
plt.figure()
|
| 59 |
+
plt.bar(days, prec)
|
| 60 |
+
plt.xticks(rotation=45, ha="right")
|
| 61 |
+
plt.title("Daily Precipitation (mm)")
|
| 62 |
+
plt.tight_layout()
|
| 63 |
+
plt.savefig(PRECIP_RESULTS)
|
| 64 |
+
plt.close()
|
| 65 |
+
|
| 66 |
+
mirror_asset(PRECIP_RESULTS, PRECIP_ASSET)
|
| 67 |
+
|
| 68 |
+
print(f"✅ Wrote {TEMPS_RESULTS} / {PRECIP_RESULTS}")
|
| 69 |
+
print(f"✅ Updated assets at {TEMPS_ASSET} / {PRECIP_ASSET}")
|
| 70 |
+
|
| 71 |
+
if __name__ == "__main__":
|
| 72 |
+
main()
|
scripts/predict_rain.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys, json, joblib
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
# Load latest hour from results/hourly.csv, predict next 6h rain
|
| 6 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 7 |
+
row = df.iloc[-1:].copy()
|
| 8 |
+
|
| 9 |
+
for col in ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]:
|
| 10 |
+
row[f"d_{col}"] = df[col].diff().iloc[-1]
|
| 11 |
+
row[f"ma3_{col}"] = df[col].rolling(3).mean().iloc[-1]
|
| 12 |
+
|
| 13 |
+
features = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
|
| 14 |
+
features += [f"d_{c}" for c in features]
|
| 15 |
+
features += [f"ma3_{c}" for c in ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]]
|
| 16 |
+
|
| 17 |
+
# Load model + meta
|
| 18 |
+
clf = joblib.load("models/rain_classifier_hourly.joblib")
|
| 19 |
+
meta = json.load(open("models/rain_model_meta.json"))
|
| 20 |
+
X = row[meta["features"]].values
|
| 21 |
+
|
| 22 |
+
proba = float(clf.predict_proba(X)[0,1])
|
| 23 |
+
thr_r = meta["thresholds"]["high_recall"]
|
| 24 |
+
thr_p = meta["thresholds"]["high_precision"]
|
| 25 |
+
|
| 26 |
+
print(f"Latest hour: {row['time'].iloc[0]}")
|
| 27 |
+
print(f"P(rain next {meta['horizon_hours']}h) = {proba:.3f}")
|
| 28 |
+
print(f"High-Recall mode: {'RAIN' if proba>=thr_r else 'No rain'} (thr={thr_r:.2f})")
|
| 29 |
+
print(f"High-Precision mode:{'RAIN' if proba>=thr_p else 'No rain'} (thr={thr_p:.2f})")
|
scripts/process_weather.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dotenv import load_dotenv
|
| 2 |
+
import os, json, sys, logging
|
| 3 |
+
|
| 4 |
+
load_dotenv()
|
| 5 |
+
LAT = os.getenv("LAT", "6.5244")
|
| 6 |
+
LON = os.getenv("LON", "3.3792")
|
| 7 |
+
CITY = os.getenv("CITY", "Lagos")
|
| 8 |
+
LOG_FILE = os.getenv("LOG_FILE", "logs/app.log")
|
| 9 |
+
|
| 10 |
+
os.makedirs("logs", exist_ok=True)
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
filename=LOG_FILE,
|
| 13 |
+
level=logging.INFO,
|
| 14 |
+
format="%(asctime)s [%(levelname)s] %(message)s",
|
| 15 |
+
)
|
| 16 |
+
|
| 17 |
+
logging.info(f"Processing weather for {CITY} ({LAT}, {LON})")
|
| 18 |
+
|
| 19 |
+
IN, OUT_DIR = "data/weather.json", "results"
|
| 20 |
+
OUT = os.path.join(OUT_DIR, "summary.txt")
|
| 21 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
| 22 |
+
|
| 23 |
+
logging.info("Reading weather.json")
|
| 24 |
+
|
| 25 |
+
try:
|
| 26 |
+
with open(IN) as f:
|
| 27 |
+
data = json.load(f)
|
| 28 |
+
except FileNotFoundError:
|
| 29 |
+
print("weather.json not found. Run `make download`.", file=sys.stderr); sys.exit(1)
|
| 30 |
+
|
| 31 |
+
try:
|
| 32 |
+
daily = data["daily"]
|
| 33 |
+
days = daily["time"]
|
| 34 |
+
tmax = daily["temperature_2m_max"]
|
| 35 |
+
tmin = daily["temperature_2m_min"]
|
| 36 |
+
prec = daily.get("precipitation_sum", [0]*len(days))
|
| 37 |
+
except Exception as e:
|
| 38 |
+
print(f"Unexpected JSON structure: {e}", file=sys.stderr); sys.exit(2)
|
| 39 |
+
|
| 40 |
+
with open(OUT, "w") as f:
|
| 41 |
+
f.write("Lagos (Africa/Lagos) – Daily summary\n")
|
| 42 |
+
f.write("-----------------------------------\n")
|
| 43 |
+
for d, lo, hi, p in zip(days, tmin, tmax, prec):
|
| 44 |
+
f.write(f"{d}: {lo}°C – {hi}°C | precip: {p} mm\n")
|
| 45 |
+
|
| 46 |
+
logging.info(f"Wrote summary to {OUT}")
|
| 47 |
+
|
| 48 |
+
print(f"✅ Wrote {OUT}")
|
| 49 |
+
|
| 50 |
+
import csv
|
| 51 |
+
with open(os.path.join(OUT_DIR, "summary.csv"), "w", newline="") as f:
|
| 52 |
+
w = csv.writer(f)
|
| 53 |
+
w.writerow(["date", "temp_min_c", "temp_max_c", "precip_mm"])
|
| 54 |
+
for d, lo, hi, p in zip(days, tmin, tmax, prec):
|
| 55 |
+
w.writerow([d, lo, hi, p])
|
| 56 |
+
print("✅ Wrote results/summary.csv")
|
scripts/rain_cli.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import argparse, json, joblib, pandas as pd
|
| 3 |
+
|
| 4 |
+
def main():
|
| 5 |
+
ap = argparse.ArgumentParser(description="Rain warning in next 6h")
|
| 6 |
+
ap.add_argument("--mode", choices=["recall","precision","default"], default="recall")
|
| 7 |
+
args = ap.parse_args()
|
| 8 |
+
|
| 9 |
+
meta = json.load(open("models/rain_model_meta.json"))
|
| 10 |
+
clf = joblib.load("models/rain_classifier_hourly.joblib")
|
| 11 |
+
|
| 12 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 13 |
+
row = df.iloc[-1:].copy()
|
| 14 |
+
|
| 15 |
+
# rebuild features like training
|
| 16 |
+
base = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
|
| 17 |
+
for col in base:
|
| 18 |
+
row[f"d_{col}"] = df[col].diff().iloc[-1]
|
| 19 |
+
row[f"ma3_{col}"] = df[col].rolling(3).mean().iloc[-1]
|
| 20 |
+
|
| 21 |
+
X = row[meta["features"]].values
|
| 22 |
+
p = float(clf.predict_proba(X)[0,1])
|
| 23 |
+
|
| 24 |
+
thr = {
|
| 25 |
+
"default": meta["thresholds"]["default"],
|
| 26 |
+
"recall": meta["thresholds"]["high_recall"],
|
| 27 |
+
"precision": meta["thresholds"]["high_precision"],
|
| 28 |
+
}[args.mode]
|
| 29 |
+
|
| 30 |
+
decision = "RAIN" if p >= thr else "No rain"
|
| 31 |
+
print(f"{row['time'].iloc[0]} | P(rain ≤{meta['horizon_hours']}h)={p:.3f} | mode={args.mode} thr={thr:.2f} → {decision}")
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
main()
|
scripts/start_services.sh
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -euo pipefail
|
| 3 |
+
|
| 4 |
+
STREAMLIT_PORT="${STREAMLIT_PORT:-8501}"
|
| 5 |
+
UVICORN_PORT="${UVICORN_PORT:-${PORT:-8000}}"
|
| 6 |
+
HOST="0.0.0.0"
|
| 7 |
+
|
| 8 |
+
echo "Environment: PORT=${PORT:-<unset>} STREAMLIT_PORT=${STREAMLIT_PORT} UVICORN_PORT=${UVICORN_PORT}"
|
| 9 |
+
|
| 10 |
+
export STREAMLIT_SERVER_HEADLESS=true
|
| 11 |
+
export STREAMLIT_SERVER_PORT="${STREAMLIT_PORT}"
|
| 12 |
+
export STREAMLIT_SERVER_ADDRESS="${HOST}"
|
| 13 |
+
|
| 14 |
+
echo "🌐 Starting Streamlit on port ${STREAMLIT_PORT}"
|
| 15 |
+
streamlit run streamlit_app.py --server.port "${STREAMLIT_PORT}" --server.address "${HOST}" &
|
| 16 |
+
STREAMLIT_PID=$!
|
| 17 |
+
|
| 18 |
+
cleanup() {
|
| 19 |
+
echo "🛑 Shutting down services..."
|
| 20 |
+
if kill -0 "${STREAMLIT_PID}" 2>/dev/null; then
|
| 21 |
+
kill "${STREAMLIT_PID}" 2>/dev/null || true
|
| 22 |
+
wait "${STREAMLIT_PID}" 2>/dev/null || true
|
| 23 |
+
fi
|
| 24 |
+
}
|
| 25 |
+
trap cleanup EXIT INT TERM
|
| 26 |
+
|
| 27 |
+
echo "🚀 Starting FastAPI (uvicorn) on port ${UVICORN_PORT}"
|
| 28 |
+
exec python -m uvicorn app.main:app --host "${HOST}" --port "${UVICORN_PORT}" --proxy-headers
|
scripts/time_series_cv_demo.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.model_selection import TimeSeriesSplit
|
| 4 |
+
from sklearn.pipeline import Pipeline
|
| 5 |
+
from sklearn.preprocessing import StandardScaler
|
| 6 |
+
from sklearn.linear_model import LogisticRegression
|
| 7 |
+
from sklearn.metrics import f1_score
|
| 8 |
+
|
| 9 |
+
# Load your hourly data
|
| 10 |
+
df = pd.read_csv("results/hourly.csv")
|
| 11 |
+
df = df.dropna().reset_index(drop=True)
|
| 12 |
+
|
| 13 |
+
# Features and target
|
| 14 |
+
features = ["temp_c","humidity","cloudcover","pressure","wind_speed","precip_mm","rain_mm"]
|
| 15 |
+
X = df[features].values
|
| 16 |
+
y = (df["precip_mm"].shift(-6) > 0).astype(int) # rain in next 6h
|
| 17 |
+
y = y[:-6]
|
| 18 |
+
X = X[:-6]
|
| 19 |
+
|
| 20 |
+
# Time-series CV setup
|
| 21 |
+
tscv = TimeSeriesSplit(n_splits=5)
|
| 22 |
+
|
| 23 |
+
f1_scores = []
|
| 24 |
+
for fold, (train_idx, test_idx) in enumerate(tscv.split(X)):
|
| 25 |
+
X_train, X_test = X[train_idx], X[test_idx]
|
| 26 |
+
y_train, y_test = y[train_idx], y[test_idx]
|
| 27 |
+
|
| 28 |
+
clf = Pipeline([
|
| 29 |
+
("scaler", StandardScaler()),
|
| 30 |
+
("model", LogisticRegression(max_iter=1000, class_weight="balanced"))
|
| 31 |
+
])
|
| 32 |
+
clf.fit(X_train, y_train)
|
| 33 |
+
preds = clf.predict(X_test)
|
| 34 |
+
score = f1_score(y_test, preds)
|
| 35 |
+
f1_scores.append(score)
|
| 36 |
+
print(f"Fold {fold+1} F1: {score:.3f}")
|
| 37 |
+
|
| 38 |
+
print("\nAverage F1 across folds:", np.mean(f1_scores).round(3))
|
scripts/train_classify_rain.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
+
from sklearn.linear_model import LogisticRegression
|
| 8 |
+
from sklearn.pipeline import Pipeline
|
| 9 |
+
from sklearn.metrics import (
|
| 10 |
+
confusion_matrix, classification_report,
|
| 11 |
+
roc_auc_score, roc_curve, precision_recall_fscore_support
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
df = pd.read_csv("results/daily.csv")
|
| 15 |
+
|
| 16 |
+
df["precip_tomorrow"] = df["precip_mm"].shift(-1)
|
| 17 |
+
df = df.dropna() # drop last row without tomorrow
|
| 18 |
+
|
| 19 |
+
df["rain_tomorrow"] = (df["precip_tomorrow"] > 0).astype(int)
|
| 20 |
+
|
| 21 |
+
features = [
|
| 22 |
+
"temp_max_c",
|
| 23 |
+
"temp_min_c",
|
| 24 |
+
"cloudcover",
|
| 25 |
+
"wind_speed",
|
| 26 |
+
"humidity_max",
|
| 27 |
+
"humidity_min",
|
| 28 |
+
"precip_mm", # rain today often implies rain persists
|
| 29 |
+
]
|
| 30 |
+
X = df[features].values
|
| 31 |
+
y = df["rain_tomorrow"].values
|
| 32 |
+
|
| 33 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 34 |
+
X, y, test_size=0.3, shuffle=False
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
clf = Pipeline([
|
| 38 |
+
("scaler", StandardScaler()),
|
| 39 |
+
("logreg", LogisticRegression(max_iter=200,
|
| 40 |
+
class_weight="balanced"))
|
| 41 |
+
])
|
| 42 |
+
|
| 43 |
+
clf.fit(X_train, y_train)
|
| 44 |
+
|
| 45 |
+
proba = clf.predict_proba(X_test)[:, 1] # P(rain)
|
| 46 |
+
pred_default = (proba >= 0.5).astype(int) # default threshold
|
| 47 |
+
|
| 48 |
+
labels = [0, 1]
|
| 49 |
+
cm = confusion_matrix(y_test, pred_default, labels=labels)
|
| 50 |
+
tn, fp, fn, tp = cm.ravel()
|
| 51 |
+
|
| 52 |
+
prec, rec, f1, _ = precision_recall_fscore_support(
|
| 53 |
+
y_test, pred_default, average="binary", zero_division=0
|
| 54 |
+
)
|
| 55 |
+
auc = (
|
| 56 |
+
roc_auc_score(y_test, proba)
|
| 57 |
+
if len(np.unique(y_test)) > 1 and len(np.unique(proba)) > 1
|
| 58 |
+
else float("nan")
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
print("📊 Confusion Matrix (threshold=0.50)")
|
| 62 |
+
print(cm)
|
| 63 |
+
auc_str = f"{auc:.3f}" if np.isfinite(auc) else "n/a"
|
| 64 |
+
print(f"\nPrecision: {prec:.3f} Recall: {rec:.3f} F1: {f1:.3f} ROC-AUC: {auc_str}")
|
| 65 |
+
|
| 66 |
+
print("\nDetailed report:")
|
| 67 |
+
print(classification_report(y_test, pred_default, digits=3, zero_division=0, labels=labels))
|
| 68 |
+
|
| 69 |
+
always_no = np.zeros_like(y_test)
|
| 70 |
+
prec0, rec0, f10, _ = precision_recall_fscore_support(
|
| 71 |
+
y_test, always_no, average="binary", zero_division=0
|
| 72 |
+
)
|
| 73 |
+
print("\n⚠️ Baseline — always 'no rain'")
|
| 74 |
+
print(f"Precision: {prec0:.3f} Recall: {rec0:.3f} F1: {f10:.3f}")
|
| 75 |
+
|
| 76 |
+
today_rain = (df["precip_mm"].values[-len(y_test)-1:-1] > 0).astype(int)
|
| 77 |
+
precp, recp, f1p, _ = precision_recall_fscore_support(
|
| 78 |
+
y_test, today_rain, average="binary", zero_division=0
|
| 79 |
+
)
|
| 80 |
+
print("\n🧠 Baseline — 'tomorrow rain = today rain'")
|
| 81 |
+
print(f"Precision: {precp:.3f} Recall: {recp:.3f} F1: {f1p:.3f}")
|
| 82 |
+
|
| 83 |
+
thr = 0.35
|
| 84 |
+
pred_tuned = (proba >= thr).astype(int)
|
| 85 |
+
prec_t, rec_t, f1_t, _ = precision_recall_fscore_support(
|
| 86 |
+
y_test, pred_tuned, average="binary", zero_division=0
|
| 87 |
+
)
|
| 88 |
+
print(f"\n🎛️ Threshold {thr:.2f} → Precision: {prec_t:.3f} Recall: {rec_t:.3f} F1: {f1_t:.3f}")
|
| 89 |
+
|
| 90 |
+
import joblib
|
| 91 |
+
os.makedirs("models", exist_ok=True)
|
| 92 |
+
joblib.dump(clf, "models/rain_classifier.joblib")
|
| 93 |
+
print("\n💾 Saved: models/rain_classifier.joblib")
|
scripts/train_classify_rain_hourly.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
+
from sklearn.linear_model import LogisticRegression
|
| 8 |
+
from sklearn.pipeline import Pipeline
|
| 9 |
+
from sklearn.metrics import (
|
| 10 |
+
confusion_matrix, classification_report, roc_auc_score,
|
| 11 |
+
precision_recall_fscore_support
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
|
| 15 |
+
|
| 16 |
+
H = 6
|
| 17 |
+
precip_next = np.zeros(len(df), dtype=int)
|
| 18 |
+
|
| 19 |
+
prec = df["precip_mm"].values
|
| 20 |
+
|
| 21 |
+
for i in range(len(prec) - H):
|
| 22 |
+
precip_next[i] = 1 if np.any(prec[i+1:i+1+H] > 0) else 0
|
| 23 |
+
|
| 24 |
+
df = df.iloc[:len(precip_next) - (0)].copy()
|
| 25 |
+
df["rain_next6h"] = precip_next[:len(df)]
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
features = [
|
| 29 |
+
"temp_c","humidity","cloudcover","pressure","wind_speed",
|
| 30 |
+
"precip_mm","rain_mm"
|
| 31 |
+
]
|
| 32 |
+
X = df[features].values
|
| 33 |
+
y = df["rain_next6h"].values
|
| 34 |
+
|
| 35 |
+
print("Class balance (0=no-rain, 1=rain-in-next6h):", np.bincount(y))
|
| 36 |
+
|
| 37 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 38 |
+
X, y, test_size=0.3, shuffle=False
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
clf = Pipeline([
|
| 42 |
+
("scaler", StandardScaler()),
|
| 43 |
+
("logreg", LogisticRegression(max_iter=500, class_weight="balanced"))
|
| 44 |
+
])
|
| 45 |
+
clf.fit(X_train, y_train)
|
| 46 |
+
|
| 47 |
+
proba = clf.predict_proba(X_test)[:, 1]
|
| 48 |
+
pred_050 = (proba >= 0.50).astype(int)
|
| 49 |
+
|
| 50 |
+
cm = confusion_matrix(y_test, pred_050)
|
| 51 |
+
print("\n📊 Confusion Matrix (thr=0.50)")
|
| 52 |
+
print(cm)
|
| 53 |
+
|
| 54 |
+
prec, rec, f1, _ = precision_recall_fscore_support(
|
| 55 |
+
y_test, pred_050, average="binary", zero_division=0
|
| 56 |
+
)
|
| 57 |
+
try:
|
| 58 |
+
auc = roc_auc_score(y_test, proba)
|
| 59 |
+
except ValueError:
|
| 60 |
+
auc = float("nan")
|
| 61 |
+
|
| 62 |
+
print(f"Precision: {prec:.3f} Recall: {rec:.3f} F1: {f1:.3f} ROC-AUC: {auc:.3f}")
|
| 63 |
+
|
| 64 |
+
print("\nDetailed report:")
|
| 65 |
+
print(classification_report(y_test, pred_050, digits=3, zero_division=0))
|
| 66 |
+
|
| 67 |
+
# Baselines
|
| 68 |
+
always_no = np.zeros_like(y_test)
|
| 69 |
+
p0, r0, f10, _ = precision_recall_fscore_support(
|
| 70 |
+
y_test, always_no, average="binary", zero_division=0
|
| 71 |
+
)
|
| 72 |
+
print("\n🧠 Baseline — always 'no rain'")
|
| 73 |
+
print(f"Precision: {p0:.3f} Recall: {r0:.3f} F1: {f10:.3f}")
|
| 74 |
+
|
| 75 |
+
# Persistence baseline
|
| 76 |
+
recent_rain = (
|
| 77 |
+
pd.Series(df["precip_mm"])
|
| 78 |
+
.rolling(window=H, min_periods=1)
|
| 79 |
+
.sum()
|
| 80 |
+
.shift(1)
|
| 81 |
+
.fillna(0)
|
| 82 |
+
> 0
|
| 83 |
+
).astype(int).values
|
| 84 |
+
prev6_test = recent_rain[-len(y_test):]
|
| 85 |
+
pp, rp, f1p, _ = precision_recall_fscore_support(y_test, prev6_test, average="binary", zero_division=0)
|
| 86 |
+
print("\n🧠 Baseline — persistence (prev 6h)")
|
| 87 |
+
print(f"Precision: {pp:.3f} Recall: {rp:.3f} F1: {f1p:.3f}")
|
| 88 |
+
|
| 89 |
+
# Threshold tuning
|
| 90 |
+
thr_recall = 0.35
|
| 91 |
+
thr_precision = 0.65
|
| 92 |
+
|
| 93 |
+
pred_recall = (proba >= thr_recall).astype(int)
|
| 94 |
+
pred_precision = (proba >= thr_precision).astype(int)
|
| 95 |
+
|
| 96 |
+
pr_recall, rc_recall, f1_recall, _ = precision_recall_fscore_support(
|
| 97 |
+
y_test, pred_recall, average="binary", zero_division=0
|
| 98 |
+
)
|
| 99 |
+
pr_precision, rc_precision, f1_precision, _ = precision_recall_fscore_support(
|
| 100 |
+
y_test, pred_precision, average="binary", zero_division=0
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
print(f"\n🎛️ Threshold {thr_recall:.2f} → Precision: {pr_recall:.3f} Recall: {rc_recall:.3f} F1: {f1_recall:.3f}")
|
| 104 |
+
print(f"🎛️ Threshold {thr_precision:.2f} → Precision: {pr_precision:.3f} Recall: {rc_precision:.3f} F1: {f1_precision:.3f}")
|
| 105 |
+
|
| 106 |
+
import joblib
|
| 107 |
+
os.makedirs("models", exist_ok=True)
|
| 108 |
+
joblib.dump(clf, "models/rain_classifier_hourly.joblib")
|
| 109 |
+
print("\n💾 Saved: models/rain_classifier_hourly.joblib")
|
| 110 |
+
|
| 111 |
+
meta = {
|
| 112 |
+
"horizon_hours": H,
|
| 113 |
+
"features": features,
|
| 114 |
+
"thresholds": {
|
| 115 |
+
"default": 0.50,
|
| 116 |
+
"high_recall": thr_recall,
|
| 117 |
+
"high_precision": thr_precision,
|
| 118 |
+
},
|
| 119 |
+
"metrics": {
|
| 120 |
+
"default": {"precision": float(prec), "recall": float(rec), "f1": float(f1)},
|
| 121 |
+
"high_recall": {
|
| 122 |
+
"precision": float(pr_recall),
|
| 123 |
+
"recall": float(rc_recall),
|
| 124 |
+
"f1": float(f1_recall),
|
| 125 |
+
},
|
| 126 |
+
"high_precision": {
|
| 127 |
+
"precision": float(pr_precision),
|
| 128 |
+
"recall": float(rc_precision),
|
| 129 |
+
"f1": float(f1_precision),
|
| 130 |
+
},
|
| 131 |
+
"baseline_persistence": {
|
| 132 |
+
"precision": float(pp),
|
| 133 |
+
"recall": float(rp),
|
| 134 |
+
"f1": float(f1p),
|
| 135 |
+
},
|
| 136 |
+
},
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
with open("models/rain_model_meta.json", "w") as fh:
|
| 140 |
+
import json
|
| 141 |
+
json.dump(meta, fh, indent=2)
|
| 142 |
+
|
| 143 |
+
print("📝 Saved: models/rain_model_meta.json")
|