File size: 2,411 Bytes
6eff894
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import json
import os
from pathlib import Path

import joblib
import numpy as np
import pandas as pd
from sklearn.inspection import permutation_importance
from sklearn.model_selection import train_test_split

RESULTS_DIR = "results"
os.environ.setdefault("MPLCONFIGDIR", os.path.join(RESULTS_DIR, ".matplotlib"))
os.environ.setdefault("XDG_CACHE_HOME", os.path.join(RESULTS_DIR, ".cache"))

Path(os.environ["MPLCONFIGDIR"]).mkdir(parents=True, exist_ok=True)
Path(os.environ["XDG_CACHE_HOME"]).mkdir(parents=True, exist_ok=True)

import matplotlib

matplotlib.use("Agg")
import matplotlib.pyplot as plt


def build_dataset(meta: dict) -> tuple[np.ndarray, np.ndarray]:
    df = pd.read_csv("results/hourly.csv", parse_dates=["time"])
    horizon = meta["horizon_hours"]

    precip = df["precip_mm"].values
    rain_future = np.zeros(len(df), dtype=int)
    for i in range(len(precip) - horizon):
        rain_future[i] = 1 if np.any(precip[i + 1 : i + 1 + horizon] > 0) else 0

    df = df.iloc[: len(precip) - horizon].copy()
    labels = rain_future[: len(df)]
    features = df[meta["features"]].values
    return features, labels


def plot_importance(feature_names: list[str], importances: np.ndarray, std: np.ndarray) -> None:
    order = np.argsort(importances)[::-1]
    feature_names = np.array(feature_names)[order]
    importances = importances[order]

    plt.figure(figsize=(8, 5))
    y_pos = np.arange(len(feature_names))
    plt.barh(y_pos, importances, align="center")
    plt.yticks(y_pos, feature_names)
    plt.gca().invert_yaxis()
    plt.xlabel("Permutation importance (F1 drop)")
    plt.title("Rain classifier — feature importances")
    plt.tight_layout()

    Path(RESULTS_DIR).mkdir(exist_ok=True)
    plt.savefig(os.path.join(RESULTS_DIR, "feature_importance.png"))
    plt.close()


def main() -> None:
    meta = json.load(open("models/rain_model_meta.json"))
    model = joblib.load("models/rain_classifier_hourly.joblib")

    X, y = build_dataset(meta)
    _, X_test, _, y_test = train_test_split(X, y, test_size=0.3, shuffle=False)

    result = permutation_importance(
        model,
        X_test,
        y_test,
        n_repeats=25,
        random_state=42,
        scoring="f1",
    )

    plot_importance(meta["features"], result.importances_mean, result.importances_std)
    print("✅ Wrote results/feature_importance.png")


if __name__ == "__main__":
    main()