File size: 5,407 Bytes
0ab0788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import streamlit as st
import polars as pl
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error

def compute_statistic_per_point(df: pl.DataFrame, stat_key: str) -> pl.DataFrame:
    cols = df.columns

    if stat_key == "mean":
        has_h = "mean_mm_h" in cols
        if has_h:
            df = df.with_columns(
                (pl.col("mean_mm_h") * 24).alias("mean_mm_j")
            )
        return df.group_by("NUM_POSTE").agg([
            *( [pl.col("mean_mm_h").mean().alias("mean_all_mm_h")] if has_h else [] ),
            *( [pl.col("mean_mm_j").mean().alias("mean_all_mm_j")] if has_h else [] ),
        ])

    elif stat_key == "max":
        return df.group_by("NUM_POSTE").agg([
            *( [pl.col("max_mm_h").max().alias("max_all_mm_h")] if "max_mm_h" in cols else [] ),
            *( [pl.col("max_mm_j").max().alias("max_all_mm_j")] if "max_mm_j" in cols else [] ),
        ])

    elif stat_key == "mean-max":
        return df.group_by("NUM_POSTE").agg([
            *( [pl.col("max_mm_h").mean().alias("max_mean_mm_h")] if "max_mm_h" in cols else [] ),
            *( [pl.col("max_mm_j").mean().alias("max_mean_mm_j")] if "max_mm_j" in cols else [] ),
        ])

    elif stat_key == "date":
        res = []
        if "max_mm_h" in cols and "max_date_mm_h" in cols:
            df_h = (
                df.sort("max_mm_h", descending=True)
                .group_by("NUM_POSTE")
                .agg(pl.col("max_date_mm_h").first().alias("date_max_h"))
            )
            res.append(df_h)
        if "max_mm_j" in cols and "max_date_mm_j" in cols:
            df_j = (
                df.sort("max_mm_j", descending=True)
                .group_by("NUM_POSTE")
                .agg(pl.col("max_date_mm_j").first().alias("date_max_j"))
            )
            res.append(df_j)

        if not res:
            raise ValueError("Aucune date de maximum disponible.")
        elif len(res) == 1:
            return res[0]
        else:
            return res[0].join(res[1], on="NUM_POSTE", how="outer")

    elif stat_key == "month":
        exprs = []
        if "max_date_mm_h" in cols:
            exprs.append(
                pl.col("max_date_mm_h")
                .str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S%.f", strict=False)
                .dt.month()
                .alias("mois_max_h")
            )
        if "max_date_mm_j" in cols:
            exprs.append(
                pl.col("max_date_mm_j")
                .str.strptime(pl.Datetime, format="%Y-%m-%d %H:%M:%S%.f", strict=False)
                .dt.month()
                .alias("mois_max_j")
            )
        if not exprs:
            raise ValueError("Aucune date de maximum pour extraire les mois.")

        df = df.with_columns(exprs)

        mois_h = mois_j = None

        if "mois_max_h" in df.columns:
            mois_h = (
                df.drop_nulls("mois_max_h")
                .group_by(["NUM_POSTE", "mois_max_h"])
                .len()
                .sort(["NUM_POSTE", "len"], descending=[False, True])
                .unique(subset=["NUM_POSTE"])
                .select(["NUM_POSTE", "mois_max_h"])
                .rename({"mois_max_h": "mois_pluvieux_h"})
            )

        if "mois_max_j" in df.columns:
            mois_j = (
                df.drop_nulls("mois_max_j")
                .group_by(["NUM_POSTE", "mois_max_j"])
                .len()
                .sort(["NUM_POSTE", "len"], descending=[False, True])
                .unique(subset=["NUM_POSTE"])
                .select(["NUM_POSTE", "mois_max_j"])
                .rename({"mois_max_j": "mois_pluvieux_j"})
            )

        if mois_h is None and mois_j is None:
            return pl.DataFrame(schema={"NUM_POSTE": pl.Int64, "mois_pluvieux_h": pl.Int32, "mois_pluvieux_j": pl.Int32})
        elif mois_h is None:
            return mois_j.with_columns([pl.lit(None, dtype=pl.Int32).alias("mois_pluvieux_h")])
        elif mois_j is None:
            return mois_h.with_columns([pl.lit(None, dtype=pl.Int32).alias("mois_pluvieux_j")])
        else:
            return mois_h.join(mois_j, on="NUM_POSTE", how="outer")

    elif stat_key == "numday":
        if "n_days_gt1mm" not in df.columns:
            raise ValueError("Colonne `n_days_gt1mm` manquante.")
        return (
            df.group_by("NUM_POSTE")
            .agg(pl.col("n_days_gt1mm").mean().alias("jours_pluie_moyen"))
        )

    else:
        raise ValueError(f"Statistique inconnue : {stat_key}")



def generate_metrics(df: pl.DataFrame, x_label: str = "AROME", y_label: str = "Station"):
    x = df[x_label].to_numpy()
    y = df[y_label].to_numpy()

    if len(x) != len(y):
        st.error("Longueur x et y différente")
        return np.nan, np.nan, np.nan, np.nan

    # Filtrage des NaNs sur les deux colonnes
    mask = ~np.isnan(x) & ~np.isnan(y)
    x_valid = x[mask]
    y_valid = y[mask]

    if len(x_valid) == 0:
        st.warning("Aucune donnée valide après suppression des NaN.")
        return np.nan, np.nan, np.nan, np.nan

    rmse = np.sqrt(mean_squared_error(y_valid, x_valid))
    mae = mean_absolute_error(y_valid, x_valid)
    me = np.mean(x_valid - y_valid)

    corr = np.corrcoef(x_valid, y_valid)[0, 1] if len(x_valid) > 1 else np.nan
    r2_corr = corr**2 if not np.isnan(corr) else np.nan

    return me, mae, rmse, r2_corr