Spaces:

previdengue
/

PreviDengueAPI

Running

File size: 13,789 Bytes

import os
import json
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import timedelta
from io import BytesIO
import base64
import tensorflow as tf
from tensorflow.keras.utils import register_keras_serializable
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
from huggingface_hub import hf_hub_download

plt.style.use('seaborn-v0_8-darkgrid')

@register_keras_serializable(package="Custom", name="asymmetric_mse")
def asymmetric_mse(y_true, y_pred):
    penalty_factor = 10.0
    error = y_true - y_pred
    denom = tf.maximum(tf.abs(y_true), 1.0)
    rel = tf.abs(error) / denom
    penalty = tf.where(error > 0, 1.0 + penalty_factor * rel, 1.0)
    loss = tf.square(error) * penalty
    return tf.reduce_mean(loss)

class DenguePredictor:
    def __init__(self, project_root=None, offline: bool = False, local_inference_path: str | None = None):
        self.project_root = Path(project_root) if project_root else Path(__file__).resolve().parent
        self.offline = bool(offline)
        self.local_inference_path = Path(local_inference_path) if local_inference_path else None
        self.sequence_length = 12
        self.horizon = 6
        self.anchor_lag_weeks = 2
        self.year_min_train = 2014
        self.year_max_train = 2025
        self.dynamic_features = [
            "numero_casos", "casos_velocidade", "casos_aceleracao", "casos_mm_4_semanas",
            "T2M", "T2M_MAX", "T2M_MIN", "PRECTOTCORR", "RH2M", "ALLSKY_SFC_SW_DWN",
            "week_sin", "week_cos", "year_norm", "notificacao"
        ]
        self.static_features = ["latitude", "longitude"]
        self.feature_names_pt = {
            "numero_casos": "Nº de Casos de Dengue",
            "T2M": "Temperatura Média (°C)",
            "PRECTOTCORR": "Precipitação (mm)"
        }
        self._loaded = False
        self.load_assets()

    def load_assets(self):
        models_dir = self.project_root / "models"
        scalers_dir = models_dir / "scalers"
        candidate_model_paths = [
            models_dir / "model_checkpoint_best_city.keras",
            models_dir / "model.keras",
        ]
        model_path = next((p for p in candidate_model_paths if p.exists()), None)
        city_map_path = models_dir / "city_to_idx.json"

        if not scalers_dir.exists():
            raise FileNotFoundError(str(scalers_dir) + " not found")

        self.scaler_dyn = joblib.load(scalers_dir / "scaler_dyn_global.pkl")
        self.scaler_static = joblib.load(scalers_dir / "scaler_static_global.pkl")
        self.scaler_target = joblib.load(scalers_dir / "scaler_target_global.pkl")

        if city_map_path.exists():
            with open(city_map_path, "r", encoding="utf-8") as fh:
                self.city_to_idx = {int(k): int(v) for k, v in json.load(fh).items()}
        else:
            self.city_to_idx = {}

        # Load inference dataset (HF online or local offline)
        df = None
        if self.offline:
            # Somente .parquet é aceito no modo offline
            candidate_paths = []
            if self.local_inference_path:
                candidate_paths.append(self.local_inference_path)
            candidate_paths.append(models_dir / "inference_data.parquet")

            found = None
            for p in candidate_paths:
                try:
                    if p and Path(p).exists() and str(p).lower().endswith(".parquet"):
                        found = Path(p)
                        break
                except Exception:
                    continue
            if not found:
                raise FileNotFoundError(
                    "Offline mode enabled but no local Parquet dataset found. "
                    "Place 'inference_data.parquet' under models/ or pass a valid 'local_inference_path' (.parquet)."
                )
            df = pd.read_parquet(found)
        else:
            inference_path = hf_hub_download(
                repo_id="previdengue/predict_inference_data",
                filename="inference_data.parquet",
                repo_type="dataset"
            )
            df = pd.read_parquet(inference_path)
        df["codigo_ibge"] = df["codigo_ibge"].astype(int)
        df["ano"] = df["ano"].astype(int)
        df["semana"] = df["semana"].astype(int)
        try:
            df["date"] = pd.to_datetime(df["ano"].astype(str) + df["semana"].astype(str) + "0", format="%Y%W%w", errors="coerce")
        except Exception:
            df["date"] = pd.NaT

        df = df.sort_values(by=["codigo_ibge", "ano", "semana"]).reset_index(drop=True)
        df["week_sin"] = np.sin(2 * np.pi * df["semana"] / 52)
        df["week_cos"] = np.cos(2 * np.pi * df["semana"] / 52)
        df["year_norm"] = (df["ano"] - self.year_min_train) / (self.year_max_train - self.year_min_train)
        df["notificacao"] = df["ano"].isin([2021, 2022]).astype(float)

        self.df_master = df
        self.municipios = df[["codigo_ibge", "municipio"]].drop_duplicates().sort_values("codigo_ibge")

        if model_path is None:
            raise FileNotFoundError(
                "No municipal model checkpoint found. Expected one of: "
                + ", ".join(str(p) for p in candidate_model_paths)
            )

        self.model = tf.keras.models.load_model(model_path, custom_objects={"asymmetric_mse": asymmetric_mse}, compile=False)
        self._loaded = True

    def plot_to_base64(self, fig):
        buf = BytesIO()
        fig.savefig(buf, format="png", bbox_inches="tight", facecolor=fig.get_facecolor())
        buf.seek(0)
        img_str = base64.b64encode(buf.read()).decode("utf-8")
        plt.close(fig)
        return img_str

    def _prepare_sequence(self, df_mun, end_idx=None):
        df_all = df_mun.copy()
        df_all["notificacao"] = df_all["ano"].isin([2021, 2022]).astype(float)
        df_all["week_sin"] = np.sin(2 * np.pi * df_all["semana"] / 52)
        df_all["week_cos"] = np.cos(2 * np.pi * df_all["semana"] / 52)
        df_all["year_norm"] = (df_all["ano"] - self.year_min_train) / (self.year_max_train - self.year_min_train)
        df_all["casos_velocidade"] = df_all["numero_casos"].diff().fillna(0)
        df_all["casos_aceleracao"] = df_all["casos_velocidade"].diff().fillna(0)
        df_all["casos_mm_4_semanas"] = df_all["numero_casos"].rolling(4, min_periods=1).mean()
        if end_idx is None:
            end_idx = len(df_all) - 1
        start_idx = end_idx - self.sequence_length + 1
        if start_idx < 0:
            return df_all.iloc[0:0].copy()
        return df_all.iloc[start_idx:end_idx + 1].copy()

    def predict(self, ibge_code: int, show_plot=False, display_history_weeks=None):
        if not self._loaded:
            raise RuntimeError("assets not loaded")

        df_mun = self.df_master[self.df_master["codigo_ibge"] == int(ibge_code)].copy().reset_index(drop=True)
        if df_mun.empty or len(df_mun) < self.sequence_length:
            raise ValueError(f"No data or insufficient history for ibge {ibge_code}")

        municipio_row = self.municipios[self.municipios["codigo_ibge"] == int(ibge_code)]
        municipality_name = municipio_row.iloc[0]["municipio"] if not municipio_row.empty else str(ibge_code)

        pred_point_idx = len(df_mun) - self.anchor_lag_weeks
        last_known_idx = pred_point_idx - 1
        if last_known_idx < self.sequence_length - 1:
            raise ValueError(f"Insufficient sequence window before forecast point for {ibge_code}")

        seq_df = self._prepare_sequence(df_mun, end_idx=last_known_idx)
        if len(seq_df) < self.sequence_length:
            raise ValueError(f"Insufficient sequence length for {ibge_code}")

        dynamic_raw = seq_df[self.dynamic_features].values
        static_raw = seq_df[self.static_features].iloc[0].values.reshape(1, -1)

        missing_feats = [c for c in self.dynamic_features if c not in seq_df.columns]
        if missing_feats:
            raise ValueError(f"Missing dynamic features in dataframe: {missing_feats}")
        if hasattr(self.scaler_dyn, "n_features_in_") and self.scaler_dyn.n_features_in_ != len(self.dynamic_features):
            raise ValueError(
                f"Dynamic scaler expects {getattr(self.scaler_dyn, 'n_features_in_', 'unknown')} features, "
                f"but predictor assembled {len(self.dynamic_features)}. Ensure training and inference feature sets match."
            )

        dynamic_scaled = self.scaler_dyn.transform(dynamic_raw).reshape(1, self.sequence_length, -1)
        static_scaled = self.scaler_static.transform(static_raw)

        city_idx = int(self.city_to_idx.get(int(ibge_code), 0))
        city_input = np.array([[city_idx]], dtype=np.int32)

        y_pred = self.model.predict([dynamic_scaled, static_scaled, city_input], verbose=0)
        y_pred_reg = y_pred[0] if isinstance(y_pred, (list, tuple)) else y_pred

        y_pred_flat = y_pred_reg.reshape(-1, 1)
        y_pred_inv_flat = self.scaler_target.inverse_transform(y_pred_flat)
        y_pred_inv = y_pred_inv_flat.reshape(y_pred_reg.shape)
        pred_values = np.maximum(y_pred_inv.flatten(), 0.0)

        last_known_case = seq_df["numero_casos"].iloc[-1]
        connected_prediction = np.insert(pred_values, 0, last_known_case)

        last_real_date = seq_df["date"].iloc[-1] if "date" in seq_df.columns else None
        predicted_data = []
        for i, val in enumerate(connected_prediction[1:]):
            pred_date = (last_real_date + timedelta(weeks=i + 1)).strftime("%Y-%m-%d") if pd.notna(last_real_date) else None
            predicted_data.append({"date": pred_date, "predicted_cases": int(round(float(val)))})

        # Histórico: por padrão retorna tudo; se display_history_weeks > 0, limita a janela
        hist_base = df_mun.iloc[:last_known_idx + 1].copy()
        if display_history_weeks is None or (isinstance(display_history_weeks, (int, float)) and display_history_weeks <= 0):
            hist_tail = hist_base
        else:
            hist_tail = hist_base.tail(min(len(hist_base), int(display_history_weeks))).copy()
        historic_data = []
        for _, row in hist_tail.iterrows():
            historic_data.append({
                "date": row["date"].strftime("%Y-%m-%d") if pd.notna(row.get("date")) else None,
                "cases": int(row["numero_casos"]) if pd.notna(row.get("numero_casos")) else None
            })
        # Insights: lag correlation analysis and strategic summary
        lag_plot_b64, strategic_summary, tipping_points = self.generate_lag_insights(df_mun)

        insights = {
            "lag_analysis_plot_base64": lag_plot_b64,
            "strategic_summary": strategic_summary,
            "tipping_points": tipping_points
        }

        return {
            "municipality_name": municipality_name,
            "ibge": int(ibge_code),
            "last_known_index": int(last_known_idx),
            "historic_data": historic_data,
            "predicted_data": predicted_data,
            "insights": insights,
        }

    def generate_lag_insights(self, df_mun: pd.DataFrame):
        # Prepare analysis columns
        df_analysis = df_mun.rename(columns={
            "T2M": "Temperature_C",
            "PRECTOTCORR": "Precipitation_mm"
        })
        max_lag = 12
        cases_col = "numero_casos"
        lag_features = ["Temperature_C", "Precipitation_mm"]
        lag_correlations = {}

        for col in lag_features:
            if col in df_analysis.columns:
                corrs = []
                for lag in range(1, max_lag + 1):
                    try:
                        corr = df_analysis[cases_col].corr(df_analysis[col].shift(lag))
                    except Exception:
                        corr = np.nan
                    corrs.append(corr)
                lag_correlations[col] = corrs
            else:
                lag_correlations[col] = [np.nan] * max_lag

        # Plot
        fig, ax = plt.subplots(figsize=(10, 6), facecolor="#18181b")
        ax.set_facecolor("#18181b")
        for feature_name, corrs in lag_correlations.items():
            ax.plot(range(1, max_lag + 1), corrs, marker="o", linestyle="-", label=feature_name)
        ax.set_title("Lag Analysis", color="white")
        ax.set_xlabel("Lag (weeks)", color="white")
        ax.set_ylabel("Correlation with cases", color="white")
        ax.tick_params(colors="white")
        ax.legend(facecolor="#27272a", edgecolor="gray", labelcolor="white")
        ax.grid(True, which="both", linestyle="--", linewidth=0.5, color="#444")
        lag_plot_b64 = self.plot_to_base64(fig)

        # Summaries
        lag_peaks = {}
        for feature, corrs in lag_correlations.items():
            if corrs and not all(pd.isna(corrs)):
                peak = int(np.nanargmax(np.abs(np.array(corrs))) + 1)
            else:
                peak = "N/A"
            lag_peaks[feature] = peak

        temp_lag = lag_peaks.get("Temperature_C", "N/A")
        rain_lag = lag_peaks.get("Precipitation_mm", "N/A")
        summary = (
            f"O modelo identifica Temperatura e Precipitação como fatores climáticos chave. "
            f"Temperatura mostra impacto máximo após {temp_lag} semanas e precipitação após {rain_lag} semanas. "
            "Ações preventivas devem ser intensificadas nessas janelas após eventos climáticos extremos."
        )

        tipping_points = [
            {"factor": "Temperatura", "value": f"Maior impacto em {temp_lag} semanas"},
            {"factor": "Precipitação", "value": f"Maior impacto em {rain_lag} semanas"},
            {"factor": "Umidade", "value": "Aumenta a sobrevivência de mosquitos adultos"}
        ]

        return lag_plot_b64, summary, tipping_points