Spaces:

ImanK12
/

RabiesRadar

Sleeping

File size: 17,361 Bytes

import streamlit as st
import pandas as pd
import joblib
import matplotlib as plt
from sklearn.tree import plot_tree
import io

# ✅ חייב להיות ראשון!
st.set_page_config(page_title="Rabies Prediction", layout="centered")

# ================== עיצוב דינמי ==================
st.markdown(
    """
    <style>
    .stApp { background: linear-gradient(135deg, #f5f7fa, #c3cfe2); font-family: 'Arial', sans-serif; }
    h1 { color: #2c3e50; text-align: center; font-size: 3rem; font-weight: bold; }
    h2, h3 { color: #34495e; }
    div.stButton > button:first-child { background-color: #2980b9; color: white; font-size: 1.1rem; padding: 10px 24px; border-radius: 8px; border: none; transition: background-color 0.3s ease; }
    div.stButton > button:first-child:hover { background-color: #3498db; }
    div[data-baseweb="select"] > div { border-radius: 8px; border: 1px solid #2980b9; }
    </style>
    """,
    unsafe_allow_html=True
)

def compute_similarity(df: pd.DataFrame, inp: pd.DataFrame, columns: list):
    """
    מחשבת דמיון בין רשומה חדשה לבין כל הדאטה ב-DataFrame.

    פרמטרים:
    df       : DataFrame עם הנתונים הקיימים
    inp      : DataFrame עם רשומה אחת לחיזוי
    columns  : רשימת עמודות להשוואה

    מחזירה DataFrame עם עמודת 'similarity' ממוינת מהגבוה לנמוך
    """
    similarities = []

    for _, row in df.iterrows():
        score = 0
        for col in columns:
            if pd.api.types.is_numeric_dtype(df[col]):
                # נורמליזציה לפי טווח העמודה
                max_val = df[col].max()
                score += 1 - abs(row[col] - inp[col].values[0]) / (max_val if max_val != 0 else 1)
            else:
                # categorical comparison
                score += (row[col] == inp[col].values[0])
        # ממוצע הדמיון על כל העמודות שנבחרו
        similarities.append(score / len(columns))

    df['similarity'] = similarities
    return df.sort_values('similarity', ascending=False)
    
# ================== הגדרות ==================
DATA_PATH = "./src/Rabies__Weather__War_Combined_1.4.25.xlsx"
MODEL_PATH = "./src/final_model_gradient_boosting.pkl"
OHE_PATH = "./src/preprocessing_onehot_encoder.pkl"
SCALER_PATH = "./src/preprocessing_scaler.pkl"
TARGET_ENCODERS_PATH = "./src/preprocessing_target_encoders.pkl"



label_cols = ['Animal Species', 'Rabies Species', 'Settlement', 'Region_Weather']
target_cols = ['Region', 'Month']
num_cols = ['x', 'y', 'Avg Temperature', 'Monthly Precipitation (mm)', 'Rainy Days']
extra_cols = ['War in Israel', 'Year']  # עמודות נוספות שהמודל דורש

# ================== טעינת המודל והנירמולים ==================
df = pd.read_excel(DATA_PATH)
model = joblib.load(MODEL_PATH)
ohe = joblib.load(OHE_PATH)
scaler = joblib.load(SCALER_PATH)
target_encoders = joblib.load(TARGET_ENCODERS_PATH)

# ================== רשימות ייחודיות עבור selectbox ==================
animal_species_list = sorted(df['Animal Species'].dropna().unique())
rabies_species_list = sorted(df['Rabies Species'].dropna().unique())
settlement_list = sorted(df['Settlement'].dropna().unique())
region_weather_list = sorted(df['Region_Weather'].dropna().unique())



# ================== כותרת ==================
#st.set_page_config(page_title="Rabies Prediction", layout="centered")
st.title("🐶 Rabies / Weather / War Prediction")
st.markdown("הזן נתונים חדשים לקבלת תחזית עבור **Region** ו־**Month**")

# ================== טופס קלט ==================
with st.form("input_form"):
    st.subheader("✍️ הזן פרטי רשומה חדשה")

    # בחירה מתוך רשימות
    animal_species = st.selectbox("Animal Species", animal_species_list)
    rabies_species = st.selectbox("Rabies Species", rabies_species_list)
    settlement = st.selectbox("Settlement", settlement_list)
    region_weather = st.selectbox("Region Weather", region_weather_list)
    war_in_israel = st.selectbox("War in Israel", ["Yes", "No"])

    # מספריים
    x = st.number_input("x", value=0.0)
    y = st.number_input("y", value=0.0)
    avg_temp = st.number_input("Avg Temperature", value=20.0)
    precipitation = st.number_input("Monthly Precipitation (mm)", value=50.0)
    rainy_days = st.number_input("Rainy Days", value=10.0)
    year = st.number_input("Year", min_value=1900, max_value=2100, value=2025)

    submitted = st.form_submit_button("🔮 Make Prediction >> ")

# ================== חיזוי ==================
if submitted:
    try:
        # המרה ל־0/1
        war_in_israel_val = 1 if war_in_israel == "Yes" else 0

        # בניית DataFrame יחיד
        input_df = pd.DataFrame([{
            'Animal Species': animal_species,
            'Rabies Species': rabies_species,
            'Settlement': settlement,
            'Region_Weather': region_weather,
            'x': x,
            'y': y,
            'Avg Temperature': avg_temp,
            'Monthly Precipitation (mm)': precipitation,
            'Rainy Days': rainy_days,
            'War in Israel': war_in_israel_val,
            'Year': year
        }])

        # --- OneHot לקטגוריות ---
        encoded = ohe.transform(input_df[label_cols])
        encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(label_cols), index=input_df.index)

        # --- נירמול למספריים ---
        scaled_nums = scaler.transform(input_df[num_cols])
        scaled_df = pd.DataFrame(scaled_nums, columns=num_cols, index=input_df.index)

        # --- איחוד עמודות ---
        X_new = pd.concat([scaled_df, encoded_df, input_df[extra_cols]], axis=1)

        # סדר העמודות כמו במודל
        X_new = X_new[model.estimators_[0].feature_names_in_]

        # --- חיזוי ---
        y_pred = model.predict(X_new)[0]

        # --- סיכוי לכל קטגוריה ---
        region_proba = model.estimators_[0].predict_proba(X_new)[0]  # estimator[0] = Region
        month_proba = model.estimators_[1].predict_proba(X_new)[0]  # estimator[1] = Month

        # המרה חזרה לערכים מקוריים
        region_pred = target_encoders['Region'].inverse_transform([y_pred[0]])[0]
        month_pred = target_encoders['Month'].inverse_transform([y_pred[1]])[0]

        # אחוזים
        region_confidence = region_proba[y_pred[0]] * 100
        month_confidence = month_proba[y_pred[1]] * 100


        # ================== Alerts Dictionary per Target ==================
        alerts_dict_region = {
                'Galil Golan': "⚠️ Region is 'Galil Golan', check coordinates, temperature, and precipitation values for consistency.",
                'Amakim': "⚠️ Region is 'Amakim', unusual feature values may affect prediction.",
                'Shfela Vahar': "⚠️ Region is 'Shfela Vahar', verify X/Y coordinates and weather features.",
                'Hasharon': "⚠️ Region is 'Hasharon', check numeric inputs for anomalies.",
                'Galil Maaravi': "⚠️ Region is 'Galil Maaravi', some features might be outside typical range.",
                'Negev': "⚠️ Region is 'Negev', check for extreme values in coordinates or weather data."
        }

        alerts_dict_month = {
            "January": "⚠️ Month is January, check if temperature, precipitation, and rainy days align with typical values.",
            "February": "⚠️ Month is February, unusual feature values may affect predictions.",
            "March": "⚠️ Month is March, verify coordinates and weather features for consistency.",
            "April": "⚠️ Month is April, check numeric inputs for anomalies.",
            "May": "⚠️ Month is May, some features might be outside typical range.",
            "June": "⚠️ Month is June, check for extreme values in coordinates or weather data.",
            "July": "⚠️ Month is July, unusual conditions may affect predictions.",
            "August": "⚠️ Month is August, verify temperature and precipitation values.",
            "September": "⚠️ Month is September, ensure numeric inputs are within reasonable range.",
            "October": "⚠️ Month is October, check if weather features match typical patterns.",
            "November": "⚠️ Month is November, anomalies in inputs may affect prediction.",
            "December": "⚠️ Month is December, verify coordinate and weather inputs."
        }


        


        # ================== Run alerts for both targets ==================
        st.warning(alerts_dict_month[month_pred])
        st.warning(alerts_dict_region[region_pred])
        
        st.success(f"✅ Model Prediction: **Region = {region_pred} ({region_confidence:.2f}%), "
                   f"Month = {month_pred} ({month_confidence:.2f}%)**")

        st.subheader("🟢 Most Similar Record to Your Input (Similarity Based)")
        columns_to_compare = label_cols + num_cols + extra_cols  # all relevant columns
        most_similar_row = compute_similarity(df, input_df, columns_to_compare)
        st.write("The record from the existing dataset that is most similar to your input:")
        st.dataframe(most_similar_row)
        
        

        # Feature names
        feature_names = X_new.columns.tolist()

        # GradientBoosting for Region (estimator[0])
        gb_region = model.estimators_[0].estimators_[0, 0]  # הגישה למודל פנימי של Region
        gb_month = model.estimators_[1].estimators_[0, 0]  # הגישה למודל פנימי של Region

        gb_targets = [gb_region , gb_month]
        # Feature importance for Region
        import matplotlib.pyplot as plt
        from sklearn.tree import plot_tree
        import numpy as np
        import streamlit as st
        import matplotlib.cm as cm
        import seaborn as sns
        from scipy.stats import pearsonr, chi2_contingency
        import pandas as pd

        # ============================== plotting ==============================
        # gb_targets = רשימת המודלים של Region ו-Month, לדוגמה: model.estimators_
        target_names = ['Region', 'Month']

        for idx, i in enumerate(gb_targets):
            target = target_names[idx]
            st.subheader(f'Target Name : {target}')

            # Feature importances
            importances = i.feature_importances_
            indices = np.argsort(importances)[::-1]  # סדר יורד
            top_n = 4

            top_features = [feature_names[j] for j in indices[:top_n]]
            top_importances = importances[indices[:top_n]]

            # ===== Streamlit columns =====
            col1, col2 = st.columns(2)

            # ===== גרף Feature Importance =====
            with col1:
                plt.figure(figsize=(8, 6))
                colors = cm.viridis(np.linspace(0, 1, top_n))
                plt.barh(top_features[::-1], top_importances[::-1], color=colors)
                plt.xlabel("Feature Importance")
                plt.title(f"Top 4 Features ({target})", color='darkblue')
                st.pyplot(plt.gcf())
                plt.clf()

            # ===== Example Decision Tree =====
            with col2:
                plt.figure(figsize=(8, 6))
                plot_tree(i, feature_names=feature_names, filled=True, max_depth=3, rounded=True, fontsize=10)
                plt.title(f"Decision Tree (Depth=3) for {target}", color='darkgreen')
                st.pyplot(plt.gcf())
                plt.clf()

        # ================== Numeric Correlation ==================
        st.subheader("📊 Correlation Matrix (Numeric Features)")
        st.markdown("""
        The correlation matrix shows the pairwise **Pearson correlation coefficients** between numeric features.
        - Values close to **1** indicate a strong positive correlation.
        - Values close to **-1** indicate a strong negative correlation.
        - Values around **0** indicate little or no linear correlation.
        """)
        numeric_df = df[num_cols]
        corr_matrix = numeric_df.corr()
        plt.figure(figsize=(8, 6))
        sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
        plt.title("Correlation Matrix", color='darkblue', fontsize=14)
        st.pyplot(plt.gcf())
        plt.clf()

        st.subheader("📑 Pearson p-values (Numeric Features)")
        st.markdown("""
        The Pearson p-values indicate the statistical significance of the correlation between numeric features.
        - A **small p-value (typically < 0.05)** suggests that the correlation is statistically significant.
        - A **large p-value** suggests that the correlation could be due to random chance.
        - Diagonal cells are **NaN** because a feature's correlation with itself is not tested.
        """)
        pval_matrix = pd.DataFrame(np.zeros((len(num_cols), len(num_cols))), columns=num_cols, index=num_cols)
        for i, col1 in enumerate(num_cols):
            for j, col2 in enumerate(num_cols):
                pval_matrix.loc[col1, col2] = np.nan if i == j else pearsonr(numeric_df[col1], numeric_df[col2])[1]
        st.dataframe(pval_matrix.style.background_gradient(cmap="coolwarm", axis=None).format("{:.3f}"))

        # ================== Categorical Correlation (Cramér's V) ==================
        st.subheader("📊 Cramér's V (Categorical Features + Targets)")

        explain_carmer = """
        
        Cramér's V measures the strength of association between categorical variables.  
        - Values range from **0 to 1**:
          - **0** → no association  
          - **1** → perfect association  
        - Higher values indicate stronger relationships between the categories.  
        - This includes both the original categorical features and the target variables (e.g., Region, Month).
        """

        st.markdown(explain_carmer)


        categorical_cols = label_cols + target_cols
        cat_df = df[categorical_cols].dropna()



        def cramers_v(x, y):
            cmatrix = pd.crosstab(x, y)
            chi2 = chi2_contingency(cmatrix)[0]
            n = cmatrix.sum().sum()
            phi2 = chi2 / n
            r, k = cmatrix.shape
            return np.sqrt(phi2 / min(k - 1, r - 1))


        cramers_matrix = pd.DataFrame(np.zeros((len(categorical_cols), len(categorical_cols))),
                                      index=categorical_cols, columns=categorical_cols)
        for col1 in categorical_cols:
            for col2 in categorical_cols:
                cramers_matrix.loc[col1, col2] = 1.0 if col1 == col2 else cramers_v(cat_df[col1], cat_df[col2])

        plt.figure(figsize=(10, 8))
        sns.heatmap(cramers_matrix, annot=True, fmt=".2f", cmap="viridis", linewidths=0.5)
        plt.title("Cramér's V Correlation (Categorical Features)", color='darkgreen', fontsize=14)
        st.pyplot(plt.gcf())
        plt.clf()

        # ================== יצירת Excel ==================

        download_df = input_df.copy()
        download_df['Predicted Region'] = region_pred
        download_df['Region Confidence (%)'] = region_confidence
        download_df['Predicted Month'] = month_pred
        download_df['Month Confidence (%)'] = month_confidence

        # Feature Importances
        fi_region = pd.Series(gb_region.feature_importances_, index=feature_names, name='Region FI')
        fi_month = pd.Series(gb_month.feature_importances_, index=feature_names, name='Month FI')
        fi_df = pd.concat([fi_region, fi_month], axis=1).reset_index().rename(columns={'index': 'Feature'})


        pval_df = pval_matrix.reset_index().rename(columns={'index':'Feature1'})

        cramers_df = cramers_matrix.reset_index().rename(columns={'index':'Feature1'})
        numeric_df = df[num_cols]  # בחירת העמודות המספריות
        corr_df = numeric_df.corr()  # מטריצת קורלציה (Pearson)

        excel_buffer = io.BytesIO()
        lines = explain_carmer.split('\n')  # אם יש פסקאות
        explain_carmer_to_save = pd.DataFrame(lines, columns=['Explanation'])
        with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer:
            download_df.to_excel(writer, sheet_name='Prediction', index=False)
            most_similar_row.to_excel(writer, sheet_name='Similar row table', index=False)
            fi_df.to_excel(writer, sheet_name='Feature Importances', index=False)
            pval_df.to_excel(writer, sheet_name='Pearson p-values', index=False)
            cramers_df.to_excel(writer, sheet_name= 'Cramers V', index=False)
            pd.DataFrame(explain_carmer_to_save).to_excel(writer, sheet_name='Cramers V', index=False)
            corr_df.to_excel(writer, sheet_name='Correlation Matrix', index=True)

        st.download_button(
            label="⬇️ Download Rabies Prediction Data",
            data=excel_buffer.getvalue(),
            file_name="Rabies_analysis.xlsx",
            mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )









    except Exception as e:
        st.error(f"❌ שגיאה: {str(e)}")