import streamlit as st import pandas as pd import joblib import matplotlib as plt from sklearn.tree import plot_tree import io # ✅ חייב להיות ראשון! st.set_page_config(page_title="Rabies Prediction", layout="centered") # ================== עיצוב דינמי ================== st.markdown( """ """, unsafe_allow_html=True ) def compute_similarity(df: pd.DataFrame, inp: pd.DataFrame, columns: list): """ מחשבת דמיון בין רשומה חדשה לבין כל הדאטה ב-DataFrame. פרמטרים: df : DataFrame עם הנתונים הקיימים inp : DataFrame עם רשומה אחת לחיזוי columns : רשימת עמודות להשוואה מחזירה DataFrame עם עמודת 'similarity' ממוינת מהגבוה לנמוך """ similarities = [] for _, row in df.iterrows(): score = 0 for col in columns: if pd.api.types.is_numeric_dtype(df[col]): # נורמליזציה לפי טווח העמודה max_val = df[col].max() score += 1 - abs(row[col] - inp[col].values[0]) / (max_val if max_val != 0 else 1) else: # categorical comparison score += (row[col] == inp[col].values[0]) # ממוצע הדמיון על כל העמודות שנבחרו similarities.append(score / len(columns)) df['similarity'] = similarities return df.sort_values('similarity', ascending=False) # ================== הגדרות ================== DATA_PATH = "./src/Rabies__Weather__War_Combined_1.4.25.xlsx" MODEL_PATH = "./src/final_model_gradient_boosting.pkl" OHE_PATH = "./src/preprocessing_onehot_encoder.pkl" SCALER_PATH = "./src/preprocessing_scaler.pkl" TARGET_ENCODERS_PATH = "./src/preprocessing_target_encoders.pkl" label_cols = ['Animal Species', 'Rabies Species', 'Settlement', 'Region_Weather'] target_cols = ['Region', 'Month'] num_cols = ['x', 'y', 'Avg Temperature', 'Monthly Precipitation (mm)', 'Rainy Days'] extra_cols = ['War in Israel', 'Year'] # עמודות נוספות שהמודל דורש # ================== טעינת המודל והנירמולים ================== df = pd.read_excel(DATA_PATH) model = joblib.load(MODEL_PATH) ohe = joblib.load(OHE_PATH) scaler = joblib.load(SCALER_PATH) target_encoders = joblib.load(TARGET_ENCODERS_PATH) # ================== רשימות ייחודיות עבור selectbox ================== animal_species_list = sorted(df['Animal Species'].dropna().unique()) rabies_species_list = sorted(df['Rabies Species'].dropna().unique()) settlement_list = sorted(df['Settlement'].dropna().unique()) region_weather_list = sorted(df['Region_Weather'].dropna().unique()) # ================== כותרת ================== #st.set_page_config(page_title="Rabies Prediction", layout="centered") st.title("🐶 Rabies / Weather / War Prediction") st.markdown("הזן נתונים חדשים לקבלת תחזית עבור **Region** ו־**Month**") # ================== טופס קלט ================== with st.form("input_form"): st.subheader("✍️ הזן פרטי רשומה חדשה") # בחירה מתוך רשימות animal_species = st.selectbox("Animal Species", animal_species_list) rabies_species = st.selectbox("Rabies Species", rabies_species_list) settlement = st.selectbox("Settlement", settlement_list) region_weather = st.selectbox("Region Weather", region_weather_list) war_in_israel = st.selectbox("War in Israel", ["Yes", "No"]) # מספריים x = st.number_input("x", value=0.0) y = st.number_input("y", value=0.0) avg_temp = st.number_input("Avg Temperature", value=20.0) precipitation = st.number_input("Monthly Precipitation (mm)", value=50.0) rainy_days = st.number_input("Rainy Days", value=10.0) year = st.number_input("Year", min_value=1900, max_value=2100, value=2025) submitted = st.form_submit_button("🔮 Make Prediction >> ") # ================== חיזוי ================== if submitted: try: # המרה ל־0/1 war_in_israel_val = 1 if war_in_israel == "Yes" else 0 # בניית DataFrame יחיד input_df = pd.DataFrame([{ 'Animal Species': animal_species, 'Rabies Species': rabies_species, 'Settlement': settlement, 'Region_Weather': region_weather, 'x': x, 'y': y, 'Avg Temperature': avg_temp, 'Monthly Precipitation (mm)': precipitation, 'Rainy Days': rainy_days, 'War in Israel': war_in_israel_val, 'Year': year }]) # --- OneHot לקטגוריות --- encoded = ohe.transform(input_df[label_cols]) encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(label_cols), index=input_df.index) # --- נירמול למספריים --- scaled_nums = scaler.transform(input_df[num_cols]) scaled_df = pd.DataFrame(scaled_nums, columns=num_cols, index=input_df.index) # --- איחוד עמודות --- X_new = pd.concat([scaled_df, encoded_df, input_df[extra_cols]], axis=1) # סדר העמודות כמו במודל X_new = X_new[model.estimators_[0].feature_names_in_] # --- חיזוי --- y_pred = model.predict(X_new)[0] # --- סיכוי לכל קטגוריה --- region_proba = model.estimators_[0].predict_proba(X_new)[0] # estimator[0] = Region month_proba = model.estimators_[1].predict_proba(X_new)[0] # estimator[1] = Month # המרה חזרה לערכים מקוריים region_pred = target_encoders['Region'].inverse_transform([y_pred[0]])[0] month_pred = target_encoders['Month'].inverse_transform([y_pred[1]])[0] # אחוזים region_confidence = region_proba[y_pred[0]] * 100 month_confidence = month_proba[y_pred[1]] * 100 # ================== Alerts Dictionary per Target ================== alerts_dict_region = { 'Galil Golan': "⚠️ Region is 'Galil Golan', check coordinates, temperature, and precipitation values for consistency.", 'Amakim': "⚠️ Region is 'Amakim', unusual feature values may affect prediction.", 'Shfela Vahar': "⚠️ Region is 'Shfela Vahar', verify X/Y coordinates and weather features.", 'Hasharon': "⚠️ Region is 'Hasharon', check numeric inputs for anomalies.", 'Galil Maaravi': "⚠️ Region is 'Galil Maaravi', some features might be outside typical range.", 'Negev': "⚠️ Region is 'Negev', check for extreme values in coordinates or weather data." } alerts_dict_month = { "January": "⚠️ Month is January, check if temperature, precipitation, and rainy days align with typical values.", "February": "⚠️ Month is February, unusual feature values may affect predictions.", "March": "⚠️ Month is March, verify coordinates and weather features for consistency.", "April": "⚠️ Month is April, check numeric inputs for anomalies.", "May": "⚠️ Month is May, some features might be outside typical range.", "June": "⚠️ Month is June, check for extreme values in coordinates or weather data.", "July": "⚠️ Month is July, unusual conditions may affect predictions.", "August": "⚠️ Month is August, verify temperature and precipitation values.", "September": "⚠️ Month is September, ensure numeric inputs are within reasonable range.", "October": "⚠️ Month is October, check if weather features match typical patterns.", "November": "⚠️ Month is November, anomalies in inputs may affect prediction.", "December": "⚠️ Month is December, verify coordinate and weather inputs." } # ================== Run alerts for both targets ================== st.warning(alerts_dict_month[month_pred]) st.warning(alerts_dict_region[region_pred]) st.success(f"✅ Model Prediction: **Region = {region_pred} ({region_confidence:.2f}%), " f"Month = {month_pred} ({month_confidence:.2f}%)**") st.subheader("🟢 Most Similar Record to Your Input (Similarity Based)") columns_to_compare = label_cols + num_cols + extra_cols # all relevant columns most_similar_row = compute_similarity(df, input_df, columns_to_compare) st.write("The record from the existing dataset that is most similar to your input:") st.dataframe(most_similar_row) # Feature names feature_names = X_new.columns.tolist() # GradientBoosting for Region (estimator[0]) gb_region = model.estimators_[0].estimators_[0, 0] # הגישה למודל פנימי של Region gb_month = model.estimators_[1].estimators_[0, 0] # הגישה למודל פנימי של Region gb_targets = [gb_region , gb_month] # Feature importance for Region import matplotlib.pyplot as plt from sklearn.tree import plot_tree import numpy as np import streamlit as st import matplotlib.cm as cm import seaborn as sns from scipy.stats import pearsonr, chi2_contingency import pandas as pd # ============================== plotting ============================== # gb_targets = רשימת המודלים של Region ו-Month, לדוגמה: model.estimators_ target_names = ['Region', 'Month'] for idx, i in enumerate(gb_targets): target = target_names[idx] st.subheader(f'Target Name : {target}') # Feature importances importances = i.feature_importances_ indices = np.argsort(importances)[::-1] # סדר יורד top_n = 4 top_features = [feature_names[j] for j in indices[:top_n]] top_importances = importances[indices[:top_n]] # ===== Streamlit columns ===== col1, col2 = st.columns(2) # ===== גרף Feature Importance ===== with col1: plt.figure(figsize=(8, 6)) colors = cm.viridis(np.linspace(0, 1, top_n)) plt.barh(top_features[::-1], top_importances[::-1], color=colors) plt.xlabel("Feature Importance") plt.title(f"Top 4 Features ({target})", color='darkblue') st.pyplot(plt.gcf()) plt.clf() # ===== Example Decision Tree ===== with col2: plt.figure(figsize=(8, 6)) plot_tree(i, feature_names=feature_names, filled=True, max_depth=3, rounded=True, fontsize=10) plt.title(f"Decision Tree (Depth=3) for {target}", color='darkgreen') st.pyplot(plt.gcf()) plt.clf() # ================== Numeric Correlation ================== st.subheader("📊 Correlation Matrix (Numeric Features)") st.markdown(""" The correlation matrix shows the pairwise **Pearson correlation coefficients** between numeric features. - Values close to **1** indicate a strong positive correlation. - Values close to **-1** indicate a strong negative correlation. - Values around **0** indicate little or no linear correlation. """) numeric_df = df[num_cols] corr_matrix = numeric_df.corr() plt.figure(figsize=(8, 6)) sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5) plt.title("Correlation Matrix", color='darkblue', fontsize=14) st.pyplot(plt.gcf()) plt.clf() st.subheader("📑 Pearson p-values (Numeric Features)") st.markdown(""" The Pearson p-values indicate the statistical significance of the correlation between numeric features. - A **small p-value (typically < 0.05)** suggests that the correlation is statistically significant. - A **large p-value** suggests that the correlation could be due to random chance. - Diagonal cells are **NaN** because a feature's correlation with itself is not tested. """) pval_matrix = pd.DataFrame(np.zeros((len(num_cols), len(num_cols))), columns=num_cols, index=num_cols) for i, col1 in enumerate(num_cols): for j, col2 in enumerate(num_cols): pval_matrix.loc[col1, col2] = np.nan if i == j else pearsonr(numeric_df[col1], numeric_df[col2])[1] st.dataframe(pval_matrix.style.background_gradient(cmap="coolwarm", axis=None).format("{:.3f}")) # ================== Categorical Correlation (Cramér's V) ================== st.subheader("📊 Cramér's V (Categorical Features + Targets)") explain_carmer = """ Cramér's V measures the strength of association between categorical variables. - Values range from **0 to 1**: - **0** → no association - **1** → perfect association - Higher values indicate stronger relationships between the categories. - This includes both the original categorical features and the target variables (e.g., Region, Month). """ st.markdown(explain_carmer) categorical_cols = label_cols + target_cols cat_df = df[categorical_cols].dropna() def cramers_v(x, y): cmatrix = pd.crosstab(x, y) chi2 = chi2_contingency(cmatrix)[0] n = cmatrix.sum().sum() phi2 = chi2 / n r, k = cmatrix.shape return np.sqrt(phi2 / min(k - 1, r - 1)) cramers_matrix = pd.DataFrame(np.zeros((len(categorical_cols), len(categorical_cols))), index=categorical_cols, columns=categorical_cols) for col1 in categorical_cols: for col2 in categorical_cols: cramers_matrix.loc[col1, col2] = 1.0 if col1 == col2 else cramers_v(cat_df[col1], cat_df[col2]) plt.figure(figsize=(10, 8)) sns.heatmap(cramers_matrix, annot=True, fmt=".2f", cmap="viridis", linewidths=0.5) plt.title("Cramér's V Correlation (Categorical Features)", color='darkgreen', fontsize=14) st.pyplot(plt.gcf()) plt.clf() # ================== יצירת Excel ================== download_df = input_df.copy() download_df['Predicted Region'] = region_pred download_df['Region Confidence (%)'] = region_confidence download_df['Predicted Month'] = month_pred download_df['Month Confidence (%)'] = month_confidence # Feature Importances fi_region = pd.Series(gb_region.feature_importances_, index=feature_names, name='Region FI') fi_month = pd.Series(gb_month.feature_importances_, index=feature_names, name='Month FI') fi_df = pd.concat([fi_region, fi_month], axis=1).reset_index().rename(columns={'index': 'Feature'}) pval_df = pval_matrix.reset_index().rename(columns={'index':'Feature1'}) cramers_df = cramers_matrix.reset_index().rename(columns={'index':'Feature1'}) numeric_df = df[num_cols] # בחירת העמודות המספריות corr_df = numeric_df.corr() # מטריצת קורלציה (Pearson) excel_buffer = io.BytesIO() lines = explain_carmer.split('\n') # אם יש פסקאות explain_carmer_to_save = pd.DataFrame(lines, columns=['Explanation']) with pd.ExcelWriter(excel_buffer, engine='xlsxwriter') as writer: download_df.to_excel(writer, sheet_name='Prediction', index=False) most_similar_row.to_excel(writer, sheet_name='Similar row table', index=False) fi_df.to_excel(writer, sheet_name='Feature Importances', index=False) pval_df.to_excel(writer, sheet_name='Pearson p-values', index=False) cramers_df.to_excel(writer, sheet_name= 'Cramers V', index=False) pd.DataFrame(explain_carmer_to_save).to_excel(writer, sheet_name='Cramers V', index=False) corr_df.to_excel(writer, sheet_name='Correlation Matrix', index=True) st.download_button( label="⬇️ Download Rabies Prediction Data", data=excel_buffer.getvalue(), file_name="Rabies_analysis.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) except Exception as e: st.error(f"❌ שגיאה: {str(e)}")