Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import seaborn as sns | |
| import pandas as pd | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.svm import SVR | |
| from sklearn.neighbors import KNeighborsRegressor | |
| from sklearn.metrics import r2_score | |
| from sklearn.model_selection import train_test_split | |
| from scipy.stats import spearmanr | |
| import plotly.graph_objects as go | |
| from xgboost import XGBRegressor | |
| # Configuration globale | |
| RANDOM_STATE = 42 | |
| st.set_page_config(page_title="Analyse d'imputation", layout="wide") | |
| # CSS personnalisé pour un rendu plus professionnel | |
| st.markdown(""" | |
| <style> | |
| .main { | |
| background-color: #f8f9fa; | |
| } | |
| .stTabs [data-baseweb="tab-list"] { | |
| gap: 8px; | |
| background-color: white; | |
| padding: 10px; | |
| border-radius: 8px; | |
| box-shadow: 0 1px 3px rgba(0,0,0,0.1); | |
| } | |
| .stTabs [data-baseweb="tab"] { | |
| background-color: #f8f9fa; | |
| border-radius: 6px; | |
| padding: 10px 20px; | |
| font-weight: 500; | |
| } | |
| .stTabs [aria-selected="true"] { | |
| background-color: #0066cc; | |
| color: white; | |
| } | |
| div[data-testid="stExpander"] { | |
| background-color: white; | |
| border: 1px solid #e0e0e0; | |
| border-radius: 8px; | |
| margin-bottom: 12px; | |
| box-shadow: 0 1px 2px rgba(0,0,0,0.05); | |
| } | |
| div[data-testid="stExpander"] summary { | |
| font-weight: 600; | |
| color: #1a1a1a; | |
| padding: 12px; | |
| } | |
| .stButton>button { | |
| border-radius: 6px; | |
| font-weight: 600; | |
| box-shadow: 0 2px 4px rgba(0,0,0,0.1); | |
| } | |
| h1 { | |
| color: #1a1a1a; | |
| font-weight: 700; | |
| } | |
| h2, h3 { | |
| color: #333333; | |
| font-weight: 600; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| st.title("🔍 Analyse de fiabilité de l'imputation") | |
| # st.markdown( | |
| # """ | |
| # Cette application évalue la capacité à imputer chaque variable d'un dataset en utilisant les autres variables. | |
| # **Méthodologie :** Standardisation des données • Suppression des variables jumelles (corrélation Spearman) • Modélisation au choix • Évaluation de la qualité d'imputation (R²) | |
| # """ | |
| # ) | |
| st.markdown( | |
| """ | |
| Cette application évalue la capacité à imputer chaque variable d'un dataset en utilisant les autres variables. | |
| """ | |
| ) | |
| # Sidebar: Configuration | |
| with st.sidebar: | |
| run_analysis = st.button("🚀 Lancer l'analyse", type="primary", use_container_width=True) | |
| st.header("⚙️ Configuration") | |
| # Source de données | |
| data_source = st.radio( | |
| "Source des données", | |
| ["Jeu de données Seaborn", "Importer un fichier"], | |
| label_visibility="visible" | |
| ) | |
| df = None | |
| if data_source == "Importer un fichier": | |
| uploaded_file = st.file_uploader("Importer un fichier CSV", type=["csv"]) | |
| if uploaded_file is not None: | |
| dataset_name = uploaded_file.name | |
| try: | |
| df = pd.read_csv(uploaded_file, sep=None, engine='python') | |
| df = df.select_dtypes(include=[np.number]) | |
| df = df.dropna() | |
| if len(df) == 0: | |
| st.error("❌ Aucune donnée numérique après nettoyage.") | |
| df = None | |
| #else: | |
| # st.success(f"✅ Fichier chargé ! ({len(df)} lignes, {len(df.columns)} colonnes)") | |
| except Exception as e: | |
| st.error(f"Erreur : {e}") | |
| df = None | |
| else: | |
| excluded_datasets = ['anagrams', 'anscombe', 'attention', 'brain_networks', | |
| 'car_crashes', 'dowjones', 'exercise', 'fmri','flights', 'geyser', | |
| 'planets', 'seaice'] | |
| available_datasets = [d for d in sorted(sns.get_dataset_names()) if d not in excluded_datasets] | |
| default_dataset = "iris" | |
| default_index = available_datasets.index(default_dataset) if default_dataset in available_datasets else 0 | |
| dataset_name = st.selectbox( | |
| "Dataset d'exemple", | |
| available_datasets, | |
| index=default_index | |
| ) | |
| try: | |
| df = sns.load_dataset(dataset_name) | |
| df = df.select_dtypes(include=[np.number]) | |
| df = df.dropna() | |
| #st.success(f"✅ Jeu '{dataset_name}' chargé") | |
| except Exception as e: | |
| st.error(f"Erreur : {e}") | |
| df = None | |
| if df is not None and len(df.columns) > 1: | |
| st.subheader("Paramètres") | |
| # Expander Modélisation | |
| with st.expander("▶ Modélisation", expanded=True): | |
| algo = st.selectbox( | |
| "Algorithme de régression", | |
| ["Régression Linéaire", "Random Forest", "SVR", "KNN", "XGBoost"], | |
| help="Algorithme utilisé pour prédire chaque variable" | |
| ) | |
| test_size = st.slider( | |
| "Taille de l'ensemble test (%)", | |
| min_value=10, | |
| max_value=50, | |
| value=30, | |
| step=5, | |
| help="Pourcentage des données pour le test" | |
| ) | |
| # Expander Nettoyage & Filtres | |
| with st.expander("▶ Réglages", expanded=False): | |
| corr_threshold = st.slider( | |
| "Seuil de corrélation (Spearman)", | |
| min_value=0.5, | |
| max_value=0.99, | |
| value=0.92, | |
| step=0.01, | |
| help="Variables avec corrélation > seuil seront considérées comme jumelles" | |
| ) | |
| outlier_threshold = st.slider( | |
| "Seuil de suppression des outliers (écart-types)", | |
| min_value=1.0, | |
| max_value=6.0, | |
| value=6.0, | |
| step=0.5, | |
| help="Supprime les valeurs à plus de X écart-types de la moyenne" | |
| ) | |
| sample_size = st.slider( | |
| "Échantillon du dataset (%)", | |
| min_value=10, | |
| max_value=100, | |
| value=100, | |
| step=10, | |
| help="Pourcentage du dataset à utiliser pour l'analyse" | |
| ) | |
| #st.divider() | |
| #run_analysis = st.button("🚀 Lancer l'analyse", type="primary", use_container_width=True) | |
| else: | |
| if df is not None: | |
| st.warning("⚠️ Le dataset doit contenir au moins 2 variables numériques.") | |
| else: | |
| st.info("👈 Veuillez sélectionner ou importer un jeu de données.") | |
| # Fonctions utilitaires | |
| def get_model(algo_name): | |
| """Retourne le modèle selon l'algorithme sélectionné""" | |
| if algo_name == "Régression Linéaire": | |
| return LinearRegression() | |
| elif algo_name == "Random Forest": | |
| return RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1) | |
| elif algo_name == "SVR": | |
| return SVR(kernel='rbf') | |
| elif algo_name == "XGBoost": | |
| return XGBRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1, verbosity=0) | |
| else: # KNN | |
| return KNeighborsRegressor(n_neighbors=5) | |
| def remove_outliers(df, threshold): | |
| """Supprime les outliers à plus de X écart-types""" | |
| df_clean = df.copy() | |
| for col in df_clean.columns: | |
| mean = df_clean[col].mean() | |
| std = df_clean[col].std() | |
| df_clean = df_clean[np.abs(df_clean[col] - mean) <= threshold * std] | |
| return df_clean | |
| def remove_twin_variables(X, threshold): | |
| """Supprime les variables jumelles basé sur corrélation Spearman""" | |
| corr_matrix = X.corr(method='spearman').abs() | |
| upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)) | |
| to_drop = set() | |
| for column in upper_tri.columns: | |
| correlated = upper_tri.index[upper_tri[column] > threshold].tolist() | |
| if correlated: | |
| to_drop.update(correlated) | |
| return X.drop(columns=list(to_drop)), list(to_drop) | |
| def backward_elimination(X, y, p_threshold=0.05): | |
| """Backward elimination basé sur les p-values""" | |
| import statsmodels.api as sm | |
| X_with_const = sm.add_constant(X) | |
| selected_features = list(X.columns) | |
| while len(selected_features) > 0: | |
| model = sm.OLS(y, X_with_const[['const'] + selected_features]).fit() | |
| p_values = model.pvalues[1:] | |
| max_p_value = p_values.max() | |
| if max_p_value > p_threshold: | |
| exclude_feature = p_values.idxmax() | |
| selected_features.remove(exclude_feature) | |
| else: | |
| break | |
| return selected_features | |
| def evaluate_imputation(df, target_col, corr_threshold, test_size, algo): | |
| """Évalue la faisabilité d'imputation pour une variable""" | |
| X = df.drop(columns=[target_col]) | |
| y = df[target_col] | |
| # Vérification minimale de données | |
| if len(X) < 10 or len(X.columns) == 0: | |
| return None, [], [] | |
| # Standardisation | |
| scaler = StandardScaler() | |
| X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index) | |
| # Suppression des jumelles | |
| X_filtered, dropped_twins = remove_twin_variables(X_scaled, corr_threshold) | |
| if len(X_filtered.columns) == 0: | |
| return 0.0, [], dropped_twins | |
| # Split train/test | |
| try: | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_filtered, y, test_size=test_size/100, random_state=RANDOM_STATE | |
| ) | |
| except: | |
| return None, [], dropped_twins | |
| # Backward elimination (seulement pour régression linéaire avec seuil fixe de 0.05) | |
| selected_features = list(X_train.columns) | |
| if algo == "Régression Linéaire" and len(X_train.columns) > 1: | |
| try: | |
| selected_features = backward_elimination(X_train, y_train, p_threshold=0.05) | |
| except: | |
| pass | |
| if len(selected_features) == 0: | |
| return 0.0, [], dropped_twins | |
| # Entraînement du modèle | |
| model = get_model(algo) | |
| X_train_selected = X_train[selected_features] | |
| X_test_selected = X_test[selected_features] | |
| try: | |
| model.fit(X_train_selected, y_train) | |
| y_pred = model.predict(X_test_selected) | |
| r2 = r2_score(y_test, y_pred) | |
| return max(0, r2), selected_features, dropped_twins | |
| except: | |
| return 0.0, selected_features, dropped_twins | |
| # Interface principale | |
| if df is not None and len(df.columns) > 1: | |
| tab1, tab2, tab3, tab4, tab5 = st.tabs(["📊 Analyse", "📋 Détails par variable", "📈 Statistiques", "💾 Données brutes", "ℹ️ Information"]) | |
| with tab4: | |
| st.dataframe(df.head(20), use_container_width=True) | |
| with tab5: | |
| st.header("À propos de l'analyse") | |
| st.markdown(f""" | |
| **Nom du dataset :** {dataset_name} | |
| **Dataset :** {len(df)} lignes × {len(df.columns)} colonnes | |
| **Interprétation du R² :** | |
| - **R² > 0.7** : Imputation très fiable ✅ | |
| - **0.5 < R² < 0.7** : Imputation acceptable ⚠️ | |
| - **R² < 0.5** : Imputation difficile ❌ | |
| **Méthodologie :** | |
| 1. Chaque variable est tour à tour considérée comme cible | |
| 2. Les autres variables servent de prédicteurs | |
| 3. Suppression des variables jumelles (corrélation > {corr_threshold}) | |
| 4. Évaluation avec {algo} | |
| """) | |
| with tab1: | |
| if 'run_analysis' in locals() and run_analysis: | |
| # Préparation du dataset avec outliers et échantillonnage | |
| df_processed = df.copy() | |
| # Suppression des outliers | |
| if outlier_threshold < 6.0: | |
| df_before = len(df_processed) | |
| df_processed = remove_outliers(df_processed, outlier_threshold) | |
| df_after = len(df_processed) | |
| st.info(f"🧹 Outliers supprimés : {df_before - df_after} lignes ({(df_before - df_after)/df_before*100:.1f}%)") | |
| # Échantillonnage | |
| if sample_size < 100: | |
| df_processed = df_processed.sample(frac=sample_size/100, random_state=RANDOM_STATE) | |
| st.info(f"📊 Échantillon utilisé : {len(df_processed)} lignes ({sample_size}% du dataset)") | |
| results = [] | |
| progress_bar = st.progress(0) | |
| status_text = st.empty() | |
| for idx, col in enumerate(df_processed.columns): | |
| status_text.text(f"Analyse de '{col}' ({idx+1}/{len(df_processed.columns)})...") | |
| r2, selected_vars, dropped_twins = evaluate_imputation( | |
| df_processed, col, corr_threshold, test_size, algo | |
| ) | |
| if r2 is not None: | |
| results.append({ | |
| 'Variable': col, | |
| 'R²': r2, | |
| 'Prédicteurs': len(selected_vars), | |
| 'Jumelles': len(dropped_twins), | |
| 'Statut': '✅ Excellent' if r2 > 0.7 else ('⚠️ Moyen' if r2 > 0.5 else '❌ Difficile') | |
| }) | |
| progress_bar.progress((idx + 1) / len(df_processed.columns)) | |
| status_text.empty() | |
| progress_bar.empty() | |
| if results: | |
| results_df = pd.DataFrame(results).sort_values('R²', ascending=False) | |
| # Stocker dans session_state | |
| st.session_state['results_df'] = results_df | |
| # Graphique interactif | |
| st.subheader("📈 Résultats de l'analyse") | |
| fig = go.Figure() | |
| colors = ['#28a745' if r >= 0.7 else '#ffc107' if r >= 0.5 else '#dc3545' | |
| for r in results_df['R²']] | |
| fig.add_trace(go.Bar( | |
| x=results_df['Variable'], | |
| y=results_df['R²'], | |
| marker_color=colors, | |
| text=results_df['R²'].round(3), | |
| textposition='outside', | |
| hovertemplate='<b>%{x}</b><br>R²: %{y:.3f}<extra></extra>' | |
| )) | |
| fig.add_hline(y=0.7, line_dash="dash", line_color="#28a745", | |
| annotation_text="Excellent (0.7)", annotation_position="right") | |
| fig.add_hline(y=0.5, line_dash="dash", line_color="#ffc107", | |
| annotation_text="Acceptable (0.5)", annotation_position="right") | |
| fig.update_layout( | |
| title=f"Fiabilité de l'imputation par variable ({algo})", | |
| xaxis_title="Variable", | |
| yaxis_title="R² Score", | |
| height=470, | |
| showlegend=False, | |
| hovermode='x', | |
| plot_bgcolor='white', | |
| paper_bgcolor='white', | |
| font=dict(family="Arial, sans-serif", size=12, color="#333333") | |
| ) | |
| fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0') | |
| fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0') | |
| st.plotly_chart(fig, use_container_width=True) | |
| else: | |
| st.error("❌ Aucun résultat. Vérifiez vos données.") | |
| elif 'run_analysis' not in locals(): | |
| st.info("👈 Cliquez sur le bouton 'Lancer l'analyse' dans la sidebar") | |
| else: | |
| st.info("👈 Cliquez sur le bouton 'Lancer l'analyse' dans la sidebar") | |
| with tab2: | |
| if 'results_df' in st.session_state: | |
| st.subheader("📋 Détails par variable") | |
| # Tableau avec style personnalisé | |
| results_display = st.session_state['results_df'].copy() | |
| styled_df = results_display.style.format({ | |
| 'R²': '{:.3f}' | |
| }).background_gradient(subset=['R²'], cmap='RdYlGn', vmin=0, vmax=1) | |
| st.dataframe(styled_df, use_container_width=True, hide_index=True, height=400) | |
| else: | |
| st.info("👈 Lancez d'abord une analyse pour voir les détails par variable") | |
| with tab3: | |
| if 'results_df' in st.session_state: | |
| st.subheader("📈 Statistiques récapitulatives") | |
| results_df = st.session_state['results_df'] | |
| # Statistiques récapitulatives | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| excellent = len(results_df[results_df['R²'] > 0.7]) | |
| st.metric("Nombre d'imputations fiables", excellent, | |
| delta=f"{excellent/len(results_df)*100:.1f}%") | |
| with col2: | |
| acceptable = len(results_df[(results_df['R²'] > 0.5) & (results_df['R²'] <= 0.7)]) | |
| st.metric("Nombre d'imputations acceptables", acceptable, | |
| delta=f"{acceptable/len(results_df)*100:.1f}%") | |
| with col3: | |
| difficult = len(results_df[results_df['R²'] <= 0.5]) | |
| st.metric("Nombre d'imputations déconseillées", difficult, | |
| delta=f"{difficult/len(results_df)*100:.1f}%") | |
| else: | |
| st.info("👈 Lancez d'abord une analyse pour voir les statistiques") | |
| else: | |
| st.info("👈 Veuillez sélectionner un jeu de données avec au moins 2 variables numériques.") |