Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,40 +3,93 @@ import seaborn as sns
|
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
|
| 6 |
-
from sklearn.preprocessing import StandardScaler
|
| 7 |
-
from sklearn.
|
| 8 |
-
from sklearn.
|
| 9 |
-
from sklearn.
|
| 10 |
-
from sklearn.
|
|
|
|
| 11 |
from sklearn.model_selection import train_test_split
|
| 12 |
-
#from scipy.stats import pearsonr
|
| 13 |
from scipy.stats import spearmanr
|
|
|
|
|
|
|
| 14 |
|
| 15 |
-
#
|
| 16 |
-
# Configuration Globale
|
| 17 |
-
# ------------------------------------------------------------
|
| 18 |
-
TEST_SIZE = 0.3
|
| 19 |
RANDOM_STATE = 42
|
| 20 |
|
| 21 |
-
st.set_page_config(page_title="Analyse d'
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
st.markdown(
|
| 25 |
"""
|
| 26 |
-
Cette application
|
| 27 |
-
|
| 28 |
-
- Pertinence marginale : corrélation ou information mutuelle avec la cible.
|
| 29 |
-
- Pertinence conditionnelle : valeur ajoutée d'une variable excluant les redondances après contrôle.
|
| 30 |
"""
|
| 31 |
)
|
| 32 |
|
| 33 |
-
#
|
| 34 |
-
# Sidebar: Dataset et Importation
|
| 35 |
-
# ------------------------------------------------------------
|
| 36 |
with st.sidebar:
|
|
|
|
| 37 |
st.header("⚙️ Configuration")
|
| 38 |
|
| 39 |
-
#
|
| 40 |
data_source = st.radio(
|
| 41 |
"Source des données",
|
| 42 |
["Jeu de données Seaborn", "Importer un fichier"],
|
|
@@ -49,41 +102,24 @@ with st.sidebar:
|
|
| 49 |
uploaded_file = st.file_uploader("Importer un fichier CSV", type=["csv"])
|
| 50 |
|
| 51 |
if uploaded_file is not None:
|
|
|
|
| 52 |
try:
|
| 53 |
df = pd.read_csv(uploaded_file, sep=None, engine='python')
|
| 54 |
-
|
| 55 |
-
# Seuil de valeurs manquantes (configurable)
|
| 56 |
-
missing_threshold = st.slider(
|
| 57 |
-
"Seuil max de valeurs manquantes (%)",
|
| 58 |
-
min_value=0,
|
| 59 |
-
max_value=100,
|
| 60 |
-
value=50,
|
| 61 |
-
help="Les colonnes avec plus de X% de valeurs manquantes seront supprimées"
|
| 62 |
-
)
|
| 63 |
-
|
| 64 |
-
# Calcul du pourcentage de valeurs manquantes par colonne
|
| 65 |
-
missing_pct = (df.isnull().sum() / len(df)) * 100
|
| 66 |
-
cols_to_drop = missing_pct[missing_pct > missing_threshold].index.tolist()
|
| 67 |
-
|
| 68 |
-
if cols_to_drop:
|
| 69 |
-
st.info(f"ℹ️ {len(cols_to_drop)} colonne(s) supprimée(s) (>{missing_threshold}% manquantes) : {', '.join(cols_to_drop)}")
|
| 70 |
-
df = df.drop(columns=cols_to_drop)
|
| 71 |
-
|
| 72 |
-
# Suppression des lignes avec valeurs manquantes restantes
|
| 73 |
df = df.dropna()
|
| 74 |
|
| 75 |
if len(df) == 0:
|
| 76 |
-
st.error("❌ Aucune donnée après nettoyage.
|
| 77 |
df = None
|
| 78 |
-
else:
|
| 79 |
-
|
| 80 |
except Exception as e:
|
| 81 |
st.error(f"Erreur : {e}")
|
| 82 |
df = None
|
| 83 |
else:
|
| 84 |
excluded_datasets = ['anagrams', 'anscombe', 'attention', 'brain_networks',
|
| 85 |
-
'car_crashes', 'dowjones','
|
| 86 |
-
'planets','seaice']
|
| 87 |
available_datasets = [d for d in sorted(sns.get_dataset_names()) if d not in excluded_datasets]
|
| 88 |
default_dataset = "iris"
|
| 89 |
default_index = available_datasets.index(default_dataset) if default_dataset in available_datasets else 0
|
|
@@ -92,227 +128,336 @@ with st.sidebar:
|
|
| 92 |
available_datasets,
|
| 93 |
index=default_index
|
| 94 |
)
|
| 95 |
-
#dataset_name = st.selectbox("Dataset d'exemple", available_datasets)
|
| 96 |
try:
|
| 97 |
df = sns.load_dataset(dataset_name)
|
|
|
|
| 98 |
df = df.dropna()
|
| 99 |
-
st.success(f"✅ Jeu '{dataset_name}' chargé")
|
| 100 |
except Exception as e:
|
| 101 |
st.error(f"Erreur : {e}")
|
| 102 |
df = None
|
| 103 |
|
| 104 |
-
if df is not None:
|
| 105 |
-
|
| 106 |
-
y = df[target]
|
| 107 |
-
X = df.drop(columns=[target])
|
| 108 |
|
| 109 |
-
#
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
task = "Regression" if (y.dtype.kind in "ifu" and y.nunique() > 10) else "Classification"
|
| 117 |
-
excluded_features = st.multiselect("Variables à exclure :", X.columns.tolist(), default=[])
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
else:
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
|
| 134 |
-
#
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
if
|
| 138 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
-
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
# ------------------------------------------------------------
|
| 164 |
with tab1:
|
| 165 |
-
if
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
-
#
|
| 191 |
-
|
| 192 |
-
st.error("❌ Pas assez de données pour créer les ensembles d'entraînement et de test.")
|
| 193 |
-
st.info(f"Données disponibles : {len(X)} lignes. Minimum requis : 2 lignes.")
|
| 194 |
-
st.stop()
|
| 195 |
|
| 196 |
-
|
|
|
|
| 197 |
|
| 198 |
-
|
| 199 |
-
if X_train_proc.shape[0] == 0 or X_train_proc.shape[1] == 0:
|
| 200 |
-
st.error("❌ Erreur : Les données transformées sont vides.")
|
| 201 |
-
st.info(f"Shape après transformation : {X_train_proc.shape}")
|
| 202 |
-
st.info(f"Variables numériques : {num_cols}")
|
| 203 |
-
st.info(f"Variables catégorielles : {cat_cols}")
|
| 204 |
-
st.stop()
|
| 205 |
|
| 206 |
-
|
|
|
|
| 207 |
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 220 |
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
"Importance seule (MI)": mi,
|
| 224 |
-
"Poids dans le modèle": np.abs(coefs),
|
| 225 |
-
"Sens": np.where(coefs > 0, "+", "-")
|
| 226 |
-
})
|
| 227 |
|
| 228 |
-
|
| 229 |
-
# res["Lien direct (Corr)"] = [pearsonr(X_train_proc[:, i], y_train)[0] for i in range(len(feature_names))]
|
| 230 |
-
|
| 231 |
-
if task == "Regression":
|
| 232 |
-
res["Lien direct (Corr)"] = [spearmanr(X_train_proc[:, i], y_train)[0] for i in range(len(feature_names))]
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
else:
|
| 245 |
-
res["Score synthétique"] = (mi_n + poids_n) / 2
|
| 246 |
-
|
| 247 |
-
res = res.sort_values("Score synthétique", ascending=False)
|
| 248 |
-
|
| 249 |
-
# Réorganisation des colonnes
|
| 250 |
-
cols = ["Variable", "Score synthétique", "Importance seule (MI)", "Poids dans le modèle", "Sens"]
|
| 251 |
-
if task == "Regression":
|
| 252 |
-
cols = ["Variable", "Score synthétique", "Importance seule (MI)", "Lien direct (Corr)", "Poids dans le modèle", "Sens"]
|
| 253 |
-
|
| 254 |
-
final_df = res[cols].copy()
|
| 255 |
-
|
| 256 |
-
# --- STYLISATION ET AFFICHAGE ---
|
| 257 |
-
# 1. Préparation du style pour la colonne Sens (couleurs)
|
| 258 |
-
def style_sign(val):
|
| 259 |
-
color = 'color: #2ecc71;' if val == '+' else 'color: #e74c3c;'
|
| 260 |
-
return f'{color} font-weight: bold; font-size: 20px;'
|
| 261 |
-
|
| 262 |
-
# 2. Application du formatage (2 décimales) et des gradients
|
| 263 |
-
num_cols_to_style = [c for c in cols if c not in ["Variable", "Sens", "Score synthétique"]]
|
| 264 |
-
|
| 265 |
-
styled_res = (final_df.style
|
| 266 |
-
.format({c: "{:.2f}" for c in cols if c not in ["Variable", "Sens"]})
|
| 267 |
-
.background_gradient(subset=num_cols_to_style, cmap="RdYlGn")
|
| 268 |
-
.map(style_sign, subset=['Sens'])
|
| 269 |
-
)
|
| 270 |
-
|
| 271 |
-
# 3. Affichage avec st.data_editor pour fixer la hauteur (6 lignes env = 250px)
|
| 272 |
-
st.data_editor(
|
| 273 |
-
styled_res,
|
| 274 |
-
use_container_width=True,
|
| 275 |
-
height=250, # Limite la hauteur avec scrollbar
|
| 276 |
-
hide_index=True,
|
| 277 |
-
disabled=True, # Empêche l'édition, agit comme un dataframe
|
| 278 |
-
column_config={
|
| 279 |
-
"Sens": st.column_config.Column(
|
| 280 |
-
"Sens",
|
| 281 |
-
help="Direction de l'influence",
|
| 282 |
-
width="small"
|
| 283 |
-
)
|
| 284 |
-
}
|
| 285 |
-
)
|
| 286 |
-
|
| 287 |
-
st.subheader("📖 Guide de lecture")
|
| 288 |
-
st.markdown(
|
| 289 |
-
"""
|
| 290 |
-
- **Score synthétique** : Note globale d'importance.
|
| 291 |
-
- **Importance seule (MI)** : Mesure la dépendance globale entre la variable et la cible. Contrairement à la corrélation qui ne voit que les lignes droites, l'Information Mutuelle détecte toutes les formes de relations (courbes, motifs complexes, etc.). Elle indique quelle quantité d'information "pure" cette variable partage avec la cible, sans tenir compte des autres variables.
|
| 292 |
-
- **Poids dans le modèle** : Contribution finale au modèle.
|
| 293 |
-
- **Sens (+) / (-)** : Direction de l'impact sur la cible.
|
| 294 |
-
"""
|
| 295 |
-
)
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
st.error("❌ Erreur d'analyse : données insuffisantes ou incompatibles")
|
| 300 |
-
st.warning("⚠️ Vérifiez que :")
|
| 301 |
-
st.markdown("""
|
| 302 |
-
- Vous n'avez pas exclu toutes les variables
|
| 303 |
-
- La variable cible choisie est appropriée (elle ne doit pas être identique à une variable prédictive)
|
| 304 |
-
- Il reste suffisamment de données après nettoyage
|
| 305 |
-
- Les variables ont suffisamment de variance
|
| 306 |
-
""")
|
| 307 |
-
else:
|
| 308 |
-
st.error(f"❌ Erreur : {str(e)}")
|
| 309 |
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
else:
|
| 316 |
-
st.info("
|
| 317 |
else:
|
| 318 |
-
st.info("👈 Veuillez sélectionner
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
|
| 6 |
+
from sklearn.preprocessing import StandardScaler
|
| 7 |
+
from sklearn.linear_model import LinearRegression
|
| 8 |
+
from sklearn.ensemble import RandomForestRegressor
|
| 9 |
+
from sklearn.svm import SVR
|
| 10 |
+
from sklearn.neighbors import KNeighborsRegressor
|
| 11 |
+
from sklearn.metrics import r2_score
|
| 12 |
from sklearn.model_selection import train_test_split
|
|
|
|
| 13 |
from scipy.stats import spearmanr
|
| 14 |
+
import plotly.graph_objects as go
|
| 15 |
+
from xgboost import XGBRegressor
|
| 16 |
|
| 17 |
+
# Configuration globale
|
|
|
|
|
|
|
|
|
|
| 18 |
RANDOM_STATE = 42
|
| 19 |
|
| 20 |
+
st.set_page_config(page_title="Analyse d'imputation", layout="wide")
|
| 21 |
|
| 22 |
+
# CSS personnalisé pour un rendu plus professionnel
|
| 23 |
+
st.markdown("""
|
| 24 |
+
<style>
|
| 25 |
+
.main {
|
| 26 |
+
background-color: #f8f9fa;
|
| 27 |
+
}
|
| 28 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 29 |
+
gap: 8px;
|
| 30 |
+
background-color: white;
|
| 31 |
+
padding: 10px;
|
| 32 |
+
border-radius: 8px;
|
| 33 |
+
box-shadow: 0 1px 3px rgba(0,0,0,0.1);
|
| 34 |
+
}
|
| 35 |
+
.stTabs [data-baseweb="tab"] {
|
| 36 |
+
background-color: #f8f9fa;
|
| 37 |
+
border-radius: 6px;
|
| 38 |
+
padding: 10px 20px;
|
| 39 |
+
font-weight: 500;
|
| 40 |
+
}
|
| 41 |
+
.stTabs [aria-selected="true"] {
|
| 42 |
+
background-color: #0066cc;
|
| 43 |
+
color: white;
|
| 44 |
+
}
|
| 45 |
+
div[data-testid="stExpander"] {
|
| 46 |
+
background-color: white;
|
| 47 |
+
border: 1px solid #e0e0e0;
|
| 48 |
+
border-radius: 8px;
|
| 49 |
+
margin-bottom: 12px;
|
| 50 |
+
box-shadow: 0 1px 2px rgba(0,0,0,0.05);
|
| 51 |
+
}
|
| 52 |
+
div[data-testid="stExpander"] summary {
|
| 53 |
+
font-weight: 600;
|
| 54 |
+
color: #1a1a1a;
|
| 55 |
+
padding: 12px;
|
| 56 |
+
}
|
| 57 |
+
.stButton>button {
|
| 58 |
+
border-radius: 6px;
|
| 59 |
+
font-weight: 600;
|
| 60 |
+
box-shadow: 0 2px 4px rgba(0,0,0,0.1);
|
| 61 |
+
}
|
| 62 |
+
h1 {
|
| 63 |
+
color: #1a1a1a;
|
| 64 |
+
font-weight: 700;
|
| 65 |
+
}
|
| 66 |
+
h2, h3 {
|
| 67 |
+
color: #333333;
|
| 68 |
+
font-weight: 600;
|
| 69 |
+
}
|
| 70 |
+
</style>
|
| 71 |
+
""", unsafe_allow_html=True)
|
| 72 |
+
|
| 73 |
+
st.title("🔍 Analyse de fiabilité de l'imputation")
|
| 74 |
+
# st.markdown(
|
| 75 |
+
# """
|
| 76 |
+
# Cette application évalue la capacité à imputer chaque variable d'un dataset en utilisant les autres variables.
|
| 77 |
+
|
| 78 |
+
# **Méthodologie :** Standardisation des données • Suppression des variables jumelles (corrélation Spearman) • Modélisation au choix • Évaluation de la qualité d'imputation (R²)
|
| 79 |
+
# """
|
| 80 |
+
# )
|
| 81 |
st.markdown(
|
| 82 |
"""
|
| 83 |
+
Cette application évalue la capacité à imputer chaque variable d'un dataset en utilisant les autres variables.
|
|
|
|
|
|
|
|
|
|
| 84 |
"""
|
| 85 |
)
|
| 86 |
|
| 87 |
+
# Sidebar: Configuration
|
|
|
|
|
|
|
| 88 |
with st.sidebar:
|
| 89 |
+
run_analysis = st.button("🚀 Lancer l'analyse", type="primary", use_container_width=True)
|
| 90 |
st.header("⚙️ Configuration")
|
| 91 |
|
| 92 |
+
# Source de données
|
| 93 |
data_source = st.radio(
|
| 94 |
"Source des données",
|
| 95 |
["Jeu de données Seaborn", "Importer un fichier"],
|
|
|
|
| 102 |
uploaded_file = st.file_uploader("Importer un fichier CSV", type=["csv"])
|
| 103 |
|
| 104 |
if uploaded_file is not None:
|
| 105 |
+
dataset_name = uploaded_file.name
|
| 106 |
try:
|
| 107 |
df = pd.read_csv(uploaded_file, sep=None, engine='python')
|
| 108 |
+
df = df.select_dtypes(include=[np.number])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
df = df.dropna()
|
| 110 |
|
| 111 |
if len(df) == 0:
|
| 112 |
+
st.error("❌ Aucune donnée numérique après nettoyage.")
|
| 113 |
df = None
|
| 114 |
+
#else:
|
| 115 |
+
# st.success(f"✅ Fichier chargé ! ({len(df)} lignes, {len(df.columns)} colonnes)")
|
| 116 |
except Exception as e:
|
| 117 |
st.error(f"Erreur : {e}")
|
| 118 |
df = None
|
| 119 |
else:
|
| 120 |
excluded_datasets = ['anagrams', 'anscombe', 'attention', 'brain_networks',
|
| 121 |
+
'car_crashes', 'dowjones', 'exercise', 'fmri','flights', 'geyser',
|
| 122 |
+
'planets', 'seaice']
|
| 123 |
available_datasets = [d for d in sorted(sns.get_dataset_names()) if d not in excluded_datasets]
|
| 124 |
default_dataset = "iris"
|
| 125 |
default_index = available_datasets.index(default_dataset) if default_dataset in available_datasets else 0
|
|
|
|
| 128 |
available_datasets,
|
| 129 |
index=default_index
|
| 130 |
)
|
|
|
|
| 131 |
try:
|
| 132 |
df = sns.load_dataset(dataset_name)
|
| 133 |
+
df = df.select_dtypes(include=[np.number])
|
| 134 |
df = df.dropna()
|
| 135 |
+
#st.success(f"✅ Jeu '{dataset_name}' chargé")
|
| 136 |
except Exception as e:
|
| 137 |
st.error(f"Erreur : {e}")
|
| 138 |
df = None
|
| 139 |
|
| 140 |
+
if df is not None and len(df.columns) > 1:
|
| 141 |
+
st.subheader("Paramètres")
|
|
|
|
|
|
|
| 142 |
|
| 143 |
+
# Expander Modélisation
|
| 144 |
+
with st.expander("▶ Modélisation", expanded=True):
|
| 145 |
+
algo = st.selectbox(
|
| 146 |
+
"Algorithme de régression",
|
| 147 |
+
["Régression Linéaire", "Random Forest", "SVR", "KNN", "XGBoost"],
|
| 148 |
+
help="Algorithme utilisé pour prédire chaque variable"
|
| 149 |
+
)
|
|
|
|
|
|
|
| 150 |
|
| 151 |
+
test_size = st.slider(
|
| 152 |
+
"Taille de l'ensemble test (%)",
|
| 153 |
+
min_value=10,
|
| 154 |
+
max_value=50,
|
| 155 |
+
value=30,
|
| 156 |
+
step=5,
|
| 157 |
+
help="Pourcentage des données pour le test"
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
# Expander Nettoyage & Filtres
|
| 161 |
+
with st.expander("▶ Réglages", expanded=False):
|
| 162 |
+
corr_threshold = st.slider(
|
| 163 |
+
"Seuil de corrélation (Spearman)",
|
| 164 |
+
min_value=0.5,
|
| 165 |
+
max_value=0.99,
|
| 166 |
+
value=0.92,
|
| 167 |
+
step=0.01,
|
| 168 |
+
help="Variables avec corrélation > seuil seront considérées comme jumelles"
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
outlier_threshold = st.slider(
|
| 172 |
+
"Seuil de suppression des outliers (écart-types)",
|
| 173 |
+
min_value=1.0,
|
| 174 |
+
max_value=6.0,
|
| 175 |
+
value=6.0,
|
| 176 |
+
step=0.5,
|
| 177 |
+
help="Supprime les valeurs à plus de X écart-types de la moyenne"
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
sample_size = st.slider(
|
| 181 |
+
"Échantillon du dataset (%)",
|
| 182 |
+
min_value=10,
|
| 183 |
+
max_value=100,
|
| 184 |
+
value=100,
|
| 185 |
+
step=10,
|
| 186 |
+
help="Pourcentage du dataset à utiliser pour l'analyse"
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
#st.divider()
|
| 190 |
+
#run_analysis = st.button("🚀 Lancer l'analyse", type="primary", use_container_width=True)
|
| 191 |
else:
|
| 192 |
+
if df is not None:
|
| 193 |
+
st.warning("⚠️ Le dataset doit contenir au moins 2 variables numériques.")
|
| 194 |
+
else:
|
| 195 |
+
st.info("👈 Veuillez sélectionner ou importer un jeu de données.")
|
| 196 |
|
| 197 |
+
# Fonctions utilitaires
|
| 198 |
+
def get_model(algo_name):
|
| 199 |
+
"""Retourne le modèle selon l'algorithme sélectionné"""
|
| 200 |
+
if algo_name == "Régression Linéaire":
|
| 201 |
+
return LinearRegression()
|
| 202 |
+
elif algo_name == "Random Forest":
|
| 203 |
+
return RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)
|
| 204 |
+
elif algo_name == "SVR":
|
| 205 |
+
return SVR(kernel='rbf')
|
| 206 |
+
elif algo_name == "XGBoost":
|
| 207 |
+
return XGBRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1, verbosity=0)
|
| 208 |
+
else: # KNN
|
| 209 |
+
return KNeighborsRegressor(n_neighbors=5)
|
| 210 |
|
| 211 |
+
def remove_outliers(df, threshold):
|
| 212 |
+
"""Supprime les outliers à plus de X écart-types"""
|
| 213 |
+
df_clean = df.copy()
|
| 214 |
+
for col in df_clean.columns:
|
| 215 |
+
mean = df_clean[col].mean()
|
| 216 |
+
std = df_clean[col].std()
|
| 217 |
+
df_clean = df_clean[np.abs(df_clean[col] - mean) <= threshold * std]
|
| 218 |
+
return df_clean
|
| 219 |
|
| 220 |
+
def remove_twin_variables(X, threshold):
|
| 221 |
+
"""Supprime les variables jumelles basé sur corrélation Spearman"""
|
| 222 |
+
corr_matrix = X.corr(method='spearman').abs()
|
| 223 |
+
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
|
| 224 |
+
|
| 225 |
+
to_drop = set()
|
| 226 |
+
for column in upper_tri.columns:
|
| 227 |
+
correlated = upper_tri.index[upper_tri[column] > threshold].tolist()
|
| 228 |
+
if correlated:
|
| 229 |
+
to_drop.update(correlated)
|
| 230 |
+
|
| 231 |
+
return X.drop(columns=list(to_drop)), list(to_drop)
|
| 232 |
+
|
| 233 |
+
def backward_elimination(X, y, p_threshold=0.05):
|
| 234 |
+
"""Backward elimination basé sur les p-values"""
|
| 235 |
+
import statsmodels.api as sm
|
| 236 |
+
|
| 237 |
+
X_with_const = sm.add_constant(X)
|
| 238 |
+
selected_features = list(X.columns)
|
| 239 |
+
|
| 240 |
+
while len(selected_features) > 0:
|
| 241 |
+
model = sm.OLS(y, X_with_const[['const'] + selected_features]).fit()
|
| 242 |
+
p_values = model.pvalues[1:]
|
| 243 |
|
| 244 |
+
max_p_value = p_values.max()
|
| 245 |
+
if max_p_value > p_threshold:
|
| 246 |
+
exclude_feature = p_values.idxmax()
|
| 247 |
+
selected_features.remove(exclude_feature)
|
| 248 |
+
else:
|
| 249 |
+
break
|
| 250 |
+
|
| 251 |
+
return selected_features
|
| 252 |
+
|
| 253 |
+
def evaluate_imputation(df, target_col, corr_threshold, test_size, algo):
|
| 254 |
+
"""Évalue la faisabilité d'imputation pour une variable"""
|
| 255 |
+
X = df.drop(columns=[target_col])
|
| 256 |
+
y = df[target_col]
|
| 257 |
+
|
| 258 |
+
# Vérification minimale de données
|
| 259 |
+
if len(X) < 10 or len(X.columns) == 0:
|
| 260 |
+
return None, [], []
|
| 261 |
+
|
| 262 |
+
# Standardisation
|
| 263 |
+
scaler = StandardScaler()
|
| 264 |
+
X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
|
| 265 |
+
|
| 266 |
+
# Suppression des jumelles
|
| 267 |
+
X_filtered, dropped_twins = remove_twin_variables(X_scaled, corr_threshold)
|
| 268 |
+
|
| 269 |
+
if len(X_filtered.columns) == 0:
|
| 270 |
+
return 0.0, [], dropped_twins
|
| 271 |
+
|
| 272 |
+
# Split train/test
|
| 273 |
+
try:
|
| 274 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 275 |
+
X_filtered, y, test_size=test_size/100, random_state=RANDOM_STATE
|
| 276 |
+
)
|
| 277 |
+
except:
|
| 278 |
+
return None, [], dropped_twins
|
| 279 |
+
|
| 280 |
+
# Backward elimination (seulement pour régression linéaire avec seuil fixe de 0.05)
|
| 281 |
+
selected_features = list(X_train.columns)
|
| 282 |
+
if algo == "Régression Linéaire" and len(X_train.columns) > 1:
|
| 283 |
+
try:
|
| 284 |
+
selected_features = backward_elimination(X_train, y_train, p_threshold=0.05)
|
| 285 |
+
except:
|
| 286 |
+
pass
|
| 287 |
+
|
| 288 |
+
if len(selected_features) == 0:
|
| 289 |
+
return 0.0, [], dropped_twins
|
| 290 |
+
|
| 291 |
+
# Entraînement du modèle
|
| 292 |
+
model = get_model(algo)
|
| 293 |
+
X_train_selected = X_train[selected_features]
|
| 294 |
+
X_test_selected = X_test[selected_features]
|
| 295 |
+
|
| 296 |
+
try:
|
| 297 |
+
model.fit(X_train_selected, y_train)
|
| 298 |
+
y_pred = model.predict(X_test_selected)
|
| 299 |
+
r2 = r2_score(y_test, y_pred)
|
| 300 |
+
return max(0, r2), selected_features, dropped_twins
|
| 301 |
+
except:
|
| 302 |
+
return 0.0, selected_features, dropped_twins
|
| 303 |
+
|
| 304 |
+
# Interface principale
|
| 305 |
+
if df is not None and len(df.columns) > 1:
|
| 306 |
+
tab1, tab2, tab3, tab4, tab5 = st.tabs(["📊 Analyse", "📋 Détails par variable", "📈 Statistiques", "💾 Données brutes", "ℹ️ Information"])
|
| 307 |
+
|
| 308 |
+
with tab4:
|
| 309 |
+
st.dataframe(df.head(20), use_container_width=True)
|
| 310 |
+
|
| 311 |
+
with tab5:
|
| 312 |
+
st.header("À propos de l'analyse")
|
| 313 |
+
st.markdown(f"""
|
| 314 |
+
**Nom du dataset :** {dataset_name}
|
| 315 |
+
|
| 316 |
+
**Dataset :** {len(df)} lignes × {len(df.columns)} colonnes
|
| 317 |
|
| 318 |
+
**Interprétation du R² :**
|
| 319 |
+
- **R² > 0.7** : Imputation très fiable ✅
|
| 320 |
+
- **0.5 < R² < 0.7** : Imputation acceptable ⚠️
|
| 321 |
+
- **R² < 0.5** : Imputation difficile ❌
|
| 322 |
|
| 323 |
+
**Méthodologie :**
|
| 324 |
+
1. Chaque variable est tour à tour considérée comme cible
|
| 325 |
+
2. Les autres variables servent de prédicteurs
|
| 326 |
+
3. Suppression des variables jumelles (corrélation > {corr_threshold})
|
| 327 |
+
4. Évaluation avec {algo}
|
| 328 |
+
""")
|
| 329 |
+
|
|
|
|
| 330 |
with tab1:
|
| 331 |
+
if 'run_analysis' in locals() and run_analysis:
|
| 332 |
+
# Préparation du dataset avec outliers et échantillonnage
|
| 333 |
+
df_processed = df.copy()
|
| 334 |
+
|
| 335 |
+
# Suppression des outliers
|
| 336 |
+
if outlier_threshold < 6.0:
|
| 337 |
+
df_before = len(df_processed)
|
| 338 |
+
df_processed = remove_outliers(df_processed, outlier_threshold)
|
| 339 |
+
df_after = len(df_processed)
|
| 340 |
+
st.info(f"🧹 Outliers supprimés : {df_before - df_after} lignes ({(df_before - df_after)/df_before*100:.1f}%)")
|
| 341 |
+
|
| 342 |
+
# Échantillonnage
|
| 343 |
+
if sample_size < 100:
|
| 344 |
+
df_processed = df_processed.sample(frac=sample_size/100, random_state=RANDOM_STATE)
|
| 345 |
+
st.info(f"📊 Échantillon utilisé : {len(df_processed)} lignes ({sample_size}% du dataset)")
|
| 346 |
+
|
| 347 |
+
results = []
|
| 348 |
+
|
| 349 |
+
progress_bar = st.progress(0)
|
| 350 |
+
status_text = st.empty()
|
| 351 |
+
|
| 352 |
+
for idx, col in enumerate(df_processed.columns):
|
| 353 |
+
status_text.text(f"Analyse de '{col}' ({idx+1}/{len(df_processed.columns)})...")
|
| 354 |
|
| 355 |
+
r2, selected_vars, dropped_twins = evaluate_imputation(
|
| 356 |
+
df_processed, col, corr_threshold, test_size, algo
|
| 357 |
+
)
|
| 358 |
|
| 359 |
+
if r2 is not None:
|
| 360 |
+
results.append({
|
| 361 |
+
'Variable': col,
|
| 362 |
+
'R²': r2,
|
| 363 |
+
'Prédicteurs': len(selected_vars),
|
| 364 |
+
'Jumelles': len(dropped_twins),
|
| 365 |
+
'Statut': '✅ Excellent' if r2 > 0.7 else ('⚠️ Moyen' if r2 > 0.5 else '❌ Difficile')
|
| 366 |
+
})
|
| 367 |
|
| 368 |
+
progress_bar.progress((idx + 1) / len(df_processed.columns))
|
| 369 |
+
|
| 370 |
+
status_text.empty()
|
| 371 |
+
progress_bar.empty()
|
| 372 |
+
|
| 373 |
+
if results:
|
| 374 |
+
results_df = pd.DataFrame(results).sort_values('R²', ascending=False)
|
| 375 |
|
| 376 |
+
# Stocker dans session_state
|
| 377 |
+
st.session_state['results_df'] = results_df
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
+
# Graphique interactif
|
| 380 |
+
st.subheader("📈 Résultats de l'analyse")
|
| 381 |
|
| 382 |
+
fig = go.Figure()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
+
colors = ['#28a745' if r >= 0.7 else '#ffc107' if r >= 0.5 else '#dc3545'
|
| 385 |
+
for r in results_df['R²']]
|
| 386 |
|
| 387 |
+
fig.add_trace(go.Bar(
|
| 388 |
+
x=results_df['Variable'],
|
| 389 |
+
y=results_df['R²'],
|
| 390 |
+
marker_color=colors,
|
| 391 |
+
text=results_df['R²'].round(3),
|
| 392 |
+
textposition='outside',
|
| 393 |
+
hovertemplate='<b>%{x}</b><br>R²: %{y:.3f}<extra></extra>'
|
| 394 |
+
))
|
| 395 |
|
| 396 |
+
fig.add_hline(y=0.7, line_dash="dash", line_color="#28a745",
|
| 397 |
+
annotation_text="Excellent (0.7)", annotation_position="right")
|
| 398 |
+
fig.add_hline(y=0.5, line_dash="dash", line_color="#ffc107",
|
| 399 |
+
annotation_text="Acceptable (0.5)", annotation_position="right")
|
| 400 |
|
| 401 |
+
fig.update_layout(
|
| 402 |
+
title=f"Fiabilité de l'imputation par variable ({algo})",
|
| 403 |
+
xaxis_title="Variable",
|
| 404 |
+
yaxis_title="R² Score",
|
| 405 |
+
height=470,
|
| 406 |
+
showlegend=False,
|
| 407 |
+
hovermode='x',
|
| 408 |
+
plot_bgcolor='white',
|
| 409 |
+
paper_bgcolor='white',
|
| 410 |
+
font=dict(family="Arial, sans-serif", size=12, color="#333333")
|
| 411 |
+
)
|
| 412 |
|
| 413 |
+
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')
|
| 414 |
+
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='#f0f0f0')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
|
| 416 |
+
st.plotly_chart(fig, use_container_width=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 417 |
|
| 418 |
+
else:
|
| 419 |
+
st.error("❌ Aucun résultat. Vérifiez vos données.")
|
| 420 |
+
elif 'run_analysis' not in locals():
|
| 421 |
+
st.info("👈 Cliquez sur le bouton 'Lancer l'analyse' dans la sidebar")
|
| 422 |
+
else:
|
| 423 |
+
st.info("👈 Cliquez sur le bouton 'Lancer l'analyse' dans la sidebar")
|
| 424 |
+
|
| 425 |
+
with tab2:
|
| 426 |
+
if 'results_df' in st.session_state:
|
| 427 |
+
st.subheader("📋 Détails par variable")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
|
| 429 |
+
# Tableau avec style personnalisé
|
| 430 |
+
results_display = st.session_state['results_df'].copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 431 |
|
| 432 |
+
styled_df = results_display.style.format({
|
| 433 |
+
'R²': '{:.3f}'
|
| 434 |
+
}).background_gradient(subset=['R²'], cmap='RdYlGn', vmin=0, vmax=1)
|
| 435 |
+
|
| 436 |
+
st.dataframe(styled_df, use_container_width=True, hide_index=True, height=400)
|
| 437 |
+
else:
|
| 438 |
+
st.info("👈 Lancez d'abord une analyse pour voir les détails par variable")
|
| 439 |
+
|
| 440 |
+
with tab3:
|
| 441 |
+
if 'results_df' in st.session_state:
|
| 442 |
+
st.subheader("📈 Statistiques récapitulatives")
|
| 443 |
+
|
| 444 |
+
results_df = st.session_state['results_df']
|
| 445 |
+
|
| 446 |
+
# Statistiques récapitulatives
|
| 447 |
+
col1, col2, col3 = st.columns(3)
|
| 448 |
+
with col1:
|
| 449 |
+
excellent = len(results_df[results_df['R²'] > 0.7])
|
| 450 |
+
st.metric("Nombre d'imputations fiables", excellent,
|
| 451 |
+
delta=f"{excellent/len(results_df)*100:.1f}%")
|
| 452 |
+
with col2:
|
| 453 |
+
acceptable = len(results_df[(results_df['R²'] > 0.5) & (results_df['R²'] <= 0.7)])
|
| 454 |
+
st.metric("Nombre d'imputations acceptables", acceptable,
|
| 455 |
+
delta=f"{acceptable/len(results_df)*100:.1f}%")
|
| 456 |
+
with col3:
|
| 457 |
+
difficult = len(results_df[results_df['R²'] <= 0.5])
|
| 458 |
+
st.metric("Nombre d'imputations déconseillées", difficult,
|
| 459 |
+
delta=f"{difficult/len(results_df)*100:.1f}%")
|
| 460 |
else:
|
| 461 |
+
st.info("👈 Lancez d'abord une analyse pour voir les statistiques")
|
| 462 |
else:
|
| 463 |
+
st.info("👈 Veuillez sélectionner un jeu de données avec au moins 2 variables numériques.")
|