Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- Dockerfile +32 -20
- README.md +6 -12
- app.py +216 -0
- requirements.txt +8 -3
Dockerfile
CHANGED
|
@@ -1,20 +1,32 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
RUN apt-get update && apt-get install -y \
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# syntax = docker/dockerfile:1.4
|
| 2 |
+
FROM python:3.12-slim
|
| 3 |
+
|
| 4 |
+
# Installe les dépendances système nécessaires
|
| 5 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 6 |
+
gcc \
|
| 7 |
+
g++ \
|
| 8 |
+
libjpeg62-turbo-dev \
|
| 9 |
+
zlib1g-dev \
|
| 10 |
+
libpng-dev \
|
| 11 |
+
libfreetype6-dev \
|
| 12 |
+
libopenjp2-7-dev \
|
| 13 |
+
libtiff5-dev \
|
| 14 |
+
curl \
|
| 15 |
+
git \
|
| 16 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 17 |
+
|
| 18 |
+
WORKDIR /app
|
| 19 |
+
|
| 20 |
+
# Copie requirements en premier (meilleur cache)
|
| 21 |
+
COPY requirements.txt .
|
| 22 |
+
|
| 23 |
+
# Installe tout (sans cache pour réduire la taille finale)
|
| 24 |
+
RUN pip install --no-cache-dir -r requirements.txt
|
| 25 |
+
|
| 26 |
+
# Copie le code
|
| 27 |
+
COPY app.py .
|
| 28 |
+
|
| 29 |
+
# Port + commande obligatoire pour HF Spaces
|
| 30 |
+
EXPOSE 8501
|
| 31 |
+
HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
|
| 32 |
+
CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableCORS=false", "--server.enableXsrfProtection=false"]
|
README.md
CHANGED
|
@@ -1,19 +1,13 @@
|
|
| 1 |
---
|
| 2 |
-
title: Feature
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
| 7 |
app_port: 8501
|
| 8 |
-
tags:
|
| 9 |
-
- streamlit
|
| 10 |
pinned: false
|
| 11 |
-
short_description: Robust variable selection method
|
| 12 |
---
|
| 13 |
|
| 14 |
-
#
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 19 |
-
forums](https://discuss.streamlit.io).
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Feature selection
|
| 3 |
+
emoji: 📊
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: blue
|
| 6 |
sdk: docker
|
| 7 |
app_port: 8501
|
|
|
|
|
|
|
| 8 |
pinned: false
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# 📊 Feature selection
|
| 12 |
|
| 13 |
+
Application Streamlit pour sélectionner les meilleures variables.
|
|
|
|
|
|
|
|
|
app.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import seaborn as sns
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
|
| 6 |
+
from sklearn.preprocessing import StandardScaler, OneHotEncoder
|
| 7 |
+
from sklearn.compose import ColumnTransformer
|
| 8 |
+
from sklearn.linear_model import LinearRegression, LogisticRegression
|
| 9 |
+
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
|
| 10 |
+
from sklearn.metrics import r2_score, accuracy_score
|
| 11 |
+
from sklearn.model_selection import train_test_split
|
| 12 |
+
from scipy.stats import pearsonr
|
| 13 |
+
|
| 14 |
+
# ------------------------------------------------------------
|
| 15 |
+
# Configuration Globale
|
| 16 |
+
# ------------------------------------------------------------
|
| 17 |
+
TEST_SIZE = 0.3
|
| 18 |
+
RANDOM_STATE = 42
|
| 19 |
+
|
| 20 |
+
st.set_page_config(page_title="Analyse d'Importance", layout="wide")
|
| 21 |
+
|
| 22 |
+
st.title("🔍 Analyse de l'Importance des Caractéristiques")
|
| 23 |
+
st.markdown(
|
| 24 |
+
"""
|
| 25 |
+
Cette application illustre la différence entre la pertinence marginale et la pertinence conditionnelle d'une caractéristique.
|
| 26 |
+
|
| 27 |
+
- Pertinence marginale : corrélation ou information mutuelle avec la cible.
|
| 28 |
+
- Pertinence conditionnelle : valeur ajoutée d'une variable excluant les redondances après contrôle.
|
| 29 |
+
"""
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# ------------------------------------------------------------
|
| 33 |
+
# Sidebar: Dataset et Importation
|
| 34 |
+
# ------------------------------------------------------------
|
| 35 |
+
with st.sidebar:
|
| 36 |
+
st.header("⚙️ Configuration")
|
| 37 |
+
|
| 38 |
+
# Choix de la source de données
|
| 39 |
+
data_source = st.radio(
|
| 40 |
+
"Source des données",
|
| 41 |
+
["Jeu de données Seaborn", "Importer un fichier"],
|
| 42 |
+
label_visibility="visible"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
df = None
|
| 46 |
+
|
| 47 |
+
if data_source == "Importer un fichier":
|
| 48 |
+
uploaded_file = st.file_uploader("Importer un fichier CSV", type=["csv"])
|
| 49 |
+
|
| 50 |
+
if uploaded_file is not None:
|
| 51 |
+
try:
|
| 52 |
+
df = pd.read_csv(uploaded_file, sep=None, engine='python')
|
| 53 |
+
df = df.dropna()
|
| 54 |
+
st.success("✅ Fichier CSV chargé !")
|
| 55 |
+
except Exception as e:
|
| 56 |
+
st.error(f"Erreur : {e}")
|
| 57 |
+
df = None
|
| 58 |
+
else:
|
| 59 |
+
excluded_datasets = ['anagrams', 'anscombe', 'attention', 'brain_networks',
|
| 60 |
+
'car_crashes', 'dowjones','diamonds','flights','geyser',
|
| 61 |
+
'planets','seaice']
|
| 62 |
+
available_datasets = [d for d in sorted(sns.get_dataset_names()) if d not in excluded_datasets]
|
| 63 |
+
dataset_name = st.selectbox("Dataset d'exemple", available_datasets)
|
| 64 |
+
try:
|
| 65 |
+
df = sns.load_dataset(dataset_name)
|
| 66 |
+
df = df.dropna()
|
| 67 |
+
st.success(f"✅ Jeu '{dataset_name}' chargé")
|
| 68 |
+
except Exception as e:
|
| 69 |
+
st.error(f"Erreur : {e}")
|
| 70 |
+
df = None
|
| 71 |
+
|
| 72 |
+
if df is not None:
|
| 73 |
+
target = st.selectbox("Sélection cible (Y)", df.columns)
|
| 74 |
+
y = df[target]
|
| 75 |
+
X = df.drop(columns=[target])
|
| 76 |
+
|
| 77 |
+
task = "Regression" if (y.dtype.kind in "ifu" and y.nunique() > 10) else "Classification"
|
| 78 |
+
excluded_features = st.multiselect("Variables à exclure :", X.columns.tolist(), default=[])
|
| 79 |
+
X = X.drop(columns=excluded_features)
|
| 80 |
+
else:
|
| 81 |
+
st.info("👈 Veuillez sélectionner ou importer un jeu de données.")
|
| 82 |
+
X = None
|
| 83 |
+
y = None
|
| 84 |
+
task = None
|
| 85 |
+
|
| 86 |
+
# ------------------------------------------------------------
|
| 87 |
+
# Onglets
|
| 88 |
+
# ------------------------------------------------------------
|
| 89 |
+
if df is not None and X is not None:
|
| 90 |
+
tab1, tab2, tab3 = st.tabs(["📊 Analyse d'Importance", "📋 Données Brutes", "🔧 Types"])
|
| 91 |
+
|
| 92 |
+
with tab2:
|
| 93 |
+
st.dataframe(df.head(20), use_container_width=True)
|
| 94 |
+
|
| 95 |
+
with tab3:
|
| 96 |
+
st.header("Types des variables")
|
| 97 |
+
|
| 98 |
+
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
| 99 |
+
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
|
| 100 |
+
|
| 101 |
+
col1, col2 = st.columns(2)
|
| 102 |
+
|
| 103 |
+
with col1:
|
| 104 |
+
st.subheader("Numériques")
|
| 105 |
+
for col in num_cols or ["None"]:
|
| 106 |
+
st.write(f"- {col}")
|
| 107 |
+
|
| 108 |
+
with col2:
|
| 109 |
+
st.subheader("Catégorielles")
|
| 110 |
+
for col in cat_cols or ["None"]:
|
| 111 |
+
st.write(f"- {col}")
|
| 112 |
+
|
| 113 |
+
# ------------------------------------------------------------
|
| 114 |
+
# Analyse Principale (Tab 1)
|
| 115 |
+
# ------------------------------------------------------------
|
| 116 |
+
with tab1:
|
| 117 |
+
if len(X.columns) > 0:
|
| 118 |
+
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
|
| 119 |
+
cat_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()
|
| 120 |
+
|
| 121 |
+
preprocess = ColumnTransformer(transformers=[
|
| 122 |
+
("num", StandardScaler(), num_cols),
|
| 123 |
+
("cat", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), cat_cols)
|
| 124 |
+
])
|
| 125 |
+
|
| 126 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)
|
| 127 |
+
X_train_proc = preprocess.fit_transform(X_train)
|
| 128 |
+
feature_names = preprocess.get_feature_names_out()
|
| 129 |
+
|
| 130 |
+
model = LinearRegression() if task == "Regression" else LogisticRegression(max_iter=1000)
|
| 131 |
+
model.fit(X_train_proc, y_train)
|
| 132 |
+
|
| 133 |
+
y_pred = model.predict(preprocess.transform(X_test))
|
| 134 |
+
perf = r2_score(y_test, y_pred) if task == "Regression" else accuracy_score(y_test, y_pred)
|
| 135 |
+
|
| 136 |
+
st.subheader("📊 Pertinence marginale vs conditionnelle")
|
| 137 |
+
st.markdown(f"**🎯 Performance globale : {perf:.2f} ({'R²' if task == 'Regression' else 'Précision'})**")
|
| 138 |
+
|
| 139 |
+
# Métriques
|
| 140 |
+
mi = mutual_info_regression(X_train_proc, y_train, random_state=0) if task == "Regression" else mutual_info_classif(X_train_proc, y_train, random_state=0)
|
| 141 |
+
coefs = model.coef_.ravel() if task == "Regression" else model.coef_[0]
|
| 142 |
+
|
| 143 |
+
res = pd.DataFrame({
|
| 144 |
+
"Variable": feature_names,
|
| 145 |
+
"Importance seule (MI)": mi,
|
| 146 |
+
"Poids dans le Modèle": np.abs(coefs),
|
| 147 |
+
"Sens": np.where(coefs > 0, "+", "-")
|
| 148 |
+
})
|
| 149 |
+
|
| 150 |
+
if task == "Regression":
|
| 151 |
+
res["Lien Direct (Corr)"] = [pearsonr(X_train_proc[:, i], y_train)[0] for i in range(len(feature_names))]
|
| 152 |
+
|
| 153 |
+
# Normalisation pour Score Synthétique
|
| 154 |
+
def normalize(s): return (s - s.min()) / (s.max() - s.min() + 1e-10)
|
| 155 |
+
mi_n = normalize(res["Importance seule (MI)"])
|
| 156 |
+
poids_n = normalize(res["Poids dans le Modèle"])
|
| 157 |
+
|
| 158 |
+
if task == "Regression":
|
| 159 |
+
corr_n = normalize(res["Lien Direct (Corr)"].abs())
|
| 160 |
+
res["Score synthétique"] = ((mi_n + corr_n) / 2 + poids_n) / 2
|
| 161 |
+
else:
|
| 162 |
+
res["Score synthétique"] = (mi_n + poids_n) / 2
|
| 163 |
+
|
| 164 |
+
res = res.sort_values("Score synthétique", ascending=False)
|
| 165 |
+
|
| 166 |
+
# Réorganisation des colonnes
|
| 167 |
+
cols = ["Variable", "Score synthétique", "Importance seule (MI)", "Poids dans le Modèle", "Sens"]
|
| 168 |
+
if task == "Regression":
|
| 169 |
+
cols = ["Variable", "Score synthétique", "Importance seule (MI)", "Lien Direct (Corr)", "Poids dans le Modèle", "Sens"]
|
| 170 |
+
|
| 171 |
+
final_df = res[cols].copy()
|
| 172 |
+
|
| 173 |
+
# --- STYLISATION ET AFFICHAGE ---
|
| 174 |
+
# 1. Préparation du style pour la colonne Sens (couleurs)
|
| 175 |
+
def style_sign(val):
|
| 176 |
+
color = 'color: #2ecc71;' if val == '+' else 'color: #e74c3c;'
|
| 177 |
+
return f'{color} font-weight: bold; font-size: 20px;'
|
| 178 |
+
|
| 179 |
+
# 2. Application du formatage (2 décimales) et des gradients
|
| 180 |
+
num_cols_to_style = [c for c in cols if c not in ["Variable", "Sens", "Score synthétique"]]
|
| 181 |
+
|
| 182 |
+
styled_res = (final_df.style
|
| 183 |
+
.format({c: "{:.2f}" for c in cols if c not in ["Variable", "Sens"]})
|
| 184 |
+
.background_gradient(subset=num_cols_to_style, cmap="RdYlGn")
|
| 185 |
+
.map(style_sign, subset=['Sens'])
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
# 3. Affichage avec st.data_editor pour fixer la hauteur (6 lignes env = 250px)
|
| 189 |
+
st.data_editor(
|
| 190 |
+
styled_res,
|
| 191 |
+
use_container_width=True,
|
| 192 |
+
height=250, # Limite la hauteur avec scrollbar
|
| 193 |
+
hide_index=True,
|
| 194 |
+
disabled=True, # Empêche l'édition, agit comme un dataframe
|
| 195 |
+
column_config={
|
| 196 |
+
"Sens": st.column_config.Column(
|
| 197 |
+
"Sens",
|
| 198 |
+
help="Direction de l'influence",
|
| 199 |
+
width="small"
|
| 200 |
+
)
|
| 201 |
+
}
|
| 202 |
+
)
|
| 203 |
+
|
| 204 |
+
st.subheader("📖 Guide de lecture")
|
| 205 |
+
st.markdown(
|
| 206 |
+
"""
|
| 207 |
+
- **Score synthétique** : Note globale d'importance.
|
| 208 |
+
- **Importance seule (MI)** : Mesure la dépendance globale entre la variable et la cible. Contrairement à la corrélation qui ne voit que les lignes droites, l'Information Mutuelle détecte toutes les formes de relations (courbes, motifs complexes, etc.). Elle indique quelle quantité d'information "pure" cette variable partage avec la cible, sans tenir compte des autres variables.
|
| 209 |
+
- **Poids dans le modèle** : Contribution finale au modèle.
|
| 210 |
+
- **Sens (+) / (-)** : Direction de l'impact sur la cible.
|
| 211 |
+
"""
|
| 212 |
+
)
|
| 213 |
+
else:
|
| 214 |
+
st.info("ℹ️ Veuillez sélectionner au moins une variable.")
|
| 215 |
+
else:
|
| 216 |
+
st.info("👈 Veuillez sélectionner ou importer un jeu de données pour commencer l'analyse.")
|
requirements.txt
CHANGED
|
@@ -1,3 +1,8 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
streamlit==1.31.0
|
| 3 |
+
pandas==2.1.4
|
| 4 |
+
numpy==1.26.3
|
| 5 |
+
seaborn==0.13.1
|
| 6 |
+
matplotlib==3.8.2
|
| 7 |
+
scipy==1.11.4
|
| 8 |
+
scikit-learn==1.4.0
|