Eric2mangel's picture
Upload 4 files
72eee83 verified
import streamlit as st
import seaborn as sns
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import entropy
from concurrent.futures import ThreadPoolExecutor
# ==========================================
# CONFIGURATION FACILE DU PROGRAMME
# ==========================================
GRAPH_HEIGHT = 480 # Modifiez cette valeur (en pixels) pour adapter la hauteur du graphe
# ==========================================
st.set_page_config(page_title="Analyse de proximité", page_icon="🔗", layout="wide")
st.title("🔗 Cartographie des variables")
st.subheader("Méthode : Information Mutuelle Normalisée (IMN)")
# --- SIDEBAR ---
with st.sidebar:
st.header("⚙️ Configuration")
data_source = st.radio("Source des données", ["Dataset Seaborn", "Importer un fichier"])
df = None
if data_source == "Importer un fichier":
uploaded_file = st.file_uploader("Fichier CSV ou Excel", type=["csv", "xlsx", "xls"])
if uploaded_file:
file_extension = uploaded_file.name.split('.')[-1].lower()
with st.spinner("Chargement du fichier..."):
try:
if file_extension == 'csv':
# Tentative de lecture CSV avec détection automatique du séparateur
df = pd.read_csv(uploaded_file, sep=None, engine='python', encoding_errors='ignore')
df = df.dropna()
elif file_extension in ['xlsx', 'xls']:
df = pd.read_excel(uploaded_file)
df = df.dropna()
except Exception as e:
st.error(f"Erreur lors du chargement : {str(e)}")
df = None
else:
# Sélection du dataset Seaborn
dataset_name = st.selectbox(
"Choisir un dataset",
["titanic", "tips", "iris", "penguins", "mpg", "planets", "flights", "diamonds"]
)
try:
with st.spinner(f"Chargement du dataset {dataset_name}..."):
df = sns.load_dataset(dataset_name).dropna()
except:
st.error(f"Impossible de charger le dataset '{dataset_name}'")
df = None
if df is not None:
# Affichage de la taille du dataset
st.metric("Lignes × Colonnes", f"{len(df)} × {len(df.columns)}")
# Remplacement du slider classique par un select_slider avec vos paliers spécifiques
threshold = st.select_slider(
"Seuil de visibilité des liens (IMN)",
options=[0, 0.1, 0.3, 0.6, 0.9],
value=0.3,
help="Filtre les liens selon les paliers d'interprétation définis ci-dessous."
)
st.info("💡 **Légende de l'IMN**")
st.markdown("""
* **0.90 – 1.00** : Quasi-doublons
* **0.60 – 0.90** : Relation forte
* **0.30 – 0.60** : Relation modérée
* **0.10 – 0.30** : Relation faible
* **< 0.10** : Indépendance
""")
# --- LOGIQUE DE CALCUL ---
if df is not None:
with st.spinner("Calcul de l'information mutuelle en cours..."):
# Préparation des colonnes avec typage optimisé
df_calc = df.copy()
discrete_map = []
for col in df.columns:
if df[col].dtype == 'object' or df[col].dtype.name == 'category':
df_calc[col] = df[col].astype('category').cat.codes.astype(np.int32)
discrete_map.append(True)
else:
df_calc[col] = df_calc[col].astype(np.float32)
discrete_map.append(False)
n_vars = len(df.columns)
mi_matrix = np.zeros((n_vars, n_vars), dtype=np.float32)
# Calcul des entropies avec bins adaptés
entropies = []
for i in range(n_vars):
bins = min(10, len(df_calc.iloc[:, i].unique()))
hist = np.histogram(df_calc.iloc[:, i], bins=bins, density=True)[0]
entropies.append(entropy(hist + 1e-9))
# Fonction pour calculer une ligne de la matrice
def compute_mi_row(i):
scores = mutual_info_regression(
df_calc,
df_calc.iloc[:, i],
discrete_features=discrete_map,
random_state=42,
n_neighbors=min(3, max(1, len(df_calc) // 100))
)
return i, scores
# Calcul parallélisé de la matrice d'information mutuelle
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(compute_mi_row, range(n_vars)))
# Remplissage de la matrice et symétrisation
for i, scores in results:
for j, s in enumerate(scores):
if i == j:
mi_matrix[i, j] = 1.0
else:
h_min = min(entropies[i], entropies[j])
nmi = s / h_min if h_min > 0 else 0
mi_matrix[i, j] = min(max(nmi, 0), 1.0)
# Symétrisation de la matrice (moyenne des deux directions)
mi_matrix = (mi_matrix + mi_matrix.T) / 2
np.fill_diagonal(mi_matrix, 1.0)
to_keep = []
redundant_pairs = []
seen = set()
for i in range(n_vars):
if i in seen: continue
for j in range(i + 1, n_vars):
val_im = mi_matrix[i, j]
if val_im >= 0.99:
seen.add(j)
redundant_pairs.append({
"Variable conservée": df.columns[i],
"Doublon supprimé": df.columns[j],
"Score IMN": f"{val_im:.4f}"
})
to_keep.append(i)
final_vars = [df.columns[i] for i in to_keep]
final_mi = mi_matrix[np.ix_(to_keep, to_keep)]
G = nx.Graph()
for i in range(len(final_vars)):
for j in range(i + 1, len(final_vars)):
im_val = final_mi[i, j]
if im_val > threshold:
G.add_edge(final_vars[i], final_vars[j], weight=float(im_val))
if final_vars[i] not in G:
G.add_node(final_vars[i])
pos = nx.spring_layout(G, k=1.2, seed=42)
node_hover_texts = []
for node in G.nodes():
hover_text = f"<b>Variable : {node}</b><br><br>Liens (IMN > {threshold}):<br>"
neighbors = G.edges(node, data=True)
sorted_neighbors = sorted(neighbors, key=lambda x: x[2]['weight'], reverse=True)
if not sorted_neighbors:
hover_text += "<i>Aucun lien significatif</i>"
else:
for _, neighbor, data in sorted_neighbors:
hover_text += f"• {neighbor} : <b>{data['weight']:.4f}</b><br>"
node_hover_texts.append(hover_text)
# --- AFFICHAGE ---
tab1, tab2, tab3, tab4 = st.tabs(["📊 Graphe interactif", "👯 Doublons filtrés", "📋 Matrice triangulaire", "📄 Aperçu des données"])
with tab1:
edge_traces = []
for edge in G.edges(data=True):
x0, y0 = pos[edge[0]]
x1, y1 = pos[edge[1]]
w = edge[2]['weight']
color = f'rgba({int(255*w)}, {int(150*(1-w))}, {int(200*(1-w))}, {0.3 + 0.4*w})'
edge_traces.append(go.Scatter(
x=[x0, x1, None], y=[y0, y1, None],
line=dict(width=w*12, color=color),
hoverinfo='none',
mode='lines'
))
node_trace = go.Scatter(
x=[pos[n][0] for n in G.nodes()],
y=[pos[n][1] for n in G.nodes()],
mode='markers+text',
text=list(G.nodes()),
textposition="bottom center",
textfont=dict(color='white', size=11),
marker=dict(
size=[10 + G.degree(n) * 5 for n in G.nodes()],
color='#1f77b4',
line=dict(width=2, color='white'),
opacity=1
),
hoverinfo='text',
hovertext=node_hover_texts
)
fig = go.Figure(data=edge_traces + [node_trace])
fig.update_layout(
paper_bgcolor='rgba(15,15,25,1)',
plot_bgcolor='rgba(0,0,0,0)',
height=GRAPH_HEIGHT,
showlegend=False,
xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
margin=dict(b=20, l=5, r=5, t=40),
hoverlabel=dict(bgcolor="rgba(30, 30, 50, 0.9)", font_size=13)
)
st.plotly_chart(fig, use_container_width=True)
with tab2:
st.subheader("Variables supprimées (Redondance forte)")
if redundant_pairs:
st.dataframe(pd.DataFrame(redundant_pairs), use_container_width=True)
else:
st.info("Aucun doublon détecté.")
with tab3:
st.subheader("Matrice de proximité")
df_imn = pd.DataFrame(final_mi, index=final_vars, columns=final_vars)
mask = np.triu(np.ones_like(df_imn, dtype=bool))
fig_map, ax = plt.subplots(figsize=(10, 8), layout="constrained")
fig_map.patch.set_facecolor('white')
sns.heatmap(df_imn, mask=mask, cmap="coolwarm", vmax=1.0, vmin=0,
annot=True, fmt=".2f", square=True, linewidths=.5,
cbar_kws={"shrink": .8}, ax=ax, annot_kws={"size": 9})
plt.xticks(rotation=45, ha='right', color='black')
plt.yticks(rotation=0, color='black')
ax.set_facecolor('white')
st.pyplot(fig_map)
with tab4:
st.subheader("Aperçu des données (20 premières lignes)")
st.dataframe(df.head(20), use_container_width=True)
else:
st.info("👈 Veuillez sélectionner ou importer un jeu de données.")
# Footer
st.markdown("---")
st.markdown("""
<div style='text-align: center; color: gray;'>
<small>L'Information Mutuelle Normalisée (IMN) mesure la dépendance entre variables (0 = indépendance, 1 = dépendance totale)</small>
</div>
""", unsafe_allow_html=True)