Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import seaborn as sns | |
| import pandas as pd | |
| import numpy as np | |
| import networkx as nx | |
| import plotly.graph_objects as go | |
| import matplotlib.pyplot as plt | |
| from sklearn.feature_selection import mutual_info_regression | |
| from scipy.stats import entropy | |
| from concurrent.futures import ThreadPoolExecutor | |
| # ========================================== | |
| # CONFIGURATION FACILE DU PROGRAMME | |
| # ========================================== | |
| GRAPH_HEIGHT = 480 # Modifiez cette valeur (en pixels) pour adapter la hauteur du graphe | |
| # ========================================== | |
| st.set_page_config(page_title="Analyse de proximité", page_icon="🔗", layout="wide") | |
| st.title("🔗 Cartographie des variables") | |
| st.subheader("Méthode : Information Mutuelle Normalisée (IMN)") | |
| # --- SIDEBAR --- | |
| with st.sidebar: | |
| st.header("⚙️ Configuration") | |
| data_source = st.radio("Source des données", ["Dataset Seaborn", "Importer un fichier"]) | |
| df = None | |
| if data_source == "Importer un fichier": | |
| uploaded_file = st.file_uploader("Fichier CSV ou Excel", type=["csv", "xlsx", "xls"]) | |
| if uploaded_file: | |
| file_extension = uploaded_file.name.split('.')[-1].lower() | |
| with st.spinner("Chargement du fichier..."): | |
| try: | |
| if file_extension == 'csv': | |
| # Tentative de lecture CSV avec détection automatique du séparateur | |
| df = pd.read_csv(uploaded_file, sep=None, engine='python', encoding_errors='ignore') | |
| df = df.dropna() | |
| elif file_extension in ['xlsx', 'xls']: | |
| df = pd.read_excel(uploaded_file) | |
| df = df.dropna() | |
| except Exception as e: | |
| st.error(f"Erreur lors du chargement : {str(e)}") | |
| df = None | |
| else: | |
| # Sélection du dataset Seaborn | |
| dataset_name = st.selectbox( | |
| "Choisir un dataset", | |
| ["titanic", "tips", "iris", "penguins", "mpg", "planets", "flights", "diamonds"] | |
| ) | |
| try: | |
| with st.spinner(f"Chargement du dataset {dataset_name}..."): | |
| df = sns.load_dataset(dataset_name).dropna() | |
| except: | |
| st.error(f"Impossible de charger le dataset '{dataset_name}'") | |
| df = None | |
| if df is not None: | |
| # Affichage de la taille du dataset | |
| st.metric("Lignes × Colonnes", f"{len(df)} × {len(df.columns)}") | |
| # Remplacement du slider classique par un select_slider avec vos paliers spécifiques | |
| threshold = st.select_slider( | |
| "Seuil de visibilité des liens (IMN)", | |
| options=[0, 0.1, 0.3, 0.6, 0.9], | |
| value=0.3, | |
| help="Filtre les liens selon les paliers d'interprétation définis ci-dessous." | |
| ) | |
| st.info("💡 **Légende de l'IMN**") | |
| st.markdown(""" | |
| * **0.90 – 1.00** : Quasi-doublons | |
| * **0.60 – 0.90** : Relation forte | |
| * **0.30 – 0.60** : Relation modérée | |
| * **0.10 – 0.30** : Relation faible | |
| * **< 0.10** : Indépendance | |
| """) | |
| # --- LOGIQUE DE CALCUL --- | |
| if df is not None: | |
| with st.spinner("Calcul de l'information mutuelle en cours..."): | |
| # Préparation des colonnes avec typage optimisé | |
| df_calc = df.copy() | |
| discrete_map = [] | |
| for col in df.columns: | |
| if df[col].dtype == 'object' or df[col].dtype.name == 'category': | |
| df_calc[col] = df[col].astype('category').cat.codes.astype(np.int32) | |
| discrete_map.append(True) | |
| else: | |
| df_calc[col] = df_calc[col].astype(np.float32) | |
| discrete_map.append(False) | |
| n_vars = len(df.columns) | |
| mi_matrix = np.zeros((n_vars, n_vars), dtype=np.float32) | |
| # Calcul des entropies avec bins adaptés | |
| entropies = [] | |
| for i in range(n_vars): | |
| bins = min(10, len(df_calc.iloc[:, i].unique())) | |
| hist = np.histogram(df_calc.iloc[:, i], bins=bins, density=True)[0] | |
| entropies.append(entropy(hist + 1e-9)) | |
| # Fonction pour calculer une ligne de la matrice | |
| def compute_mi_row(i): | |
| scores = mutual_info_regression( | |
| df_calc, | |
| df_calc.iloc[:, i], | |
| discrete_features=discrete_map, | |
| random_state=42, | |
| n_neighbors=min(3, max(1, len(df_calc) // 100)) | |
| ) | |
| return i, scores | |
| # Calcul parallélisé de la matrice d'information mutuelle | |
| with ThreadPoolExecutor(max_workers=4) as executor: | |
| results = list(executor.map(compute_mi_row, range(n_vars))) | |
| # Remplissage de la matrice et symétrisation | |
| for i, scores in results: | |
| for j, s in enumerate(scores): | |
| if i == j: | |
| mi_matrix[i, j] = 1.0 | |
| else: | |
| h_min = min(entropies[i], entropies[j]) | |
| nmi = s / h_min if h_min > 0 else 0 | |
| mi_matrix[i, j] = min(max(nmi, 0), 1.0) | |
| # Symétrisation de la matrice (moyenne des deux directions) | |
| mi_matrix = (mi_matrix + mi_matrix.T) / 2 | |
| np.fill_diagonal(mi_matrix, 1.0) | |
| to_keep = [] | |
| redundant_pairs = [] | |
| seen = set() | |
| for i in range(n_vars): | |
| if i in seen: continue | |
| for j in range(i + 1, n_vars): | |
| val_im = mi_matrix[i, j] | |
| if val_im >= 0.99: | |
| seen.add(j) | |
| redundant_pairs.append({ | |
| "Variable conservée": df.columns[i], | |
| "Doublon supprimé": df.columns[j], | |
| "Score IMN": f"{val_im:.4f}" | |
| }) | |
| to_keep.append(i) | |
| final_vars = [df.columns[i] for i in to_keep] | |
| final_mi = mi_matrix[np.ix_(to_keep, to_keep)] | |
| G = nx.Graph() | |
| for i in range(len(final_vars)): | |
| for j in range(i + 1, len(final_vars)): | |
| im_val = final_mi[i, j] | |
| if im_val > threshold: | |
| G.add_edge(final_vars[i], final_vars[j], weight=float(im_val)) | |
| if final_vars[i] not in G: | |
| G.add_node(final_vars[i]) | |
| pos = nx.spring_layout(G, k=1.2, seed=42) | |
| node_hover_texts = [] | |
| for node in G.nodes(): | |
| hover_text = f"<b>Variable : {node}</b><br><br>Liens (IMN > {threshold}):<br>" | |
| neighbors = G.edges(node, data=True) | |
| sorted_neighbors = sorted(neighbors, key=lambda x: x[2]['weight'], reverse=True) | |
| if not sorted_neighbors: | |
| hover_text += "<i>Aucun lien significatif</i>" | |
| else: | |
| for _, neighbor, data in sorted_neighbors: | |
| hover_text += f"• {neighbor} : <b>{data['weight']:.4f}</b><br>" | |
| node_hover_texts.append(hover_text) | |
| # --- AFFICHAGE --- | |
| tab1, tab2, tab3, tab4 = st.tabs(["📊 Graphe interactif", "👯 Doublons filtrés", "📋 Matrice triangulaire", "📄 Aperçu des données"]) | |
| with tab1: | |
| edge_traces = [] | |
| for edge in G.edges(data=True): | |
| x0, y0 = pos[edge[0]] | |
| x1, y1 = pos[edge[1]] | |
| w = edge[2]['weight'] | |
| color = f'rgba({int(255*w)}, {int(150*(1-w))}, {int(200*(1-w))}, {0.3 + 0.4*w})' | |
| edge_traces.append(go.Scatter( | |
| x=[x0, x1, None], y=[y0, y1, None], | |
| line=dict(width=w*12, color=color), | |
| hoverinfo='none', | |
| mode='lines' | |
| )) | |
| node_trace = go.Scatter( | |
| x=[pos[n][0] for n in G.nodes()], | |
| y=[pos[n][1] for n in G.nodes()], | |
| mode='markers+text', | |
| text=list(G.nodes()), | |
| textposition="bottom center", | |
| textfont=dict(color='white', size=11), | |
| marker=dict( | |
| size=[10 + G.degree(n) * 5 for n in G.nodes()], | |
| color='#1f77b4', | |
| line=dict(width=2, color='white'), | |
| opacity=1 | |
| ), | |
| hoverinfo='text', | |
| hovertext=node_hover_texts | |
| ) | |
| fig = go.Figure(data=edge_traces + [node_trace]) | |
| fig.update_layout( | |
| paper_bgcolor='rgba(15,15,25,1)', | |
| plot_bgcolor='rgba(0,0,0,0)', | |
| height=GRAPH_HEIGHT, | |
| showlegend=False, | |
| xaxis=dict(showgrid=False, zeroline=False, showticklabels=False), | |
| yaxis=dict(showgrid=False, zeroline=False, showticklabels=False), | |
| margin=dict(b=20, l=5, r=5, t=40), | |
| hoverlabel=dict(bgcolor="rgba(30, 30, 50, 0.9)", font_size=13) | |
| ) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab2: | |
| st.subheader("Variables supprimées (Redondance forte)") | |
| if redundant_pairs: | |
| st.dataframe(pd.DataFrame(redundant_pairs), use_container_width=True) | |
| else: | |
| st.info("Aucun doublon détecté.") | |
| with tab3: | |
| st.subheader("Matrice de proximité") | |
| df_imn = pd.DataFrame(final_mi, index=final_vars, columns=final_vars) | |
| mask = np.triu(np.ones_like(df_imn, dtype=bool)) | |
| fig_map, ax = plt.subplots(figsize=(10, 8), layout="constrained") | |
| fig_map.patch.set_facecolor('white') | |
| sns.heatmap(df_imn, mask=mask, cmap="coolwarm", vmax=1.0, vmin=0, | |
| annot=True, fmt=".2f", square=True, linewidths=.5, | |
| cbar_kws={"shrink": .8}, ax=ax, annot_kws={"size": 9}) | |
| plt.xticks(rotation=45, ha='right', color='black') | |
| plt.yticks(rotation=0, color='black') | |
| ax.set_facecolor('white') | |
| st.pyplot(fig_map) | |
| with tab4: | |
| st.subheader("Aperçu des données (20 premières lignes)") | |
| st.dataframe(df.head(20), use_container_width=True) | |
| else: | |
| st.info("👈 Veuillez sélectionner ou importer un jeu de données.") | |
| # Footer | |
| st.markdown("---") | |
| st.markdown(""" | |
| <div style='text-align: center; color: gray;'> | |
| <small>L'Information Mutuelle Normalisée (IMN) mesure la dépendance entre variables (0 = indépendance, 1 = dépendance totale)</small> | |
| </div> | |
| """, unsafe_allow_html=True) |