Spaces:

Eric2mangel
/

Mapping_variables

Running

File size: 10,625 Bytes

72eee83

import streamlit as st
import seaborn as sns
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_regression
from scipy.stats import entropy
from concurrent.futures import ThreadPoolExecutor

# ==========================================
# CONFIGURATION FACILE DU PROGRAMME
# ==========================================
GRAPH_HEIGHT = 480  # Modifiez cette valeur (en pixels) pour adapter la hauteur du graphe
# ==========================================

st.set_page_config(page_title="Analyse de proximité", page_icon="🔗", layout="wide")
st.title("🔗 Cartographie des variables")
st.subheader("Méthode : Information Mutuelle Normalisée (IMN)")

# --- SIDEBAR ---
with st.sidebar:
    st.header("⚙️ Configuration")
    data_source = st.radio("Source des données", ["Dataset Seaborn", "Importer un fichier"])
    
    df = None
    if data_source == "Importer un fichier":
        uploaded_file = st.file_uploader("Fichier CSV ou Excel", type=["csv", "xlsx", "xls"])
        if uploaded_file:
            file_extension = uploaded_file.name.split('.')[-1].lower()
            
            with st.spinner("Chargement du fichier..."):
                try:
                    if file_extension == 'csv':
                        # Tentative de lecture CSV avec détection automatique du séparateur
                        df = pd.read_csv(uploaded_file, sep=None, engine='python', encoding_errors='ignore')
                        df = df.dropna()
                    elif file_extension in ['xlsx', 'xls']:
                        df = pd.read_excel(uploaded_file)
                        df = df.dropna()
                except Exception as e:
                    st.error(f"Erreur lors du chargement : {str(e)}")
                    df = None
    else:
        # Sélection du dataset Seaborn
        dataset_name = st.selectbox(
            "Choisir un dataset",
            ["titanic", "tips", "iris", "penguins", "mpg", "planets", "flights", "diamonds"]
        )
        try:
            with st.spinner(f"Chargement du dataset {dataset_name}..."):
                df = sns.load_dataset(dataset_name).dropna()
        except:
            st.error(f"Impossible de charger le dataset '{dataset_name}'")
            df = None

    if df is not None:
        # Affichage de la taille du dataset
        st.metric("Lignes × Colonnes", f"{len(df)} × {len(df.columns)}")
        
        # Remplacement du slider classique par un select_slider avec vos paliers spécifiques
        threshold = st.select_slider(
            "Seuil de visibilité des liens (IMN)",
            options=[0, 0.1, 0.3, 0.6, 0.9],
            value=0.3,
            help="Filtre les liens selon les paliers d'interprétation définis ci-dessous."
        )

    st.info("💡 **Légende de l'IMN**")
    st.markdown("""

    * **0.90 – 1.00** : Quasi-doublons

    * **0.60 – 0.90** : Relation forte

    * **0.30 – 0.60** : Relation modérée

    * **0.10 – 0.30** : Relation faible

    * **< 0.10** : Indépendance

    """)

# --- LOGIQUE DE CALCUL ---
if df is not None:
    with st.spinner("Calcul de l'information mutuelle en cours..."):
        # Préparation des colonnes avec typage optimisé
        df_calc = df.copy()
        discrete_map = []
        
        for col in df.columns:
            if df[col].dtype == 'object' or df[col].dtype.name == 'category':
                df_calc[col] = df[col].astype('category').cat.codes.astype(np.int32)
                discrete_map.append(True)
            else:
                df_calc[col] = df_calc[col].astype(np.float32)
                discrete_map.append(False)

        n_vars = len(df.columns)
        mi_matrix = np.zeros((n_vars, n_vars), dtype=np.float32)
        
        # Calcul des entropies avec bins adaptés
        entropies = []
        for i in range(n_vars):
            bins = min(10, len(df_calc.iloc[:, i].unique()))
            hist = np.histogram(df_calc.iloc[:, i], bins=bins, density=True)[0]
            entropies.append(entropy(hist + 1e-9))

        # Fonction pour calculer une ligne de la matrice
        def compute_mi_row(i):
            scores = mutual_info_regression(
                df_calc, 
                df_calc.iloc[:, i], 
                discrete_features=discrete_map, 
                random_state=42,
                n_neighbors=min(3, max(1, len(df_calc) // 100))
            )
            return i, scores

        # Calcul parallélisé de la matrice d'information mutuelle
        with ThreadPoolExecutor(max_workers=4) as executor:
            results = list(executor.map(compute_mi_row, range(n_vars)))
        
        # Remplissage de la matrice et symétrisation
        for i, scores in results:
            for j, s in enumerate(scores):
                if i == j:
                    mi_matrix[i, j] = 1.0
                else:
                    h_min = min(entropies[i], entropies[j])
                    nmi = s / h_min if h_min > 0 else 0
                    mi_matrix[i, j] = min(max(nmi, 0), 1.0)
        
        # Symétrisation de la matrice (moyenne des deux directions)
        mi_matrix = (mi_matrix + mi_matrix.T) / 2
        np.fill_diagonal(mi_matrix, 1.0)

        to_keep = []
        redundant_pairs = []
        seen = set()
        
        for i in range(n_vars):
            if i in seen: continue
            for j in range(i + 1, n_vars):
                val_im = mi_matrix[i, j]
                if val_im >= 0.99: 
                    seen.add(j)
                    redundant_pairs.append({
                        "Variable conservée": df.columns[i], 
                        "Doublon supprimé": df.columns[j],
                        "Score IMN": f"{val_im:.4f}"
                    })
            to_keep.append(i)

        final_vars = [df.columns[i] for i in to_keep]
        final_mi = mi_matrix[np.ix_(to_keep, to_keep)]

        G = nx.Graph()
        for i in range(len(final_vars)):
            for j in range(i + 1, len(final_vars)):
                im_val = final_mi[i, j]
                if im_val > threshold:
                    G.add_edge(final_vars[i], final_vars[j], weight=float(im_val))
            if final_vars[i] not in G:
                G.add_node(final_vars[i])

        pos = nx.spring_layout(G, k=1.2, seed=42) 

        node_hover_texts = []
        for node in G.nodes():
            hover_text = f"<b>Variable : {node}</b><br><br>Liens (IMN > {threshold}):<br>"
            neighbors = G.edges(node, data=True)
            sorted_neighbors = sorted(neighbors, key=lambda x: x[2]['weight'], reverse=True)
            
            if not sorted_neighbors:
                hover_text += "<i>Aucun lien significatif</i>"
            else:
                for _, neighbor, data in sorted_neighbors:
                    hover_text += f"• {neighbor} : <b>{data['weight']:.4f}</b><br>"
            node_hover_texts.append(hover_text)

    # --- AFFICHAGE ---
    tab1, tab2, tab3, tab4 = st.tabs(["📊 Graphe interactif", "👯 Doublons filtrés", "📋 Matrice triangulaire", "📄 Aperçu des données"])

    with tab1:
        edge_traces = []
        for edge in G.edges(data=True):
            x0, y0 = pos[edge[0]]
            x1, y1 = pos[edge[1]]
            w = edge[2]['weight']
            color = f'rgba({int(255*w)}, {int(150*(1-w))}, {int(200*(1-w))}, {0.3 + 0.4*w})'
            
            edge_traces.append(go.Scatter(
                x=[x0, x1, None], y=[y0, y1, None],
                line=dict(width=w*12, color=color),
                hoverinfo='none',
                mode='lines'
            ))

        node_trace = go.Scatter(
            x=[pos[n][0] for n in G.nodes()], 
            y=[pos[n][1] for n in G.nodes()],
            mode='markers+text',
            text=list(G.nodes()),
            textposition="bottom center",
            textfont=dict(color='white', size=11),
            marker=dict(
                size=[10 + G.degree(n) * 5 for n in G.nodes()],
                color='#1f77b4',
                line=dict(width=2, color='white'),
                opacity=1
            ),
            hoverinfo='text',
            hovertext=node_hover_texts
        )

        fig = go.Figure(data=edge_traces + [node_trace])
        fig.update_layout(
            paper_bgcolor='rgba(15,15,25,1)',
            plot_bgcolor='rgba(0,0,0,0)',
            height=GRAPH_HEIGHT,
            showlegend=False,
            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
            margin=dict(b=20, l=5, r=5, t=40),
            hoverlabel=dict(bgcolor="rgba(30, 30, 50, 0.9)", font_size=13)
        )
        st.plotly_chart(fig, use_container_width=True)

    with tab2:
        st.subheader("Variables supprimées (Redondance forte)")
        if redundant_pairs:
            st.dataframe(pd.DataFrame(redundant_pairs), use_container_width=True)
        else:
            st.info("Aucun doublon détecté.")

    with tab3:
        st.subheader("Matrice de proximité")
        df_imn = pd.DataFrame(final_mi, index=final_vars, columns=final_vars)
        mask = np.triu(np.ones_like(df_imn, dtype=bool))
        
        fig_map, ax = plt.subplots(figsize=(10, 8), layout="constrained")
        fig_map.patch.set_facecolor('white') 
        
        sns.heatmap(df_imn, mask=mask, cmap="coolwarm", vmax=1.0, vmin=0, 
                    annot=True, fmt=".2f", square=True, linewidths=.5,
                    cbar_kws={"shrink": .8}, ax=ax, annot_kws={"size": 9})
        
        plt.xticks(rotation=45, ha='right', color='black')
        plt.yticks(rotation=0, color='black')
        ax.set_facecolor('white')
        
        st.pyplot(fig_map)
    
    with tab4:
        st.subheader("Aperçu des données (20 premières lignes)")
        st.dataframe(df.head(20), use_container_width=True)
else:
    st.info("👈 Veuillez sélectionner ou importer un jeu de données.")

# Footer
st.markdown("---")
st.markdown("""

<div style='text-align: center; color: gray;'>

    <small>L'Information Mutuelle Normalisée (IMN) mesure la dépendance entre variables (0 = indépendance, 1 = dépendance totale)</small>

</div>

""", unsafe_allow_html=True)