Spaces:

Eric2mangel
/

Mapping_variables

Sleeping

App Files Files Community

Eric2mangel commited on Jan 4

Commit

72eee83

verified ·

1 Parent(s): 2e35e54

Upload 4 files

Browse files

Import des fichiers

Files changed (4) hide show

Dockerfile +34 -20
README.md +41 -19
app.py +262 -0
requirements.txt +10 -3

Dockerfile CHANGED Viewed

@@ -1,20 +1,34 @@
-FROM python:3.13.5-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+# syntax = docker/dockerfile:1.4
+FROM python:3.12-slim
+# Installe les dépendances système nécessaires
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    gcc \
+    g++ \
+    libjpeg62-turbo-dev \
+    zlib1g-dev \
+    libpng-dev \
+    libfreetype6-dev \
+    libopenjp2-7-dev \
+    libtiff5-dev \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Copie requirements en premier (meilleur cache)
+COPY requirements.txt .
+# Installe tout (sans cache pour réduire la taille finale)
+RUN pip install --no-cache-dir -r requirements.txt
+# Copie le code
+COPY app.py .
+# Port + commande obligatoire pour HF Spaces
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health || exit 1
+CMD ["streamlit", "run", "app.py", "--server.port=8501", "--server.address=0.0.0.0", "--server.enableCORS=false", "--server.enableXsrfProtection=false"]

README.md CHANGED Viewed

@@ -1,19 +1,41 @@
----
-title: Mapping Variables
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Mapping variables with mutual information index
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+---
+title: Cartographie des variables
+emoji: 🔗
+colorFrom: blue
+colorTo: blue
+sdk: docker
+app_port: 8501
+pinned: false
+---
+# 🔗 Cartographie des variables
+Application Streamlit pour analyser les relations entre variables via l'Information Mutuelle Normalisée (IMN).
+## Fonctionnalités
+- Import de jeux de données Seaborn
+- Import de fichiers CSV et Excel (détection automatique du séparateur)
+- Calcul de l'Information Mutuelle Normalisée entre toutes les variables
+- Visualisation interactive des relations sous forme de graphe réseau
+- Identification automatique des variables redondantes (doublons)
+- Matrice de proximité triangulaire avec heatmap
+- Seuils d'interprétation configurables
+## Interprétation de l'IMN
+- **0.90 – 1.00** : Quasi-doublons (variables redondantes)
+- **0.60 – 0.90** : Relation forte
+- **0.30 – 0.60** : Relation modérée
+- **0.10 – 0.30** : Relation faible
+- **< 0.10** : Indépendance
+## Utilisation
+1. Sélectionnez une source de données (dataset Seaborn ou fichier importé)
+2. Ajustez le seuil de visibilité des liens selon vos besoins
+3. Explorez les 4 onglets :
+   - **Graphe interactif** : visualisation réseau des relations
+   - **Doublons filtrés** : variables redondantes détectées
+   - **Matrice triangulaire** : heatmap de toutes les relations
+   - **Aperçu des données** : visualisation du dataset

app.py ADDED Viewed

	@@ -0,0 +1,262 @@

+import streamlit as st
+import seaborn as sns
+import pandas as pd
+import numpy as np
+import networkx as nx
+import plotly.graph_objects as go
+import matplotlib.pyplot as plt
+from sklearn.feature_selection import mutual_info_regression
+from scipy.stats import entropy
+from concurrent.futures import ThreadPoolExecutor
+# ==========================================
+# CONFIGURATION FACILE DU PROGRAMME
+# ==========================================
+GRAPH_HEIGHT = 480  # Modifiez cette valeur (en pixels) pour adapter la hauteur du graphe
+# ==========================================
+st.set_page_config(page_title="Analyse de proximité", page_icon="🔗", layout="wide")
+st.title("🔗 Cartographie des variables")
+st.subheader("Méthode : Information Mutuelle Normalisée (IMN)")
+# --- SIDEBAR ---
+with st.sidebar:
+    st.header("⚙️ Configuration")
+    data_source = st.radio("Source des données", ["Dataset Seaborn", "Importer un fichier"])
+    df = None
+    if data_source == "Importer un fichier":
+        uploaded_file = st.file_uploader("Fichier CSV ou Excel", type=["csv", "xlsx", "xls"])
+        if uploaded_file:
+            file_extension = uploaded_file.name.split('.')[-1].lower()
+            with st.spinner("Chargement du fichier..."):
+                try:
+                    if file_extension == 'csv':
+                        # Tentative de lecture CSV avec détection automatique du séparateur
+                        df = pd.read_csv(uploaded_file, sep=None, engine='python', encoding_errors='ignore')
+                        df = df.dropna()
+                    elif file_extension in ['xlsx', 'xls']:
+                        df = pd.read_excel(uploaded_file)
+                        df = df.dropna()
+                except Exception as e:
+                    st.error(f"Erreur lors du chargement : {str(e)}")
+                    df = None
+    else:
+        # Sélection du dataset Seaborn
+        dataset_name = st.selectbox(
+            "Choisir un dataset",
+            ["titanic", "tips", "iris", "penguins", "mpg", "planets", "flights", "diamonds"]
+        )
+        try:
+            with st.spinner(f"Chargement du dataset {dataset_name}..."):
+                df = sns.load_dataset(dataset_name).dropna()
+        except:
+            st.error(f"Impossible de charger le dataset '{dataset_name}'")
+            df = None
+    if df is not None:
+        # Affichage de la taille du dataset
+        st.metric("Lignes × Colonnes", f"{len(df)} × {len(df.columns)}")
+        # Remplacement du slider classique par un select_slider avec vos paliers spécifiques
+        threshold = st.select_slider(
+            "Seuil de visibilité des liens (IMN)",
+            options=[0, 0.1, 0.3, 0.6, 0.9],
+            value=0.3,
+            help="Filtre les liens selon les paliers d'interprétation définis ci-dessous."
+        )
+    st.info("💡 **Légende de l'IMN**")
+    st.markdown("""
+    * **0.90 – 1.00** : Quasi-doublons
+    * **0.60 – 0.90** : Relation forte
+    * **0.30 – 0.60** : Relation modérée
+    * **0.10 – 0.30** : Relation faible
+    * **< 0.10** : Indépendance
+    """)
+# --- LOGIQUE DE CALCUL ---
+if df is not None:
+    with st.spinner("Calcul de l'information mutuelle en cours..."):
+        # Préparation des colonnes avec typage optimisé
+        df_calc = df.copy()
+        discrete_map = []
+        for col in df.columns:
+            if df[col].dtype == 'object' or df[col].dtype.name == 'category':
+                df_calc[col] = df[col].astype('category').cat.codes.astype(np.int32)
+                discrete_map.append(True)
+            else:
+                df_calc[col] = df_calc[col].astype(np.float32)
+                discrete_map.append(False)
+        n_vars = len(df.columns)
+        mi_matrix = np.zeros((n_vars, n_vars), dtype=np.float32)
+        # Calcul des entropies avec bins adaptés
+        entropies = []
+        for i in range(n_vars):
+            bins = min(10, len(df_calc.iloc[:, i].unique()))
+            hist = np.histogram(df_calc.iloc[:, i], bins=bins, density=True)[0]
+            entropies.append(entropy(hist + 1e-9))
+        # Fonction pour calculer une ligne de la matrice
+        def compute_mi_row(i):
+            scores = mutual_info_regression(
+                df_calc,
+                df_calc.iloc[:, i],
+                discrete_features=discrete_map,
+                random_state=42,
+                n_neighbors=min(3, max(1, len(df_calc) // 100))
+            )
+            return i, scores
+        # Calcul parallélisé de la matrice d'information mutuelle
+        with ThreadPoolExecutor(max_workers=4) as executor:
+            results = list(executor.map(compute_mi_row, range(n_vars)))
+        # Remplissage de la matrice et symétrisation
+        for i, scores in results:
+            for j, s in enumerate(scores):
+                if i == j:
+                    mi_matrix[i, j] = 1.0
+                else:
+                    h_min = min(entropies[i], entropies[j])
+                    nmi = s / h_min if h_min > 0 else 0
+                    mi_matrix[i, j] = min(max(nmi, 0), 1.0)
+        # Symétrisation de la matrice (moyenne des deux directions)
+        mi_matrix = (mi_matrix + mi_matrix.T) / 2
+        np.fill_diagonal(mi_matrix, 1.0)
+        to_keep = []
+        redundant_pairs = []
+        seen = set()
+        for i in range(n_vars):
+            if i in seen: continue
+            for j in range(i + 1, n_vars):
+                val_im = mi_matrix[i, j]
+                if val_im >= 0.99:
+                    seen.add(j)
+                    redundant_pairs.append({
+                        "Variable conservée": df.columns[i],
+                        "Doublon supprimé": df.columns[j],
+                        "Score IMN": f"{val_im:.4f}"
+                    })
+            to_keep.append(i)
+        final_vars = [df.columns[i] for i in to_keep]
+        final_mi = mi_matrix[np.ix_(to_keep, to_keep)]
+        G = nx.Graph()
+        for i in range(len(final_vars)):
+            for j in range(i + 1, len(final_vars)):
+                im_val = final_mi[i, j]
+                if im_val > threshold:
+                    G.add_edge(final_vars[i], final_vars[j], weight=float(im_val))
+            if final_vars[i] not in G:
+                G.add_node(final_vars[i])
+        pos = nx.spring_layout(G, k=1.2, seed=42)
+        node_hover_texts = []
+        for node in G.nodes():
+            hover_text = f"<b>Variable : {node}</b><br><br>Liens (IMN > {threshold}):<br>"
+            neighbors = G.edges(node, data=True)
+            sorted_neighbors = sorted(neighbors, key=lambda x: x[2]['weight'], reverse=True)
+            if not sorted_neighbors:
+                hover_text += "<i>Aucun lien significatif</i>"
+            else:
+                for _, neighbor, data in sorted_neighbors:
+                    hover_text += f"• {neighbor} : <b>{data['weight']:.4f}</b><br>"
+            node_hover_texts.append(hover_text)
+    # --- AFFICHAGE ---
+    tab1, tab2, tab3, tab4 = st.tabs(["📊 Graphe interactif", "👯 Doublons filtrés", "📋 Matrice triangulaire", "📄 Aperçu des données"])
+    with tab1:
+        edge_traces = []
+        for edge in G.edges(data=True):
+            x0, y0 = pos[edge[0]]
+            x1, y1 = pos[edge[1]]
+            w = edge[2]['weight']
+            color = f'rgba({int(255*w)}, {int(150*(1-w))}, {int(200*(1-w))}, {0.3 + 0.4*w})'
+            edge_traces.append(go.Scatter(
+                x=[x0, x1, None], y=[y0, y1, None],
+                line=dict(width=w*12, color=color),
+                hoverinfo='none',
+                mode='lines'
+            ))
+        node_trace = go.Scatter(
+            x=[pos[n][0] for n in G.nodes()],
+            y=[pos[n][1] for n in G.nodes()],
+            mode='markers+text',
+            text=list(G.nodes()),
+            textposition="bottom center",
+            textfont=dict(color='white', size=11),
+            marker=dict(
+                size=[10 + G.degree(n) * 5 for n in G.nodes()],
+                color='#1f77b4',
+                line=dict(width=2, color='white'),
+                opacity=1
+            ),
+            hoverinfo='text',
+            hovertext=node_hover_texts
+        )
+        fig = go.Figure(data=edge_traces + [node_trace])
+        fig.update_layout(
+            paper_bgcolor='rgba(15,15,25,1)',
+            plot_bgcolor='rgba(0,0,0,0)',
+            height=GRAPH_HEIGHT,
+            showlegend=False,
+            xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
+            margin=dict(b=20, l=5, r=5, t=40),
+            hoverlabel=dict(bgcolor="rgba(30, 30, 50, 0.9)", font_size=13)
+        )
+        st.plotly_chart(fig, use_container_width=True)
+    with tab2:
+        st.subheader("Variables supprimées (Redondance forte)")
+        if redundant_pairs:
+            st.dataframe(pd.DataFrame(redundant_pairs), use_container_width=True)
+        else:
+            st.info("Aucun doublon détecté.")
+    with tab3:
+        st.subheader("Matrice de proximité")
+        df_imn = pd.DataFrame(final_mi, index=final_vars, columns=final_vars)
+        mask = np.triu(np.ones_like(df_imn, dtype=bool))
+        fig_map, ax = plt.subplots(figsize=(10, 8), layout="constrained")
+        fig_map.patch.set_facecolor('white')
+        sns.heatmap(df_imn, mask=mask, cmap="coolwarm", vmax=1.0, vmin=0,
+                    annot=True, fmt=".2f", square=True, linewidths=.5,
+                    cbar_kws={"shrink": .8}, ax=ax, annot_kws={"size": 9})
+        plt.xticks(rotation=45, ha='right', color='black')
+        plt.yticks(rotation=0, color='black')
+        ax.set_facecolor('white')
+        st.pyplot(fig_map)
+    with tab4:
+        st.subheader("Aperçu des données (20 premières lignes)")
+        st.dataframe(df.head(20), use_container_width=True)
+else:
+    st.info("👈 Veuillez sélectionner ou importer un jeu de données.")
+# Footer
+st.markdown("---")
+st.markdown("""
+<div style='text-align: center; color: gray;'>
+    <small>L'Information Mutuelle Normalisée (IMN) mesure la dépendance entre variables (0 = indépendance, 1 = dépendance totale)</small>
+</div>
+""", unsafe_allow_html=True)

requirements.txt CHANGED Viewed

@@ -1,3 +1,10 @@
-altair
-pandas
-streamlit

+streamlit==1.40.2
+pandas==2.2.3
+numpy==2.2.1
+seaborn==0.13.2
+matplotlib==3.10.0
+scipy==1.15.1
+scikit-learn==1.6.1
+networkx==3.4.2
+plotly==5.24.1
+openpyxl==3.1.5