Spaces:

Eric2mangel
/

Partial_correlations

Sleeping

File size: 8,162 Bytes

import streamlit as st
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
import itertools
import matplotlib.pyplot as plt

# ------------------------------
# Configuration Streamlit
# ------------------------------
st.set_page_config(layout="wide")

st.sidebar.title("Exploration des corrélations partielles")
st.sidebar.markdown("---")
st.sidebar.header("Configuration")

# Sélection dataset
available_datasets = []

for name in sns.get_dataset_names():
    try:
        df_test = sns.load_dataset(name).dropna()
        numeric_cols = df_test.select_dtypes(include=['float64', 'int64']).columns
        if len(numeric_cols) >= 3:
            available_datasets.append(name)
    except:
        pass

dataset_name = st.sidebar.selectbox("Sélectionnez un jeu de données Seaborn :", available_datasets)

df = sns.load_dataset(dataset_name).dropna()

# Colonnes numériques uniquement
df_numeric = df.select_dtypes(include=['float64', 'int64'])

# Choix Pearson / Spearman
corr_type = st.sidebar.radio(
    "Type de corrélation :",
    ["Pearson", "Spearman"],
    index=0
)

# Choix variables à éliminer
all_vars = list(df_numeric.columns)
control_vars = st.sidebar.multiselect(
    "Variables dont vous voulez éliminer l'influence :",
    all_vars
)

# Variables restantes
vars_remaining = [v for v in all_vars if v not in control_vars]

if len(vars_remaining) < 2:
    st.error("Il faut au moins deux variables restantes pour afficher une corrélation.")
    st.stop()

# ------------------------------
# Prétraitement Spearman (si sélectionné)
# ------------------------------
if corr_type == "Spearman":
    df_for_corr = df_numeric.rank()
else:
    df_for_corr = df_numeric.copy()

# ------------------------------
# Matrice brute
# ------------------------------
corr_raw = df_for_corr[vars_remaining].corr(method=("spearman" if corr_type=="Spearman" else "pearson"))

# ------------------------------
# Fonction corrélation partielle
# ------------------------------
def partial_corr(df, controls):
    vars_to_corr = [v for v in df.columns if v not in controls]

    partial_corr_matrix = pd.DataFrame(
        np.zeros((len(vars_to_corr), len(vars_to_corr))),
        columns=vars_to_corr,
        index=vars_to_corr
    )

    for x, y in itertools.product(vars_to_corr, repeat=2):
        if x == y:
            partial_corr_matrix.loc[x, y] = 1.0
            continue

        X = df[[x]]
        Y = df[[y]]

        if len(controls) > 0:
            Z = df[controls]

            model_x = LinearRegression().fit(Z, X)
            X_res = X - model_x.predict(Z)

            model_y = LinearRegression().fit(Z, Y)
            Y_res = Y - model_y.predict(Z)

            r = np.corrcoef(X_res.T, Y_res.T)[0, 1]
        else:
            r = df[[x, y]].corr(method=("spearman" if corr_type=="Spearman" else "pearson")).iloc[0, 1]

        partial_corr_matrix.loc[x, y] = r

    return partial_corr_matrix

# ------------------------------
# Matrice partielle
# ------------------------------
corr_partial = partial_corr(df_for_corr, control_vars)

# ------------------------------
# Calcul des différences
# ------------------------------
# Extraire uniquement les paires uniques (triangle supérieur sans diagonale)
diff_data = []
n = len(vars_remaining)
for i in range(n):
    for j in range(i+1, n):
        var1 = vars_remaining[i]
        var2 = vars_remaining[j]
        raw_val = corr_raw.loc[var1, var2]
        partial_val = corr_partial.loc[var1, var2]
        diff = partial_val - raw_val
        diff_data.append({
            'Paire': f"{var1} - {var2}",
            'Différence': diff
        })

df_diff = pd.DataFrame(diff_data).sort_values('Différence', ascending=True)

# ------------------------------
# ONGLET PRINCIPAL
# ------------------------------
tab1, tab2 = st.tabs(["📊 Matrices", "📄 Données"])

# ----------- TAB 1 -----------
with tab1:
    col1, col2 = st.columns(2)

    # Heatmap corrélation brute (triangle inférieur)
    # with col1:
    #     st.write(f"**Corrélation brute ({corr_type})**")
    #     mask_raw = np.triu(np.ones_like(corr_raw, dtype=bool))
    #     fig, ax = plt.subplots(figsize=(5.5, 4))
    #     sns.heatmap(corr_raw, annot=True, cmap="coolwarm", center=0, ax=ax, 
    #                 mask=mask_raw, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
    #     plt.tight_layout()
    #     st.pyplot(fig)

    # # Heatmap corrélation partielle (triangle inférieur)
    # with col2:
    #     st.write(f"**Corrélation partielle ({corr_type})**")
    #     mask_partial = np.triu(np.ones_like(corr_partial, dtype=bool))
    #     fig2, ax2 = plt.subplots(figsize=(5.5, 4))
    #     sns.heatmap(corr_partial, annot=True, cmap="coolwarm", center=0, ax=ax2,
    #                 mask=mask_partial, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
    #     plt.tight_layout()
    #     st.pyplot(fig2)

    # # Graphique des différences (pleine largeur en dessous)
    # st.write("**Différences (Partielle - Brute)**")
    # fig3, ax3 = plt.subplots(figsize=(12, 2.2))
    # colors = ['#d7191c' if x < 0 else '#2b83ba' for x in df_diff['Différence']]
    # ax3.barh(df_diff['Paire'], df_diff['Différence'], color=colors, height=0.55)
    # ax3.axvline(0, color='black', linewidth=0.8, linestyle='--')
    # ax3.set_xlabel('Différence de corrélation', fontsize=8)
    # ax3.tick_params(axis='both', labelsize=7.5)
    # ax3.grid(axis='x', alpha=0.3, linestyle=':')
    # plt.tight_layout()
    # st.pyplot(fig3)

    with col1:
        st.write(f"**Corrélation brute ({corr_type})**")
        mask_raw = np.triu(np.ones_like(corr_raw, dtype=bool))
        fig, ax = plt.subplots(figsize=(5, 4)) # Diminuez un peu la taille pour la place
        sns.heatmap(corr_raw, annot=True, cmap="coolwarm", center=0, ax=ax, 
                    mask=mask_raw, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
        ax.tick_params(axis='both', labelsize=8) # Ajout pour fixer la taille de la police
        # RETRAIT de plt.tight_layout()
        st.pyplot(fig, use_container_width=False) # **IMPORTANT : Forcer la taille fixe**

    # --- COLONNE 2 : CORRÉLATION PARTIELLE ---
    with col2:
        st.write(f"**Corrélation partielle ({corr_type})**")
        mask_partial = np.triu(np.ones_like(corr_partial, dtype=bool))
        fig2, ax2 = plt.subplots(figsize=(5, 4)) # Même taille que fig
        sns.heatmap(corr_partial, annot=True, cmap="coolwarm", center=0, ax=ax2,
                    mask=mask_partial, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
        ax2.tick_params(axis='both', labelsize=8) # Ajout pour fixer la taille de la police
        # RETRAIT de plt.tight_layout()
        st.pyplot(fig2, use_container_width=False) # **IMPORTANT : Forcer la taille fixe**

    # --- GRAPHIQUE DES DIFFÉRENCES (Celui-ci est souvent moins problématique) ---
    # Vous pouvez garder plt.tight_layout() ici ou le remplacer par ax3.set_... si le problème persiste.
    st.write("**Différences (Partielle - Brute)**")
    fig3, ax3 = plt.subplots(figsize=(12, 2.2))
    colors = ['#d7191c' if x < 0 else '#2b83ba' for x in df_diff['Différence']]
    ax3.barh(df_diff['Paire'], df_diff['Différence'], color=colors, height=0.55)
    ax3.axvline(0, color='black', linewidth=0.8, linestyle='--')
    ax3.set_xlabel('Différence de corrélation', fontsize=8)
    ax3.tick_params(axis='both', labelsize=7.5)
    ax3.grid(axis='x', alpha=0.3, linestyle=':')
    plt.tight_layout()
    st.pyplot(fig3)
    
    st.caption("🔵 Corrélation renforcée après contrôle | 🔴 Corrélation affaiblie après contrôle")

# ----------- TAB 2 -----------
with tab2:
    st.write("**Aperçu des données (10 premières lignes)**")
    st.dataframe(df_numeric.head(10))