Spaces:

Eric2mangel
/

Partial_correlations

Sleeping

App Files Files Community

Partial_correlations / app.py

Eric2mangel

Upload app.py

7b5300c verified 2 months ago

raw

history blame contribute delete

8.16 kB

	import streamlit as st
	import seaborn as sns
	import pandas as pd
	import numpy as np
	from sklearn.linear_model import LinearRegression
	import itertools
	import matplotlib.pyplot as plt

	# ------------------------------
	# Configuration Streamlit
	# ------------------------------
	st.set_page_config(layout="wide")

	st.sidebar.title("Exploration des corrélations partielles")
	st.sidebar.markdown("---")
	st.sidebar.header("Configuration")

	# Sélection dataset
	available_datasets = []

	for name in sns.get_dataset_names():
	try:
	df_test = sns.load_dataset(name).dropna()
	numeric_cols = df_test.select_dtypes(include=['float64', 'int64']).columns
	if len(numeric_cols) >= 3:
	available_datasets.append(name)
	except:
	pass

	dataset_name = st.sidebar.selectbox("Sélectionnez un jeu de données Seaborn :", available_datasets)

	df = sns.load_dataset(dataset_name).dropna()

	# Colonnes numériques uniquement
	df_numeric = df.select_dtypes(include=['float64', 'int64'])

	# Choix Pearson / Spearman
	corr_type = st.sidebar.radio(
	"Type de corrélation :",
	["Pearson", "Spearman"],
	index=0
	)

	# Choix variables à éliminer
	all_vars = list(df_numeric.columns)
	control_vars = st.sidebar.multiselect(
	"Variables dont vous voulez éliminer l'influence :",
	all_vars
	)

	# Variables restantes
	vars_remaining = [v for v in all_vars if v not in control_vars]

	if len(vars_remaining) < 2:
	st.error("Il faut au moins deux variables restantes pour afficher une corrélation.")
	st.stop()

	# ------------------------------
	# Prétraitement Spearman (si sélectionné)
	# ------------------------------
	if corr_type == "Spearman":
	df_for_corr = df_numeric.rank()
	else:
	df_for_corr = df_numeric.copy()

	# ------------------------------
	# Matrice brute
	# ------------------------------
	corr_raw = df_for_corr[vars_remaining].corr(method=("spearman" if corr_type=="Spearman" else "pearson"))

	# ------------------------------
	# Fonction corrélation partielle
	# ------------------------------
	def partial_corr(df, controls):
	vars_to_corr = [v for v in df.columns if v not in controls]

	partial_corr_matrix = pd.DataFrame(
	np.zeros((len(vars_to_corr), len(vars_to_corr))),
	columns=vars_to_corr,
	index=vars_to_corr
	)

	for x, y in itertools.product(vars_to_corr, repeat=2):
	if x == y:
	partial_corr_matrix.loc[x, y] = 1.0
	continue

	X = df[[x]]
	Y = df[[y]]

	if len(controls) > 0:
	Z = df[controls]

	model_x = LinearRegression().fit(Z, X)
	X_res = X - model_x.predict(Z)

	model_y = LinearRegression().fit(Z, Y)
	Y_res = Y - model_y.predict(Z)

	r = np.corrcoef(X_res.T, Y_res.T)[0, 1]
	else:
	r = df[[x, y]].corr(method=("spearman" if corr_type=="Spearman" else "pearson")).iloc[0, 1]

	partial_corr_matrix.loc[x, y] = r

	return partial_corr_matrix

	# ------------------------------
	# Matrice partielle
	# ------------------------------
	corr_partial = partial_corr(df_for_corr, control_vars)

	# ------------------------------
	# Calcul des différences
	# ------------------------------
	# Extraire uniquement les paires uniques (triangle supérieur sans diagonale)
	diff_data = []
	n = len(vars_remaining)
	for i in range(n):
	for j in range(i+1, n):
	var1 = vars_remaining[i]
	var2 = vars_remaining[j]
	raw_val = corr_raw.loc[var1, var2]
	partial_val = corr_partial.loc[var1, var2]
	diff = partial_val - raw_val
	diff_data.append({
	'Paire': f"{var1} - {var2}",
	'Différence': diff
	})

	df_diff = pd.DataFrame(diff_data).sort_values('Différence', ascending=True)

	# ------------------------------
	# ONGLET PRINCIPAL
	# ------------------------------
	tab1, tab2 = st.tabs(["📊 Matrices", "📄 Données"])

	# ----------- TAB 1 -----------
	with tab1:
	col1, col2 = st.columns(2)

	# Heatmap corrélation brute (triangle inférieur)
	# with col1:
	# st.write(f"Corrélation brute ({corr_type})")
	# mask_raw = np.triu(np.ones_like(corr_raw, dtype=bool))
	# fig, ax = plt.subplots(figsize=(5.5, 4))
	# sns.heatmap(corr_raw, annot=True, cmap="coolwarm", center=0, ax=ax,
	# mask=mask_raw, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
	# plt.tight_layout()
	# st.pyplot(fig)

	# # Heatmap corrélation partielle (triangle inférieur)
	# with col2:
	# st.write(f"Corrélation partielle ({corr_type})")
	# mask_partial = np.triu(np.ones_like(corr_partial, dtype=bool))
	# fig2, ax2 = plt.subplots(figsize=(5.5, 4))
	# sns.heatmap(corr_partial, annot=True, cmap="coolwarm", center=0, ax=ax2,
	# mask=mask_partial, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
	# plt.tight_layout()
	# st.pyplot(fig2)

	# # Graphique des différences (pleine largeur en dessous)
	# st.write("Différences (Partielle - Brute)")
	# fig3, ax3 = plt.subplots(figsize=(12, 2.2))
	# colors = ['#d7191c' if x < 0 else '#2b83ba' for x in df_diff['Différence']]
	# ax3.barh(df_diff['Paire'], df_diff['Différence'], color=colors, height=0.55)
	# ax3.axvline(0, color='black', linewidth=0.8, linestyle='--')
	# ax3.set_xlabel('Différence de corrélation', fontsize=8)
	# ax3.tick_params(axis='both', labelsize=7.5)
	# ax3.grid(axis='x', alpha=0.3, linestyle=':')
	# plt.tight_layout()
	# st.pyplot(fig3)

	with col1:
	st.write(f"Corrélation brute ({corr_type})")
	mask_raw = np.triu(np.ones_like(corr_raw, dtype=bool))
	fig, ax = plt.subplots(figsize=(5, 4)) # Diminuez un peu la taille pour la place
	sns.heatmap(corr_raw, annot=True, cmap="coolwarm", center=0, ax=ax,
	mask=mask_raw, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
	ax.tick_params(axis='both', labelsize=8) # Ajout pour fixer la taille de la police
	# RETRAIT de plt.tight_layout()
	st.pyplot(fig, use_container_width=False) # IMPORTANT : Forcer la taille fixe

	# --- COLONNE 2 : CORRÉLATION PARTIELLE ---
	with col2:
	st.write(f"Corrélation partielle ({corr_type})")
	mask_partial = np.triu(np.ones_like(corr_partial, dtype=bool))
	fig2, ax2 = plt.subplots(figsize=(5, 4)) # Même taille que fig
	sns.heatmap(corr_partial, annot=True, cmap="coolwarm", center=0, ax=ax2,
	mask=mask_partial, square=True, vmin=-1, vmax=1, cbar_kws={'shrink': 0.75}, annot_kws={'size': 9})
	ax2.tick_params(axis='both', labelsize=8) # Ajout pour fixer la taille de la police
	# RETRAIT de plt.tight_layout()
	st.pyplot(fig2, use_container_width=False) # IMPORTANT : Forcer la taille fixe

	# --- GRAPHIQUE DES DIFFÉRENCES (Celui-ci est souvent moins problématique) ---
	# Vous pouvez garder plt.tight_layout() ici ou le remplacer par ax3.set_... si le problème persiste.
	st.write("Différences (Partielle - Brute)")
	fig3, ax3 = plt.subplots(figsize=(12, 2.2))
	colors = ['#d7191c' if x < 0 else '#2b83ba' for x in df_diff['Différence']]
	ax3.barh(df_diff['Paire'], df_diff['Différence'], color=colors, height=0.55)
	ax3.axvline(0, color='black', linewidth=0.8, linestyle='--')
	ax3.set_xlabel('Différence de corrélation', fontsize=8)
	ax3.tick_params(axis='both', labelsize=7.5)
	ax3.grid(axis='x', alpha=0.3, linestyle=':')
	plt.tight_layout()
	st.pyplot(fig3)

	st.caption("🔵 Corrélation renforcée après contrôle \| 🔴 Corrélation affaiblie après contrôle")

	# ----------- TAB 2 -----------
	with tab2:
	st.write("Aperçu des données (10 premières lignes)")
	st.dataframe(df_numeric.head(10))