Spaces:

Eric2mangel
/

Mapping_variables

Sleeping

App Files Files Community

Mapping_variables / app.py

Eric2mangel

Upload 4 files

72eee83 verified 26 days ago

raw

history blame contribute delete

10.6 kB

	import streamlit as st
	import seaborn as sns
	import pandas as pd
	import numpy as np
	import networkx as nx
	import plotly.graph_objects as go
	import matplotlib.pyplot as plt
	from sklearn.feature_selection import mutual_info_regression
	from scipy.stats import entropy
	from concurrent.futures import ThreadPoolExecutor

	# ==========================================
	# CONFIGURATION FACILE DU PROGRAMME
	# ==========================================
	GRAPH_HEIGHT = 480 # Modifiez cette valeur (en pixels) pour adapter la hauteur du graphe
	# ==========================================

	st.set_page_config(page_title="Analyse de proximité", page_icon="🔗", layout="wide")
	st.title("🔗 Cartographie des variables")
	st.subheader("Méthode : Information Mutuelle Normalisée (IMN)")

	# --- SIDEBAR ---
	with st.sidebar:
	st.header("⚙️ Configuration")
	data_source = st.radio("Source des données", ["Dataset Seaborn", "Importer un fichier"])

	df = None
	if data_source == "Importer un fichier":
	uploaded_file = st.file_uploader("Fichier CSV ou Excel", type=["csv", "xlsx", "xls"])
	if uploaded_file:
	file_extension = uploaded_file.name.split('.')[-1].lower()

	with st.spinner("Chargement du fichier..."):
	try:
	if file_extension == 'csv':
	# Tentative de lecture CSV avec détection automatique du séparateur
	df = pd.read_csv(uploaded_file, sep=None, engine='python', encoding_errors='ignore')
	df = df.dropna()
	elif file_extension in ['xlsx', 'xls']:
	df = pd.read_excel(uploaded_file)
	df = df.dropna()
	except Exception as e:
	st.error(f"Erreur lors du chargement : {str(e)}")
	df = None
	else:
	# Sélection du dataset Seaborn
	dataset_name = st.selectbox(
	"Choisir un dataset",
	["titanic", "tips", "iris", "penguins", "mpg", "planets", "flights", "diamonds"]
	)
	try:
	with st.spinner(f"Chargement du dataset {dataset_name}..."):
	df = sns.load_dataset(dataset_name).dropna()
	except:
	st.error(f"Impossible de charger le dataset '{dataset_name}'")
	df = None

	if df is not None:
	# Affichage de la taille du dataset
	st.metric("Lignes × Colonnes", f"{len(df)} × {len(df.columns)}")

	# Remplacement du slider classique par un select_slider avec vos paliers spécifiques
	threshold = st.select_slider(
	"Seuil de visibilité des liens (IMN)",
	options=[0, 0.1, 0.3, 0.6, 0.9],
	value=0.3,
	help="Filtre les liens selon les paliers d'interprétation définis ci-dessous."
	)

	st.info("💡 Légende de l'IMN")
	st.markdown("""
	* 0.90 – 1.00 : Quasi-doublons
	* 0.60 – 0.90 : Relation forte
	* 0.30 – 0.60 : Relation modérée
	* 0.10 – 0.30 : Relation faible
	* < 0.10 : Indépendance
	""")

	# --- LOGIQUE DE CALCUL ---
	if df is not None:
	with st.spinner("Calcul de l'information mutuelle en cours..."):
	# Préparation des colonnes avec typage optimisé
	df_calc = df.copy()
	discrete_map = []

	for col in df.columns:
	if df[col].dtype == 'object' or df[col].dtype.name == 'category':
	df_calc[col] = df[col].astype('category').cat.codes.astype(np.int32)
	discrete_map.append(True)
	else:
	df_calc[col] = df_calc[col].astype(np.float32)
	discrete_map.append(False)

	n_vars = len(df.columns)
	mi_matrix = np.zeros((n_vars, n_vars), dtype=np.float32)

	# Calcul des entropies avec bins adaptés
	entropies = []
	for i in range(n_vars):
	bins = min(10, len(df_calc.iloc[:, i].unique()))
	hist = np.histogram(df_calc.iloc[:, i], bins=bins, density=True)[0]
	entropies.append(entropy(hist + 1e-9))

	# Fonction pour calculer une ligne de la matrice
	def compute_mi_row(i):
	scores = mutual_info_regression(
	df_calc,
	df_calc.iloc[:, i],
	discrete_features=discrete_map,
	random_state=42,
	n_neighbors=min(3, max(1, len(df_calc) // 100))
	)
	return i, scores

	# Calcul parallélisé de la matrice d'information mutuelle
	with ThreadPoolExecutor(max_workers=4) as executor:
	results = list(executor.map(compute_mi_row, range(n_vars)))

	# Remplissage de la matrice et symétrisation
	for i, scores in results:
	for j, s in enumerate(scores):
	if i == j:
	mi_matrix[i, j] = 1.0
	else:
	h_min = min(entropies[i], entropies[j])
	nmi = s / h_min if h_min > 0 else 0
	mi_matrix[i, j] = min(max(nmi, 0), 1.0)

	# Symétrisation de la matrice (moyenne des deux directions)
	mi_matrix = (mi_matrix + mi_matrix.T) / 2
	np.fill_diagonal(mi_matrix, 1.0)

	to_keep = []
	redundant_pairs = []
	seen = set()

	for i in range(n_vars):
	if i in seen: continue
	for j in range(i + 1, n_vars):
	val_im = mi_matrix[i, j]
	if val_im >= 0.99:
	seen.add(j)
	redundant_pairs.append({
	"Variable conservée": df.columns[i],
	"Doublon supprimé": df.columns[j],
	"Score IMN": f"{val_im:.4f}"
	})
	to_keep.append(i)

	final_vars = [df.columns[i] for i in to_keep]
	final_mi = mi_matrix[np.ix_(to_keep, to_keep)]

	G = nx.Graph()
	for i in range(len(final_vars)):
	for j in range(i + 1, len(final_vars)):
	im_val = final_mi[i, j]
	if im_val > threshold:
	G.add_edge(final_vars[i], final_vars[j], weight=float(im_val))
	if final_vars[i] not in G:
	G.add_node(final_vars[i])

	pos = nx.spring_layout(G, k=1.2, seed=42)

	node_hover_texts = []
	for node in G.nodes():
	hover_text = f"<b>Variable : {node}</b><br><br>Liens (IMN > {threshold}):<br>"
	neighbors = G.edges(node, data=True)
	sorted_neighbors = sorted(neighbors, key=lambda x: x[2]['weight'], reverse=True)

	if not sorted_neighbors:
	hover_text += "<i>Aucun lien significatif</i>"
	else:
	for _, neighbor, data in sorted_neighbors:
	hover_text += f"• {neighbor} : <b>{data['weight']:.4f}</b><br>"
	node_hover_texts.append(hover_text)

	# --- AFFICHAGE ---
	tab1, tab2, tab3, tab4 = st.tabs(["📊 Graphe interactif", "👯 Doublons filtrés", "📋 Matrice triangulaire", "📄 Aperçu des données"])

	with tab1:
	edge_traces = []
	for edge in G.edges(data=True):
	x0, y0 = pos[edge[0]]
	x1, y1 = pos[edge[1]]
	w = edge[2]['weight']
	color = f'rgba({int(255w)}, {int(150(1-w))}, {int(200(1-w))}, {0.3 + 0.4w})'

	edge_traces.append(go.Scatter(
	x=[x0, x1, None], y=[y0, y1, None],
	line=dict(width=w*12, color=color),
	hoverinfo='none',
	mode='lines'
	))

	node_trace = go.Scatter(
	x=[pos[n][0] for n in G.nodes()],
	y=[pos[n][1] for n in G.nodes()],
	mode='markers+text',
	text=list(G.nodes()),
	textposition="bottom center",
	textfont=dict(color='white', size=11),
	marker=dict(
	size=[10 + G.degree(n) * 5 for n in G.nodes()],
	color='#1f77b4',
	line=dict(width=2, color='white'),
	opacity=1
	),
	hoverinfo='text',
	hovertext=node_hover_texts
	)

	fig = go.Figure(data=edge_traces + [node_trace])
	fig.update_layout(
	paper_bgcolor='rgba(15,15,25,1)',
	plot_bgcolor='rgba(0,0,0,0)',
	height=GRAPH_HEIGHT,
	showlegend=False,
	xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
	yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
	margin=dict(b=20, l=5, r=5, t=40),
	hoverlabel=dict(bgcolor="rgba(30, 30, 50, 0.9)", font_size=13)
	)
	st.plotly_chart(fig, use_container_width=True)

	with tab2:
	st.subheader("Variables supprimées (Redondance forte)")
	if redundant_pairs:
	st.dataframe(pd.DataFrame(redundant_pairs), use_container_width=True)
	else:
	st.info("Aucun doublon détecté.")

	with tab3:
	st.subheader("Matrice de proximité")
	df_imn = pd.DataFrame(final_mi, index=final_vars, columns=final_vars)
	mask = np.triu(np.ones_like(df_imn, dtype=bool))

	fig_map, ax = plt.subplots(figsize=(10, 8), layout="constrained")
	fig_map.patch.set_facecolor('white')

	sns.heatmap(df_imn, mask=mask, cmap="coolwarm", vmax=1.0, vmin=0,
	annot=True, fmt=".2f", square=True, linewidths=.5,
	cbar_kws={"shrink": .8}, ax=ax, annot_kws={"size": 9})

	plt.xticks(rotation=45, ha='right', color='black')
	plt.yticks(rotation=0, color='black')
	ax.set_facecolor('white')

	st.pyplot(fig_map)

	with tab4:
	st.subheader("Aperçu des données (20 premières lignes)")
	st.dataframe(df.head(20), use_container_width=True)
	else:
	st.info("👈 Veuillez sélectionner ou importer un jeu de données.")

	# Footer
	st.markdown("---")
	st.markdown("""
	<div style='text-align: center; color: gray;'>
	<small>L'Information Mutuelle Normalisée (IMN) mesure la dépendance entre variables (0 = indépendance, 1 = dépendance totale)</small>
	</div>
	""", unsafe_allow_html=True)