Spaces:

Aliou12
/

streamlit-app

Build error

App Files Files Community

streamlit-app / app.py

Aliou12

le nouveaau deploiement de l'application

14201cd about 1 year ago

raw

history blame contribute delete

4.69 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	import scipy.stats as stats
	import statsmodels.api as sm
	import statsmodels.formula.api as smf
	from sklearn.cluster import KMeans
	from sklearn.preprocessing import LabelEncoder
	from statsmodels.stats.multicomp import pairwise_tukeyhsd

	# 🏠 Titre de l'application
	st.title("📊 Analyse des Évaluations des Clients avec ANOVA")

	# 📂 Upload du fichier
	uploaded_file = st.file_uploader("📂 Téléchargez le fichier 'supermarket_sales.csv'", type=["csv"])

	if uploaded_file is not None:
	# 📖 Charger les données
	data = pd.read_csv(uploaded_file)

	# ✅ Renommer les colonnes pour éviter les erreurs de syntaxe
	data = data.rename(columns={'Product line': 'Product_line'})

	# ✅ Sélectionner les colonnes nécessaires
	data = data[['Product_line', 'Payment', 'Rating']]
	data.dropna(inplace=True) # Supprimer les valeurs manquantes

	# ✅ Convertir en catégories
	data['Product_line'] = data['Product_line'].astype('category')
	data['Payment'] = data['Payment'].astype('category')
	data['Rating'] = pd.to_numeric(data['Rating'], errors='coerce') # Convertir en numérique

	# 📌 Afficher un aperçu des données
	st.subheader("📊 Aperçu des Données")
	st.write(data.head())

	# ============================
	# 📌 Vérification des Hypothèses
	# ============================

	st.subheader("🧪 Vérification des Hypothèses")

	# 🔹 Test de normalité des résidus (Shapiro-Wilk)
	model = smf.ols('Rating ~ C(Product_line) * C(Payment)', data=data).fit()
	residuals = model.resid

	if len(residuals) > 5000:
	residuals_sample = pd.Series(residuals).sample(5000, random_state=42)
	else:
	residuals_sample = residuals

	shapiro_test = stats.shapiro(residuals_sample)
	st.write(f"✅ Test de Shapiro-Wilk (Normalité) : p-value = {shapiro_test.pvalue:.4f}")

	# 🔹 Test d'homogénéité des variances (Levene)
	group_list = [group.dropna().values for _, group in data.groupby('Product_line')['Rating']]
	levene_test = stats.levene(*group_list)
	st.write(f"✅ Test de Levene (Homogénéité des variances) : p-value = {levene_test.pvalue:.4f}")

	# ============================
	# 📌 ANOVA à Deux Facteurs
	# ============================

	st.subheader("📊 ANOVA à Deux Facteurs")
	anova_table = sm.stats.anova_lm(model, typ=2)
	st.write(anova_table)

	# ============================
	# 📌 Comparaisons Post-Hoc (Tukey HSD)
	# ============================

	st.subheader("📌 Comparaisons Post-Hoc (Tukey HSD)")

	if data['Rating'].isna().sum() == 0: # Vérifie qu'il n'y a pas de NaN
	tukey = pairwise_tukeyhsd(data['Rating'], data['Product_line'])
	st.write(tukey.summary())
	else:
	st.error("Erreur : Des valeurs non numériques ont été détectées dans 'Rating'. Vérifiez votre fichier CSV.")

	# ============================
	# 📊 Visualisation des Résultats
	# ============================

	st.subheader("📊 Visualisation des Résultats")

	# 🔹 Boxplot
	fig, ax = plt.subplots(figsize=(10, 5))
	sns.boxplot(x='Product_line', y='Rating', hue='Payment', data=data, ax=ax)
	plt.xticks(rotation=45)
	st.pyplot(fig)

	# 🔹 Heatmap des Moyennes des Évaluations
	mean_ratings = data.groupby(['Product_line', 'Payment'])['Rating'].mean().unstack().fillna(0)
	fig, ax = plt.subplots(figsize=(8, 5))
	sns.heatmap(mean_ratings, annot=True, cmap='coolwarm', ax=ax)
	st.pyplot(fig)

	# ============================
	# 📌 Régression Linéaire Multiple
	# ============================

	st.subheader("📈 Régression Linéaire Multiple")
	lm_model = smf.ols('Rating ~ C(Product_line) + C(Payment)', data=data).fit()
	st.write(lm_model.summary())

	# ============================
	# 📌 Clustering des Clients (K-Means)
	# ============================

	st.subheader("🎯 Clustering des Clients (K-Means)")

	encoder = LabelEncoder()
	data['Product_line_encoded'] = encoder.fit_transform(data['Product_line'])
	kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
	data['Cluster'] = kmeans.fit_predict(data[['Rating', 'Product_line_encoded']])

	# 🔹 Visualisation du Clustering
	fig, ax = plt.subplots(figsize=(8, 5))
	sns.scatterplot(x='Product_line_encoded', y='Rating', hue=data['Cluster'].astype(str), palette='viridis', data=data, ax=ax)
	plt.xticks(ticks=range(len(encoder.classes_)), labels=encoder.classes_, rotation=45)
	st.pyplot(fig)