Spaces:

ericjedha
/

fraud-detection-streamlit

Sleeping

App Files Files Community

ericjedha commited on Oct 30, 2025

Commit

42542c5

verified ·

1 Parent(s): 3c41440

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -11

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import plotly.graph_objects as go
 from sqlalchemy import create_engine, text
 from datetime import datetime, timedelta
 import os
-from skimpy import skim
 # ========================== CONFIGURATION ==========================
 st.set_page_config(
@@ -274,19 +273,139 @@ def page_eda():
         st.error("Impossible de charger les données")
         return
-    # ========================== 1. RÉSUMÉ AVEC SKIMPY ==========================
-    st.markdown("## 📋 Résumé des données avec Skimpy")
-    # Capturer la sortie de skim dans un buffer
-    import io
-    from contextlib import redirect_stdout
-    buffer = io.StringIO()
-    with redirect_stdout(buffer):
-        skim(df)
-    skim_output = buffer.getvalue()
-    st.text(skim_output)
     st.markdown("---")

 from sqlalchemy import create_engine, text
 from datetime import datetime, timedelta
 import os
 # ========================== CONFIGURATION ==========================
 st.set_page_config(
         st.error("Impossible de charger les données")
         return
+    # ========================== 1. RÉSUMÉ DU DATASET ==========================
+    st.markdown("## 📋 Résumé du Dataset")
+    # Informations générales
+    col1, col2, col3, col4 = st.columns(4)
+    with col1:
+        st.metric("📊 Nombre de lignes", f"{len(df):,}")
+    with col2:
+        st.metric("📋 Nombre de colonnes", f"{len(df.columns)}")
+    with col3:
+        st.metric("💾 Taille mémoire", f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
+    with col4:
+        duplicates = df.duplicated().sum()
+        st.metric("🔄 Doublons", f"{duplicates:,}")
+    # Valeurs manquantes
+    st.markdown("### 🔍 Valeurs manquantes")
+    missing = df.isnull().sum()
+    missing_pct = (missing / len(df) * 100).round(2)
+    missing_df = pd.DataFrame({
+        'Colonne': missing.index,
+        'Manquantes': missing.values,
+        'Pourcentage': missing_pct.values
+    })
+    missing_df = missing_df[missing_df['Manquantes'] > 0].sort_values('Manquantes', ascending=False)
+    if not missing_df.empty:
+        fig_missing = px.bar(
+            missing_df,
+            x='Colonne',
+            y='Pourcentage',
+            title='Pourcentage de valeurs manquantes par colonne',
+            color='Pourcentage',
+            color_continuous_scale='Reds',
+            text=missing_df['Pourcentage'].apply(lambda x: f"{x:.1f}%")
+        )
+        fig_missing.update_layout(showlegend=False, height=400)
+        st.plotly_chart(fig_missing, use_container_width=True)
+    else:
+        st.success("✅ Aucune valeur manquante dans le dataset !")
+    # Statistiques descriptives
+    st.markdown("### 📊 Statistiques descriptives (Variables numériques)")
+    # Sélecteur de colonnes numériques
+    numeric_cols_all = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
+    selected_stats_cols = st.multiselect(
+        "Choisissez les colonnes à analyser",
+        numeric_cols_all,
+        default=numeric_cols_all[:5]
+    )
+    if selected_stats_cols:
+        stats_df = df[selected_stats_cols].describe().T
+        stats_df['missing'] = df[selected_stats_cols].isnull().sum().values
+        stats_df['missing_pct'] = (stats_df['missing'] / len(df) * 100).round(2)
+        # Formater pour l'affichage
+        display_stats = stats_df[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'missing', 'missing_pct']]
+        display_stats.columns = ['Count', 'Moyenne', 'Écart-type', 'Min', 'Q1', 'Médiane', 'Q3', 'Max', 'Manquantes', 'Manquantes (%)']
+        st.dataframe(
+            display_stats.style.format({
+                'Moyenne': '{:.2f}',
+                'Écart-type': '{:.2f}',
+                'Min': '{:.2f}',
+                'Q1': '{:.2f}',
+                'Médiane': '{:.2f}',
+                'Q3': '{:.2f}',
+                'Max': '{:.2f}',
+                'Manquantes (%)': '{:.2f}'
+            }),
+            use_container_width=True
+        )
+        # Distribution des variables numériques
+        st.markdown("### 📈 Distributions des variables numériques")
+        selected_dist = st.selectbox("Choisissez une variable à visualiser", selected_stats_cols)
+        col_hist, col_box = st.columns(2)
+        with col_hist:
+            fig_hist = px.histogram(
+                df,
+                x=selected_dist,
+                nbins=50,
+                title=f"Distribution de {selected_dist}",
+                color_discrete_sequence=['#636EFA']
+            )
+            fig_hist.update_layout(showlegend=False, height=350)
+            st.plotly_chart(fig_hist, use_container_width=True)
+        with col_box:
+            fig_box = px.box(
+                df,
+                y=selected_dist,
+                title=f"Box plot de {selected_dist}",
+                color_discrete_sequence=['#636EFA']
+            )
+            fig_box.update_layout(showlegend=False, height=350)
+            st.plotly_chart(fig_box, use_container_width=True)
+    # Variables catégorielles
+    st.markdown("### 🏷️ Variables catégorielles")
+    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
+    if categorical_cols:
+        selected_cat = st.selectbox("Choisissez une variable catégorielle", categorical_cols)
+        value_counts = df[selected_cat].value_counts().head(15)
+        col_bar, col_info = st.columns([2, 1])
+        with col_bar:
+            fig_cat = px.bar(
+                x=value_counts.index,
+                y=value_counts.values,
+                title=f"Top 15 valeurs de {selected_cat}",
+                labels={'x': selected_cat, 'y': 'Count'},
+                color=value_counts.values,
+                color_continuous_scale='Blues'
+            )
+            fig_cat.update_layout(showlegend=False, height=400)
+            st.plotly_chart(fig_cat, use_container_width=True)
+        with col_info:
+            st.markdown("#### Statistiques")
+            st.metric("Valeurs uniques", df[selected_cat].nunique())
+            st.metric("Valeur la plus fréquente", value_counts.index[0])
+            st.metric("Fréquence max", f"{value_counts.values[0]:,}")
+            st.metric("% de la plus fréquente", f"{(value_counts.values[0] / len(df) * 100):.1f}%")
+    else:
+        st.info("Aucune variable catégorielle détectée")
     st.markdown("---")