Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| from sqlalchemy import create_engine, text | |
| from datetime import datetime, timedelta | |
| import os | |
| # ========================== CONFIGURATION ========================== | |
| st.set_page_config( | |
| page_title="Fraud Detection Dashboard", | |
| page_icon="🕵🏻", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Couleurs (variables pour personnalisation facile) | |
| COLOR_FRAUD = "#FF4B4B" # Rouge | |
| COLOR_NO_FRAUD = "#00CC66" # Vert | |
| COLOR_SAVED = "#FFD700" # Or pour l'argent économisé | |
| # ========================== CONNEXION BASE DE DONNÉES ========================== | |
| def get_db_connection(): | |
| """Connexion directe à Neon DB (non-pooler, stable pour petit volume)""" | |
| try: | |
| database_url = os.environ.get("NEON_DB_FRAUD_URL") | |
| if not database_url: | |
| st.error("❌ Variable NEON_DB_FRAUD_URL non trouvée dans les secrets Hugging Face") | |
| st.stop() | |
| # Connexion directe sans pooler → autorise les options PostgreSQL | |
| engine = create_engine( | |
| database_url, | |
| pool_pre_ping=True, | |
| pool_recycle=3600, | |
| connect_args={ | |
| "connect_timeout": 10, | |
| "options": "-c statement_timeout=30000" | |
| } | |
| ) | |
| return engine | |
| except Exception as e: | |
| st.error(f"❌ Erreur de connexion à la base de données: {e}") | |
| st.stop() | |
| # ========================== CHARGEMENT DES DONNÉES ========================== | |
| def load_csv_data(): | |
| """Charge le fichier CSV pour l'EDA""" | |
| try: | |
| df = pd.read_csv("fraudTest.csv") | |
| return df | |
| except Exception as e: | |
| st.error(f"❌ Erreur lors du chargement du fichier CSV: {e}") | |
| return pd.DataFrame() | |
| # ========================== REQUÊTES SQL OPTIMISÉES ========================== | |
| def load_all_data(): | |
| """Charge toutes les transactions - APPELÉ SEULEMENT APRÈS CLIC SUR REFRESH""" | |
| engine = get_db_connection() | |
| query = text(""" | |
| SELECT | |
| trans_num, merchant, category, amt, gender, city, zip, city_pop, job, | |
| hour, day, month, year, pred_is_fraud, is_fraud_ground_truth, | |
| transaction_time, created_at | |
| FROM fraud_predictions | |
| ORDER BY created_at DESC | |
| LIMIT 10000 | |
| """) | |
| try: | |
| with engine.connect() as conn: | |
| df = pd.read_sql(query, conn) | |
| df['created_at'] = pd.to_datetime(df['created_at']) | |
| return df | |
| except Exception as e: | |
| st.error(f"❌ Erreur lors du chargement des données: {e}") | |
| return pd.DataFrame() | |
| def load_last_24h_data(): | |
| """Charge les transactions des dernières 24h""" | |
| engine = get_db_connection() | |
| query = text(""" | |
| SELECT trans_num, merchant, category, amt, gender, city, pred_is_fraud, created_at | |
| FROM fraud_predictions | |
| WHERE created_at >= NOW() - INTERVAL '24 HOURS' | |
| ORDER BY created_at DESC | |
| """) | |
| try: | |
| with engine.connect() as conn: | |
| df = pd.read_sql(query, conn) | |
| df['created_at'] = pd.to_datetime(df['created_at']) | |
| return df | |
| except Exception as e: | |
| st.error(f"❌ Erreur lors du chargement des données 24h: {e}") | |
| return pd.DataFrame() | |
| def load_last_7_days_stats(): | |
| """Charge les stats agrégées des 7 derniers jours""" | |
| engine = get_db_connection() | |
| query = text(""" | |
| SELECT DATE(created_at) as date, | |
| SUM(CASE WHEN pred_is_fraud = 1 THEN 1 ELSE 0 END) as frauds, | |
| SUM(CASE WHEN pred_is_fraud = 0 THEN 1 ELSE 0 END) as no_frauds | |
| FROM fraud_predictions | |
| WHERE created_at >= NOW() - INTERVAL '30 DAYS' | |
| GROUP BY DATE(created_at) | |
| ORDER BY date ASC | |
| """) | |
| try: | |
| with engine.connect() as conn: | |
| df = pd.read_sql(query, conn) | |
| return df | |
| except Exception as e: | |
| st.error(f"❌ Erreur lors du chargement des stats 7 jours: {e}") | |
| return pd.DataFrame() | |
| def load_dashboard_summary(): | |
| """Charge les métriques pour le dashboard (robuste à toutes versions SQLAlchemy)""" | |
| engine = get_db_connection() | |
| query = text(""" | |
| SELECT | |
| COUNT(*) as total_transactions, | |
| COALESCE(SUM(CASE WHEN pred_is_fraud = 1 THEN 1 ELSE 0 END), 0) as total_frauds, | |
| COALESCE(SUM(CASE WHEN pred_is_fraud = 0 THEN 1 ELSE 0 END), 0) as total_no_frauds, | |
| COALESCE(SUM(CASE WHEN pred_is_fraud = 1 THEN amt ELSE 0 END), 0) as total_fraud_amount | |
| FROM fraud_predictions; | |
| """) | |
| try: | |
| with engine.connect() as conn: | |
| # mappins().first() renvoie un dict-like (compatible SQLAlchemy 1.x/2.x) | |
| result = conn.execute(query).mappings().first() | |
| if not result: | |
| return {'total_frauds': 0, 'total_no_frauds': 0, 'total_fraud_amount': 0.0} | |
| # Convertir explicitement en float pour éviter Decimal * float errors | |
| total_fraud_amount = result.get('total_fraud_amount', 0) or 0 | |
| try: | |
| total_fraud_amount = float(total_fraud_amount) | |
| except (TypeError, ValueError): | |
| total_fraud_amount = 0.0 | |
| return { | |
| 'total_frauds': int(result.get('total_frauds', 0) or 0), | |
| 'total_no_frauds': int(result.get('total_no_frauds', 0) or 0), | |
| 'total_fraud_amount': total_fraud_amount | |
| } | |
| except Exception as e: | |
| st.error(f"❌ Erreur lors du chargement du résumé: {e}") | |
| return {'total_frauds': 0, 'total_no_frauds': 0, 'total_fraud_amount': 0.0} | |
| # ========================== PAGE: DASHBOARD ========================== | |
| def page_dashboard(): | |
| st.title("🕵🏻 Fraud Detection Dashboard") | |
| # Message d'instruction | |
| st.info("👇 Cliquez sur **Refresh Data** pour charger les données") | |
| # Bouton refresh qui contrôle le chargement | |
| if st.button("🔄 Refresh Data", type="primary", key="dashboard_refresh"): | |
| st.session_state.dashboard_loaded = True | |
| # Ne charger que si le bouton a été cliqué | |
| if not st.session_state.get('dashboard_loaded', False): | |
| st.warning("⚠️ Cliquez sur 'Refresh Data' pour afficher le dashboard") | |
| return | |
| with st.spinner("Chargement des données..."): | |
| # Charger d'abord le résumé (rapide) | |
| summary = load_dashboard_summary() | |
| df_7days = load_last_7_days_stats() | |
| # ========================== MÉTRIQUES ========================== | |
| total_frauds = summary['total_frauds'] | |
| total_no_frauds = summary['total_no_frauds'] | |
| saved_amount = int(summary['total_fraud_amount'] * 1.5) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.markdown(f""" | |
| <div style="background-color: {COLOR_FRAUD}; padding: 20px; border-radius: 10px; text-align: center;"> | |
| <h3 style="color: white; margin: 0;">🚨 Frauds</h3> | |
| <h1 style="color: white; margin: 10px 0;">{total_frauds}</h1> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col2: | |
| st.markdown(f""" | |
| <div style="background-color: {COLOR_NO_FRAUD}; padding: 20px; border-radius: 10px; text-align: center;"> | |
| <h3 style="color: white; margin: 0;">✅ No Frauds</h3> | |
| <h1 style="color: white; margin: 10px 0;">{total_no_frauds}</h1> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| with col3: | |
| st.markdown(f""" | |
| <div style="background-color: {COLOR_SAVED}; padding: 20px; border-radius: 10px; text-align: center;"> | |
| <h3 style="color: white; margin: 0;">💰 Saved Amount</h3> | |
| <h1 style="color: white; margin: 10px 0;">${saved_amount}</h1> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| # ========================== GRAPHIQUES ========================== | |
| col_pie, col_saved_detail = st.columns([1, 1]) | |
| with col_pie: | |
| fig_pie = go.Figure(data=[go.Pie( | |
| labels=['Frauds', 'No Frauds'], | |
| values=[total_frauds, total_no_frauds], | |
| marker=dict(colors=[COLOR_FRAUD, COLOR_NO_FRAUD]), | |
| hole=0.4, | |
| textinfo='label+percent', | |
| textfont_size=14 | |
| )]) | |
| fig_pie.update_layout(title="Distribution Fraud vs No Fraud", showlegend=True, height=400) | |
| st.plotly_chart(fig_pie, use_container_width=True) | |
| with col_saved_detail: | |
| total_fraud_amount = summary['total_fraud_amount'] | |
| additional_costs = total_fraud_amount * 0.5 | |
| st.markdown("### 💵 Breakdown of Saved Amount") | |
| st.markdown(f""" | |
| - **Total Fraud Amounts**: ${total_fraud_amount:,.2f} | |
| - **Estimated Additional Costs** (chargebacks, fees): ${additional_costs:,.2f} | |
| - **Total Saved**: ${saved_amount:,.2f} | |
| """) | |
| fig_breakdown = go.Figure(data=[ | |
| go.Bar(name='Fraud Amount', x=['Saved'], y=[total_fraud_amount], marker_color=COLOR_FRAUD), | |
| go.Bar(name='Additional Costs', x=['Saved'], y=[additional_costs], marker_color=COLOR_SAVED) | |
| ]) | |
| fig_breakdown.update_layout(barmode='stack', showlegend=True, height=250, yaxis_title="Amount ($)") | |
| st.plotly_chart(fig_breakdown, use_container_width=True) | |
| # ========================== GRAPHIQUE 7 JOURS ========================== | |
| st.markdown("### 📊 Fraud Trend - Last 30 Days") | |
| if not df_7days.empty: | |
| fig_trend = go.Figure() | |
| fig_trend.add_trace(go.Bar(name='Frauds', x=df_7days['date'], y=df_7days['frauds'], marker_color=COLOR_FRAUD)) | |
| fig_trend.add_trace(go.Bar(name='No Frauds', x=df_7days['date'], y=df_7days['no_frauds'], marker_color=COLOR_NO_FRAUD)) | |
| fig_trend.update_layout(barmode='stack', xaxis_title="Date", yaxis_title="Number of Transactions", height=400, showlegend=True, hovermode='x unified') | |
| st.plotly_chart(fig_trend, use_container_width=True) | |
| else: | |
| st.info("Pas encore de données sur 7 jours") | |
| # ========================== PAGE: EDA ========================== | |
| def page_eda(): | |
| st.title("📊 Exploratory Data Analysis") | |
| st.info("👇 Cliquez sur **Load Data** pour charger les données EDA") | |
| if st.button("🔄 Load Data", type="primary", key="eda_refresh"): | |
| st.session_state.eda_loaded = True | |
| if not st.session_state.get('eda_loaded', False): | |
| st.warning("⚠️ Cliquez sur 'Load Data' pour afficher l'analyse") | |
| return | |
| with st.spinner("Chargement des données..."): | |
| df = load_csv_data() | |
| if df.empty: | |
| st.error("Impossible de charger les données") | |
| return | |
| # ========================== 1. RÉSUMÉ DU DATASET ========================== | |
| st.markdown("## 📋 Résumé du Dataset") | |
| # Informations générales | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("📊 Nombre de lignes", f"{len(df):,}") | |
| with col2: | |
| st.metric("📋 Nombre de colonnes", f"{len(df.columns)}") | |
| with col3: | |
| st.metric("💾 Taille mémoire", f"{df.memory_usage(deep=True).sum() / 1024**2:.2f} MB") | |
| with col4: | |
| duplicates = df.duplicated().sum() | |
| st.metric("🔄 Doublons", f"{duplicates:,}") | |
| # Valeurs manquantes | |
| st.markdown("### 🔍 Valeurs manquantes") | |
| missing = df.isnull().sum() | |
| missing_pct = (missing / len(df) * 100).round(2) | |
| missing_df = pd.DataFrame({ | |
| 'Colonne': missing.index, | |
| 'Manquantes': missing.values, | |
| 'Pourcentage': missing_pct.values | |
| }) | |
| missing_df = missing_df[missing_df['Manquantes'] > 0].sort_values('Manquantes', ascending=False) | |
| if not missing_df.empty: | |
| fig_missing = px.bar( | |
| missing_df, | |
| x='Colonne', | |
| y='Pourcentage', | |
| title='Pourcentage de valeurs manquantes par colonne', | |
| color='Pourcentage', | |
| color_continuous_scale='Reds', | |
| text=missing_df['Pourcentage'].apply(lambda x: f"{x:.1f}%") | |
| ) | |
| fig_missing.update_layout(showlegend=False, height=400) | |
| st.plotly_chart(fig_missing, use_container_width=True) | |
| else: | |
| st.success("✅ Aucune valeur manquante dans le dataset !") | |
| # Statistiques descriptives | |
| st.markdown("### 📊 Statistiques descriptives (Variables numériques)") | |
| # Sélecteur de colonnes numériques | |
| numeric_cols_all = df.select_dtypes(include=['float64', 'int64']).columns.tolist() | |
| selected_stats_cols = st.multiselect( | |
| "Choisissez les colonnes à analyser", | |
| numeric_cols_all, | |
| default=numeric_cols_all[:5] | |
| ) | |
| if selected_stats_cols: | |
| stats_df = df[selected_stats_cols].describe().T | |
| stats_df['missing'] = df[selected_stats_cols].isnull().sum().values | |
| stats_df['missing_pct'] = (stats_df['missing'] / len(df) * 100).round(2) | |
| # Formater pour l'affichage | |
| display_stats = stats_df[['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max', 'missing', 'missing_pct']] | |
| display_stats.columns = ['Count', 'Moyenne', 'Écart-type', 'Min', 'Q1', 'Médiane', 'Q3', 'Max', 'Manquantes', 'Manquantes (%)'] | |
| st.dataframe( | |
| display_stats.style.format({ | |
| 'Moyenne': '{:.2f}', | |
| 'Écart-type': '{:.2f}', | |
| 'Min': '{:.2f}', | |
| 'Q1': '{:.2f}', | |
| 'Médiane': '{:.2f}', | |
| 'Q3': '{:.2f}', | |
| 'Max': '{:.2f}', | |
| 'Manquantes (%)': '{:.2f}' | |
| }), | |
| use_container_width=True | |
| ) | |
| # Distribution des variables numériques | |
| st.markdown("### 📈 Distributions des variables numériques") | |
| selected_dist = st.selectbox("Choisissez une variable à visualiser", selected_stats_cols) | |
| col_hist, col_box = st.columns(2) | |
| with col_hist: | |
| fig_hist = px.histogram( | |
| df, | |
| x=selected_dist, | |
| nbins=50, | |
| title=f"Distribution de {selected_dist}", | |
| color_discrete_sequence=['#636EFA'] | |
| ) | |
| fig_hist.update_layout(showlegend=False, height=350) | |
| st.plotly_chart(fig_hist, use_container_width=True) | |
| with col_box: | |
| fig_box = px.box( | |
| df, | |
| y=selected_dist, | |
| title=f"Box plot de {selected_dist}", | |
| color_discrete_sequence=['#636EFA'] | |
| ) | |
| fig_box.update_layout(showlegend=False, height=350) | |
| st.plotly_chart(fig_box, use_container_width=True) | |
| # Variables catégorielles | |
| st.markdown("### 🏷️ Variables catégorielles") | |
| categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist() | |
| if categorical_cols: | |
| selected_cat = st.selectbox("Choisissez une variable catégorielle", categorical_cols) | |
| value_counts = df[selected_cat].value_counts().head(15) | |
| col_bar, col_info = st.columns([2, 1]) | |
| with col_bar: | |
| fig_cat = px.bar( | |
| x=value_counts.index, | |
| y=value_counts.values, | |
| title=f"Top 15 valeurs de {selected_cat}", | |
| labels={'x': selected_cat, 'y': 'Count'}, | |
| color=value_counts.values, | |
| color_continuous_scale='Blues' | |
| ) | |
| fig_cat.update_layout(showlegend=False, height=400) | |
| st.plotly_chart(fig_cat, use_container_width=True) | |
| with col_info: | |
| st.markdown("#### Statistiques") | |
| st.metric("Valeurs uniques", df[selected_cat].nunique()) | |
| st.metric("Valeur la plus fréquente", value_counts.index[0]) | |
| st.metric("Fréquence max", f"{value_counts.values[0]:,}") | |
| st.metric("% de la plus fréquente", f"{(value_counts.values[0] / len(df) * 100):.1f}%") | |
| else: | |
| st.info("Aucune variable catégorielle détectée") | |
| st.markdown("---") | |
| # ========================== 2. DISTRIBUTION FRAUDE vs NON-FRAUDE ========================== | |
| st.markdown("## 🥧 Distribution des transactions") | |
| fraud_counts = df["is_fraud"].value_counts().reset_index() | |
| fraud_counts.columns = ["is_fraud", "count"] | |
| fraud_counts["label"] = fraud_counts["is_fraud"].map({0: "Non frauduleuse", 1: "Frauduleuse"}) | |
| fig_pie = px.pie( | |
| fraud_counts, | |
| values="count", | |
| names="label", | |
| title="Répartition des transactions : frauduleuses vs non frauduleuses", | |
| color_discrete_sequence=["#636EFA", "#EF553B"], | |
| hole=0.4 | |
| ) | |
| fig_pie.update_traces(textinfo="percent+label") | |
| st.plotly_chart(fig_pie, use_container_width=True) | |
| # Statistiques | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Total transactions", f"{len(df):,}") | |
| with col2: | |
| st.metric("Fraudes", f"{fraud_counts[fraud_counts['is_fraud']==1]['count'].values[0]:,}") | |
| with col3: | |
| fraud_rate = (fraud_counts[fraud_counts['is_fraud']==1]['count'].values[0] / len(df)) * 100 | |
| st.metric("Taux de fraude", f"{fraud_rate:.2f}%") | |
| st.markdown("---") | |
| # ========================== 3. CARTE GÉOGRAPHIQUE ========================== | |
| st.markdown("## 🗺️ Localisation géographique des transactions") | |
| # Vérifier si les colonnes existent | |
| if 'merch_lat' in df.columns and 'merch_long' in df.columns: | |
| df_geo = df.dropna(subset=["merch_lat", "merch_long"]) | |
| # Option d'échantillonnage pour performance | |
| sample_size = st.slider("Nombre de points à afficher", 1000, min(50000, len(df_geo)), 10000, step=1000) | |
| df_sample = df_geo.sample(n=min(sample_size, len(df_geo)), random_state=42) | |
| # Ajouter le label | |
| df_sample["fraud_label"] = df_sample["is_fraud"].map({0: "Non frauduleuse", 1: "Frauduleuse"}) | |
| fig_map = px.scatter_mapbox( | |
| df_sample, | |
| lat="merch_lat", | |
| lon="merch_long", | |
| color="fraud_label", | |
| color_discrete_map={"Non frauduleuse": "#636EFA", "Frauduleuse": "#EF553B"}, | |
| title=f"Localisation des transactions ({sample_size} points échantillonnés)", | |
| mapbox_style="open-street-map", | |
| zoom=3, | |
| height=700, | |
| hover_data=["amt", "category", "merchant"] | |
| ) | |
| fig_map.update_layout( | |
| legend_title_text="Type de transaction", | |
| margin={"r":0,"t":50,"l":0,"b":0} | |
| ) | |
| st.plotly_chart(fig_map, use_container_width=True) | |
| else: | |
| st.warning("⚠️ Les colonnes de géolocalisation (merch_lat, merch_long) ne sont pas disponibles dans le dataset") | |
| st.markdown("---") | |
| # ========================== 4. FRAUDES PAR GENRE ========================== | |
| st.markdown("## 👥 Analyse par genre") | |
| if 'gender' in df.columns: | |
| # Nombre de fraudes par genre | |
| fraud_by_gender = df[df["is_fraud"] == 1]["gender"].value_counts().reset_index() | |
| fraud_by_gender.columns = ["gender", "count"] | |
| fraud_by_gender["gender_label"] = fraud_by_gender["gender"].map({"M": "Homme", "F": "Femme"}) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| fig_gender = px.bar( | |
| fraud_by_gender, | |
| x="gender_label", | |
| y="count", | |
| color="gender_label", | |
| color_discrete_map={"Homme": "#1f77b4", "Femme": "#ff7f0e"}, | |
| title="Nombre de fraudes par genre", | |
| labels={"count": "Nombre de fraudes", "gender_label": "Genre"}, | |
| text="count" | |
| ) | |
| fig_gender.update_layout(showlegend=False) | |
| st.plotly_chart(fig_gender, use_container_width=True) | |
| with col2: | |
| # Taux de fraude par genre | |
| gender_stats = df.groupby('gender')['is_fraud'].agg(['sum', 'count']).reset_index() | |
| gender_stats['fraud_rate'] = (gender_stats['sum'] / gender_stats['count']) * 100 | |
| gender_stats['gender_label'] = gender_stats['gender'].map({"M": "Homme", "F": "Femme"}) | |
| fig_rate = px.bar( | |
| gender_stats, | |
| x="gender_label", | |
| y="fraud_rate", | |
| color="gender_label", | |
| color_discrete_map={"Homme": "#1f77b4", "Femme": "#ff7f0e"}, | |
| title="Taux de fraude par genre (%)", | |
| labels={"fraud_rate": "Taux de fraude (%)", "gender_label": "Genre"}, | |
| text=gender_stats['fraud_rate'].apply(lambda x: f"{x:.2f}%") | |
| ) | |
| fig_rate.update_layout(showlegend=False) | |
| st.plotly_chart(fig_rate, use_container_width=True) | |
| else: | |
| st.warning("⚠️ La colonne 'gender' n'est pas disponible dans le dataset") | |
| st.markdown("---") | |
| # ========================== 5. CORRÉLATIONS ET PAIRPLOT ========================== | |
| st.markdown("## 🔍 Analyse de corrélations") | |
| # Sélectionner les colonnes numériques | |
| numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist() | |
| # Retirer is_fraud de la liste | |
| if 'is_fraud' in numeric_cols: | |
| numeric_cols.remove('is_fraud') | |
| # Limiter aux colonnes les plus pertinentes par défaut | |
| default_cols = ['amt', 'city_pop', 'lat', 'long'][:4] | |
| default_cols = [col for col in default_cols if col in numeric_cols] | |
| selected_cols = st.multiselect( | |
| "Choisissez les variables à analyser (3-5 variables recommandées)", | |
| numeric_cols, | |
| default=default_cols[:4], | |
| max_selections=6 | |
| ) | |
| if selected_cols and len(selected_cols) >= 2: | |
| # ========================== MATRICE DE CORRÉLATION ========================== | |
| st.markdown("### 📊 Matrice de corrélation") | |
| corr_matrix = df[selected_cols].corr() | |
| # Créer une heatmap annotée plus lisible | |
| fig_corr = go.Figure(data=go.Heatmap( | |
| z=corr_matrix.values, | |
| x=corr_matrix.columns, | |
| y=corr_matrix.columns, | |
| colorscale='RdBu_r', | |
| zmid=0, | |
| text=corr_matrix.values, | |
| texttemplate='%{text:.2f}', | |
| textfont={"size": 12}, | |
| colorbar=dict(title="Corrélation") | |
| )) | |
| fig_corr.update_layout( | |
| title="Matrice de corrélation de Pearson", | |
| xaxis_title="", | |
| yaxis_title="", | |
| height=500, | |
| width=500 | |
| ) | |
| st.plotly_chart(fig_corr, use_container_width=True) | |
| # Afficher les corrélations les plus fortes | |
| corr_pairs = [] | |
| for i in range(len(corr_matrix.columns)): | |
| for j in range(i+1, len(corr_matrix.columns)): | |
| corr_pairs.append({ | |
| 'Variable 1': corr_matrix.columns[i], | |
| 'Variable 2': corr_matrix.columns[j], | |
| 'Corrélation': corr_matrix.iloc[i, j] | |
| }) | |
| corr_df = pd.DataFrame(corr_pairs).sort_values('Corrélation', key=abs, ascending=False) | |
| st.markdown("#### 🔝 Top 5 des corrélations") | |
| st.dataframe( | |
| corr_df.head(5).style.background_gradient( | |
| subset=['Corrélation'], | |
| cmap='RdBu_r', | |
| vmin=-1, | |
| vmax=1 | |
| ).format({'Corrélation': '{:.3f}'}), | |
| use_container_width=True | |
| ) | |
| # ========================== PAIRPLOT STYLE SEABORN ========================== | |
| st.markdown("### 🎨 Pairplot (style Seaborn)") | |
| # Échantillonner pour la performance | |
| sample_size_pair = min(1000, len(df)) | |
| df_pair = df[selected_cols + ['is_fraud']].sample(n=sample_size_pair, random_state=42) | |
| df_pair['fraud_label'] = df_pair['is_fraud'].map({0: "Non frauduleuse", 1: "Frauduleuse"}) | |
| # Créer une grille de subplots | |
| n_vars = len(selected_cols) | |
| from plotly.subplots import make_subplots | |
| fig_pair = make_subplots( | |
| rows=n_vars, | |
| cols=n_vars, | |
| shared_xaxes=True, | |
| shared_yaxes=True, | |
| vertical_spacing=0.02, | |
| horizontal_spacing=0.02 | |
| ) | |
| colors = {'Non frauduleuse': '#636EFA', 'Frauduleuse': '#EF553B'} | |
| for i, var_y in enumerate(selected_cols): | |
| for j, var_x in enumerate(selected_cols): | |
| row = i + 1 | |
| col = j + 1 | |
| if i == j: | |
| # Diagonale : histogrammes | |
| for fraud_label in ['Non frauduleuse', 'Frauduleuse']: | |
| data = df_pair[df_pair['fraud_label'] == fraud_label][var_x] | |
| fig_pair.add_trace( | |
| go.Histogram( | |
| x=data, | |
| name=fraud_label, | |
| marker_color=colors[fraud_label], | |
| opacity=0.7, | |
| showlegend=(i == 0 and j == 0) | |
| ), | |
| row=row, col=col | |
| ) | |
| elif i > j: | |
| # Triangle inférieur : scatter plots | |
| for fraud_label in ['Non frauduleuse', 'Frauduleuse']: | |
| data = df_pair[df_pair['fraud_label'] == fraud_label] | |
| fig_pair.add_trace( | |
| go.Scatter( | |
| x=data[var_x], | |
| y=data[var_y], | |
| mode='markers', | |
| name=fraud_label, | |
| marker=dict( | |
| color=colors[fraud_label], | |
| size=4, | |
| opacity=0.6 | |
| ), | |
| showlegend=False | |
| ), | |
| row=row, col=col | |
| ) | |
| # Labels uniquement sur les bords | |
| if i == n_vars - 1: | |
| fig_pair.update_xaxes(title_text=var_x, row=row, col=col) | |
| if j == 0: | |
| fig_pair.update_yaxes(title_text=var_y, row=row, col=col) | |
| fig_pair.update_layout( | |
| height=200 * n_vars, | |
| title_text=f"Pairplot - {sample_size_pair} échantillons", | |
| showlegend=True | |
| ) | |
| st.plotly_chart(fig_pair, use_container_width=True) | |
| st.info("💡 **Lecture du pairplot** : La diagonale montre la distribution de chaque variable. Le triangle inférieur montre les relations entre paires de variables.") | |
| elif selected_cols and len(selected_cols) < 2: | |
| st.warning("⚠️ Veuillez sélectionner au moins 2 variables pour voir les corrélations") | |
| else: | |
| st.warning("⚠️ Veuillez sélectionner des variables à analyser") | |
| # ========================== PAGE: FRAUDES (24h) ========================== | |
| def page_frauds(): | |
| st.title("🚨 Fraudes Détectées (Dernières 24h)") | |
| st.info("👇 Cliquez sur **Refresh Data** pour charger les fraudes") | |
| if st.button("🔄 Refresh Data", type="primary", key="frauds_refresh"): | |
| st.session_state.frauds_loaded = True | |
| if not st.session_state.get('frauds_loaded', False): | |
| st.warning("⚠️ Cliquez sur 'Refresh Data' pour afficher les fraudes") | |
| return | |
| with st.spinner("Chargement des fraudes..."): | |
| df = load_last_24h_data() | |
| # 🔧 Convertir created_at en heure de Paris | |
| if 'created_at' in df.columns and not df.empty: | |
| df['created_at'] = pd.to_datetime(df['created_at'], utc=True) | |
| df['created_at'] = df['created_at'].dt.tz_convert('Europe/Paris') | |
| # Formater pour affichage plus lisible | |
| df['created_at_display'] = df['created_at'].dt.strftime('%Y-%m-%d %H:%M:%S') | |
| df_frauds = df[df['pred_is_fraud'] == 1] | |
| st.markdown(f""" | |
| <div style="background-color: {COLOR_FRAUD}; padding: 15px; border-radius: 10px; text-align: center; margin-bottom: 20px;"> | |
| <h2 style="color: white; margin: 0;">🚨 {len(df_frauds)} Fraudes détectées</h2> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if not df_frauds.empty: | |
| # Utiliser created_at_display au lieu de created_at pour l'affichage | |
| display_cols = ['trans_num','merchant','category','amt','city','gender','created_at_display'] | |
| df_display = df_frauds[display_cols].copy() | |
| df_display = df_display.rename(columns={'created_at_display': 'Date/Heure (Paris)'}) | |
| st.dataframe( | |
| df_display.sort_values('Date/Heure (Paris)', ascending=False), | |
| use_container_width=True, | |
| height=600 | |
| ) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Montant total", f"${df_frauds['amt'].sum():,.2f}") | |
| with col2: | |
| st.metric("Montant moyen", f"${df_frauds['amt'].mean():,.2f}") | |
| with col3: | |
| st.metric("Montant max", f"${df_frauds['amt'].max():,.2f}") | |
| else: | |
| st.success("✅ Aucune fraude détectée dans les dernières 24h !") | |
| # ========================== PAGE: NON FRAUDES (24h) ========================== | |
| def page_no_frauds(): | |
| st.title("✅ Transactions Légitimes (Dernières 24h)") | |
| st.info("👇 Cliquez sur **Refresh Data** pour charger les transactions") | |
| if st.button("🔄 Refresh Data", type="primary", key="no_frauds_refresh"): | |
| st.session_state.no_frauds_loaded = True | |
| if not st.session_state.get('no_frauds_loaded', False): | |
| st.warning("⚠️ Cliquez sur 'Refresh Data' pour afficher les transactions") | |
| return | |
| with st.spinner("Chargement des transactions légitimes..."): | |
| df = load_last_24h_data() | |
| # 🔧 Convertir created_at en heure de Paris | |
| if 'created_at' in df.columns and not df.empty: | |
| df['created_at'] = pd.to_datetime(df['created_at'], utc=True) | |
| df['created_at'] = df['created_at'].dt.tz_convert('Europe/Paris') | |
| # Formater pour affichage plus lisible | |
| df['created_at_display'] = df['created_at'].dt.strftime('%Y-%m-%d %H:%M:%S') | |
| df_no_frauds = df[df['pred_is_fraud'] == 0] | |
| st.markdown(f""" | |
| <div style="background-color: {COLOR_NO_FRAUD}; padding: 15px; border-radius: 10px; text-align: center; margin-bottom: 20px;"> | |
| <h2 style="color: white; margin: 0;">✅ {len(df_no_frauds)} Transactions légitimes</h2> | |
| </div> | |
| """, unsafe_allow_html=True) | |
| if not df_no_frauds.empty: | |
| # Utiliser created_at_display au lieu de created_at pour l'affichage | |
| display_cols = ['trans_num','merchant','category','amt','city','gender','created_at_display'] | |
| df_display = df_no_frauds[display_cols].copy() | |
| df_display = df_display.rename(columns={'created_at_display': 'Date/Heure (Paris)'}) | |
| st.dataframe( | |
| df_display.sort_values('Date/Heure (Paris)', ascending=False), | |
| use_container_width=True, | |
| height=600 | |
| ) | |
| col1, col2, col3 = st.columns(3) | |
| with col1: | |
| st.metric("Montant total", f"${df_no_frauds['amt'].sum():,.2f}") | |
| with col2: | |
| st.metric("Montant moyen", f"${df_no_frauds['amt'].mean():,.2f}") | |
| with col3: | |
| st.metric("Montant max", f"${df_no_frauds['amt'].max():,.2f}") | |
| else: | |
| st.warning("⚠️ Aucune transaction légitime dans les dernières 24h") | |
| # ========================== NAVIGATION ========================== | |
| def main(): | |
| st.sidebar.title("Navigation") | |
| page = st.sidebar.radio( | |
| "Go to", | |
| ["🏠 Dashboard", "📊 EDA", "🚨 Fraudes (24h)", "✅ Non Fraudes (24h)"] | |
| ) | |
| st.sidebar.markdown("---") | |
| st.sidebar.markdown(""" | |
| ### ℹ️ À propos | |
| Dashboard de détection de fraude en temps réel. | |
| **🔄 Refresh** : Cliquez sur le bouton pour charger/actualiser les données. | |
| **⚡ Optimisé** : Les données ne se chargent que sur demande pour économiser les ressources. | |
| **📊 Données** : | |
| - Dashboard: Stats temps réel | |
| - EDA: Analyse du dataset complet | |
| - Détail: Dernières 24h | |
| """) | |
| if page == "🏠 Dashboard": | |
| page_dashboard() | |
| elif page == "📊 EDA": | |
| page_eda() | |
| elif page == "🚨 Fraudes (24h)": | |
| page_frauds() | |
| elif page == "✅ Non Fraudes (24h)": | |
| page_no_frauds() | |
| if __name__ == "__main__": | |
| main() |