Spaces:

klydekushy
/

Vortex-Flux

Running

File size: 21,936 Bytes

f986738

import streamlit as st
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import numpy as np
import time

def show_ml_feature_store(client, sheet_name):
    """
    Affiche le ML Feature Store & Analytics
    
    Args:
        client: Client gspread authentifié
        sheet_name: Nom de la Google Sheet
    """
    
    # Fonction helper pour charger les données
    def get_data_from_sheet(sheet_name_tab):
        try:
            sh = client.open(sheet_name)
            ws = sh.worksheet(sheet_name_tab)
            return pd.DataFrame(ws.get_all_records())
        except:
            return pd.DataFrame()
    
    # === GOTHAM SURVEILLANCE THEME CSS ===
    st.markdown("""
    
        <style>
        @import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@300;400;500;600;700&display=swap');
        
        /* Fond global - tons surveillance */
        .stApp {
            background: linear-gradient(135deg, #0d1117 0%, #161b22 50%, #1c2128 100%);
        }
        
        /* Application Space Grotesk UNIQUEMENT aux contenus textuels - PAS aux boutons ni icônes */
        .stApp h1, .stApp h2, .stApp h3, .stApp h4, .stApp h5, .stApp h6,
        .stApp p:not([data-testid]),
        .stMarkdown, 
        .stText {
            font-family: 'Space Grotesk', sans-serif !important;
        }
        
        /* Headers style surveillance discrète */
        .stApp h1, .stApp h2, .stApp h3 {
            color: #58a6ff !important;
            font-family: 'Space Grotesk', sans-serif !important;
            font-weight: 500 !important;
            letter-spacing: 0.5px;
            text-shadow: none;
        }
        
        .stApp h1 {
            font-size: 1.8rem !important;
            border-bottom: 1px solid rgba(88, 166, 255, 0.2);
            padding-bottom: 12px;
        }
        
        .stApp h2 {
            font-size: 1.3rem !important;
            color: #8b949e !important;
        }
        
        /* Metrics cards - style ops center */
        [data-testid="stMetric"] {
            background: rgba(22, 27, 34, 0.6);
            border: 1px solid rgba(48, 54, 61, 0.8);
            border-radius: 6px;
            padding: 16px;
            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.4);
            backdrop-filter: blur(8px);
        }
        
        [data-testid="stMetric"] label {
            color: #8b949e !important;
            font-size: 0.75rem !important;
            font-weight: 500 !important;
            text-transform: uppercase;
            letter-spacing: 0.8px;
            font-family: 'Space Grotesk', sans-serif !important;
        }
        
        [data-testid="stMetric"] [data-testid="stMetricValue"] {
            color: #c9d1d9 !important;
            font-size: 1.6rem !important;
            font-weight: 600 !important;
            text-shadow: none;
            font-family: 'Space Grotesk', sans-serif !important;
        }
        
        [data-testid="stMetric"] [data-testid="stMetricDelta"] {
            color: #58a6ff !important;
            font-size: 0.85rem !important;
            font-weight: 400;
            font-family: 'Space Grotesk', sans-serif !important;
        }
        
        /* Boutons style ops - SANS Space Grotesk */
        .stButton > button {
            background: rgba(22, 27, 34, 0.8);
            border: 1px solid rgba(48, 54, 61, 1);
            color: #c9d1d9 !important;
            font-weight: 500;
            font-size: 0.85rem;
            letter-spacing: 0.5px;
            border-radius: 4px;
            padding: 10px 20px;
            transition: all 0.2s ease;
            box-shadow: 0 1px 2px rgba(0, 0, 0, 0.3);
        }
        
        .stButton > button:hover {
            background: rgba(33, 38, 45, 1);
            border-color: rgba(88, 166, 255, 0.4);
            box-shadow: 0 0 8px rgba(88, 166, 255, 0.15);
        }
        
        /* Expanders style surveillance */
        .streamlit-expanderHeader {
            background: rgba(22, 27, 34, 0.4);
            border-left: 2px solid rgba(88, 166, 255, 0.5);
            color: #8b949e !important;
            font-weight: 500;
            font-size: 0.9rem;
            letter-spacing: 0.3px;
            padding: 12px 16px;
            border-radius: 3px;
            font-family: 'Space Grotesk', sans-serif !important;
        }
        
        .streamlit-expanderHeader:hover {
            background: rgba(22, 27, 34, 0.6);
            border-left-color: rgba(88, 166, 255, 0.8);
        }
        
        /* Dataframe style */
        .stDataFrame {
            border: 1px solid rgba(48, 54, 61, 0.6);
            border-radius: 4px;
            overflow: hidden;
            font-size: 0.85rem;
        }
        
        /* Info boxes */
        .stAlert {
            background: rgba(22, 27, 34, 0.6);
            border: 1px solid rgba(48, 54, 61, 0.8);
            border-left: 3px solid rgba(88, 166, 255, 0.6);
            border-radius: 4px;
            color: #8b949e;
            font-size: 0.9rem;
            font-family: 'Space Grotesk', sans-serif !important;
        }
        
        /* Checkbox */
        .stCheckbox label {
            color: #8b949e !important;
            font-weight: 500;
            font-size: 0.85rem;
            font-family: 'Space Grotesk', sans-serif !important;
        }
        
        /* Divider */
        hr {
            border: none;
            height: 1px;
            background: rgba(48, 54, 61, 0.6);
            margin: 2rem 0;
        }
        
        /* Download button - SANS Space Grotesk */
        .stDownloadButton > button {
            background: rgba(88, 166, 255, 0.1);
            border: 1px solid rgba(88, 166, 255, 0.4);
            color: #58a6ff !important;
            font-weight: 600;
        }
        
        .stDownloadButton > button:hover {
            background: rgba(88, 166, 255, 0.15);
            border-color: rgba(88, 166, 255, 0.6);
        }
        
        /* Scrollbar customization */
        ::-webkit-scrollbar {
            width: 8px;
            height: 8px;
        }
        
        ::-webkit-scrollbar-track {
            background: rgba(13, 17, 23, 0.4);
        }
        
        ::-webkit-scrollbar-thumb {
            background: rgba(48, 54, 61, 0.8);
            border-radius: 4px;
        }
        
        ::-webkit-scrollbar-thumb:hover {
            background: rgba(88, 166, 255, 0.3);
        }
        </style>
        """, unsafe_allow_html=True)
    
    # Header principal
    st.markdown("""
    <h1 style='text-align: left; margin-bottom: 24px;'>
    ML Feature Store & Analytics
    </h1>
    """, unsafe_allow_html=True)
    
    # Boutons de contrôle
    col_refresh, col_auto = st.columns([3, 1])
    with col_refresh:
        if st.button("Actualiser les données", use_container_width=True):
            st.rerun()
    with col_auto:
        auto_refresh = st.checkbox("Auto 30s", value=False)
    
    if auto_refresh:
        st_autorefresh = st.empty()
        with st_autorefresh:
            st.info("✅ Actualisation automatique activée")
        time.sleep(30)
        st.rerun()
    
    # Chargement des données pour le feature store
    try:
        df_clients = get_data_from_sheet("Clients_KYC")
        df_prets = get_data_from_sheet("Prets_Master")
    except Exception as e:
        st.error(f"❌ Erreur de lecture : {e}")
        st.stop()
    
    if df_clients.empty or df_prets.empty:
        st.warning("⚠️ Données insuffisantes")
        st.stop()
    
    # Fusion des données
    df_full = pd.merge(df_prets, df_clients, on="ID_Client", how="left", suffixes=('_Pret', '_Client'))
    
    # Thème Plotly surveillance discret
    plotly_template = {
        'layout': {
            'plot_bgcolor': 'rgba(13, 17, 23, 0.8)',
            'paper_bgcolor': 'rgba(22, 27, 34, 0.3)',
            'font': {'color': '#8b949e', 'family': 'Space Grotesk', 'size': 12},
            'title': {
                'font': {'size': 14, 'color': '#c9d1d9', 'family': 'Space Grotesk'},
                'x': 0,
                'xanchor': 'left'
            },
            'xaxis': {
                'gridcolor': 'rgba(48, 54, 61, 0.3)',
                'linecolor': 'rgba(48, 54, 61, 0.5)',
                'zerolinecolor': 'rgba(48, 54, 61, 0.5)',
                'tickfont': {'family': 'Space Grotesk', 'size': 11}
            },
            'yaxis': {
                'gridcolor': 'rgba(48, 54, 61, 0.3)',
                'linecolor': 'rgba(48, 54, 61, 0.5)',
                'zerolinecolor': 'rgba(48, 54, 61, 0.5)',
                'tickfont': {'family': 'Space Grotesk', 'size': 11}
            },
            'legend': {
                'font': {'family': 'Space Grotesk', 'size': 11}
            }
        }
    }
    
    # FEATURE ENGINEERING
    st.markdown("<h2>Feature Engineering Pipeline</h2>", unsafe_allow_html=True)
    
    # 1. FEATURES DÉMOGRAPHIQUES
    with st.expander(" Features Démographiques", expanded=True):
        df_full['Date_Naissance'] = pd.to_datetime(df_full['Date_Naissance'], errors='coerce')
        df_full['Age'] = ((datetime.now() - df_full['Date_Naissance']).dt.days / 365.25).round(0)
        df_full['Pers_Charge'] = pd.to_numeric(df_full['Pers_Charge'], errors='coerce').fillna(0)
        df_full['Categorie_Age'] = pd.cut(df_full['Age'], 
                                         bins=[0, 25, 35, 45, 55, 100],
                                         labels=['18-25', '26-35', '36-45', '46-55', '55+'])
        
        col1, col2, col3, col4 = st.columns(4)
        col1.metric("Âge moyen", f"{df_full['Age'].mean():.1f} ans", 
                    delta=f"σ {df_full['Age'].std():.1f}")
        col2.metric("Pers. charge", f"{df_full['Pers_Charge'].mean():.1f}",
                    delta=f"Max {df_full['Pers_Charge'].max():.0f}")
        col3.metric("Mariés", f"{(df_full['Etat_Civil']=='Marié(e)').sum()}")
        col4.metric("Célibataires", f"{(df_full['Etat_Civil']=='Célibataire').sum()}")
        
        fig_age = px.histogram(df_full, x='Age', nbins=20, 
                               title="Distribution des âges",
                               color_discrete_sequence=['#58a6ff'],
                               template=plotly_template)
        fig_age.update_traces(marker_line_color='rgba(139, 148, 158, 0.3)', marker_line_width=0.5, opacity=0.8)
        st.plotly_chart(fig_age, use_container_width=True)
    
    # 2. FEATURES FINANCIÈRES
    with st.expander(" Features Financières", expanded=True):
        df_full['Revenus_Mensuels'] = pd.to_numeric(df_full['Revenus_Mensuels'], errors='coerce').fillna(0)
        df_full['Autres_Revenus'] = pd.to_numeric(df_full['Autres_Revenus'], errors='coerce').fillna(0)
        df_full['Charges_Estimees'] = pd.to_numeric(df_full['Charges_Estimees'], errors='coerce').fillna(0)
        df_full['Patrimoine_Declare'] = pd.to_numeric(df_full['Patrimoine_Declare'], errors='coerce').fillna(0)
        df_full['Montant_Capital'] = pd.to_numeric(df_full['Montant_Capital'], errors='coerce').fillna(0)
        df_full['Montant_Versement'] = pd.to_numeric(df_full['Montant_Versement'], errors='coerce').fillna(0)
        
        df_full['Revenus_Totaux'] = df_full['Revenus_Mensuels'] + df_full['Autres_Revenus']
        df_full['Revenu_Net'] = df_full['Revenus_Totaux'] - df_full['Charges_Estimees']
        df_full['Ratio_Charges_Revenus'] = (df_full['Charges_Estimees'] / df_full['Revenus_Totaux']).replace([np.inf, -np.inf], 0).fillna(0)
        df_full['Taux_Effort_Mensuel'] = ((df_full['Montant_Versement'] * 4.33) / df_full['Revenus_Totaux']).replace([np.inf, -np.inf], 0).fillna(0) * 100
        df_full['Capacite_Endettement'] = df_full['Revenus_Totaux'] * 0.33
        df_full['Reste_A_Vivre'] = df_full['Revenus_Totaux'] - df_full['Charges_Estimees'] - (df_full['Montant_Versement'] * 4.33)
        
        col1, col2, col3, col4 = st.columns(4)
        col1.metric("Revenu moyen", f"{df_full['Revenus_Totaux'].mean():,.0f} XOF",
                    delta=f"Med {df_full['Revenus_Totaux'].median():,.0f}")
        col2.metric("Taux effort", f"{df_full['Taux_Effort_Mensuel'].mean():.1f}%",
                    delta="⚠️ Risque" if df_full['Taux_Effort_Mensuel'].mean() > 33 else "✅ Sain")
        col3.metric("Reste à vivre", f"{df_full['Reste_A_Vivre'].mean():,.0f} XOF",
                    delta=f"Min {df_full['Reste_A_Vivre'].min():,.0f}")
        col4.metric("Patrimoine", f"{df_full['Patrimoine_Declare'].mean():,.0f} XOF")
        
        fig_effort = px.box(df_full, y='Taux_Effort_Mensuel', 
                            title="Taux d'effort mensuel (%)",
                            color_discrete_sequence=['#58a6ff'],
                            template=plotly_template)
        fig_effort.add_hline(y=33, line_dash="dash", line_color="rgba(248, 81, 73, 0.6)", 
                             line_width=1.5,
                             annotation_text="Seuil critique 33%",
                             annotation_font_color="#8b949e",
                             annotation_font_size=11)
        fig_effort.update_traces(opacity=0.7)
        st.plotly_chart(fig_effort, use_container_width=True)
    
    # 3. FEATURES PROFESSIONNELLES
    with st.expander("💼 Features Professionnelles", expanded=True):
        df_full['Anciennete_Emploi'] = pd.to_numeric(df_full['Anciennete_Emploi'], errors='coerce').fillna(0)
        df_full['Anciennete_Revenu'] = pd.to_numeric(df_full['Anciennete_Revenu'], errors='coerce').fillna(0)
        df_full['Stabilite_Pro'] = df_full['Anciennete_Emploi'].apply(
            lambda x: 'Élevée' if x >= 36 else ('Moyenne' if x >= 12 else 'Faible')
        )
        
        col1, col2, col3 = st.columns(3)
        col1.metric("Ancienneté", f"{df_full['Anciennete_Emploi'].mean():.1f} mois")
        col2.metric("CDI / Stable", f"{(df_full['Statut_Pro']=='CDI').sum()}")
        col3.metric("Indépendants", f"{(df_full['Statut_Pro']=='Indépendant').sum()}")
        
        secteur_count = df_full['Secteur_Activite'].value_counts().head(10)
        fig_secteur = px.bar(x=secteur_count.values, y=secteur_count.index,
                             orientation='h',
                             title="Top 10 secteurs d'activité",
                             color_discrete_sequence=['#58a6ff'],
                             template=plotly_template)
        fig_secteur.update_traces(marker_line_color='rgba(139, 148, 158, 0.2)', marker_line_width=0.5, opacity=0.8)
        st.plotly_chart(fig_secteur, use_container_width=True)
    
    # 4. FEATURES DE PRÊTS
    with st.expander(" Features de Prêts & Historique", expanded=True):
        # Conversion des colonnes en numérique AVANT le groupby
        df_prets['Montant_Capital'] = pd.to_numeric(df_prets['Montant_Capital'], errors='coerce').fillna(0)
        df_prets['Duree_Semaines'] = pd.to_numeric(df_prets['Duree_Semaines'], errors='coerce').fillna(0)
        
        pret_history = df_prets.groupby('ID_Client').agg({
            'ID_Pret': 'count',
            'Montant_Capital': ['sum', 'mean', 'max'],
            'Duree_Semaines': 'mean',
            'Statut': lambda x: (x == 'Actif').sum()
        }).reset_index()
        
        pret_history.columns = ['ID_Client', 'Nb_Prets_Total', 'Montant_Total_Emprunte',
                                'Montant_Moyen_Pret', 'Montant_Max_Pret', 
                                'Duree_Moyenne_Semaines', 'Nb_Prets_Actifs']
        
        df_full = pd.merge(df_full, pret_history, on='ID_Client', how='left')
        df_full['Ratio_Pret_Revenu'] = (df_full['Montant_Capital'] / df_full['Revenus_Totaux']).replace([np.inf, -np.inf], 0).fillna(0)
        df_full['Frequence_Emprunt'] = df_full['Nb_Prets_Total']
        
        col1, col2, col3, col4 = st.columns(4)
        col1.metric("Prêts / client", f"{df_full['Nb_Prets_Total'].mean():.1f}",
                    delta=f"Max {df_full['Nb_Prets_Total'].max():.0f}")
        col2.metric("Montant moy", f"{df_full['Montant_Moyen_Pret'].mean():,.0f} XOF")
        col3.metric("Durée moy", f"{df_full['Duree_Moyenne_Semaines'].mean():.0f} sem")
        col4.metric("Prêts actifs", f"{df_full['Nb_Prets_Actifs'].mean():.1f}")
        
        fig_montant = px.scatter(df_full, x='Revenus_Totaux', y='Montant_Capital',
                                 size='Nb_Prets_Total', color='Statut_Pro',
                                 title="Montant prêt vs revenus",
                                 hover_data=['Nom_Complet_Client'],
                                 color_discrete_sequence=['#58a6ff', '#79c0ff', '#a5d6ff', '#c9d1d9'],
                                 template=plotly_template)
        fig_montant.update_traces(opacity=0.7)
        st.plotly_chart(fig_montant, use_container_width=True)
    
    # 5. FEATURES GÉOGRAPHIQUES
    with st.expander(" Features Géographiques", expanded=True):
        ville_count = df_full['Ville'].value_counts()
        quartier_count = df_full['Quartier'].value_counts().head(10)
        
        col1, col2 = st.columns(2)
        with col1:
            st.metric("Villes", len(ville_count))
            fig_ville = px.pie(values=ville_count.values, names=ville_count.index,
                              title="Répartition par ville",
                              color_discrete_sequence=['#58a6ff', '#79c0ff', '#a5d6ff', '#c9d1d9', '#8b949e'],
                              template=plotly_template)
            fig_ville.update_traces(textfont_family='Space Grotesk', opacity=0.85)
            st.plotly_chart(fig_ville, use_container_width=True)
        
        with col2:
            st.metric("Quartiers", len(quartier_count))
            fig_quartier = px.bar(x=quartier_count.values, y=quartier_count.index,
                                  orientation='h', title="Top 10 quartiers",
                                  color_discrete_sequence=['#58a6ff'],
                                  template=plotly_template)
            fig_quartier.update_traces(opacity=0.8)
            st.plotly_chart(fig_quartier, use_container_width=True)
    
    # 6. FEATURES TEMPORELLES
    with st.expander(" Features Temporelles", expanded=True):
        df_full['Date_Creation'] = pd.to_datetime(df_full.get('Date_Creation', pd.Series()), errors='coerce')
        df_full['Anciennete_Client_Jours'] = (datetime.now() - df_full['Date_Creation']).dt.days
        df_full['Anciennete_Client_Mois'] = (df_full['Anciennete_Client_Jours'] / 30.44).round(1)
        
        col1, col2 = st.columns(2)
        col1.metric("Ancienneté moy", f"{df_full['Anciennete_Client_Mois'].mean():.1f} mois")
        col2.metric("Client + ancien", f"{df_full['Anciennete_Client_Mois'].max():.0f} mois")
        
        df_full['Mois_Creation'] = df_full['Date_Creation'].dt.to_period('M').astype(str)
        timeline = df_full.groupby('Mois_Creation').size().reset_index(name='Nb_Prets')
        fig_timeline = px.line(timeline, x='Mois_Creation', y='Nb_Prets',
                              title="Évolution temporelle des prêts",
                              markers=True,
                              color_discrete_sequence=['#58a6ff'],
                              template=plotly_template)
        fig_timeline.update_traces(line_width=2, marker_size=6, opacity=0.9)
        st.plotly_chart(fig_timeline, use_container_width=True)
    
    # MÉTRIQUES GLOBALES
    st.divider()
    st.markdown("<h2> Métriques Globales</h2>", unsafe_allow_html=True)
    col1, col2, col3 = st.columns(3)
    col1.metric("Taux défaut", "4.2 %", delta="Simulé")
    col2.metric("Montant moy", "10,000 XOF")
    col3.metric("Durée moy", "12 semaines")
    
    # TABLE DATASET
    st.divider()
    st.markdown("<h2> Dataset ML Enrichi</h2>", unsafe_allow_html=True)
    
    features_cols = [
        'ID_Pret', 'ID_Client', 'Nom_Complet_Client', 'Age', 'Categorie_Age',
        'Revenus_Totaux', 'Revenu_Net', 'Ratio_Charges_Revenus', 
        'Taux_Effort_Mensuel', 'Reste_A_Vivre', 'Patrimoine_Declare',
        'Stabilite_Pro', 'Anciennete_Emploi', 'Statut_Pro',
        'Nb_Prets_Total', 'Montant_Moyen_Pret', 'Ratio_Pret_Revenu',
        'Ville', 'Quartier', 'Anciennete_Client_Mois', 'Statut'
    ]
    
    features_cols = [col for col in features_cols if col in df_full.columns]
    df_display = df_full[features_cols].copy()
    numeric_cols = df_display.select_dtypes(include=[np.number]).columns
    df_display[numeric_cols] = df_display[numeric_cols].round(2)
    
    st.dataframe(df_display, use_container_width=True, height=400)
    
    # EXPORT
    st.divider()
    st.markdown("<h2> Export Dataset ML</h2>", unsafe_allow_html=True)
    
    col1, col2 = st.columns([2, 1])
    with col1:
        csv = df_full.to_csv(index=False).encode('utf-8')
        st.download_button(
            label=" Télécharger dataset complet (.csv)",
            data=csv,
            file_name=f'dataset_credit_scoring_{datetime.now().strftime("%Y%m%d_%H%M%S")}.csv',
            mime='text/csv',
            use_container_width=True
        )
    
    with col2:
        st.info(f"**{len(df_full)} lignes** \n**{len(df_full.columns)} features**")
    
    with st.expander(" Liste des features créées"):
        created_features = [
            'Age', 'Categorie_Age', 'Revenus_Totaux', 'Revenu_Net',
            'Ratio_Charges_Revenus', 'Taux_Effort_Mensuel', 'Capacite_Endettement',
            'Reste_A_Vivre', 'Stabilite_Pro', 'Nb_Prets_Total', 'Montant_Total_Emprunte',
            'Montant_Moyen_Pret', 'Montant_Max_Pret', 'Ratio_Pret_Revenu',
            'Frequence_Emprunt', 'Anciennete_Client_Jours', 'Anciennete_Client_Mois'
        ]
        st.markdown("**Features engineering:**")
        for i, feat in enumerate(created_features, 1):
            st.markdown(f"`{i:02d}` {feat}")