import os import time import tempfile import threading import requests import pandas as pd import json import re import pickle import matplotlib.pyplot as plt import shap import plotly.graph_objects as go import streamlit as st from streamlit.components.v1 import html as st_html from evidently import Report from evidently.metrics import ValueDrift, DriftedColumnsCount from evidently.tests import lte from huggingface_hub import hf_hub_download from functions import most_important_features_min_max features_min_max = most_important_features_min_max() # ----------------------------------------------------------------------- # Configuration globale – URLs des services HF Spaces # ----------------------------------------------------------------------- API_URL = os.environ.get("API_URL", "https://cedm-oc-mlops-projet-2.hf.space") DASHBOARD_P3_URL = "https://huggingface.co/spaces/CedM/oc_mlops_projet_3" KEEP_ALIVE_INTERVAL = 5 * 60 # 5 minutes – HF Spaces CPU gratuit dort après ~10-15 min d'inactivité # ----------------------------------------------------------------------- # Keep-alive : maintien des containers HF Spaces en état "running" # Utilise @st.cache_resource pour ne démarrer le thread qu'une seule fois. # ----------------------------------------------------------------------- @st.cache_resource(show_spinner=False) def start_keep_alive(): """ Démarre un thread démon qui envoie des requêtes croisées vers l'API et le dashboard projet 3 toutes les 5 min afin de maintenir les containers HF Spaces actifs. Un ping est envoyé immédiatement au démarrage. """ def _loop(): while True: # Ping immédiat, puis attente avant le prochain try: r = requests.get(f"{API_URL}/health", timeout=30) print(f"[keep-alive] Ping API → HTTP {r.status_code}") except Exception as e: print(f"[keep-alive] Ping API échoué: {e}") try: r = requests.get(DASHBOARD_P3_URL, timeout=30) print(f"[keep-alive] Ping Projet 3 → HTTP {r.status_code}") except Exception as e: print(f"[keep-alive] Ping Projet 3 échoué: {e}") time.sleep(KEEP_ALIVE_INTERVAL) thread = threading.Thread(target=_loop, daemon=True, name="keep-alive-api") thread.start() return thread start_keep_alive() # ----------------------------------------------------------------------- def load_shap_explainer(): """Charge l'explainer SHAP depuis le fichier pickle.""" with open("./hgb_shap_explainer.pkl", "rb") as f: return pickle.load(f) @st.cache_data(show_spinner=False) def load_model_columns(): """Charge la liste ordonnée des colonnes du modèle depuis le fichier de référence (hors TARGET et SK_ID_CURR).""" df_ref = pd.read_csv("./train_data_sp2_subsample_1.csv", sep=";", encoding="utf-8", nrows=1) return [c for c in df_ref.columns if c not in ("SK_ID_CURR", "TARGET")] # ----------------------------------------------------------------------- # Configuration de la page # ----------------------------------------------------------------------- st.set_page_config(page_title="Prêt à Dépenser", layout="wide", page_icon="💳") # ----------------------------------------------------------------------- # CSS global – design professionnel & responsive mobile # ----------------------------------------------------------------------- st.markdown(""" """, unsafe_allow_html=True) # ----------------------------------------------------------------------- # SIDEBAR – logo, navigation, informations # ----------------------------------------------------------------------- with st.sidebar: st.image("Logo_Pret_a_Depenser.png", use_container_width=True) #st.markdown("## 💳 ") st.markdown("*Outil de Scoring des demandes de crédit*") st.divider() page = st.radio( "Navigation", options=[ "❓ Demande simple", "📋 Demande en lot", "📊 Dérive des données", "⚡ Latence & Erreurs API", ], label_visibility="collapsed", ) st.divider() st.markdown(""" **À propos de cet outil :** - Prédiction rapide (10 variables) - Prédiction en lot (fichier CSV) - Surveillance de la dérive des données - Monitoring de l'API (latence / erreurs) """) st.markdown(f'

🔗 API : {API_URL}

', unsafe_allow_html=True) # ----------------------------------------------------------------------- # En-tête principal (zone de contenu) # ----------------------------------------------------------------------- st.markdown(f"""

Outil de Scoring des demandes de crédit

Prédiction par machine learning · Interprétabilité SHAP · Surveillance des données

""", unsafe_allow_html=True) # ===================================================================================================================== # PAGE 1 – Prédiction simple # ===================================================================================================================== if page == "❓ Demande simple": st.markdown('

', unsafe_allow_html=True) st.subheader("🔢 Paramètres de la demande") st.caption("Les variables sont classées par ordre d'importance (SHAP). Remplissez les champs puis lancez la prédiction.") st.markdown('

', unsafe_allow_html=True) with st.container(): col1, col2, col3, col4, col5 = st.columns(5) with col1: label = "EXT_SOURCE_3" ext_source_3 = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=0.01, help="Score normalisé provenant d'une source de données externe", key='ext_source_3') with col2: label = "EXT_SOURCE_2" ext_source_2 = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=0.01, help="Score normalisé provenant d'une source de données externe", key='ext_source_2') with col3: label = "EXT_SOURCE_1" ext_source_1 = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=0.01, help="Score normalisé provenant d'une source de données externe", key='ext_source_1') with col4: label = "DAYS_EMPLOYED" days_employed = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0, step=1, help="Nombre de jours avant la demande où la personne a commencé son emploi actuel (chiffre négatif)", key='days_employed') with col5: label = "PAYMENT_RATE" payment_rate = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=0.01, help="PAYMENT_RATE = AMT_ANNUITY / AMT_CREDIT", key='payment_rate') col6, col7, col8, col9, col10 = st.columns(5) with col6: label = "INSTAL_DPD_MEAN" instal_dpd_mean = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=1.0, help="Moyenne des jours de retard sur les paiements des crédits précédents (si négatif, mettre 0)", key='instal_dpd_mean') with col7: label = "PREV_CNT_PAYMENT_MEAN" prev_cnt_payment_mean = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=1.0, help="Moyenne des durées des crédits précédents", key='prev_cnt_payment_mean') with col8: label = "AMT_ANNUITY" amt_annuity = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=1.0, help="Annuité du prêt", key='amt_annuity') with col9: label = "CODE_GENDER" code_gender = st.selectbox(label, (features_min_max[label]["femme"], features_min_max[label]["homme"]), index=0, help="Sexe du client (0:Femme, 1:Homme)", key='code_gender') with col10: label = "PREV_NAME_PRODUCT_TYPE_walk-in_MEAN" prev_name_product_type_walk_in_mean = st.number_input(label, min_value=features_min_max[label]["min"], max_value=features_min_max[label]["max"], value=0.0, step=0.01, help="Ratio de demandes précédentes faites en agence (walk-in)", key='prev_name_product_type_walk_in_mean') st.divider() if st.button("🔮 Lancer la prédiction", type="primary"): st.subheader("Résultat de la prédiction", divider="blue") features = { "EXT_SOURCE_3": ext_source_3, "EXT_SOURCE_2": ext_source_2, "EXT_SOURCE_1": ext_source_1, "DAYS_EMPLOYED": days_employed, "PAYMENT_RATE": payment_rate, "INSTAL_DPD_MEAN": instal_dpd_mean, "PREV_CNT_PAYMENT_MEAN": prev_cnt_payment_mean, "AMT_ANNUITY": amt_annuity, "CODE_GENDER": code_gender, "PREV_NAME_PRODUCT_TYPE_walk-in_MEAN": prev_name_product_type_walk_in_mean } try: with st.spinner("Analyse en cours..."): response = requests.post(f"{API_URL}/predict", json={"features": features}, timeout=30) if response.status_code == 200: result = response.json() prediction = result.get("prediction") proba_rejet = result.get("probability", 0) * 100 seuil = result.get("threshold", 0.474) * 100 col_result1, col_result2 = st.columns([1, 2]) with col_result1: if prediction == 1: st.error("⚠️ **DOSSIER REJETÉ**") st.metric(label="Probabilité de défaut", value=f"{proba_rejet:.1f}%", delta=f"+{proba_rejet - seuil:.1f}% au-dessus du seuil", delta_color="inverse") else: st.success("✅ **DOSSIER ACCEPTÉ**") st.metric(label="Probabilité de défaut", value=f"{proba_rejet:.1f}%", delta=f"{proba_rejet - seuil:.1f}% sous le seuil", delta_color="normal") with col_result2: st.info(f"**Règle de décision :**\n- Seuil de rejet : **{seuil}%**\n" f"- Probabilité de défaut : **{proba_rejet:.2f}%**\n" f"- Décision : **{'REJET' if proba_rejet >= seuil else 'ACCEPTÉ'}**") else: st.error(f"❌ Erreur API: {response.status_code} - {response.text}") except requests.exceptions.ConnectionError: st.error("❌ Impossible de se connecter à l'API.") except requests.exceptions.Timeout: st.error("❌ Timeout : l'API met trop de temps à répondre.") except Exception as e: st.error(f"❌ Erreur inattendue : {str(e)}") st.subheader("Interprétabilité SHAP", divider="gray") col20, col21 = st.columns(2) with col20: with st.container(border=True): st.markdown("**🌍 Vue globale du modèle**") st.image("./hgb_shap_global.png", use_container_width=True) with col21: with st.container(border=True): st.markdown("**🔍 Vue locale – cette demande**") try: explainer = load_shap_explainer() model_columns = load_model_columns() features_df = pd.DataFrame([features], columns=list(features.keys())) features_df = features_df.reindex(columns=model_columns) shap_values = explainer(features_df) fig_shap, ax = plt.subplots() shap.plots.waterfall(shap_values[0], max_display=10, show=False) st.pyplot(fig_shap, bbox_inches='tight') plt.close(fig_shap) except Exception as e: st.warning(f"⚠️ Impossible d'afficher le graphique SHAP local : {str(e)}") # ===================================================================================================================== # PAGE 2 – Prédiction en lot # ===================================================================================================================== elif page == "📋 Demande en lot": st.markdown('

', unsafe_allow_html=True) st.subheader("📂 Import du fichier de demandes") st.caption("Formats acceptés : CSV avec séparateur `;`, encodage UTF-8, colonne `SK_ID_CURR`. Maximum 1 000 lignes traitées.") st.markdown('

', unsafe_allow_html=True) uploaded_file = st.file_uploader( "Sélectionnez un fichier CSV pré-traité", type=["csv"], help="Le fichier doit contenir toutes les variables nécessaires (sep=';', encodage UTF-8)." ) if uploaded_file is not None: try: dataframe = pd.read_csv(uploaded_file, sep=';', encoding='utf-8', index_col='SK_ID_CURR', nrows=1000) st.subheader("Aperçu des données chargées", divider="gray") st.dataframe(dataframe, use_container_width=True) except Exception as e: st.error(f"❌ Erreur lors de la lecture du fichier CSV : {e}") st.stop() st.divider() if st.button("🔮 Lancer les prédictions en lot", type="primary", key="predict_batch"): try: with st.spinner("Analyse en cours..."): uploaded_file.seek(0) response = requests.post( f"{API_URL}/predict/file", files={"file": (uploaded_file.name, uploaded_file, "text/csv")}, timeout=60 ) if response.status_code == 200: result = response.json() predictions = result.get("predictions", []) probabilities = result.get("probabilities", []) seuil = result.get("threshold", 0.474) * 100 count = result.get("count", 0) st.success(f"✅ **{count} prédictions effectuées avec succès !**") col_stats1, col_stats2, col_stats3 = st.columns(3) nb_acceptes = predictions.count(0) nb_rejetes = predictions.count(1) with col_stats1: st.metric("Total des demandes", count) with col_stats2: st.metric("Dossiers acceptés ✅", nb_acceptes, delta=f"{nb_acceptes/count*100:.1f}%" if count > 0 else "0%") with col_stats3: st.metric("Dossiers rejetés ⚠️", nb_rejetes, delta=f"{nb_rejetes/count*100:.1f}%" if count > 0 else "0%", delta_color="inverse") dataframe_result = dataframe.copy() dataframe_result['Probabilite_defaut'] = [round(p * 100, 2) for p in probabilities] dataframe_result['Prediction'] = predictions st.subheader("Détail des prédictions", divider="gray") st.dataframe( dataframe_result[['Probabilite_defaut', 'Prediction']].style.map( lambda x: 'background-color: #ffcccc' if x == 1 else 'background-color: #ccffcc', subset=['Prediction'] ), use_container_width=True ) csv_result = dataframe_result.to_csv(index=True, sep=';', encoding='utf-8') st.download_button( label="📥 Télécharger les résultats (CSV)", data=csv_result, file_name="predictions_resultats.csv", mime="text/csv" ) st.info(f"**Seuil de décision appliqué : {seuil:.1f}%** — Les dossiers dont la probabilité dépasse ce seuil sont rejetés.") else: st.error(f"❌ Erreur API: {response.status_code} - {response.text}") except requests.exceptions.ConnectionError: st.error("❌ Impossible de se connecter à l'API.") except requests.exceptions.Timeout: st.error("❌ Timeout : l'API met trop de temps à répondre.") except Exception as e: st.error(f"❌ Erreur inattendue : {str(e)}") # ===================================================================================================================== # PAGE 3 – Dérive des données # ===================================================================================================================== elif page == "📊 Dérive des données": st.markdown('

', unsafe_allow_html=True) st.subheader("📊 Analyse de la dérive des données") st.markdown( "🚧 La génération des rapports est disponible dès que le dataset actuel contient **au moins 100 lignes**. " "L'analyse porte sur les **1 000 dernières lignes** pour garantir fiabilité et performance." ) st.markdown('

', unsafe_allow_html=True) @st.cache_data(show_spinner=False, ttl=3600) def build_evidently_html(current_df, reference_df, report: int) -> str: if report == 0: report = Report([ ValueDrift(column=important_features[0], method="ks"), ValueDrift(column=important_features[1], method="ks"), ValueDrift(column=important_features[2], method="ks"), ValueDrift(column=important_features[3], method="ks"), ValueDrift(column=important_features[4], method="ks"), ValueDrift(column=important_features[5], method="ks"), ValueDrift(column=important_features[6], method="ks"), ValueDrift(column=important_features[7], method="ks"), ValueDrift(column=important_features[8], method="chisquare"), ValueDrift(column=important_features[9], method="ks"), ]) else: report = Report([DriftedColumnsCount(share_tests=[lte(threshold_drift)])]) my_eval = report.run(current_df, reference_df) tmp = tempfile.NamedTemporaryFile(suffix=".html", delete=False) tmp.close() my_eval.save_html(tmp.name) with open(tmp.name, "r", encoding="utf-8") as f: html_content = f.read() os.remove(tmp.name) return html_content def drop_unnecessary_columns(df: pd.DataFrame): return df.drop(columns=[col for col in df.columns if col in ['SK_ID_CURR', 'TARGET', '_prediction', '_timestamp']]) @st.cache_data(show_spinner=False, ttl=3600) def load_reference_data(): return pd.read_pickle("./train_data_sp1.pkl", compression="gzip") reference_df = load_reference_data() reference_df = drop_unnecessary_columns(reference_df) try: current_file_path = hf_hub_download(repo_id="CedM/oc_mlops_projet_2", filename="data_io.csv", repo_type="dataset") current_df = pd.read_csv(current_file_path, encoding="utf-8", sep=";", index_col='SK_ID_CURR').tail(1000) current_df = drop_unnecessary_columns(current_df) except FileNotFoundError: st.error("❌ Fichier 'data_io.csv' introuvable sur le dépôt Hugging Face.") st.stop() except Exception as e: st.error(f"❌ Erreur lors du téléchargement : {str(e)}") st.stop() important_features = [ "EXT_SOURCE_3", "EXT_SOURCE_2", "EXT_SOURCE_1", "DAYS_EMPLOYED", "PAYMENT_RATE", "INSTAL_DPD_MEAN", "PREV_CNT_PAYMENT_MEAN", "AMT_ANNUITY", "CODE_GENDER", "PREV_NAME_PRODUCT_TYPE_walk-in_MEAN" ] threshold_drift = 0.50 current_df_count = current_df.shape[0] st.info(f"📂 **Dataset actuel :** {current_df_count} lignes | 📂 **Dataset de référence :** {reference_df.shape[0]} lignes") st.divider() if current_df_count < 100: st.warning("⚠️ Le dataset actuel contient moins de 100 lignes. L'analyse sera disponible dès 100 lignes.") else: report_type = st.selectbox( "Type de rapport", options=[ ("Rapport détaillé – 10 variables les plus importantes (SHAP)", 0), ("Rapport synthétique – toutes les autres variables", 1) ], format_func=lambda x: x[0], key="report_type" ) if st.button("🔮 Lancer l'analyse de dérive", type="primary", key="run_data_drift", help="Compare le dataset actuel au dataset de référence."): try: with st.spinner("Génération du rapport Evidently en cours..."): ref_df_filtered = reference_df.copy() cur_df_filtered = current_df.copy() if report_type[1] == 0: ref_df_filtered = ref_df_filtered[[c for c in ref_df_filtered.columns if c in important_features]] cur_df_filtered = cur_df_filtered[[c for c in cur_df_filtered.columns if c in important_features]] else: ref_df_filtered = ref_df_filtered.drop(columns=[c for c in important_features if c in ref_df_filtered.columns]) cur_df_filtered = cur_df_filtered.drop(columns=[c for c in important_features if c in cur_df_filtered.columns]) html_content = build_evidently_html(cur_df_filtered, ref_df_filtered, report=report_type[1]) col_i1, col_i2 = st.columns(2) col_i1.metric("Variables dans le dataset de référence", ref_df_filtered.shape[1]) col_i2.metric("Variables dans le dataset actuel", cur_df_filtered.shape[1]) st.subheader("Rapport Evidently", divider="gray") st_html(html_content, height=900, scrolling=True) except Exception as e: st.error(f"❌ Erreur inattendue : {str(e)}") # ===================================================================================================================== # PAGE 4 – Latence & Erreurs API # ===================================================================================================================== elif page == "⚡ Latence & Erreurs API": st.markdown('

', unsafe_allow_html=True) st.subheader("⚡ Monitoring de l'API – Latence & Erreurs") st.caption("Fenêtre d'analyse : **72 dernières heures**. Les logs sont mis à jour après chaque appel aux endpoints `/predict` et `/predict/file`.") st.markdown('

', unsafe_allow_html=True) def load_api_logs(): logs_data = [] errors_list = [] timestamps_all = [] try: log_file_path = hf_hub_download( repo_id="CedM/oc_mlops_projet_2", filename="api_log.jsonl", repo_type="dataset", force_download=True ) except Exception: log_file_path = "../api_log.jsonl" try: with open(log_file_path, 'r', encoding='utf-8') as f: for line in f: try: log_entry = json.loads(line.strip()) message = log_entry.get("message", "") ts = pd.to_datetime(log_entry.get("timestamp"), utc=True, errors="coerce") if pd.isna(ts): continue timestamps_all.append(ts) match_single = re.search(r"Prédiction effectuée avec succès:.*temps=([\d.]+)s\)", message) match_batch = re.search(r"Prédictions effectuées avec succès: (\d+) résultats \(temps d'exécution: ([\d.]+)s\)", message) if match_single: logs_data.append({"timestamp": ts, "endpoint": "/predict", "latency": float(match_single.group(1)), "prediction_count": 1}) elif match_batch: logs_data.append({"timestamp": ts, "endpoint": "/predict/file", "latency": float(match_batch.group(2)), "prediction_count": int(match_batch.group(1))}) level = str(log_entry.get("level", "")).upper() if level in ("ERROR", "WARNING"): errors_list.append({"timestamp": ts, "level": level, "message": message, "module": log_entry.get("module", "")}) except json.JSONDecodeError: continue except FileNotFoundError: st.error("❌ Fichier 'api_log.jsonl' introuvable.") return {"df": pd.DataFrame(), "errors_df": pd.DataFrame(), "error_counts": {}} df = pd.DataFrame(logs_data) df_err = pd.DataFrame(errors_list) last_ts = max(timestamps_all) if timestamps_all else pd.Timestamp.now(tz='UTC') cutoff_time = last_ts - pd.Timedelta(hours=72) if not df.empty: df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True, errors="coerce") df = df.dropna(subset=["timestamp"]) df = df[df["timestamp"] >= cutoff_time] if not df_err.empty: df_err["timestamp"] = pd.to_datetime(df_err["timestamp"], utc=True, errors="coerce") df_err = df_err.dropna(subset=["timestamp"]) df_err = df_err[df_err["timestamp"] >= cutoff_time] error_counts = { "ERROR": int(df_err[df_err["level"] == "ERROR"].shape[0]) if not df_err.empty else 0, "WARNING": int(df_err[df_err["level"] == "WARNING"].shape[0]) if not df_err.empty else 0, "TOTAL": int(df_err.shape[0]) if not df_err.empty else 0 } return {"df": df, "errors_df": df_err, "error_counts": error_counts} if st.button("🔄 Rafraîchir les données", type="primary", key="refresh_logs", help="Recharge les logs depuis le HF Dataset."): st.session_state['refresh_logs_triggered'] = True with st.spinner("Chargement des logs..."): result = load_api_logs() df_logs = result.get('df', pd.DataFrame()) df_errors = result.get('errors_df', pd.DataFrame()) error_counts = result.get('error_counts', {}) if df_logs.empty and df_errors.empty: st.warning("⚠️ Aucune donnée de latence ou d'erreur disponible dans les logs de l'API.") else: if not df_logs.empty: st.info(f"📊 **{len(df_logs)} appels de prédiction** enregistrés sur les 72 dernières heures.") st.subheader("Latence au fil du temps", divider="gray") fig = go.Figure() colors = {"/predict": "#2e86de", "/predict/file": "#e67e22"} for endpoint in df_logs["endpoint"].unique(): df_ep = df_logs[df_logs["endpoint"] == endpoint].sort_values("timestamp") if len(df_ep) > 0: fig.add_trace(go.Scatter( x=df_ep["timestamp"], y=df_ep["latency"], mode='lines+markers', name=endpoint, marker=dict(size=8, color=colors.get(endpoint, "#333"), line=dict(width=1, color='DarkSlateGrey')), line=dict(color=colors.get(endpoint, "#333"), width=2), hovertemplate="%{x}
Latence: %{y:.4f}s
Nb prédictions: %{customdata}", customdata=df_ep["prediction_count"] )) if len(df_ep) >= 2: mean_l = df_ep["latency"].mean() std_l = df_ep["latency"].std() fig.add_hline(y=mean_l, line_dash="dash", line_color=colors.get(endpoint, "#333"), annotation_text=f"Moy. {endpoint}: {mean_l:.4f}s", annotation_position="right", annotation_font_size=10) fig.add_hrect(y0=max(0, mean_l - std_l), y1=mean_l + std_l, fillcolor=colors.get(endpoint, "#333"), opacity=0.1, line_width=0, annotation_text=f"±σ {endpoint}: {std_l:.4f}s", annotation_position="right", annotation_font_size=9) fig.update_layout( title="Latence de prédiction de l'API (fenêtre 72h)", xaxis_title="Date et heure (UTC)", yaxis_title="Temps d'exécution (s)", hovermode="x unified", legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01), height=480, plot_bgcolor="#f8fafc", paper_bgcolor="#f8fafc", ) st.plotly_chart(fig, use_container_width=True) st.subheader("Erreurs & Warnings (72h)", divider="gray") col_err, col_warn, col_total = st.columns(3) with col_err: st.metric("🔴 Erreurs (ERROR)", error_counts.get('ERROR', 0)) with col_warn: st.metric("🟡 Warnings (WARNING)", error_counts.get('WARNING', 0)) with col_total: st.metric("📋 Total événements", error_counts.get('TOTAL', 0)) if not df_errors.empty: st.subheader("Détail des événements", divider="gray") st.dataframe(df_errors.sort_values('timestamp', ascending=False).reset_index(drop=True), use_container_width=True)