{ "cells": [ { "cell_type": "code", "execution_count": 4, "id": "72d11d95", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ Evidently importe\n" ] } ], "source": [ "# EXPLICATION : Imports Evidently pour comparaison de distributions\n", "# - Report : genere les rapports automatiques\n", "# - DataDriftPreset : ensemble de metriques pour detecter le drift (Distribution, KS Test, etc.)\n", "# - ColumnMapping : informe Evidently du type de chaque colonne (numerique/categorique)\n", "\n", "import pandas as pd\n", "import json\n", "from pathlib import Path\n", "\n", "try:\n", " from evidently.legacy.report import Report\n", " from evidently.legacy.metric_preset import DataDriftPreset\n", " from evidently.legacy.pipeline.column_mapping import ColumnMapping\n", "except ImportError:\n", " # Fallback for older/newer Evidently layouts\n", " from evidently.report import Report\n", " from evidently.metric_preset import DataDriftPreset\n", " from evidently.pipeline.column_mapping import ColumnMapping\n", "\n", "print(\"✅ Evidently importe\")\n" ] }, { "cell_type": "markdown", "id": "9b33c429", "metadata": {}, "source": [ "## Chargement référence et données production" ] }, { "cell_type": "code", "execution_count": 5, "id": "61a259c2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "⚠️ Colonnes vides supprimées : 31\n", "✅ Référence : 10000 lignes | Production : 500 lignes\n", " Colonnes analysées : 711\n" ] } ], "source": [ "# EXPLICATION : \n", "# 1. Référence = distribution d'entraînement (dataset pristine)\n", "# 2. Production = features réelles extraites des logs d'inférence\n", "# 3. Nettoyage : convertir \"\" en NaN (valeurs vides)\n", "# 4. Aligner : garder seulement colonnes communes (peut y avoir des différences en production)\n", "\n", "# Référence (entraînement)\n", "reference = pd.read_csv(\"../reference/reference.csv\")\n", "\n", "# Production : extraire input_features des logs\n", "LOG_FILE = Path(\"../logs/predictions.jsonl\")\n", "logs = pd.read_json(LOG_FILE, lines=True)\n", "production = pd.json_normalize(logs['input_features'])\n", "\n", "# Nettoyage (\"\" → NaN, aligner colonnes)\n", "production = production.replace(\"\", pd.NA).infer_objects()\n", "# EXPLICATION : infer_objects() détecte automatiquement les vrais types (ex: strings → objects)\n", "\n", "# Garder seulement les colonnes communes avec la référence\n", "# (en production, certaines colonnes peuvent être absentes ou ajoutées)\n", "common_cols = list(set(reference.columns) & set(production.columns))\n", "reference = reference[common_cols]\n", "production = production[common_cols]\n", "\n", "# Supprimer les colonnes vides (100% NaN) pour éviter les erreurs Evidently\n", "empty_ref = reference.columns[reference.isna().all()].tolist()\n", "empty_prod = production.columns[production.isna().all()].tolist()\n", "empty_cols = sorted(set(empty_ref) | set(empty_prod))\n", "if empty_cols:\n", " reference = reference.drop(columns=empty_cols)\n", " production = production.drop(columns=empty_cols)\n", " print(f\"⚠️ Colonnes vides supprimées : {len(empty_cols)}\")\n", "\n", "print(f\"✅ Référence : {len(reference)} lignes | Production : {len(production)} lignes\")\n", "print(f\" Colonnes analysées : {len(reference.columns)}\")" ] }, { "cell_type": "markdown", "id": "8a5feb72", "metadata": {}, "source": [ "## Calcul du data drift + génération du rapport" ] }, { "cell_type": "code", "execution_count": 7, "id": "8e4c48a8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Numériques : 580 | Catégorique : 131\n", "✅ Rapport généré : reports/data_drift_report.html\n" ] } ], "source": [ "# EXPLICATION : ColumnMapping aide Evidently à utiliser les bonnes métriques\n", "# - Features numériques : test KS (Kolmogorov-Smirnov) pour comparaison de distributions\n", "# - Features catégorique : test Chi-Squared pour comparer les fréquences\n", "\n", "column_mapping = ColumnMapping()\n", "column_mapping.numerical_features = reference.select_dtypes(include=['number']).columns.tolist()\n", "column_mapping.categorical_features = reference.select_dtypes(include=['object', 'bool']).columns.tolist()\n", "\n", "print(f\" Numériques : {len(column_mapping.numerical_features)} | Catégorique : {len(column_mapping.categorical_features)}\")\n", "\n", "# EXPLICATION : DataDriftPreset inclut :\n", "# - Drift per column (KS test pour numériques, Chi2 pour catégories)\n", "# - Dataset drift ratio\n", "# - Détection automatique pour seuil default (0.95 confiance)\n", "data_drift_report = Report(metrics=[DataDriftPreset()])\n", "data_drift_report.run(reference_data=reference, current_data=production, column_mapping=column_mapping)\n", "\n", "# Sauvegarde HTML (dashboard interactif)\n", "REPORT_DIR = Path(\"../reports\")\n", "REPORT_DIR.mkdir(exist_ok=True)\n", "report_path = REPORT_DIR / \"data_drift_report.html\"\n", "data_drift_report.save_html(str(report_path))\n", "print(\"✅ Rapport généré : reports/data_drift_report.html\")" ] }, { "cell_type": "markdown", "id": "e6e9f4c5", "metadata": {}, "source": [ "## Alertes automatiques" ] }, { "cell_type": "code", "execution_count": 9, "id": "c5497ce9", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "🔴 ALERTE : Drift détecté sur 1 features !\n", " Exemples : ['AMT_INCOME_TOTAL']\n", "\n", " 📋 Recommandations : \n", " - Vérifier source des données (anomalie/changement)\n", " - Envisager réentraînement du modèle\n", " - Ajouter monitoring continu sur ces features\n", "\n", "📊 Ouvre le fichier reports/data_drift_report.html dans ton navigateur pour le dashboard complet\n" ] } ], "source": [ "# EXPLICATION : \n", "# - Extraire les résultats du rapport (dictionnaire structuré)\n", "# - Seuil 0.3 : drift_score > 0.3 = **drift modéré à fort** (sensibilité équilibrée)\n", "# * 0.1-0.3 = léger (toléré)\n", "# * > 0.3 = alerte (intervention recommandée)\n", "# - Ce seuil est a : selon besoin métier (plus strict = plus d'alertes)\n", "\n", "# Exemple d'alerte sur features qui driftent fortement\n", "report_dict = data_drift_report.as_dict()\n", "drift_summary = None\n", "for metric in report_dict.get(\"metrics\", []):\n", " result = metric.get(\"result\", {})\n", " if \"drift_by_columns\" in result:\n", " drift_summary = result[\"drift_by_columns\"]\n", " break\n", "\n", "if drift_summary is None:\n", " sample_keys = [list(m.get(\"result\", {}).keys()) for m in report_dict.get(\"metrics\", [])[:3]]\n", " print(\"⚠️ Impossible de trouver 'drift_by_columns' dans le rapport Evidently\")\n", " print(f\" Exemples de clés disponibles : {sample_keys}\")\n", "else:\n", " drifted_features = [col for col, info in drift_summary.items()\n", " if info.get(\"drift_detected\") and info.get(\"drift_score\", 0) > 0.3]\n", "\n", " if len(drifted_features) > 0:\n", " print(f\"🔴 ALERTE : Drift détecté sur {len(drifted_features)} features !\")\n", " print(f\" Exemples : {drifted_features[:5]}\")\n", " print(\"\\n 📋 Recommandations : \")\n", " print(\" - Vérifier source des données (anomalie/changement)\") \n", " print(\" - Envisager réentraînement du modèle\")\n", " print(\" - Ajouter monitoring continu sur ces features\")\n", " else:\n", " print(\"✅ Aucun drift majeur détecté\")\n", "\n", "print(\"\\n📊 Ouvre le fichier reports/data_drift_report.html dans ton navigateur pour le dashboard complet\")" ] } ], "metadata": { "kernelspec": { "display_name": "OC_P6", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }