{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "425434fa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Configuration chargée avec succès !\n", "MLflow Experiment: OC_P6_Credit_Scoring\n", "Model: LogisticRegression\n", "Fixed threshold: 0.5\n" ] } ], "source": [ "# ============================================================================\n", "# VERSION 1: Baseline LogisticRegression - paramètres par défaut\n", "# ============================================================================\n", "# Objectif: modèle le plus simple possible, sans aucune gestion du déséquilibre\n", "# ni ajustement de seuil\n", "# Validation: StratifiedKFold (5 folds) pour conserver la proportion de classes\n", "# Modèle: LogisticRegression() avec max_iter=1000, random_state=42\n", "# Features: X_train, y_train, X_test, y_test (seront scalés avec StandardScaler)\n", "# Seuil fixe: 0.5\n", "# Métriques par fold: AUC-ROC, Accuracy, F1-score, Recall classe 1\n", "# Coût métier: 10 * FN + 1 * FP (avec seuil=0.5)\n", "# MLflow: run_name=\"V1_LogisticRegression_Baseline\"\n", "# Tags: version=\"1\", model=\"LogisticRegression\"\n", "\n", "import datetime\n", "import numpy as np\n", "import pandas as pd\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import StratifiedKFold\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.metrics import roc_auc_score, accuracy_score, f1_score, recall_score, confusion_matrix\n", "from sklearn.exceptions import ConvergenceWarning\n", "import warnings\n", "\n", "warnings.filterwarnings('ignore', message='.*Failed to resolve installed pip version.*')\n", "warnings.filterwarnings('ignore', category=FutureWarning, message='.*penalty.*deprecated.*')\n", "warnings.filterwarnings('ignore', category=ConvergenceWarning)\n", "\n", "# ============================================================================\n", "# CONFIGURATION\n", "# ============================================================================\n", "MLFLOW_TRACKING_URI = \"http://127.0.0.1:5000\"\n", "MLFLOW_EXPERIMENT_NAME = \"OC_P6_Credit_Scoring\"\n", "\n", "PROJECT_VERSION = \"1.0\"\n", "MODEL_NAME = \"LogisticRegression\"\n", "NOTEBOOK_NAME = \"04_regression\"\n", "RUN_DATE = datetime.datetime.now()\n", "\n", "DATA_PATH = \"../data/processed/\"\n", "TRAIN_FILE = \"features_train.csv\"\n", "TEST_FILE = \"features_test.csv\"\n", "\n", "# Configuration du modèle baseline (paramètres par défaut)\n", "MODEL_CONFIG_V1 = {\n", " \"max_iter\": 1000,\n", " \"random_state\": 42\n", "}\n", "\n", "RANDOM_STATE = 42\n", "THRESHOLD_FIXED = 0.5 # Seuil fixe pour les prédictions\n", "\n", "print(\"Configuration chargée avec succès !\")\n", "print(f\"MLflow Experiment: {MLFLOW_EXPERIMENT_NAME}\")\n", "print(f\"Model: {MODEL_NAME}\")\n", "print(f\"Fixed threshold: {THRESHOLD_FIXED}\")\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "a076e751", "metadata": {}, "outputs": [], "source": [ "# Configuration MLflow\n", "from src.mlflow_config import configure_mlflow\n", "\n", "mlflow = configure_mlflow(autolog=False)\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "fc246658", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ Données chargées:\n", " X_train: (10000, 741)\n", " y_train: (10000,)\n", " X_test: (10000, 741)\n", " y_test: (10000,)\n", " Classe 0: 9225, Classe 1: 775\n", "\n", "✓ Vérification initiale des NaN:\n", " Total NaN in X_train: 5146964\n", " Total NaN in X_test: 5106144\n", "\n", "✓ Identification des colonnes vides (100% NaN):\n", " Colonnes vides dans X_train: 17\n", " Colonnes vides dans X_test: 1\n", " Suppression de 17 colonnes vides...\n", " X_train après suppression: (10000, 724)\n", " X_test après suppression: (10000, 724)\n", "\n", "✓ Imputation des NaN restants:\n", " NaN restants in X_train: 4976964\n", " NaN restants in X_test: 4936311\n", " Imputation avec la médiane...\n", " X_train après imputation: (10000, 724)\n", " X_test après imputation: (10000, 724)\n", " Vérification post-imputation:\n", " NaN in X_train: 0\n", " NaN in X_test: 0\n", "\n", "✓ Données finales après nettoyage:\n", " X_train: (10000, 724)\n", " y_train: (10000,)\n", " X_test: (10000, 724)\n", " y_test: (10000,)\n" ] } ], "source": [ "# ============================================================================\n", "# CHARGEMENT ET PRÉPARATION DES DONNÉES\n", "# ============================================================================\n", "\n", "# Chargement des données d'entraînement\n", "X_train = pd.read_csv(DATA_PATH + TRAIN_FILE)\n", "y_train = X_train.pop(\"TARGET\")\n", "\n", "# Chargement des données de test\n", "X_test = pd.read_csv(DATA_PATH + TEST_FILE)\n", "y_test = X_test.pop(\"TARGET\")\n", "\n", "print(f\"✓ Données chargées:\")\n", "print(f\" X_train: {X_train.shape}\")\n", "print(f\" y_train: {y_train.shape}\")\n", "print(f\" X_test: {X_test.shape}\")\n", "print(f\" y_test: {y_test.shape}\")\n", "print(f\" Classe 0: {(y_train==0).sum()}, Classe 1: {(y_train==1).sum()}\")\n", "\n", "# ============================================================================\n", "# NETTOYAGE DES DONNÉES: Suppression des colonnes avec 100% NaN\n", "# ============================================================================\n", "from sklearn.impute import SimpleImputer\n", "\n", "# Vérifier les NaN\n", "nan_train = X_train.isna().sum().sum()\n", "nan_test = X_test.isna().sum().sum()\n", "\n", "print(f\"\\n✓ Vérification initiale des NaN:\")\n", "print(f\" Total NaN in X_train: {nan_train}\")\n", "print(f\" Total NaN in X_test: {nan_test}\")\n", "\n", "# Identifier et supprimer les colonnes entièrement NaN dans X_train\n", "empty_cols_train = X_train.columns[X_train.isna().all()].tolist()\n", "empty_cols_test = X_test.columns[X_test.isna().all()].tolist()\n", "\n", "print(f\"\\n✓ Identification des colonnes vides (100% NaN):\")\n", "print(f\" Colonnes vides dans X_train: {len(empty_cols_train)}\")\n", "print(f\" Colonnes vides dans X_test: {len(empty_cols_test)}\")\n", "\n", "# Supprimer les colonnes vides (union des deux ensembles)\n", "cols_to_drop = set(empty_cols_train) | set(empty_cols_test)\n", "if cols_to_drop:\n", " print(f\" Suppression de {len(cols_to_drop)} colonnes vides...\")\n", " X_train = X_train.drop(columns=list(cols_to_drop))\n", " X_test = X_test.drop(columns=list(cols_to_drop))\n", " print(f\" X_train après suppression: {X_train.shape}\")\n", " print(f\" X_test après suppression: {X_test.shape}\")\n", "\n", "# ============================================================================\n", "# IMPUTATION DES VALEURS NaN RESTANTES AVEC LA MÉDIANE\n", "# ============================================================================\n", "\n", "nan_train_remaining = X_train.isna().sum().sum()\n", "nan_test_remaining = X_test.isna().sum().sum()\n", "\n", "print(f\"\\n✓ Imputation des NaN restants:\")\n", "print(f\" NaN restants in X_train: {nan_train_remaining}\")\n", "print(f\" NaN restants in X_test: {nan_test_remaining}\")\n", "\n", "if nan_train_remaining > 0 or nan_test_remaining > 0:\n", " print(f\" Imputation avec la médiane...\")\n", " \n", " # Créer un imputer avec stratégie médiane\n", " imputer = SimpleImputer(strategy='median')\n", " \n", " # Fit sur X_train et transformer X_train et X_test\n", " X_train_imputed = imputer.fit_transform(X_train)\n", " X_test_imputed = imputer.transform(X_test)\n", " \n", " # Reconvertir en DataFrame\n", " X_train = pd.DataFrame(X_train_imputed, columns=X_train.columns)\n", " X_test = pd.DataFrame(X_test_imputed, columns=X_test.columns)\n", " \n", " print(f\" X_train après imputation: {X_train.shape}\")\n", " print(f\" X_test après imputation: {X_test.shape}\")\n", " print(f\" Vérification post-imputation:\")\n", " print(f\" NaN in X_train: {X_train.isna().sum().sum()}\")\n", " print(f\" NaN in X_test: {X_test.isna().sum().sum()}\")\n", "else:\n", " print(f\" Aucun NaN à imputer !\")\n", "\n", "print(f\"\\n✓ Données finales après nettoyage:\")\n", "print(f\" X_train: {X_train.shape}\")\n", "print(f\" y_train: {y_train.shape}\")\n", "print(f\" X_test: {X_test.shape}\")\n", "print(f\" y_test: {y_test.shape}\")\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "44d75270", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Features standardisées (StandardScaler):\n", " Shape train: (10000, 724)\n", " Mean: -0.00000000 (≈ 0)\n", " Std: 0.874353 (≈ 1)\n" ] } ], "source": [ "# ============================================================================\n", "# STANDARDISATION DES FEATURES\n", "# ============================================================================\n", "# La régression logistique est sensible à l'échelle des features\n", "# Utiliser StandardScaler (fit sur train, transform sur test)\n", "\n", "scaler = StandardScaler()\n", "X_train_scaled = scaler.fit_transform(X_train)\n", "X_test_scaled = scaler.transform(X_test)\n", "\n", "# Reconvertir en DataFrame pour conserver les noms de colonnes\n", "X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)\n", "X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)\n", "\n", "print(f\"\\n✓ Features standardisées (StandardScaler):\")\n", "print(f\" Shape train: {X_train_scaled.shape}\")\n", "print(f\" Mean: {X_train_scaled.mean().mean():.8f} (≈ 0)\")\n", "print(f\" Std: {X_train_scaled.std().mean():.6f} (≈ 1)\")\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "59eabb43", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fold 1/5 | AUC=0.6995 | Acc=0.9170 | F1=0.0568 | Recall=0.0323 | Cost=1516\n", "Fold 2/5 | AUC=0.6972 | Acc=0.9225 | F1=0.1243 | Recall=0.0710 | Cost=1451\n", "Fold 3/5 | AUC=0.7036 | Acc=0.9160 | F1=0.0562 | Recall=0.0323 | Cost=1518\n", "Fold 4/5 | AUC=0.7061 | Acc=0.9140 | F1=0.0444 | Recall=0.0258 | Cost=1531\n", "Fold 5/5 | AUC=0.6984 | Acc=0.9145 | F1=0.0339 | Recall=0.0194 | Cost=1539\n", "\n", "✓ Cross-Validation LogisticRegression V1 terminée\n", " AUC moyen: 0.7010 ± 0.0038\n", " F1 moyen: 0.0631 ± 0.0355\n", " Recall moyen: 0.0361 ± 0.0202\n", " Coût métier moyen: 1511.00 ± 34.85\n", " Seuil optimal: 0.50\n", "🏃 View run V1_LogisticRegression_Baseline at: http://127.0.0.1:5000/#/experiments/1/runs/00e6a5708f0340678afb3fe611ba11c8\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# CROSS-VALIDATION: LogisticRegression V1 Baseline\n", "# ============================================================================\n", "# StratifiedKFold (5 folds) pour conserver la proportion de classes\n", "# Seuil fixe = 0.5 pour les prédictions (pas d'optimisation)\n", "\n", "from src.mlflow_config import configure_mlflow\n", "\n", "mlflow = configure_mlflow(autolog=False)\n", "\n", "# Terminer tout run actif avant de commencer\n", "mlflow.end_run()\n", "\n", "RUN_NAME_V1 = \"V1_LogisticRegression_Baseline\"\n", "\n", "fold_results = []\n", "\n", "with mlflow.start_run(run_name=RUN_NAME_V1):\n", " # ========== Logging des paramètres et tags ==========\n", " mlflow.log_params(MODEL_CONFIG_V1)\n", " mlflow.set_tag(\"version\", \"1\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"notebook\", NOTEBOOK_NAME)\n", " mlflow.set_tag(\"phase\", \"baseline_cv\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"scaling\", \"StandardScaler\")\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " \n", " # ========== StratifiedKFold (5 folds) ==========\n", " skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)\n", " \n", " for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train), start=1):\n", " X_tr, X_val = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[val_idx]\n", " y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]\n", " \n", " # ========== Entraînement ==========\n", " model = LogisticRegression(**MODEL_CONFIG_V1)\n", " model.fit(X_tr, y_tr)\n", " \n", " # ========== Prédictions ==========\n", " y_val_proba = model.predict_proba(X_val)[:, 1] # Probabilités classe 1\n", " y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int) # Seuil fixe 0.5\n", " \n", " # ========== Métriques ==========\n", " auc = roc_auc_score(y_val, y_val_proba)\n", " accuracy = accuracy_score(y_val, y_val_pred)\n", " f1 = f1_score(y_val, y_val_pred)\n", " recall = recall_score(y_val, y_val_pred)\n", " \n", " # ========== Coût métier (seuil=0.5) ==========\n", " tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()\n", " cost = 10 * fn + 1 * fp\n", " \n", " fold_results.append({\n", " \"fold\": fold_idx,\n", " \"auc\": auc,\n", " \"accuracy\": accuracy,\n", " \"f1_score\": f1,\n", " \"recall_class1\": recall,\n", " \"business_cost_min\": cost,\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " \"tp\": tp,\n", " \"fp\": fp,\n", " \"fn\": fn,\n", " \"tn\": tn\n", " })\n", " \n", " print(f\"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | \"\n", " f\"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}\")\n", " \n", " # ========== Agrégation des résultats ==========\n", " cv_results_df = pd.DataFrame(fold_results)\n", " \n", " metrics_mean = {\n", " \"auc\": cv_results_df[\"auc\"].mean(),\n", " \"f1_score\": cv_results_df[\"f1_score\"].mean(),\n", " \"recall_class1\": cv_results_df[\"recall_class1\"].mean(),\n", " \"business_cost_min\": cv_results_df[\"business_cost_min\"].mean(),\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " }\n", " \n", " metrics_std = {\n", " \"auc\": cv_results_df[\"auc\"].std(),\n", " \"f1_score\": cv_results_df[\"f1_score\"].std(),\n", " \"recall_class1\": cv_results_df[\"recall_class1\"].std(),\n", " \"business_cost_min\": cv_results_df[\"business_cost_min\"].std(),\n", " }\n", " \n", " # ========== Logging dans MLFlow ==========\n", " # Utiliser les MÊMES noms que le schéma standard MLflow (sans préfixe)\n", " mlflow.log_metric(\"auc\", metrics_mean[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean[\"optimal_threshold\"])\n", " \n", " # Log artefact JSON avec détails par fold\n", " mlflow.log_dict(cv_results_df.to_dict(orient=\"records\"), \"cv_results_per_fold.json\")\n", " \n", " print(\"\\n✓ Cross-Validation LogisticRegression V1 terminée\")\n", " print(f\" AUC moyen: {metrics_mean['auc']:.4f} ± {metrics_std['auc']:.4f}\")\n", " print(f\" F1 moyen: {metrics_mean['f1_score']:.4f} ± {metrics_std['f1_score']:.4f}\")\n", " print(f\" Recall moyen: {metrics_mean['recall_class1']:.4f} ± {metrics_std['recall_class1']:.4f}\")\n", " print(f\" Coût métier moyen: {metrics_mean['business_cost_min']:.2f} ± {metrics_std['business_cost_min']:.2f}\")\n", " print(f\" Seuil optimal: {metrics_mean['optimal_threshold']:.2f}\")\n" ] }, { "cell_type": "code", "execution_count": 6, "id": "e9269e02", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "==============================================================================================================\n", "TABLEAU RÉCAPITULATIF: Métriques par fold\n", "==============================================================================================================\n", " fold auc f1_score recall_class1 business_cost_min optimal_threshold\n", " 1 0.699467 0.056818 0.032258 1516 0.5\n", " 2 0.697180 0.124294 0.070968 1451 0.5\n", " 3 0.703589 0.056180 0.032258 1518 0.5\n", " 4 0.706146 0.044444 0.025806 1531 0.5\n", " 5 0.698425 0.033898 0.019355 1539 0.5\n", "\n", "==============================================================================================================\n", "RÉSUMÉ GLOBAL: Moyennes et Écart-types sur 5 folds\n", "==============================================================================================================\n", " Métrique Moyenne Écart-type\n", " AUC-ROC 0.7010 0.0038\n", " F1-Score 0.0631 0.0355\n", "Recall Classe 1 0.0361 0.0202\n", "Coût Métier Min 1511.00 34.85\n", " Seuil Optimal 0.50 -\n", "==============================================================================================================\n" ] } ], "source": [ "# ============================================================================\n", "# TABLEAU RÉCAPITULATIF: Métriques par fold\n", "# ============================================================================\n", "\n", "print(\"\\n\" + \"=\"*110)\n", "print(\"TABLEAU RÉCAPITULATIF: Métriques par fold\")\n", "print(\"=\"*110)\n", "\n", "display_df = cv_results_df[[\"fold\", \"auc\", \"f1_score\", \"recall_class1\", \"business_cost_min\", \"optimal_threshold\"]].copy()\n", "print(display_df.to_string(index=False))\n", "\n", "# Afficher les moyennes et écart-types\n", "print(\"\\n\" + \"=\"*110)\n", "print(\"RÉSUMÉ GLOBAL: Moyennes et Écart-types sur 5 folds\")\n", "print(\"=\"*110)\n", "\n", "summary_data = {\n", " \"Métrique\": [\"AUC-ROC\", \"F1-Score\", \"Recall Classe 1\", \"Coût Métier Min\", \"Seuil Optimal\"],\n", " \"Moyenne\": [\n", " f\"{metrics_mean['auc']:.4f}\",\n", " f\"{metrics_mean['f1_score']:.4f}\",\n", " f\"{metrics_mean['recall_class1']:.4f}\",\n", " f\"{metrics_mean['business_cost_min']:.2f}\",\n", " f\"{metrics_mean['optimal_threshold']:.2f}\",\n", " ],\n", " \"Écart-type\": [\n", " f\"{metrics_std['auc']:.4f}\",\n", " f\"{metrics_std['f1_score']:.4f}\",\n", " f\"{metrics_std['recall_class1']:.4f}\",\n", " f\"{metrics_std['business_cost_min']:.2f}\",\n", " \"-\",\n", " ]\n", "}\n", "\n", "summary_df = pd.DataFrame(summary_data)\n", "print(summary_df.to_string(index=False))\n", "print(\"=\"*110)\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "0dcbf61a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Modèle final LogisticRegression V1 entraîné sur l'ensemble train complet\n", " Nombre de features: 724\n", " Intercept: -3.504665\n", " Norme des coefficients: 3.948967\n" ] } ], "source": [ "# ============================================================================\n", "# ENTRAÎNEMENT FINAL: LogisticRegression sur l'ensemble train complet\n", "# ============================================================================\n", "\n", "final_model_v1 = LogisticRegression(**MODEL_CONFIG_V1)\n", "final_model_v1.fit(X_train_scaled, y_train)\n", "\n", "print(\"\\n✓ Modèle final LogisticRegression V1 entraîné sur l'ensemble train complet\")\n", "print(f\" Nombre de features: {X_train_scaled.shape[1]}\")\n", "print(f\" Intercept: {final_model_v1.intercept_[0]:.6f}\")\n", "print(f\" Norme des coefficients: {np.linalg.norm(final_model_v1.coef_):.6f}\")\n" ] }, { "cell_type": "code", "execution_count": 8, "id": "0bdf34fb", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✓ Vérification initiale y_test:\n", " y_test shape: (10000,)\n", " NaN in y_test: 10000\n", "\n", "⚠️ ATTENTION: y_test est entièrement NaN - Évaluation test IGNORÉE\n", " Les données de test n'ont pas de cible valide.\n" ] } ], "source": [ "# ============================================================================\n", "# ÉVALUATION SUR L'ENSEMBLE TEST\n", "# ============================================================================\n", "# Utiliser le même seuil fixe de 0.5\n", "\n", "# Vérifier et nettoyer les NaN dans y_test\n", "print(f\"✓ Vérification initiale y_test:\")\n", "print(f\" y_test shape: {y_test.shape}\")\n", "print(f\" NaN in y_test: {y_test.isna().sum()}\")\n", "\n", "# Vérifier si y_test est entièrement NaN\n", "if y_test.isna().sum() == len(y_test):\n", " print(f\"\\n⚠️ ATTENTION: y_test est entièrement NaN - Évaluation test IGNORÉE\")\n", " print(f\" Les données de test n'ont pas de cible valide.\")\n", " test_auc = None\n", " test_accuracy = None\n", " test_f1 = None\n", " test_recall = None\n", " test_cost = None\n", " tp_test = None\n", " fp_test = None\n", " fn_test = None\n", " tn_test = None\n", " \n", "else:\n", " # Supprimer les lignes avec NaN dans y_test\n", " if y_test.isna().sum() > 0:\n", " print(f\" Suppression de {y_test.isna().sum()} lignes avec NaN dans y_test...\")\n", " mask_test_clean = ~y_test.isna()\n", " y_test = y_test[mask_test_clean]\n", " X_test_scaled = X_test_scaled[mask_test_clean]\n", " print(f\" y_test après suppression: {y_test.shape}\")\n", " print(f\" X_test_scaled après suppression: {X_test_scaled.shape}\")\n", "\n", " # Réinitialiser les indices\n", " y_test.reset_index(drop=True, inplace=True)\n", " X_test_scaled.reset_index(drop=True, inplace=True)\n", "\n", " # Prédictions sur le test\n", " y_test_proba = final_model_v1.predict_proba(X_test_scaled)[:, 1]\n", " y_test_pred = (y_test_proba >= THRESHOLD_FIXED).astype(int)\n", "\n", " # Métriques sur le test\n", " test_auc = roc_auc_score(y_test, y_test_proba)\n", " test_accuracy = accuracy_score(y_test, y_test_pred)\n", " test_f1 = f1_score(y_test, y_test_pred)\n", " test_recall = recall_score(y_test, y_test_pred)\n", "\n", " # Coût métier\n", " tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()\n", " test_cost = 10 * fn_test + 1 * fp_test\n", "\n", " print(\"\\n\" + \"=\"*80)\n", " print(\"ÉVALUATION SUR ENSEMBLE TEST (seuil=0.5)\")\n", " print(\"=\"*80)\n", " print(f\"AUC-ROC: {test_auc:.4f}\")\n", " print(f\"Accuracy: {test_accuracy:.4f}\")\n", " print(f\"F1-Score: {test_f1:.4f}\")\n", " print(f\"Recall Classe 1: {test_recall:.4f}\")\n", " print(f\"Coût Métier: {test_cost:.0f}\")\n", " print(f\"\\nConfusion Matrix:\")\n", " print(f\" TP: {int(tp_test):6d} | FP: {int(fp_test):6d}\")\n", " print(f\" FN: {int(fn_test):6d} | TN: {int(tn_test):6d}\")\n", " print(\"=\"*80)\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "623e9bd1", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2026/02/06 01:40:36 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "⚠️ ATTENTION: Métriques test non disponibles (y_test était entièrement NaN)\n", " Les métriques CV sont utilisées.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026/02/06 01:40:39 WARNING mlflow.utils.environment: Failed to resolve installed pip version. ``pip`` will be added to conda.yaml environment spec without a version specifier.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Métriques et artefacts du modèle loggés dans MLflow\n", " AUC (CV): 0.7010\n", " F1 (CV): 0.0631\n", " Recall (CV): 0.0361\n", " Business Cost Min (CV): 1511.00\n", "\n", " ℹ️ Pour enregistrer le modèle dans la Model Registry :\n", " - Allez à http://127.0.0.1:5000/#/experiments/1\n", " - Trouvez le run 'V1_LogisticRegression_Test_Evaluation'\n", " - Dans l'onglet 'Artifacts', cliquez 'Register Model'\n", " - Sélectionnez ou créez le nom 'LogisticRegression_V1'\n", "🏃 View run V1_LogisticRegression_Test_Evaluation at: http://127.0.0.1:5000/#/experiments/1/runs/b98cbeb8fddc435f998b929565c06021\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# LOGGING MLFLOW: Sauvegarde des métriques et artefacts du modèle\n", "# ============================================================================\n", "# Logger les métriques CV et le modèle comme artefact\n", "# IMPORTANT: Le modèle n'est PAS enregistré dans la Model Registry automatiquement\n", "\n", "# Terminer le run CV précédent\n", "mlflow.end_run()\n", "\n", "with mlflow.start_run(run_name=\"V1_LogisticRegression_Test_Evaluation\"):\n", " # Logging des paramètres\n", " mlflow.log_params(MODEL_CONFIG_V1)\n", " \n", " # Tags\n", " mlflow.set_tag(\"version\", \"1\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"phase\", \"test_evaluation\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " \n", " # TOUJOURS logger les métriques CV (pour que le modèle ait des métriques)\n", " mlflow.log_metric(\"auc\", metrics_mean[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean[\"optimal_threshold\"])\n", " \n", " # Si métriques test disponibles, les logger aussi (avec suffixe pour différencier)\n", " if test_auc is not None:\n", " mlflow.log_metric(\"test_auc\", test_auc)\n", " mlflow.log_metric(\"test_f1_score\", test_f1)\n", " mlflow.log_metric(\"test_recall_class1\", test_recall)\n", " mlflow.log_metric(\"test_business_cost_min\", test_cost)\n", " \n", " # Résultats test en artefact\n", " test_results = {\n", " \"auc\": float(test_auc),\n", " \"f1_score\": float(test_f1),\n", " \"recall_class1\": float(test_recall),\n", " \"business_cost_min\": float(test_cost),\n", " \"optimal_threshold\": float(THRESHOLD_FIXED),\n", " \"confusion_matrix\": {\n", " \"tp\": int(tp_test),\n", " \"fp\": int(fp_test),\n", " \"fn\": int(fn_test),\n", " \"tn\": int(tn_test),\n", " }\n", " }\n", " mlflow.log_dict(test_results, \"test_evaluation.json\")\n", " \n", " print(f\"\\n✓ Métriques test loggées\")\n", " print(f\" Test AUC: {test_auc:.4f}\")\n", " print(f\" Test F1: {test_f1:.4f}\")\n", " print(f\" Test Recall: {test_recall:.4f}\")\n", " print(f\" Test Business Cost Min: {test_cost:.0f}\")\n", " else:\n", " print(f\"\\n⚠️ ATTENTION: Métriques test non disponibles (y_test était entièrement NaN)\")\n", " print(f\" Les métriques CV sont utilisées.\")\n", " mlflow.set_tag(\"test_metrics_available\", \"false\")\n", " \n", " # LOG: Sauvegarder le modèle comme artefact (accessible via MLflow)\n", " # IMPORTANT: Le modèle n'est PAS enregistré dans la Model Registry automatiquement\n", " # Cela doit être fait manuellement via l'interface MLflow\n", " mlflow.sklearn.log_model(\n", " final_model_v1,\n", " artifact_path=\"logistic_regression_v1\"\n", " )\n", " \n", " print(f\"\\n✓ Métriques et artefacts du modèle loggés dans MLflow\")\n", " print(f\" AUC (CV): {metrics_mean['auc']:.4f}\")\n", " print(f\" F1 (CV): {metrics_mean['f1_score']:.4f}\")\n", " print(f\" Recall (CV): {metrics_mean['recall_class1']:.4f}\")\n", " print(f\" Business Cost Min (CV): {metrics_mean['business_cost_min']:.2f}\")\n", " print(f\"\\n ℹ️ Pour enregistrer le modèle dans la Model Registry :\")\n", " print(f\" - Allez à http://127.0.0.1:5000/#/experiments/1\")\n", " print(f\" - Trouvez le run 'V1_LogisticRegression_Test_Evaluation'\")\n", " print(f\" - Dans l'onglet 'Artifacts', cliquez 'Register Model'\")\n", " print(f\" - Sélectionnez ou créez le nom 'LogisticRegression_V1'\")\n" ] }, { "cell_type": "code", "execution_count": 10, "id": "49e25787", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "====================================================================================================\n", "COMPARAISON: Cross-Validation vs Test Set\n", "====================================================================================================\n", "\n", "⚠️ ATTENTION: Métriques test non disponibles\n", " (y_test était entièrement NaN - Évaluation test ignorée)\n", "\n", " Affichage des métriques de Cross-Validation uniquement:\n", " AUC moyen: 0.7010 ± 0.0038\n", " F1 moyen: 0.0631 ± 0.0355\n", " Recall moyen: 0.0361 ± 0.0202\n", " Coût métier moyen: 1511.00 ± 34.85\n", " Seuil optimal: 0.50\n", "====================================================================================================\n" ] } ], "source": [ "# ============================================================================\n", "# COMPARAISON: Cross-Validation vs Test\n", "# ============================================================================\n", "# Vérifier la stabilité du modèle (généralisation)\n", "\n", "print(\"\\n\" + \"=\"*100)\n", "print(\"COMPARAISON: Cross-Validation vs Test Set\")\n", "print(\"=\"*100)\n", "\n", "if test_auc is None:\n", " print(\"\\n⚠️ ATTENTION: Métriques test non disponibles\")\n", " print(\" (y_test était entièrement NaN - Évaluation test ignorée)\")\n", " print(\"\\n Affichage des métriques de Cross-Validation uniquement:\")\n", " print(f\" AUC moyen: {metrics_mean['auc']:.4f} ± {metrics_std['auc']:.4f}\")\n", " print(f\" F1 moyen: {metrics_mean['f1_score']:.4f} ± {metrics_std['f1_score']:.4f}\")\n", " print(f\" Recall moyen: {metrics_mean['recall_class1']:.4f} ± {metrics_std['recall_class1']:.4f}\")\n", " print(f\" Coût métier moyen: {metrics_mean['business_cost_min']:.2f} ± {metrics_std['business_cost_min']:.2f}\")\n", " print(f\" Seuil optimal: {metrics_mean['optimal_threshold']:.2f}\")\n", "else:\n", " comparison_data = {\n", " \"Métrique\": [\"AUC-ROC\", \"F1-Score\", \"Recall Classe 1\", \"Coût Métier Min\", \"Seuil Optimal\"],\n", " \"CV Mean\": [\n", " f\"{metrics_mean['auc']:.4f}\",\n", " f\"{metrics_mean['f1_score']:.4f}\",\n", " f\"{metrics_mean['recall_class1']:.4f}\",\n", " f\"{metrics_mean['business_cost_min']:.2f}\",\n", " f\"{metrics_mean['optimal_threshold']:.2f}\",\n", " ],\n", " \"Test\": [\n", " f\"{test_auc:.4f}\",\n", " f\"{test_f1:.4f}\",\n", " f\"{test_recall:.4f}\",\n", " f\"{test_cost:.2f}\",\n", " f\"{THRESHOLD_FIXED:.2f}\",\n", " ],\n", " \"Diff (Test-CV)\": [\n", " f\"{test_auc - metrics_mean['auc']:+.4f}\",\n", " f\"{test_f1 - metrics_mean['f1_score']:+.4f}\",\n", " f\"{test_recall - metrics_mean['recall_class1']:+.4f}\",\n", " f\"{test_cost - metrics_mean['business_cost_min']:+.2f}\",\n", " \"0.00\",\n", " ]\n", " }\n", " \n", " comparison_df = pd.DataFrame(comparison_data)\n", " print(comparison_df.to_string(index=False))\n", "\n", "print(\"=\"*100)\n" ] }, { "cell_type": "code", "execution_count": null, "id": "92864e1d", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 11, "id": "267e8211", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fold 1/5 | AUC=0.6886 | Acc=0.6995 | F1=0.2324 | Recall=0.5871 | Cost=1177\n", "Fold 2/5 | AUC=0.6828 | Acc=0.6980 | F1=0.2412 | Recall=0.6194 | Cost=1135\n", "Fold 3/5 | AUC=0.7118 | Acc=0.7265 | F1=0.2476 | Recall=0.5806 | Cost=1132\n", "Fold 4/5 | AUC=0.7035 | Acc=0.7000 | F1=0.2347 | Recall=0.5935 | Cost=1167\n", "Fold 5/5 | AUC=0.6920 | Acc=0.7185 | F1=0.2277 | Recall=0.5355 | Cost=1211\n", "\n", "✓ Cross-Validation LogisticRegression V2.1 (class_weight='balanced') terminée\n", " AUC moyen: 0.6957 ± 0.0117\n", " F1 moyen: 0.2367 ± 0.0078\n", " Recall moyen: 0.5832 ± 0.0305\n", " Coût métier moyen: 1164.40 ± 32.60\n", " Seuil optimal: 0.50\n", "🏃 View run V2_LogisticRegression_ClassWeightBalanced at: http://127.0.0.1:5000/#/experiments/1/runs/d8b12c8475984c75b995472e30f56f69\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# VERSION 2.1: LogisticRegression avec class_weight='balanced'\n", "# ============================================================================\n", "# Objectif: Gérer le déséquilibre des classes avec class_weight='balanced'\n", "# Validation: StratifiedKFold (5 folds)\n", "# Modèle: LogisticRegression(max_iter=1000, random_state=42, solver='saga', class_weight='balanced', penalty='l2')\n", "# Features: X_train_scaled, y_train (déjà scalées)\n", "# Seuil fixe: 0.5\n", "# Métriques par fold: AUC-ROC, Accuracy, F1-score, Recall classe 1\n", "# Coût métier: 10 * FN + 1 * FP (avec seuil=0.5)\n", "# MLflow: run_name=\"V2_LogisticRegression_ClassWeightBalanced\"\n", "# Tags: version=\"2\", imbalance_handling=\"class_weight\"\n", "\n", "from sklearn.pipeline import Pipeline\n", "\n", "# Configuration du modèle V2.1 (class_weight balanced)\n", "MODEL_CONFIG_V2_1 = {\n", " \"max_iter\": 3000,\n", " \"random_state\": 42,\n", " \"solver\": \"saga\",\n", " \"class_weight\": \"balanced\"\n", "}\n", "\n", "RUN_NAME_V2_1 = \"V2_LogisticRegression_ClassWeightBalanced\"\n", "\n", "fold_results_v2_1 = []\n", "\n", "# Terminer tout run actif\n", "mlflow.end_run()\n", "\n", "with mlflow.start_run(run_name=RUN_NAME_V2_1):\n", " # ========== Logging des paramètres et tags ==========\n", " mlflow.log_params(MODEL_CONFIG_V2_1)\n", " mlflow.set_tag(\"version\", \"2\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"notebook\", NOTEBOOK_NAME)\n", " mlflow.set_tag(\"phase\", \"imbalance_handling_cv\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"scaling\", \"StandardScaler\")\n", " mlflow.set_tag(\"imbalance_handling\", \"class_weight\")\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " \n", " # ========== StratifiedKFold (5 folds) ==========\n", " skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)\n", " \n", " for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_scaled, y_train), start=1):\n", " X_tr, X_val = X_train_scaled.iloc[train_idx], X_train_scaled.iloc[val_idx]\n", " y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]\n", " \n", " # ========== Entraînement ==========\n", " model = LogisticRegression(**MODEL_CONFIG_V2_1)\n", " model.fit(X_tr, y_tr)\n", " \n", " # ========== Prédictions ==========\n", " y_val_proba = model.predict_proba(X_val)[:, 1]\n", " y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int)\n", " \n", " # ========== Métriques ==========\n", " auc = roc_auc_score(y_val, y_val_proba)\n", " accuracy = accuracy_score(y_val, y_val_pred)\n", " f1 = f1_score(y_val, y_val_pred)\n", " recall = recall_score(y_val, y_val_pred)\n", " \n", " # ========== Coût métier (seuil=0.5) ==========\n", " tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()\n", " cost = 10 * fn + 1 * fp\n", " \n", " fold_results_v2_1.append({\n", " \"fold\": fold_idx,\n", " \"auc\": auc,\n", " \"accuracy\": accuracy,\n", " \"f1_score\": f1,\n", " \"recall_class1\": recall,\n", " \"business_cost_min\": cost,\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " \"tp\": tp,\n", " \"fp\": fp,\n", " \"fn\": fn,\n", " \"tn\": tn\n", " })\n", " \n", " print(f\"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | \"\n", " f\"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}\")\n", " \n", " # ========== Agrégation des résultats ==========\n", " cv_results_v2_1_df = pd.DataFrame(fold_results_v2_1)\n", " \n", " metrics_mean_v2_1 = {\n", " \"auc\": cv_results_v2_1_df[\"auc\"].mean(),\n", " \"f1_score\": cv_results_v2_1_df[\"f1_score\"].mean(),\n", " \"recall_class1\": cv_results_v2_1_df[\"recall_class1\"].mean(),\n", " \"business_cost_min\": cv_results_v2_1_df[\"business_cost_min\"].mean(),\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " }\n", " \n", " metrics_std_v2_1 = {\n", " \"auc\": cv_results_v2_1_df[\"auc\"].std(),\n", " \"f1_score\": cv_results_v2_1_df[\"f1_score\"].std(),\n", " \"recall_class1\": cv_results_v2_1_df[\"recall_class1\"].std(),\n", " \"business_cost_min\": cv_results_v2_1_df[\"business_cost_min\"].std(),\n", " }\n", " \n", " # ========== Logging dans MLFlow ==========\n", " mlflow.log_metric(\"auc\", metrics_mean_v2_1[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean_v2_1[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean_v2_1[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean_v2_1[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean_v2_1[\"optimal_threshold\"])\n", " \n", " # Log artefact JSON avec détails par fold\n", " mlflow.log_dict(cv_results_v2_1_df.to_dict(orient=\"records\"), \"cv_results_per_fold.json\")\n", " \n", " print(\"\\n✓ Cross-Validation LogisticRegression V2.1 (class_weight='balanced') terminée\")\n", " print(f\" AUC moyen: {metrics_mean_v2_1['auc']:.4f} ± {metrics_std_v2_1['auc']:.4f}\")\n", " print(f\" F1 moyen: {metrics_mean_v2_1['f1_score']:.4f} ± {metrics_std_v2_1['f1_score']:.4f}\")\n", " print(f\" Recall moyen: {metrics_mean_v2_1['recall_class1']:.4f} ± {metrics_std_v2_1['recall_class1']:.4f}\")\n", " print(f\" Coût métier moyen: {metrics_mean_v2_1['business_cost_min']:.2f} ± {metrics_std_v2_1['business_cost_min']:.2f}\")\n", " print(f\" Seuil optimal: {metrics_mean_v2_1['optimal_threshold']:.2f}\")" ] }, { "cell_type": "code", "execution_count": 12, "id": "06214200", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Modèle final LogisticRegression V2.1 entraîné sur l'ensemble train complet\n", " Nombre de features: 724\n", " Intercept: -0.917467\n", " Norme des coefficients: 2.441323\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026/02/06 01:48:02 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.\n", "2026/02/06 01:48:04 WARNING mlflow.utils.environment: Failed to resolve installed pip version. ``pip`` will be added to conda.yaml environment spec without a version specifier.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Modèle V2.1 enregistré dans MLflow\n", " AUC (CV): 0.6957\n", " F1 (CV): 0.2367\n", " Recall (CV): 0.5832\n", " Business Cost Min (CV): 1164.40\n", "🏃 View run V2.1_LogisticRegression_ClassWeight_Final at: http://127.0.0.1:5000/#/experiments/1/runs/0bc8f5f187c94a349c72011de4524c77\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# ENTRAÎNEMENT FINAL V2.1: LogisticRegression avec class_weight='balanced'\n", "# ============================================================================\n", "\n", "final_model_v2_1 = LogisticRegression(**MODEL_CONFIG_V2_1)\n", "final_model_v2_1.fit(X_train_scaled, y_train)\n", "\n", "print(\"\\n✓ Modèle final LogisticRegression V2.1 entraîné sur l'ensemble train complet\")\n", "print(f\" Nombre de features: {X_train_scaled.shape[1]}\")\n", "print(f\" Intercept: {final_model_v2_1.intercept_[0]:.6f}\")\n", "print(f\" Norme des coefficients: {np.linalg.norm(final_model_v2_1.coef_):.6f}\")\n", "\n", "# ============================================================================\n", "# LOGGING MLFLOW V2.1: Sauvegarde du modèle\n", "# ============================================================================\n", "\n", "mlflow.end_run()\n", "\n", "with mlflow.start_run(run_name=\"V2.1_LogisticRegression_ClassWeight_Final\"):\n", " # Logging des paramètres\n", " mlflow.log_params(MODEL_CONFIG_V2_1)\n", " \n", " # Tags\n", " mlflow.set_tag(\"version\", \"2.1\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"phase\", \"final_model\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"imbalance_handling\", \"class_weight\")\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " \n", " # Logger les métriques CV\n", " mlflow.log_metric(\"auc\", metrics_mean_v2_1[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean_v2_1[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean_v2_1[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean_v2_1[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean_v2_1[\"optimal_threshold\"])\n", " \n", " # Sauvegarder le modèle comme artefact\n", " mlflow.sklearn.log_model(\n", " final_model_v2_1,\n", " artifact_path=\"logistic_regression_v2_1_class_weight\"\n", " )\n", " \n", " print(f\"\\n✓ Modèle V2.1 enregistré dans MLflow\")\n", " print(f\" AUC (CV): {metrics_mean_v2_1['auc']:.4f}\")\n", " print(f\" F1 (CV): {metrics_mean_v2_1['f1_score']:.4f}\")\n", " print(f\" Recall (CV): {metrics_mean_v2_1['recall_class1']:.4f}\")\n", " print(f\" Business Cost Min (CV): {metrics_mean_v2_1['business_cost_min']:.2f}\")" ] }, { "cell_type": "code", "execution_count": 13, "id": "bf6d4baa", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fold 1/5 | AUC=0.6801 | Acc=0.7145 | F1=0.2397 | Recall=0.5806 | Cost=1156\n", "Fold 2/5 | AUC=0.6807 | Acc=0.6985 | F1=0.2299 | Recall=0.5806 | Cost=1188\n", "Fold 3/5 | AUC=0.7055 | Acc=0.7375 | F1=0.2553 | Recall=0.5806 | Cost=1110\n", "Fold 4/5 | AUC=0.6872 | Acc=0.7190 | F1=0.2301 | Recall=0.5419 | Cost=1201\n", "Fold 5/5 | AUC=0.6914 | Acc=0.7435 | F1=0.2377 | Recall=0.5161 | Cost=1188\n", "\n", "✓ Cross-Validation LogisticRegression V2.2 (SMOTE) terminée\n", " AUC moyen: 0.6890 ± 0.0104\n", " F1 moyen: 0.2386 ± 0.0104\n", " Recall moyen: 0.5600 ± 0.0297\n", " Coût métier moyen: 1168.60 ± 36.73\n", " Seuil optimal: 0.50\n", "🏃 View run V2_LogisticRegression_SMOTE at: http://127.0.0.1:5000/#/experiments/1/runs/dab29ff5c5a14880bb75287b1c5bcd5c\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# VERSION 2.2: LogisticRegression avec SMOTE\n", "# ============================================================================\n", "# Objectif: Gérer le déséquilibre des classes avec SMOTE\n", "# Validation: StratifiedKFold (5 folds)\n", "# Modèle: LogisticRegression(max_iter=1000, random_state=42, solver='saga', penalty='l2')\n", "# Pipeline: StandardScaler -> SMOTE -> LogisticRegression (pour éviter le data leakage)\n", "# Features: X_train, y_train (seront scalées dans le pipeline)\n", "# Seuil fixe: 0.5\n", "# Métriques par fold: AUC-ROC, Accuracy, F1-score, Recall classe 1\n", "# Coût métier: 10 * FN + 1 * FP (avec seuil=0.5)\n", "# MLflow: run_name=\"V2_LogisticRegression_SMOTE\"\n", "# Tags: version=\"2\", imbalance_handling=\"smote\"\n", "\n", "from imblearn.over_sampling import SMOTE\n", "from imblearn.pipeline import Pipeline as ImbPipeline\n", "\n", "# Configuration du modèle V2.2 (SMOTE)\n", "MODEL_CONFIG_V2_2 = {\n", " \"max_iter\": 3000,\n", " \"random_state\": 42,\n", " \"solver\": \"saga\"\n", "}\n", "\n", "RUN_NAME_V2_2 = \"V2_LogisticRegression_SMOTE\"\n", "\n", "fold_results_v2_2 = []\n", "\n", "# Terminer tout run actif\n", "mlflow.end_run()\n", "\n", "with mlflow.start_run(run_name=RUN_NAME_V2_2):\n", " # ========== Logging des paramètres et tags ==========\n", " mlflow.log_params(MODEL_CONFIG_V2_2)\n", " mlflow.set_tag(\"version\", \"2\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"notebook\", NOTEBOOK_NAME)\n", " mlflow.set_tag(\"phase\", \"imbalance_handling_cv\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"scaling\", \"StandardScaler\")\n", " mlflow.set_tag(\"imbalance_handling\", \"smote\")\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " \n", " # ========== StratifiedKFold (5 folds) ==========\n", " skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)\n", " \n", " for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train, y_train), start=1):\n", " X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]\n", " y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]\n", " \n", " # ========== Pipeline: Scaler -> SMOTE -> Model ==========\n", " # SMOTE est appliqué uniquement sur le train de chaque fold\n", " pipeline = ImbPipeline([\n", " ('scaler', StandardScaler()),\n", " ('smote', SMOTE(random_state=RANDOM_STATE)),\n", " ('model', LogisticRegression(**MODEL_CONFIG_V2_2))\n", " ])\n", " \n", " # ========== Entraînement ==========\n", " pipeline.fit(X_tr, y_tr)\n", " \n", " # ========== Prédictions ==========\n", " y_val_proba = pipeline.predict_proba(X_val)[:, 1]\n", " y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int)\n", " \n", " # ========== Métriques ==========\n", " auc = roc_auc_score(y_val, y_val_proba)\n", " accuracy = accuracy_score(y_val, y_val_pred)\n", " f1 = f1_score(y_val, y_val_pred)\n", " recall = recall_score(y_val, y_val_pred)\n", " \n", " # ========== Coût métier (seuil=0.5) ==========\n", " tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()\n", " cost = 10 * fn + 1 * fp\n", " \n", " fold_results_v2_2.append({\n", " \"fold\": fold_idx,\n", " \"auc\": auc,\n", " \"accuracy\": accuracy,\n", " \"f1_score\": f1,\n", " \"recall_class1\": recall,\n", " \"business_cost_min\": cost,\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " \"tp\": tp,\n", " \"fp\": fp,\n", " \"fn\": fn,\n", " \"tn\": tn\n", " })\n", " \n", " print(f\"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | \"\n", " f\"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}\")\n", " \n", " # ========== Agrégation des résultats ==========\n", " cv_results_v2_2_df = pd.DataFrame(fold_results_v2_2)\n", " \n", " metrics_mean_v2_2 = {\n", " \"auc\": cv_results_v2_2_df[\"auc\"].mean(),\n", " \"f1_score\": cv_results_v2_2_df[\"f1_score\"].mean(),\n", " \"recall_class1\": cv_results_v2_2_df[\"recall_class1\"].mean(),\n", " \"business_cost_min\": cv_results_v2_2_df[\"business_cost_min\"].mean(),\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " }\n", " \n", " metrics_std_v2_2 = {\n", " \"auc\": cv_results_v2_2_df[\"auc\"].std(),\n", " \"f1_score\": cv_results_v2_2_df[\"f1_score\"].std(),\n", " \"recall_class1\": cv_results_v2_2_df[\"recall_class1\"].std(),\n", " \"business_cost_min\": cv_results_v2_2_df[\"business_cost_min\"].std(),\n", " }\n", " \n", " # ========== Logging dans MLFlow ==========\n", " mlflow.log_metric(\"auc\", metrics_mean_v2_2[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean_v2_2[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean_v2_2[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean_v2_2[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean_v2_2[\"optimal_threshold\"])\n", " \n", " # Log artefact JSON avec détails par fold\n", " mlflow.log_dict(cv_results_v2_2_df.to_dict(orient=\"records\"), \"cv_results_per_fold.json\")\n", " \n", " print(\"\\n✓ Cross-Validation LogisticRegression V2.2 (SMOTE) terminée\")\n", " print(f\" AUC moyen: {metrics_mean_v2_2['auc']:.4f} ± {metrics_std_v2_2['auc']:.4f}\")\n", " print(f\" F1 moyen: {metrics_mean_v2_2['f1_score']:.4f} ± {metrics_std_v2_2['f1_score']:.4f}\")\n", " print(f\" Recall moyen: {metrics_mean_v2_2['recall_class1']:.4f} ± {metrics_std_v2_2['recall_class1']:.4f}\")\n", " print(f\" Coût métier moyen: {metrics_mean_v2_2['business_cost_min']:.2f} ± {metrics_std_v2_2['business_cost_min']:.2f}\")\n", " print(f\" Seuil optimal: {metrics_mean_v2_2['optimal_threshold']:.2f}\")" ] }, { "cell_type": "code", "execution_count": 14, "id": "2d115187", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Pipeline final LogisticRegression V2.2 (SMOTE) entraîné sur l'ensemble train complet\n", " Nombre de features: 724\n", " Intercept: -1.226644\n", " Norme des coefficients: 3.213375\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026/02/06 02:01:57 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.\n", "2026/02/06 02:01:59 WARNING mlflow.utils.environment: Failed to resolve installed pip version. ``pip`` will be added to conda.yaml environment spec without a version specifier.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Pipeline V2.2 enregistré dans MLflow\n", " AUC (CV): 0.6890\n", " F1 (CV): 0.2386\n", " Recall (CV): 0.5600\n", " Business Cost Min (CV): 1168.60\n", "🏃 View run V2.2_LogisticRegression_SMOTE_Final at: http://127.0.0.1:5000/#/experiments/1/runs/9a1cd90834c84f43a6b660e9dcc0a408\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# ENTRAÎNEMENT FINAL V2.2: LogisticRegression avec SMOTE\n", "# ============================================================================\n", "\n", "# Pipeline complet avec SMOTE sur l'ensemble train\n", "final_pipeline_v2_2 = ImbPipeline([\n", " ('scaler', StandardScaler()),\n", " ('smote', SMOTE(random_state=RANDOM_STATE)),\n", " ('model', LogisticRegression(**MODEL_CONFIG_V2_2))\n", "])\n", "\n", "final_pipeline_v2_2.fit(X_train, y_train)\n", "\n", "print(\"\\n✓ Pipeline final LogisticRegression V2.2 (SMOTE) entraîné sur l'ensemble train complet\")\n", "print(f\" Nombre de features: {X_train.shape[1]}\")\n", "print(f\" Intercept: {final_pipeline_v2_2.named_steps['model'].intercept_[0]:.6f}\")\n", "print(f\" Norme des coefficients: {np.linalg.norm(final_pipeline_v2_2.named_steps['model'].coef_):.6f}\")\n", "\n", "# ============================================================================\n", "# LOGGING MLFLOW V2.2: Sauvegarde du modèle\n", "# ============================================================================\n", "\n", "mlflow.end_run()\n", "\n", "with mlflow.start_run(run_name=\"V2.2_LogisticRegression_SMOTE_Final\"):\n", " # Logging des paramètres\n", " mlflow.log_params(MODEL_CONFIG_V2_2)\n", " \n", " # Tags\n", " mlflow.set_tag(\"version\", \"2.2\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"phase\", \"final_model\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"imbalance_handling\", \"smote\")\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " \n", " # Logger les métriques CV\n", " mlflow.log_metric(\"auc\", metrics_mean_v2_2[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean_v2_2[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean_v2_2[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean_v2_2[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean_v2_2[\"optimal_threshold\"])\n", " \n", " # Sauvegarder le pipeline complet comme artefact\n", " mlflow.sklearn.log_model(\n", " final_pipeline_v2_2,\n", " artifact_path=\"logistic_regression_v2_2_smote\"\n", " )\n", " \n", " print(f\"\\n✓ Pipeline V2.2 enregistré dans MLflow\")\n", " print(f\" AUC (CV): {metrics_mean_v2_2['auc']:.4f}\")\n", " print(f\" F1 (CV): {metrics_mean_v2_2['f1_score']:.4f}\")\n", " print(f\" Recall (CV): {metrics_mean_v2_2['recall_class1']:.4f}\")\n", " print(f\" Business Cost Min (CV): {metrics_mean_v2_2['business_cost_min']:.2f}\")" ] }, { "cell_type": "code", "execution_count": 15, "id": "9693605b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "========================================================================================================================\n", "TABLEAU COMPARATIF: V1 Baseline vs V2.1 Class Weight vs V2.2 SMOTE\n", "========================================================================================================================\n", " Version AUC F1-Score Recall Classe 1 Coût Métier Min Imbalance Handling\n", " V1 Baseline 0.7010 ± 0.0038 0.0631 ± 0.0355 0.0361 ± 0.0202 1511.00 ± 34.85 None\n", "V2.1 Class Weight 0.6957 ± 0.0117 0.2367 ± 0.0078 0.5832 ± 0.0305 1164.40 ± 32.60 class_weight='balanced'\n", " V2.2 SMOTE 0.6890 ± 0.0104 0.2386 ± 0.0104 0.5600 ± 0.0297 1168.60 ± 36.73 SMOTE\n", "========================================================================================================================\n", "\n", "========================================================================================================================\n", "ANALYSE DES AMÉLIORATIONS (vs V1 Baseline)\n", "========================================================================================================================\n", " Version Δ AUC Δ F1-Score Δ Recall Classe 1 Δ Coût Métier\n", "V2.1 Class Weight -0.0052 +0.1736 +0.5471 -346.60\n", " V2.2 SMOTE -0.0120 +0.1754 +0.5239 -342.40\n", "========================================================================================================================\n", "\n", "✓ Meilleure version par métrique:\n", " AUC: V1\n", " F1-Score: V2.2\n", " Recall Classe 1: V2.1\n", " Coût Métier Min: V2.1\n", "========================================================================================================================\n" ] } ], "source": [ "# ============================================================================\n", "# TABLEAU COMPARATIF: V1 Baseline vs V2 Class Weight vs V2 SMOTE\n", "# ============================================================================\n", "\n", "print(\"\\n\" + \"=\"*120)\n", "print(\"TABLEAU COMPARATIF: V1 Baseline vs V2.1 Class Weight vs V2.2 SMOTE\")\n", "print(\"=\"*120)\n", "\n", "comparison_data = {\n", " \"Version\": [\"V1 Baseline\", \"V2.1 Class Weight\", \"V2.2 SMOTE\"],\n", " \"AUC\": [\n", " f\"{metrics_mean['auc']:.4f} ± {metrics_std['auc']:.4f}\",\n", " f\"{metrics_mean_v2_1['auc']:.4f} ± {metrics_std_v2_1['auc']:.4f}\",\n", " f\"{metrics_mean_v2_2['auc']:.4f} ± {metrics_std_v2_2['auc']:.4f}\",\n", " ],\n", " \"F1-Score\": [\n", " f\"{metrics_mean['f1_score']:.4f} ± {metrics_std['f1_score']:.4f}\",\n", " f\"{metrics_mean_v2_1['f1_score']:.4f} ± {metrics_std_v2_1['f1_score']:.4f}\",\n", " f\"{metrics_mean_v2_2['f1_score']:.4f} ± {metrics_std_v2_2['f1_score']:.4f}\",\n", " ],\n", " \"Recall Classe 1\": [\n", " f\"{metrics_mean['recall_class1']:.4f} ± {metrics_std['recall_class1']:.4f}\",\n", " f\"{metrics_mean_v2_1['recall_class1']:.4f} ± {metrics_std_v2_1['recall_class1']:.4f}\",\n", " f\"{metrics_mean_v2_2['recall_class1']:.4f} ± {metrics_std_v2_2['recall_class1']:.4f}\",\n", " ],\n", " \"Coût Métier Min\": [\n", " f\"{metrics_mean['business_cost_min']:.2f} ± {metrics_std['business_cost_min']:.2f}\",\n", " f\"{metrics_mean_v2_1['business_cost_min']:.2f} ± {metrics_std_v2_1['business_cost_min']:.2f}\",\n", " f\"{metrics_mean_v2_2['business_cost_min']:.2f} ± {metrics_std_v2_2['business_cost_min']:.2f}\",\n", " ],\n", " \"Imbalance Handling\": [\n", " \"None\",\n", " \"class_weight='balanced'\",\n", " \"SMOTE\",\n", " ]\n", "}\n", "\n", "comparison_df = pd.DataFrame(comparison_data)\n", "print(comparison_df.to_string(index=False))\n", "print(\"=\"*120)\n", "\n", "# Analyse des améliorations\n", "print(\"\\n\" + \"=\"*120)\n", "print(\"ANALYSE DES AMÉLIORATIONS (vs V1 Baseline)\")\n", "print(\"=\"*120)\n", "\n", "improvement_data = {\n", " \"Version\": [\"V2.1 Class Weight\", \"V2.2 SMOTE\"],\n", " \"Δ AUC\": [\n", " f\"{metrics_mean_v2_1['auc'] - metrics_mean['auc']:+.4f}\",\n", " f\"{metrics_mean_v2_2['auc'] - metrics_mean['auc']:+.4f}\",\n", " ],\n", " \"Δ F1-Score\": [\n", " f\"{metrics_mean_v2_1['f1_score'] - metrics_mean['f1_score']:+.4f}\",\n", " f\"{metrics_mean_v2_2['f1_score'] - metrics_mean['f1_score']:+.4f}\",\n", " ],\n", " \"Δ Recall Classe 1\": [\n", " f\"{metrics_mean_v2_1['recall_class1'] - metrics_mean['recall_class1']:+.4f}\",\n", " f\"{metrics_mean_v2_2['recall_class1'] - metrics_mean['recall_class1']:+.4f}\",\n", " ],\n", " \"Δ Coût Métier\": [\n", " f\"{metrics_mean_v2_1['business_cost_min'] - metrics_mean['business_cost_min']:+.2f}\",\n", " f\"{metrics_mean_v2_2['business_cost_min'] - metrics_mean['business_cost_min']:+.2f}\",\n", " ]\n", "}\n", "\n", "improvement_df = pd.DataFrame(improvement_data)\n", "print(improvement_df.to_string(index=False))\n", "print(\"=\"*120)\n", "\n", "# Déterminer la meilleure version\n", "best_auc_version = [\"V1\", \"V2.1\", \"V2.2\"][\n", " np.argmax([metrics_mean['auc'], metrics_mean_v2_1['auc'], metrics_mean_v2_2['auc']])\n", "]\n", "best_f1_version = [\"V1\", \"V2.1\", \"V2.2\"][\n", " np.argmax([metrics_mean['f1_score'], metrics_mean_v2_1['f1_score'], metrics_mean_v2_2['f1_score']])\n", "]\n", "best_recall_version = [\"V1\", \"V2.1\", \"V2.2\"][\n", " np.argmax([metrics_mean['recall_class1'], metrics_mean_v2_1['recall_class1'], metrics_mean_v2_2['recall_class1']])\n", "]\n", "best_cost_version = [\"V1\", \"V2.1\", \"V2.2\"][\n", " np.argmin([metrics_mean['business_cost_min'], metrics_mean_v2_1['business_cost_min'], metrics_mean_v2_2['business_cost_min']])\n", "]\n", "\n", "print(\"\\n✓ Meilleure version par métrique:\")\n", "print(f\" AUC: {best_auc_version}\")\n", "print(f\" F1-Score: {best_f1_version}\")\n", "print(f\" Recall Classe 1: {best_recall_version}\")\n", "print(f\" Coût Métier Min: {best_cost_version}\")\n", "print(\"=\"*120)" ] }, { "cell_type": "code", "execution_count": null, "id": "d9d91d18", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "id": "7b014974", "metadata": {}, "source": [ "# VERSION 3: Meilleur modèle avec scaling robuste optimisé\n", "\n", "Objectif: Réentraîner le meilleur modèle (V2.1 class_weight='balanced') avec un scaling plus adapté\n", "- **RobustScaler**: Utilise la médiane et l'IQR (moins sensible aux outliers que StandardScaler)\n", "- Validation: StratifiedKFold (5 folds)\n", "- Modèle: LogisticRegression avec class_weight='balanced'\n", "- Enregistrement dans MLflow Model Registry sous le nom \"regression\"" ] }, { "cell_type": "code", "execution_count": null, "id": "4b365be7", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 17, "id": "3fb11f15", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Features scalées avec RobustScaler (médiane + IQR):\n", " Shape train: (10000, 724)\n", " Median: 0.00000000 (≈ 0)\n", " IQR (Interquartile Range): 0.052486\n", "\n", " Comparaison vs StandardScaler:\n", " StandardScaler - Mean: -0.00000000, Std: 0.874353\n", " RobustScaler - Median: 0.00000000, IQR: 0.052486\n" ] } ], "source": [ "# ============================================================================\n", "# VERSION 3: Meilleur modèle avec RobustScaler (adapté aux outliers)\n", "# ============================================================================\n", "# Objectif: Améliorer le scaling pour des features avec outliers\n", "# RobustScaler utilise la médiane et l'IQR au lieu de la moyenne et l'écart-type\n", "# Plus robuste face aux valeurs extrêmes dans les données de crédit\n", "\n", "from sklearn.preprocessing import RobustScaler\n", "\n", "# Créer le RobustScaler\n", "robust_scaler = RobustScaler()\n", "X_train_robust = robust_scaler.fit_transform(X_train)\n", "X_test_robust = robust_scaler.transform(X_test)\n", "\n", "# Reconvertir en DataFrame\n", "X_train_robust = pd.DataFrame(X_train_robust, columns=X_train.columns)\n", "X_test_robust = pd.DataFrame(X_test_robust, columns=X_test.columns)\n", "\n", "print(f\"\\n✓ Features scalées avec RobustScaler (médiane + IQR):\")\n", "print(f\" Shape train: {X_train_robust.shape}\")\n", "print(f\" Median: {X_train_robust.median().mean():.8f} (≈ 0)\")\n", "print(f\" IQR (Interquartile Range): {(X_train_robust.quantile(0.75) - X_train_robust.quantile(0.25)).mean():.6f}\")\n", "print(f\"\\n Comparaison vs StandardScaler:\")\n", "print(f\" StandardScaler - Mean: {X_train_scaled.mean().mean():.8f}, Std: {X_train_scaled.std().mean():.6f}\")\n", "print(f\" RobustScaler - Median: {X_train_robust.median().mean():.8f}, IQR: {(X_train_robust.quantile(0.75) - X_train_robust.quantile(0.25)).mean():.6f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "ecfe6509", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 18, "id": "afbc053c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Fold 1/5 | AUC=0.5488 | Acc=0.3675 | F1=0.1516 | Recall=0.7290 | Cost=1643\n", "Fold 2/5 | AUC=0.5648 | Acc=0.4400 | F1=0.1592 | Recall=0.6839 | Cost=1561\n", "Fold 3/5 | AUC=0.5284 | Acc=0.3270 | F1=0.1492 | Recall=0.7613 | Cost=1679\n", "Fold 4/5 | AUC=0.5628 | Acc=0.3750 | F1=0.1554 | Recall=0.7419 | Cost=1610\n", "Fold 5/5 | AUC=0.5070 | Acc=0.3575 | F1=0.1462 | Recall=0.7097 | Cost=1690\n", "\n", "✓ Cross-Validation LogisticRegression V3 (RobustScaler + class_weight) terminée\n", " AUC moyen: 0.5424 ± 0.0245\n", " F1 moyen: 0.1523 ± 0.0051\n", " Recall moyen: 0.7252 ± 0.0298\n", " Coût métier moyen: 1636.60 ± 52.71\n", " Seuil optimal: 0.50\n", "🏃 View run V3_LogisticRegression_RobustScaler_ClassWeight at: http://127.0.0.1:5000/#/experiments/1/runs/f3c1d8a8220a4e5193cba3eb73b30df6\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# CROSS-VALIDATION V3: Meilleur modèle (V2.1) avec RobustScaler\n", "# ============================================================================\n", "\n", "MODEL_CONFIG_V3 = {\n", " \"max_iter\": 3000,\n", " \"random_state\": 42,\n", " \"solver\": \"saga\",\n", " \"class_weight\": \"balanced\"\n", "}\n", "\n", "RUN_NAME_V3 = \"V3_LogisticRegression_RobustScaler_ClassWeight\"\n", "\n", "fold_results_v3 = []\n", "\n", "# Terminer tout run actif\n", "mlflow.end_run()\n", "\n", "with mlflow.start_run(run_name=RUN_NAME_V3):\n", " # ========== Logging des paramètres et tags ==========\n", " mlflow.log_params(MODEL_CONFIG_V3)\n", " mlflow.set_tag(\"version\", \"3\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"notebook\", NOTEBOOK_NAME)\n", " mlflow.set_tag(\"phase\", \"robust_scaling_cv\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"scaling\", \"RobustScaler\")\n", " mlflow.set_tag(\"imbalance_handling\", \"class_weight\")\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " \n", " # ========== StratifiedKFold (5 folds) ==========\n", " skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)\n", " \n", " for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_train_robust, y_train), start=1):\n", " X_tr, X_val = X_train_robust.iloc[train_idx], X_train_robust.iloc[val_idx]\n", " y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]\n", " \n", " # ========== Entraînement ==========\n", " model = LogisticRegression(**MODEL_CONFIG_V3)\n", " model.fit(X_tr, y_tr)\n", " \n", " # ========== Prédictions ==========\n", " y_val_proba = model.predict_proba(X_val)[:, 1]\n", " y_val_pred = (y_val_proba >= THRESHOLD_FIXED).astype(int)\n", " \n", " # ========== Métriques ==========\n", " auc = roc_auc_score(y_val, y_val_proba)\n", " accuracy = accuracy_score(y_val, y_val_pred)\n", " f1 = f1_score(y_val, y_val_pred)\n", " recall = recall_score(y_val, y_val_pred)\n", " \n", " # ========== Coût métier (seuil=0.5) ==========\n", " tn, fp, fn, tp = confusion_matrix(y_val, y_val_pred).ravel()\n", " cost = 10 * fn + 1 * fp\n", " \n", " fold_results_v3.append({\n", " \"fold\": fold_idx,\n", " \"auc\": auc,\n", " \"accuracy\": accuracy,\n", " \"f1_score\": f1,\n", " \"recall_class1\": recall,\n", " \"business_cost_min\": cost,\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " \"tp\": tp,\n", " \"fp\": fp,\n", " \"fn\": fn,\n", " \"tn\": tn\n", " })\n", " \n", " print(f\"Fold {fold_idx}/5 | AUC={auc:.4f} | Acc={accuracy:.4f} | \"\n", " f\"F1={f1:.4f} | Recall={recall:.4f} | Cost={cost:.0f}\")\n", " \n", " # ========== Agrégation des résultats ==========\n", " cv_results_v3_df = pd.DataFrame(fold_results_v3)\n", " \n", " metrics_mean_v3 = {\n", " \"auc\": cv_results_v3_df[\"auc\"].mean(),\n", " \"f1_score\": cv_results_v3_df[\"f1_score\"].mean(),\n", " \"recall_class1\": cv_results_v3_df[\"recall_class1\"].mean(),\n", " \"business_cost_min\": cv_results_v3_df[\"business_cost_min\"].mean(),\n", " \"optimal_threshold\": THRESHOLD_FIXED,\n", " }\n", " \n", " metrics_std_v3 = {\n", " \"auc\": cv_results_v3_df[\"auc\"].std(),\n", " \"f1_score\": cv_results_v3_df[\"f1_score\"].std(),\n", " \"recall_class1\": cv_results_v3_df[\"recall_class1\"].std(),\n", " \"business_cost_min\": cv_results_v3_df[\"business_cost_min\"].std(),\n", " }\n", " \n", " # ========== Logging dans MLFlow ==========\n", " mlflow.log_metric(\"auc\", metrics_mean_v3[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean_v3[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean_v3[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean_v3[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean_v3[\"optimal_threshold\"])\n", " \n", " # Log artefact JSON avec détails par fold\n", " mlflow.log_dict(cv_results_v3_df.to_dict(orient=\"records\"), \"cv_results_per_fold.json\")\n", " \n", " print(\"\\n✓ Cross-Validation LogisticRegression V3 (RobustScaler + class_weight) terminée\")\n", " print(f\" AUC moyen: {metrics_mean_v3['auc']:.4f} ± {metrics_std_v3['auc']:.4f}\")\n", " print(f\" F1 moyen: {metrics_mean_v3['f1_score']:.4f} ± {metrics_std_v3['f1_score']:.4f}\")\n", " print(f\" Recall moyen: {metrics_mean_v3['recall_class1']:.4f} ± {metrics_std_v3['recall_class1']:.4f}\")\n", " print(f\" Coût métier moyen: {metrics_mean_v3['business_cost_min']:.2f} ± {metrics_std_v3['business_cost_min']:.2f}\")\n", " print(f\" Seuil optimal: {metrics_mean_v3['optimal_threshold']:.2f}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "140396bb", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 19, "id": "e7ef636c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Modèle final LogisticRegression V3 entraîné sur l'ensemble train complet\n", " Nombre de features: 724\n", " Intercept: -0.000000\n", " Norme des coefficients: 0.000000\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "2026/02/06 02:11:47 WARNING mlflow.models.model: `artifact_path` is deprecated. Please use `name` instead.\n", "2026/02/06 02:11:49 WARNING mlflow.utils.environment: Failed to resolve installed pip version. ``pip`` will be added to conda.yaml environment spec without a version specifier.\n", "Registered model 'regression' already exists. Creating a new version of this model...\n", "2026/02/06 02:11:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: regression, version 3\n", "Created version '3' of model 'regression'.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "✓ Modèle V3 enregistré dans MLflow Model Registry sous le nom 'regression'\n", " AUC (CV): 0.5424\n", " F1 (CV): 0.1523\n", " Recall (CV): 0.7252\n", " Business Cost Min (CV): 1636.60\n", "\n", " Model URI: models:/m-6f0e559865f84c4a9bae981ffb44747e\n", " ℹ️ Ce modèle est maintenant disponible dans le Model Registry\n", " Accessible via: mlflow.sklearn.load_model('models:/regression/latest')\n", "🏃 View run V3_LogisticRegression_RobustScaler_Final at: http://127.0.0.1:5000/#/experiments/1/runs/6d7ce4bf0fa94725a3b69b3f85e5bdc8\n", "🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1\n" ] } ], "source": [ "# ============================================================================\n", "# ENTRAÎNEMENT FINAL V3 + ENREGISTREMENT DANS MODEL REGISTRY\n", "# ============================================================================\n", "\n", "# Entraîner le modèle final sur tout le train set avec RobustScaler\n", "final_model_v3 = LogisticRegression(**MODEL_CONFIG_V3)\n", "final_model_v3.fit(X_train_robust, y_train)\n", "\n", "print(\"\\n✓ Modèle final LogisticRegression V3 entraîné sur l'ensemble train complet\")\n", "print(f\" Nombre de features: {X_train_robust.shape[1]}\")\n", "print(f\" Intercept: {final_model_v3.intercept_[0]:.6f}\")\n", "print(f\" Norme des coefficients: {np.linalg.norm(final_model_v3.coef_):.6f}\")\n", "\n", "# ============================================================================\n", "# LOGGING MLFLOW V3 + ENREGISTREMENT DANS MODEL REGISTRY\n", "# ============================================================================\n", "\n", "mlflow.end_run()\n", "\n", "with mlflow.start_run(run_name=\"V3_LogisticRegression_RobustScaler_Final\"):\n", " # Logging des paramètres\n", " mlflow.log_params(MODEL_CONFIG_V3)\n", " \n", " # Tags\n", " mlflow.set_tag(\"version\", \"3\")\n", " mlflow.set_tag(\"model\", \"LogisticRegression\")\n", " mlflow.set_tag(\"phase\", \"final_model\")\n", " mlflow.set_tag(\"threshold\", str(THRESHOLD_FIXED))\n", " mlflow.set_tag(\"scaling\", \"RobustScaler\")\n", " mlflow.set_tag(\"imbalance_handling\", \"class_weight\")\n", " mlflow.set_tag(\"model_type\", \"LogisticRegression\")\n", " mlflow.set_tag(\"best_model\", \"true\")\n", " \n", " # Logger les métriques CV\n", " mlflow.log_metric(\"auc\", metrics_mean_v3[\"auc\"])\n", " mlflow.log_metric(\"f1_score\", metrics_mean_v3[\"f1_score\"])\n", " mlflow.log_metric(\"recall_class1\", metrics_mean_v3[\"recall_class1\"])\n", " mlflow.log_metric(\"business_cost_min\", metrics_mean_v3[\"business_cost_min\"])\n", " mlflow.log_metric(\"optimal_threshold\", metrics_mean_v3[\"optimal_threshold\"])\n", " \n", " # Sauvegarder le modèle ET l'enregistrer dans le Model Registry\n", " model_info = mlflow.sklearn.log_model(\n", " final_model_v3,\n", " artifact_path=\"logistic_regression_v3_robust_scaler\",\n", " registered_model_name=\"regression\" # Enregistrement automatique dans Model Registry\n", " )\n", " \n", " print(f\"\\n✓ Modèle V3 enregistré dans MLflow Model Registry sous le nom 'regression'\")\n", " print(f\" AUC (CV): {metrics_mean_v3['auc']:.4f}\")\n", " print(f\" F1 (CV): {metrics_mean_v3['f1_score']:.4f}\")\n", " print(f\" Recall (CV): {metrics_mean_v3['recall_class1']:.4f}\")\n", " print(f\" Business Cost Min (CV): {metrics_mean_v3['business_cost_min']:.2f}\")\n", " print(f\"\\n Model URI: {model_info.model_uri}\")\n", " print(f\" ℹ️ Ce modèle est maintenant disponible dans le Model Registry\")\n", " print(f\" Accessible via: mlflow.sklearn.load_model('models:/regression/latest')\")" ] }, { "cell_type": "code", "execution_count": 23, "id": "7af82263", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "==================================================================================================================================\n", "TABLEAU COMPARATIF FINAL: Toutes les versions\n", "==================================================================================================================================\n", " Version Scaling Imbalance AUC F1-Score Recall Coût Métier\n", " V1 Baseline StandardScaler None 0.7010 ± 0.0038 0.0631 ± 0.0355 0.0361 ± 0.0202 1511.00 ± 34.85\n", " V2.1 Class Weight StandardScaler class_weight 0.6957 ± 0.0117 0.2367 ± 0.0078 0.5832 ± 0.0305 1164.40 ± 32.60\n", " V2.2 SMOTE StandardScaler SMOTE 0.6890 ± 0.0104 0.2386 ± 0.0104 0.5600 ± 0.0297 1168.60 ± 36.73\n", "V3 RobustScaler + CW RobustScaler class_weight 0.5424 ± 0.0245 0.1523 ± 0.0051 0.7252 ± 0.0298 1636.60 ± 52.71\n", "==================================================================================================================================\n", "\n", "✓ Meilleure version par métrique:\n", " AUC: V1 (0.7010)\n", " F1-Score: V2.2 (0.2386)\n", " Recall Classe 1: V3 (0.7252)\n", " Coût Métier Min: V2.1 (1164.40)\n", "\n", "==================================================================================================================================\n", "✓ MODÈLE FINAL SÉLECTIONNÉ: V2.1 StandardScaler + class_weight='balanced'\n", " Basé sur le Coût Métier (métrique métier principale): 1164.40\n", " Note: V3 a été enregistré dans Model Registry pour démonstration,\n", " mais V2.1 StandardScaler + class_weight='balanced' a de meilleures performances\n", "==================================================================================================================================\n" ] } ], "source": [ "# ============================================================================\n", "# TABLEAU COMPARATIF FINAL: Toutes les versions (V1, V2.1, V2.2, V3)\n", "# ============================================================================\n", "\n", "print(\"\\n\" + \"=\"*130)\n", "print(\"TABLEAU COMPARATIF FINAL: Toutes les versions\")\n", "print(\"=\"*130)\n", "\n", "comparison_data_final = {\n", " \"Version\": [\"V1 Baseline\", \"V2.1 Class Weight\", \"V2.2 SMOTE\", \"V3 RobustScaler + CW\"],\n", " \"Scaling\": [\"StandardScaler\", \"StandardScaler\", \"StandardScaler\", \"RobustScaler\"],\n", " \"Imbalance\": [\"None\", \"class_weight\", \"SMOTE\", \"class_weight\"],\n", " \"AUC\": [\n", " f\"{metrics_mean['auc']:.4f} ± {metrics_std['auc']:.4f}\",\n", " f\"{metrics_mean_v2_1['auc']:.4f} ± {metrics_std_v2_1['auc']:.4f}\",\n", " f\"{metrics_mean_v2_2['auc']:.4f} ± {metrics_std_v2_2['auc']:.4f}\",\n", " f\"{metrics_mean_v3['auc']:.4f} ± {metrics_std_v3['auc']:.4f}\",\n", " ],\n", " \"F1-Score\": [\n", " f\"{metrics_mean['f1_score']:.4f} ± {metrics_std['f1_score']:.4f}\",\n", " f\"{metrics_mean_v2_1['f1_score']:.4f} ± {metrics_std_v2_1['f1_score']:.4f}\",\n", " f\"{metrics_mean_v2_2['f1_score']:.4f} ± {metrics_std_v2_2['f1_score']:.4f}\",\n", " f\"{metrics_mean_v3['f1_score']:.4f} ± {metrics_std_v3['f1_score']:.4f}\",\n", " ],\n", " \"Recall\": [\n", " f\"{metrics_mean['recall_class1']:.4f} ± {metrics_std['recall_class1']:.4f}\",\n", " f\"{metrics_mean_v2_1['recall_class1']:.4f} ± {metrics_std_v2_1['recall_class1']:.4f}\",\n", " f\"{metrics_mean_v2_2['recall_class1']:.4f} ± {metrics_std_v2_2['recall_class1']:.4f}\",\n", " f\"{metrics_mean_v3['recall_class1']:.4f} ± {metrics_std_v3['recall_class1']:.4f}\",\n", " ],\n", " \"Coût Métier\": [\n", " f\"{metrics_mean['business_cost_min']:.2f} ± {metrics_std['business_cost_min']:.2f}\",\n", " f\"{metrics_mean_v2_1['business_cost_min']:.2f} ± {metrics_std_v2_1['business_cost_min']:.2f}\",\n", " f\"{metrics_mean_v2_2['business_cost_min']:.2f} ± {metrics_std_v2_2['business_cost_min']:.2f}\",\n", " f\"{metrics_mean_v3['business_cost_min']:.2f} ± {metrics_std_v3['business_cost_min']:.2f}\",\n", " ]\n", "}\n", "\n", "comparison_df_final = pd.DataFrame(comparison_data_final)\n", "print(comparison_df_final.to_string(index=False))\n", "print(\"=\"*130)\n", "\n", "# Déterminer la meilleure version finale\n", "all_metrics = {\n", " \"V1\": metrics_mean,\n", " \"V2.1\": metrics_mean_v2_1,\n", " \"V2.2\": metrics_mean_v2_2,\n", " \"V3\": metrics_mean_v3\n", "}\n", "\n", "best_auc_v = max(all_metrics.items(), key=lambda x: x[1]['auc'])\n", "best_f1_v = max(all_metrics.items(), key=lambda x: x[1]['f1_score'])\n", "best_recall_v = max(all_metrics.items(), key=lambda x: x[1]['recall_class1'])\n", "best_cost_v = min(all_metrics.items(), key=lambda x: x[1]['business_cost_min'])\n", "\n", "print(\"\\n✓ Meilleure version par métrique:\")\n", "print(f\" AUC: {best_auc_v[0]} ({best_auc_v[1]['auc']:.4f})\")\n", "print(f\" F1-Score: {best_f1_v[0]} ({best_f1_v[1]['f1_score']:.4f})\")\n", "print(f\" Recall Classe 1: {best_recall_v[0]} ({best_recall_v[1]['recall_class1']:.4f})\")\n", "print(f\" Coût Métier Min: {best_cost_v[0]} ({best_cost_v[1]['business_cost_min']:.2f})\")\n", "\n", "# Sélection dynamique basée sur le Coût Métier (métrique métier principale)\n", "best_overall_version = best_cost_v[0]\n", "\n", "version_names = {\n", " \"V1\": \"V1 Baseline\",\n", " \"V2.1\": \"V2.1 StandardScaler + class_weight='balanced'\",\n", " \"V2.2\": \"V2.2 StandardScaler + SMOTE\",\n", " \"V3\": \"V3 RobustScaler + class_weight='balanced'\"\n", "}\n", "\n", "print(\"\\n\" + \"=\"*130)\n", "print(f\"✓ MODÈLE FINAL SÉLECTIONNÉ: {version_names[best_overall_version]}\")\n", "print(f\" Basé sur le Coût Métier (métrique métier principale): {best_cost_v[1]['business_cost_min']:.2f}\")\n", "\n", "if best_overall_version == \"V3\":\n", " print(\" Enregistré dans MLflow Model Registry sous le nom: 'regression'\")\n", "else:\n", " print(f\" Note: V3 a été enregistré dans Model Registry pour démonstration,\")\n", " print(f\" mais {version_names[best_overall_version]} a de meilleures performances\")\n", "\n", "print(\"=\"*130)" ] } ], "metadata": { "kernelspec": { "display_name": "OC_P6", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }