{ "project": { "name": "GDM Risk Prediction - France", "objective": "Prediction of gestational diabetes mellitus risk at the 1st trimester without biological variables", "version": "1.0.0", "trained_at": "2026-05-26T16:31:03", "status": "validated" }, "selected_model": { "name": "Logistic Regression with Isotonic Calibration", "short_name": "dg_lr_isotonic", "bundle_path": "artifacts/models/dg_lr_isotonic_bundle.joblib", "rationale": "Best AUC-ROC (0.759) and AUC-PR (0.334) among all candidates; best Brier Score (0.088); interpretable for clinical context" }, "candidate_models": [ { "name": "Logistic Regression", "AUC_ROC": 0.759, "AUC_PR": 0.334, "Brier_Score": 0.088, "selected": true }, { "name": "XGBoost", "AUC_ROC": 0.753, "AUC_PR": 0.32, "Brier_Score": 0.089, "selected": false }, { "name": "Random Forest", "AUC_ROC": 0.748, "AUC_PR": 0.298, "Brier_Score": 0.09, "selected": false }, { "name": "SVM", "AUC_ROC": 0.76, "AUC_PR": 0.335, "Brier_Score": 0.088, "selected": false, "sensitivity_internal": 0.884, "specificity_internal": 0.413, "external_detection_rate": 0.947, "external_FN": 13, "DCA_points_utiles": 99 } ], "thresholding": { "decision_threshold": 0.06, "threshold_file": "artifacts/threshold.json", "rationale": "Threshold optimized for high sensitivity (88.7%) and high NPV (96.6%) to minimize false negatives in a screening context" }, "performance": { "dataset": "Internal test set (80/20 stratified split, n=30000)", "AUC_ROC": 0.759394470526865, "AUC_PR": 0.3337401809714899, "Brier_Score": 0.088, "sensitivity": 0.8874074074074074, "specificity": 0.4060093896713615, "VPP": 0.15922381711855396, "VPN": 0.966041108132261, "F1_score": 0.27000225377507325, "balanced_accuracy": 0.6467083985393844, "MCC": 0.19171550055591624, "TP": 599, "FP": 3163, "TN": 2162, "FN": 76, "metrics_file": "artifacts/metrics_final.json" }, "external_validation": { "cohort_positive_243": { "n": 243, "detection_rate_LR": 0.9259, "false_negatives_LR": 18 }, "cohort_mixed_455": { "n": 455, "description": "243 positive + 212 negative cases" } }, "data": { "dataset_name": "dataset_dg_france_30000_final", "n_rows": 30000, "n_features": 18, "target": "gdm_label", "target_values": [ "Non", "Oui" ], "prevalence_train": "imbalanced (~22% positive)", "class_balancing": "class_weight=balanced" }, "features": { "input_schema_path": "artifacts/feature_schema/input_schema.json", "n_features": 18, "features_used": [ "age_maternel", "parite", "niveau_etude", "zone_residence", "imc", "ta_systolique", "ta_diastolique", "hta_chronique", "sedentarite", "tabagisme", "alcoolisme", "atcd_gdm", "atcd_macrosomie", "atcd_preeclampsie", "atcd_familial_diabete_1er_deg", "sopk", "grossesse_multiple", "sa_premiere_consult" ], "excluded": { "identifiers": [ "patient_id", "centre_id", "pays" ], "administrative": [ "annee_inclusion" ], "biological_leakage": [ "glycemie_jeun_1T", "ogtt_0min", "ogtt_60min", "ogtt_120min", "critere_dg" ], "collinear": [ "poids_kg", "taille_cm" ] } }, "preprocessing": { "pipeline": "sklearn.pipeline.Pipeline (embedded in bundle)", "categorical_missing": "filled with Non_renseigne before encoding", "numeric_missing": "SimpleImputer(strategy=median)", "encoding": "OneHotEncoder(handle_unknown=ignore)", "scaling": "StandardScaler on numeric features", "note": "Full preprocessing pipeline is saved inside the .joblib bundle — no separate preprocessing step needed at inference" }, "inference": { "how_to_load": "import joblib; bundle = joblib.load(\"artifacts/models/dg_lr_isotonic_bundle.joblib\")", "bundle_keys": [ "model", "preprocessor", "feature_names", "threshold", "calibrator" ], "prediction_example": "proba = bundle[\"model\"].predict_proba(X_preprocessed)[:,1]; label = int(proba[0] >= bundle[\"threshold\"])" }, "compliance": { "disclaimer": "Research prototype only. Not validated as a medical device. Not to be used for autonomous clinical decision-making without appropriate medical, ethical, institutional and regulatory validation.", "data_not_included": true, "data_reason": "Raw and processed data contain potentially sensitive patient information and are excluded from version control per GDPR principles" } }