projet_dg-models / models /metadata_model.json
Darryl237's picture
Upload folder using huggingface_hub
38b3fe7 verified
Raw
History Blame Contribute Delete
5.11 kB
{
"project": {
"name": "GDM Risk Prediction - France",
"objective": "Prediction of gestational diabetes mellitus risk at the 1st trimester without biological variables",
"version": "1.0.0",
"trained_at": "2026-05-26T16:31:03",
"status": "validated"
},
"selected_model": {
"name": "Logistic Regression with Isotonic Calibration",
"short_name": "dg_lr_isotonic",
"bundle_path": "artifacts/models/dg_lr_isotonic_bundle.joblib",
"rationale": "Best AUC-ROC (0.759) and AUC-PR (0.334) among all candidates; best Brier Score (0.088); interpretable for clinical context"
},
"candidate_models": [
{
"name": "Logistic Regression",
"AUC_ROC": 0.759,
"AUC_PR": 0.334,
"Brier_Score": 0.088,
"selected": true
},
{
"name": "XGBoost",
"AUC_ROC": 0.753,
"AUC_PR": 0.32,
"Brier_Score": 0.089,
"selected": false
},
{
"name": "Random Forest",
"AUC_ROC": 0.748,
"AUC_PR": 0.298,
"Brier_Score": 0.09,
"selected": false
},
{
"name": "SVM",
"AUC_ROC": 0.76,
"AUC_PR": 0.335,
"Brier_Score": 0.088,
"selected": false,
"sensitivity_internal": 0.884,
"specificity_internal": 0.413,
"external_detection_rate": 0.947,
"external_FN": 13,
"DCA_points_utiles": 99
}
],
"thresholding": {
"decision_threshold": 0.06,
"threshold_file": "artifacts/threshold.json",
"rationale": "Threshold optimized for high sensitivity (88.7%) and high NPV (96.6%) to minimize false negatives in a screening context"
},
"performance": {
"dataset": "Internal test set (80/20 stratified split, n=30000)",
"AUC_ROC": 0.759394470526865,
"AUC_PR": 0.3337401809714899,
"Brier_Score": 0.088,
"sensitivity": 0.8874074074074074,
"specificity": 0.4060093896713615,
"VPP": 0.15922381711855396,
"VPN": 0.966041108132261,
"F1_score": 0.27000225377507325,
"balanced_accuracy": 0.6467083985393844,
"MCC": 0.19171550055591624,
"TP": 599,
"FP": 3163,
"TN": 2162,
"FN": 76,
"metrics_file": "artifacts/metrics_final.json"
},
"external_validation": {
"cohort_positive_243": {
"n": 243,
"detection_rate_LR": 0.9259,
"false_negatives_LR": 18
},
"cohort_mixed_455": {
"n": 455,
"description": "243 positive + 212 negative cases"
}
},
"data": {
"dataset_name": "dataset_dg_france_30000_final",
"n_rows": 30000,
"n_features": 18,
"target": "gdm_label",
"target_values": [
"Non",
"Oui"
],
"prevalence_train": "imbalanced (~22% positive)",
"class_balancing": "class_weight=balanced"
},
"features": {
"input_schema_path": "artifacts/feature_schema/input_schema.json",
"n_features": 18,
"features_used": [
"age_maternel",
"parite",
"niveau_etude",
"zone_residence",
"imc",
"ta_systolique",
"ta_diastolique",
"hta_chronique",
"sedentarite",
"tabagisme",
"alcoolisme",
"atcd_gdm",
"atcd_macrosomie",
"atcd_preeclampsie",
"atcd_familial_diabete_1er_deg",
"sopk",
"grossesse_multiple",
"sa_premiere_consult"
],
"excluded": {
"identifiers": [
"patient_id",
"centre_id",
"pays"
],
"administrative": [
"annee_inclusion"
],
"biological_leakage": [
"glycemie_jeun_1T",
"ogtt_0min",
"ogtt_60min",
"ogtt_120min",
"critere_dg"
],
"collinear": [
"poids_kg",
"taille_cm"
]
}
},
"preprocessing": {
"pipeline": "sklearn.pipeline.Pipeline (embedded in bundle)",
"categorical_missing": "filled with Non_renseigne before encoding",
"numeric_missing": "SimpleImputer(strategy=median)",
"encoding": "OneHotEncoder(handle_unknown=ignore)",
"scaling": "StandardScaler on numeric features",
"note": "Full preprocessing pipeline is saved inside the .joblib bundle — no separate preprocessing step needed at inference"
},
"inference": {
"how_to_load": "import joblib; bundle = joblib.load(\"artifacts/models/dg_lr_isotonic_bundle.joblib\")",
"bundle_keys": [
"model",
"preprocessor",
"feature_names",
"threshold",
"calibrator"
],
"prediction_example": "proba = bundle[\"model\"].predict_proba(X_preprocessed)[:,1]; label = int(proba[0] >= bundle[\"threshold\"])"
},
"compliance": {
"disclaimer": "Research prototype only. Not validated as a medical device. Not to be used for autonomous clinical decision-making without appropriate medical, ethical, institutional and regulatory validation.",
"data_not_included": true,
"data_reason": "Raw and processed data contain potentially sensitive patient information and are excluded from version control per GDPR principles"
}
}