projet_dg-models / models /metadata_model.json

Upload folder using huggingface_hub

38b3fe7 verified 8 days ago

5.11 kB

	{
	"project": {
	"name": "GDM Risk Prediction - France",
	"objective": "Prediction of gestational diabetes mellitus risk at the 1st trimester without biological variables",
	"version": "1.0.0",
	"trained_at": "2026-05-26T16:31:03",
	"status": "validated"
	},
	"selected_model": {
	"name": "Logistic Regression with Isotonic Calibration",
	"short_name": "dg_lr_isotonic",
	"bundle_path": "artifacts/models/dg_lr_isotonic_bundle.joblib",
	"rationale": "Best AUC-ROC (0.759) and AUC-PR (0.334) among all candidates; best Brier Score (0.088); interpretable for clinical context"
	},
	"candidate_models": [
	{
	"name": "Logistic Regression",
	"AUC_ROC": 0.759,
	"AUC_PR": 0.334,
	"Brier_Score": 0.088,
	"selected": true
	},
	{
	"name": "XGBoost",
	"AUC_ROC": 0.753,
	"AUC_PR": 0.32,
	"Brier_Score": 0.089,
	"selected": false
	},
	{
	"name": "Random Forest",
	"AUC_ROC": 0.748,
	"AUC_PR": 0.298,
	"Brier_Score": 0.09,
	"selected": false
	},
	{
	"name": "SVM",
	"AUC_ROC": 0.76,
	"AUC_PR": 0.335,
	"Brier_Score": 0.088,
	"selected": false,
	"sensitivity_internal": 0.884,
	"specificity_internal": 0.413,
	"external_detection_rate": 0.947,
	"external_FN": 13,
	"DCA_points_utiles": 99
	}
	],
	"thresholding": {
	"decision_threshold": 0.06,
	"threshold_file": "artifacts/threshold.json",
	"rationale": "Threshold optimized for high sensitivity (88.7%) and high NPV (96.6%) to minimize false negatives in a screening context"
	},
	"performance": {
	"dataset": "Internal test set (80/20 stratified split, n=30000)",
	"AUC_ROC": 0.759394470526865,
	"AUC_PR": 0.3337401809714899,
	"Brier_Score": 0.088,
	"sensitivity": 0.8874074074074074,
	"specificity": 0.4060093896713615,
	"VPP": 0.15922381711855396,
	"VPN": 0.966041108132261,
	"F1_score": 0.27000225377507325,
	"balanced_accuracy": 0.6467083985393844,
	"MCC": 0.19171550055591624,
	"TP": 599,
	"FP": 3163,
	"TN": 2162,
	"FN": 76,
	"metrics_file": "artifacts/metrics_final.json"
	},
	"external_validation": {
	"cohort_positive_243": {
	"n": 243,
	"detection_rate_LR": 0.9259,
	"false_negatives_LR": 18
	},
	"cohort_mixed_455": {
	"n": 455,
	"description": "243 positive + 212 negative cases"
	}
	},
	"data": {
	"dataset_name": "dataset_dg_france_30000_final",
	"n_rows": 30000,
	"n_features": 18,
	"target": "gdm_label",
	"target_values": [
	"Non",
	"Oui"
	],
	"prevalence_train": "imbalanced (~22% positive)",
	"class_balancing": "class_weight=balanced"
	},
	"features": {
	"input_schema_path": "artifacts/feature_schema/input_schema.json",
	"n_features": 18,
	"features_used": [
	"age_maternel",
	"parite",
	"niveau_etude",
	"zone_residence",
	"imc",
	"ta_systolique",
	"ta_diastolique",
	"hta_chronique",
	"sedentarite",
	"tabagisme",
	"alcoolisme",
	"atcd_gdm",
	"atcd_macrosomie",
	"atcd_preeclampsie",
	"atcd_familial_diabete_1er_deg",
	"sopk",
	"grossesse_multiple",
	"sa_premiere_consult"
	],
	"excluded": {
	"identifiers": [
	"patient_id",
	"centre_id",
	"pays"
	],
	"administrative": [
	"annee_inclusion"
	],
	"biological_leakage": [
	"glycemie_jeun_1T",
	"ogtt_0min",
	"ogtt_60min",
	"ogtt_120min",
	"critere_dg"
	],
	"collinear": [
	"poids_kg",
	"taille_cm"
	]
	}
	},
	"preprocessing": {
	"pipeline": "sklearn.pipeline.Pipeline (embedded in bundle)",
	"categorical_missing": "filled with Non_renseigne before encoding",
	"numeric_missing": "SimpleImputer(strategy=median)",
	"encoding": "OneHotEncoder(handle_unknown=ignore)",
	"scaling": "StandardScaler on numeric features",
	"note": "Full preprocessing pipeline is saved inside the .joblib bundle — no separate preprocessing step needed at inference"
	},
	"inference": {
	"how_to_load": "import joblib; bundle = joblib.load(\"artifacts/models/dg_lr_isotonic_bundle.joblib\")",
	"bundle_keys": [
	"model",
	"preprocessor",
	"feature_names",
	"threshold",
	"calibrator"
	],
	"prediction_example": "proba = bundle[\"model\"].predict_proba(X_preprocessed)[:,1]; label = int(proba[0] >= bundle[\"threshold\"])"
	},
	"compliance": {
	"disclaimer": "Research prototype only. Not validated as a medical device. Not to be used for autonomous clinical decision-making without appropriate medical, ethical, institutional and regulatory validation.",
	"data_not_included": true,
	"data_reason": "Raw and processed data contain potentially sensitive patient information and are excluded from version control per GDPR principles"
	}
	}