Spaces:
Sleeping
Sleeping
Sync from GitHub via hub-sync
Browse files- artifacts/models/p1_historical_metadata.json +38 -5
- artifacts/models/p1_historical_pipeline.joblib +2 -2
- artifacts/models/p23_simulation_metadata.json +33 -2
- artifacts/models/p23_simulation_pipeline.joblib +2 -2
- scripts/__pycache__/__init__.cpython-312.pyc +0 -0
- scripts/__pycache__/mlflow_config.cpython-312.pyc +0 -0
- scripts/__pycache__/runtime_model_specs.cpython-312.pyc +0 -0
- scripts/deployment_payload.py +28 -8
- scripts/experience_1.py +52 -5
- scripts/mlflow_config.py +55 -0
- scripts/mlflow_logging.py +108 -0
- scripts/prediction_adjustment.py +8 -4
- scripts/project_config.py +7 -7
- scripts/promote_registered_model.py +293 -132
- scripts/run_full_pipeline.py +152 -51
- scripts/runtime_model_specs.py +41 -0
- scripts/train_historical_model.py +9 -3
- scripts/train_simulation_model.py +97 -0
- streamlit/requirements.txt +1 -0
artifacts/models/p1_historical_metadata.json
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"artifact_role": "P1_historical_prediction_model",
|
|
|
|
| 3 |
"training_notebook": "notebooks/experience_1.ipynb",
|
| 4 |
"training_script": "scripts/experience_1.py",
|
| 5 |
"training_entrypoint": "scripts/experience_1.py",
|
|
@@ -12,7 +13,7 @@
|
|
| 12 |
"parameter_grid_size": 3,
|
| 13 |
"tuning_stage": "systematic_grid_search",
|
| 14 |
"regularization_profile": "parameter_grid_search",
|
| 15 |
-
"trained_at_utc": "2026-05-
|
| 16 |
"dataset_source": "/Users/steph/Code/Python/Jupyter/OCR_Projet12/artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv",
|
| 17 |
"target_year": 2016,
|
| 18 |
"target_column": "target_yield_t_ha_2016",
|
|
@@ -105,12 +106,44 @@
|
|
| 105 |
"area_role": "group_only_not_feature",
|
| 106 |
"split_strategy": "GroupShuffleSplit(area, test_size=0.2, random_state=42)",
|
| 107 |
"metrics": {
|
| 108 |
-
"test_rmse": 2.
|
| 109 |
"test_mae": 0.8025563824483579,
|
| 110 |
"test_r2": 0.9468391265704531,
|
| 111 |
-
"cv_val_rmse_mean": 1.
|
| 112 |
-
"cv_val_mae_mean": 0.
|
| 113 |
"cv_val_r2_mean": 0.9623090308612253
|
| 114 |
},
|
| 115 |
-
"mlflow_run_id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"artifact_role": "P1_historical_prediction_model",
|
| 3 |
+
"runtime_model_role": "historical",
|
| 4 |
"training_notebook": "notebooks/experience_1.ipynb",
|
| 5 |
"training_script": "scripts/experience_1.py",
|
| 6 |
"training_entrypoint": "scripts/experience_1.py",
|
|
|
|
| 13 |
"parameter_grid_size": 3,
|
| 14 |
"tuning_stage": "systematic_grid_search",
|
| 15 |
"regularization_profile": "parameter_grid_search",
|
| 16 |
+
"trained_at_utc": "2026-05-10T22:36:57.656730+00:00",
|
| 17 |
"dataset_source": "/Users/steph/Code/Python/Jupyter/OCR_Projet12/artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv",
|
| 18 |
"target_year": 2016,
|
| 19 |
"target_column": "target_yield_t_ha_2016",
|
|
|
|
| 106 |
"area_role": "group_only_not_feature",
|
| 107 |
"split_strategy": "GroupShuffleSplit(area, test_size=0.2, random_state=42)",
|
| 108 |
"metrics": {
|
| 109 |
+
"test_rmse": 2.059250524000323,
|
| 110 |
"test_mae": 0.8025563824483579,
|
| 111 |
"test_r2": 0.9468391265704531,
|
| 112 |
+
"cv_val_rmse_mean": 1.581435712388605,
|
| 113 |
+
"cv_val_mae_mean": 0.6463164060176475,
|
| 114 |
"cv_val_r2_mean": 0.9623090308612253
|
| 115 |
},
|
| 116 |
+
"mlflow_run_id": "1b8857069dc941109703fbee6fb2b61a",
|
| 117 |
+
"registered_model_name": "p1_historical_pipeline",
|
| 118 |
+
"registered_model_version": "7",
|
| 119 |
+
"registered_model_run_id": "8dee2459e8b84ccba75596514fd5a70a",
|
| 120 |
+
"model_uri": "models:/p1_historical_pipeline/7",
|
| 121 |
+
"registry_source_run_id": "8dee2459e8b84ccba75596514fd5a70a",
|
| 122 |
+
"registered_model_stage": "None",
|
| 123 |
+
"registered_model_source": "models:/m-45ba375e4c5345adad84f2ea32d9df9f",
|
| 124 |
+
"tracking_uri": "sqlite:////Users/steph/Code/Python/Jupyter/OCR_Projet12/artifacts/mlflow.db",
|
| 125 |
+
"exported_at_utc": "2026-05-10T22:37:27.427985+00:00",
|
| 126 |
+
"artifact_path": "artifacts/models/p1_historical_pipeline.joblib",
|
| 127 |
+
"metadata_path": "artifacts/models/p1_historical_metadata.json",
|
| 128 |
+
"output_path": "artifacts/models/p1_historical_pipeline.joblib",
|
| 129 |
+
"output_metadata_path": "artifacts/models/p1_historical_metadata.json",
|
| 130 |
+
"role": "historical",
|
| 131 |
+
"source_run_name": "experience_1__runtime_historical",
|
| 132 |
+
"source_experiment_id": "4",
|
| 133 |
+
"source_run_metrics": {
|
| 134 |
+
"test_rmse": 2.059250524000323,
|
| 135 |
+
"test_mae": 0.8025563824483579,
|
| 136 |
+
"test_r2": 0.9468391265704531,
|
| 137 |
+
"cv_val_rmse_mean": 1.581435712388605,
|
| 138 |
+
"cv_val_mae_mean": 0.6463164060176475,
|
| 139 |
+
"cv_val_r2_mean": 0.9623090308612253
|
| 140 |
+
},
|
| 141 |
+
"source_run_params": {
|
| 142 |
+
"experience_name": "experience_1",
|
| 143 |
+
"runtime_model_role": "historical",
|
| 144 |
+
"registered_model_name": "p1_historical_pipeline",
|
| 145 |
+
"training_entrypoint": "scripts/experience_1.py",
|
| 146 |
+
"target_year": "2016",
|
| 147 |
+
"best_candidate_model_name": "random_forest_search_01"
|
| 148 |
+
}
|
| 149 |
}
|
artifacts/models/p1_historical_pipeline.joblib
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:807b6071dc5a98528ac45dc0a799af64bfa0ace91f3d855eb392adce5a529242
|
| 3 |
+
size 3646451
|
artifacts/models/p23_simulation_metadata.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"model_name": "linear_regression",
|
| 3 |
-
"trained_at_utc": "2026-05-
|
| 4 |
"dataset_source": "data/simulation/crop_yield.csv",
|
| 5 |
"feature_columns": [
|
| 6 |
"region",
|
|
@@ -22,5 +22,36 @@
|
|
| 22 |
"test_r2": 0.9139501848982343
|
| 23 |
},
|
| 24 |
"strategy": "2_models_3_predictions_combined",
|
| 25 |
-
"role": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"model_name": "linear_regression",
|
| 3 |
+
"trained_at_utc": "2026-05-10T22:37:15.882903+00:00",
|
| 4 |
"dataset_source": "data/simulation/crop_yield.csv",
|
| 5 |
"feature_columns": [
|
| 6 |
"region",
|
|
|
|
| 22 |
"test_r2": 0.9139501848982343
|
| 23 |
},
|
| 24 |
"strategy": "2_models_3_predictions_combined",
|
| 25 |
+
"role": "simulation",
|
| 26 |
+
"runtime_model_role": "simulation",
|
| 27 |
+
"registered_model_name": "p23_simulation_pipeline",
|
| 28 |
+
"registered_model_version": "6",
|
| 29 |
+
"registered_model_run_id": "7a9fc5eba2a146058618994287c53538",
|
| 30 |
+
"model_uri": "models:/p23_simulation_pipeline/6",
|
| 31 |
+
"registered_model_stage": "None",
|
| 32 |
+
"registered_model_source": "models:/m-1d4b5fa94ca945809af904589800a72a",
|
| 33 |
+
"tracking_uri": "sqlite:////Users/steph/Code/Python/Jupyter/OCR_Projet12/artifacts/mlflow.db",
|
| 34 |
+
"exported_at_utc": "2026-05-10T22:37:27.484951+00:00",
|
| 35 |
+
"artifact_path": "artifacts/models/p23_simulation_pipeline.joblib",
|
| 36 |
+
"metadata_path": "artifacts/models/p23_simulation_metadata.json",
|
| 37 |
+
"output_path": "artifacts/models/p23_simulation_pipeline.joblib",
|
| 38 |
+
"output_metadata_path": "artifacts/models/p23_simulation_metadata.json",
|
| 39 |
+
"source_run_name": "simulation_runtime__runtime_model",
|
| 40 |
+
"source_experiment_id": "5",
|
| 41 |
+
"source_run_metrics": {
|
| 42 |
+
"train_rmse": 0.49987819477652967,
|
| 43 |
+
"train_mae": 0.39889442485099674,
|
| 44 |
+
"train_r2": 0.9130634145187704,
|
| 45 |
+
"test_rmse": 0.49668933266948173,
|
| 46 |
+
"test_mae": 0.39606497055687334,
|
| 47 |
+
"test_r2": 0.9139501848982343
|
| 48 |
+
},
|
| 49 |
+
"source_run_params": {
|
| 50 |
+
"runtime_model_role": "simulation",
|
| 51 |
+
"registered_model_name": "p23_simulation_pipeline",
|
| 52 |
+
"training_entrypoint": "scripts/train_simulation_model.py",
|
| 53 |
+
"model_name": "linear_regression",
|
| 54 |
+
"dataset_source": "data/simulation/crop_yield.csv",
|
| 55 |
+
"sample_size": "200000"
|
| 56 |
+
}
|
| 57 |
}
|
artifacts/models/p23_simulation_pipeline.joblib
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb2405e6848232fddcd2a15c44384decf4f0b3d98f2d5b0e948296f172c34ec7
|
| 3 |
+
size 4870
|
scripts/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (269 Bytes). View file
|
|
|
scripts/__pycache__/mlflow_config.cpython-312.pyc
ADDED
|
Binary file (3.2 kB). View file
|
|
|
scripts/__pycache__/runtime_model_specs.cpython-312.pyc
ADDED
|
Binary file (1.6 kB). View file
|
|
|
scripts/deployment_payload.py
CHANGED
|
@@ -9,15 +9,23 @@ from __future__ import annotations
|
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
import shutil
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
DEPLOYMENT_REQUIRED_ARTIFACTS = [
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
| 22 |
]
|
| 23 |
|
|
@@ -44,10 +52,22 @@ PAYLOAD_FILE_SPECS = [
|
|
| 44 |
(Path("data/dataset_consolide.csv"), Path("data/dataset_consolide.csv")),
|
| 45 |
(Path("data/simulation/crop_yield.csv"), Path("data/simulation/crop_yield.csv")),
|
| 46 |
(Path("main.py"), Path("main.py")),
|
| 47 |
-
(
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
(
|
| 52 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
| 53 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
|
|
|
| 9 |
import argparse
|
| 10 |
from pathlib import Path
|
| 11 |
import shutil
|
| 12 |
+
import sys
|
| 13 |
|
| 14 |
|
| 15 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 16 |
+
if str(PROJECT_ROOT) not in sys.path:
|
| 17 |
+
sys.path.insert(0, str(PROJECT_ROOT))
|
| 18 |
+
|
| 19 |
+
from scripts.runtime_model_specs import (
|
| 20 |
+
HISTORICAL_RUNTIME_MODEL_SPEC,
|
| 21 |
+
SIMULATION_RUNTIME_MODEL_SPEC,
|
| 22 |
+
)
|
| 23 |
|
| 24 |
DEPLOYMENT_REQUIRED_ARTIFACTS = [
|
| 25 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_model_path.relative_to(PROJECT_ROOT),
|
| 26 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT),
|
| 27 |
+
SIMULATION_RUNTIME_MODEL_SPEC.output_model_path.relative_to(PROJECT_ROOT),
|
| 28 |
+
SIMULATION_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT),
|
| 29 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
| 30 |
]
|
| 31 |
|
|
|
|
| 52 |
(Path("data/dataset_consolide.csv"), Path("data/dataset_consolide.csv")),
|
| 53 |
(Path("data/simulation/crop_yield.csv"), Path("data/simulation/crop_yield.csv")),
|
| 54 |
(Path("main.py"), Path("main.py")),
|
| 55 |
+
(
|
| 56 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_model_path.relative_to(PROJECT_ROOT),
|
| 57 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_model_path.relative_to(PROJECT_ROOT),
|
| 58 |
+
),
|
| 59 |
+
(
|
| 60 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT),
|
| 61 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT),
|
| 62 |
+
),
|
| 63 |
+
(
|
| 64 |
+
SIMULATION_RUNTIME_MODEL_SPEC.output_model_path.relative_to(PROJECT_ROOT),
|
| 65 |
+
SIMULATION_RUNTIME_MODEL_SPEC.output_model_path.relative_to(PROJECT_ROOT),
|
| 66 |
+
),
|
| 67 |
+
(
|
| 68 |
+
SIMULATION_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT),
|
| 69 |
+
SIMULATION_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT),
|
| 70 |
+
),
|
| 71 |
(
|
| 72 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
| 73 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
scripts/experience_1.py
CHANGED
|
@@ -44,13 +44,20 @@ PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
|
| 44 |
if str(PROJECT_ROOT) not in sys.path:
|
| 45 |
sys.path.insert(0, str(PROJECT_ROOT))
|
| 46 |
|
| 47 |
-
from scripts.mlflow_logging import log_named_sklearn_model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
from scripts.project_config import DEFAULT_CONFIG_PATH, load_preparation_config
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
SEED = 42
|
| 52 |
CV_N_SPLITS = 4
|
| 53 |
-
MLFLOW_EXPERIMENT_NAME =
|
| 54 |
SEARCH_SPACE_DEFINITION = {
|
| 55 |
"search_method": "parameter_grid",
|
| 56 |
"scope": "all_candidate_families",
|
|
@@ -310,9 +317,9 @@ def build_experience_paths(
|
|
| 310 |
cv_dir.mkdir(parents=True, exist_ok=True)
|
| 311 |
models_dir.mkdir(parents=True, exist_ok=True)
|
| 312 |
|
| 313 |
-
resolved_tracking_uri = tracking_uri or
|
| 314 |
mlflow_db_path = Path(resolved_tracking_uri.removeprefix("sqlite:///")).resolve()
|
| 315 |
-
mlflow_artifacts_dir =
|
| 316 |
mlflow_experiment_artifact_dir = mlflow_artifacts_dir / MLFLOW_EXPERIMENT_NAME
|
| 317 |
mlflow_experiment_artifact_dir.mkdir(parents=True, exist_ok=True)
|
| 318 |
|
|
@@ -1246,6 +1253,7 @@ def export_p1_artifact(
|
|
| 1246 |
|
| 1247 |
p1_metadata = {
|
| 1248 |
"artifact_role": "P1_historical_prediction_model",
|
|
|
|
| 1249 |
"training_notebook": "notebooks/experience_1.ipynb",
|
| 1250 |
"training_script": "scripts/experience_1.py",
|
| 1251 |
"training_entrypoint": "scripts/experience_1.py",
|
|
@@ -1277,6 +1285,41 @@ def export_p1_artifact(
|
|
| 1277 |
"mlflow_run_id": str(results_df.loc[0, "run_id"]) if "run_id" in results_df.columns else None,
|
| 1278 |
}
|
| 1279 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1280 |
joblib.dump(p1_pipeline, paths.p1_model_path)
|
| 1281 |
paths.p1_metadata_path.write_text(
|
| 1282 |
json.dumps(p1_metadata, indent=2, ensure_ascii=True),
|
|
@@ -1305,7 +1348,7 @@ def run_experience_1(
|
|
| 1305 |
"""
|
| 1306 |
resolved_config_path = Path(config_path) if config_path is not None else DEFAULT_CONFIG_PATH
|
| 1307 |
config = load_preparation_config(resolved_config_path, ensure_dirs=True)
|
| 1308 |
-
resolved_tracking_uri = tracking_uri or
|
| 1309 |
paths = build_experience_paths(
|
| 1310 |
artifacts_dir=Path(config["ARTIFACTS_DIR"]),
|
| 1311 |
tracking_uri=resolved_tracking_uri,
|
|
@@ -1362,6 +1405,10 @@ def run_experience_1(
|
|
| 1362 |
"best_test_rmse": float(results_df.loc[0, "test_rmse"]),
|
| 1363 |
"best_test_r2": float(results_df.loc[0, "test_r2"]),
|
| 1364 |
"tracked_models": list(results_df["model"]),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1365 |
"p1_metadata": p1_metadata,
|
| 1366 |
}
|
| 1367 |
|
|
|
|
| 44 |
if str(PROJECT_ROOT) not in sys.path:
|
| 45 |
sys.path.insert(0, str(PROJECT_ROOT))
|
| 46 |
|
| 47 |
+
from scripts.mlflow_logging import log_and_register_sklearn_model, log_named_sklearn_model
|
| 48 |
+
from scripts.mlflow_config import (
|
| 49 |
+
DEFAULT_MLFLOW_TRACKING_URI,
|
| 50 |
+
EXPERIENCE_1_EXPERIMENT_NAME,
|
| 51 |
+
mlflow_artifacts_dir_for_tracking_uri,
|
| 52 |
+
normalize_tracking_uri,
|
| 53 |
+
)
|
| 54 |
from scripts.project_config import DEFAULT_CONFIG_PATH, load_preparation_config
|
| 55 |
+
from scripts.runtime_model_specs import HISTORICAL_RUNTIME_MODEL_SPEC
|
| 56 |
|
| 57 |
|
| 58 |
SEED = 42
|
| 59 |
CV_N_SPLITS = 4
|
| 60 |
+
MLFLOW_EXPERIMENT_NAME = EXPERIENCE_1_EXPERIMENT_NAME
|
| 61 |
SEARCH_SPACE_DEFINITION = {
|
| 62 |
"search_method": "parameter_grid",
|
| 63 |
"scope": "all_candidate_families",
|
|
|
|
| 317 |
cv_dir.mkdir(parents=True, exist_ok=True)
|
| 318 |
models_dir.mkdir(parents=True, exist_ok=True)
|
| 319 |
|
| 320 |
+
resolved_tracking_uri = normalize_tracking_uri(tracking_uri or DEFAULT_MLFLOW_TRACKING_URI)
|
| 321 |
mlflow_db_path = Path(resolved_tracking_uri.removeprefix("sqlite:///")).resolve()
|
| 322 |
+
mlflow_artifacts_dir = mlflow_artifacts_dir_for_tracking_uri(resolved_tracking_uri)
|
| 323 |
mlflow_experiment_artifact_dir = mlflow_artifacts_dir / MLFLOW_EXPERIMENT_NAME
|
| 324 |
mlflow_experiment_artifact_dir.mkdir(parents=True, exist_ok=True)
|
| 325 |
|
|
|
|
| 1253 |
|
| 1254 |
p1_metadata = {
|
| 1255 |
"artifact_role": "P1_historical_prediction_model",
|
| 1256 |
+
"runtime_model_role": HISTORICAL_RUNTIME_MODEL_SPEC.role,
|
| 1257 |
"training_notebook": "notebooks/experience_1.ipynb",
|
| 1258 |
"training_script": "scripts/experience_1.py",
|
| 1259 |
"training_entrypoint": "scripts/experience_1.py",
|
|
|
|
| 1285 |
"mlflow_run_id": str(results_df.loc[0, "run_id"]) if "run_id" in results_df.columns else None,
|
| 1286 |
}
|
| 1287 |
|
| 1288 |
+
with mlflow.start_run(run_name=f"{MLFLOW_EXPERIMENT_NAME}__runtime_historical") as runtime_run:
|
| 1289 |
+
mlflow.log_param("experience_name", MLFLOW_EXPERIMENT_NAME)
|
| 1290 |
+
mlflow.log_param("runtime_model_role", HISTORICAL_RUNTIME_MODEL_SPEC.role)
|
| 1291 |
+
mlflow.log_param("registered_model_name", HISTORICAL_RUNTIME_MODEL_SPEC.registered_model_name)
|
| 1292 |
+
mlflow.log_param("training_entrypoint", "scripts/experience_1.py")
|
| 1293 |
+
mlflow.log_param("target_year", context.target_year)
|
| 1294 |
+
mlflow.log_param("best_candidate_model_name", best_model_name)
|
| 1295 |
+
mlflow.log_metric("test_rmse", p1_metadata["metrics"]["test_rmse"])
|
| 1296 |
+
mlflow.log_metric("test_mae", p1_metadata["metrics"]["test_mae"])
|
| 1297 |
+
mlflow.log_metric("test_r2", p1_metadata["metrics"]["test_r2"])
|
| 1298 |
+
mlflow.log_metric("cv_val_rmse_mean", p1_metadata["metrics"]["cv_val_rmse_mean"])
|
| 1299 |
+
mlflow.log_metric("cv_val_mae_mean", p1_metadata["metrics"]["cv_val_mae_mean"])
|
| 1300 |
+
mlflow.log_metric("cv_val_r2_mean", p1_metadata["metrics"]["cv_val_r2_mean"])
|
| 1301 |
+
mlflow.log_artifact(str(paths.dataset_path))
|
| 1302 |
+
mlflow.log_artifact(str(paths.model_results_path))
|
| 1303 |
+
runtime_registration = log_and_register_sklearn_model(
|
| 1304 |
+
p1_pipeline,
|
| 1305 |
+
artifact_name=HISTORICAL_RUNTIME_MODEL_SPEC.registered_model_name,
|
| 1306 |
+
registered_model_name=HISTORICAL_RUNTIME_MODEL_SPEC.registered_model_name,
|
| 1307 |
+
model_metadata={
|
| 1308 |
+
"runtime_model_role": HISTORICAL_RUNTIME_MODEL_SPEC.role,
|
| 1309 |
+
"training_entrypoint": "scripts/experience_1.py",
|
| 1310 |
+
},
|
| 1311 |
+
)
|
| 1312 |
+
|
| 1313 |
+
p1_metadata.update(
|
| 1314 |
+
{
|
| 1315 |
+
"registered_model_name": runtime_registration["registered_model_name"],
|
| 1316 |
+
"registered_model_version": runtime_registration["registered_model_version"],
|
| 1317 |
+
"registered_model_run_id": runtime_registration["run_id"],
|
| 1318 |
+
"model_uri": runtime_registration["model_uri"],
|
| 1319 |
+
"registry_source_run_id": runtime_run.info.run_id,
|
| 1320 |
+
}
|
| 1321 |
+
)
|
| 1322 |
+
|
| 1323 |
joblib.dump(p1_pipeline, paths.p1_model_path)
|
| 1324 |
paths.p1_metadata_path.write_text(
|
| 1325 |
json.dumps(p1_metadata, indent=2, ensure_ascii=True),
|
|
|
|
| 1348 |
"""
|
| 1349 |
resolved_config_path = Path(config_path) if config_path is not None else DEFAULT_CONFIG_PATH
|
| 1350 |
config = load_preparation_config(resolved_config_path, ensure_dirs=True)
|
| 1351 |
+
resolved_tracking_uri = normalize_tracking_uri(tracking_uri or DEFAULT_MLFLOW_TRACKING_URI)
|
| 1352 |
paths = build_experience_paths(
|
| 1353 |
artifacts_dir=Path(config["ARTIFACTS_DIR"]),
|
| 1354 |
tracking_uri=resolved_tracking_uri,
|
|
|
|
| 1405 |
"best_test_rmse": float(results_df.loc[0, "test_rmse"]),
|
| 1406 |
"best_test_r2": float(results_df.loc[0, "test_r2"]),
|
| 1407 |
"tracked_models": list(results_df["model"]),
|
| 1408 |
+
"registered_model_name": p1_metadata.get("registered_model_name"),
|
| 1409 |
+
"registered_model_version": p1_metadata.get("registered_model_version"),
|
| 1410 |
+
"registered_model_run_id": p1_metadata.get("registered_model_run_id"),
|
| 1411 |
+
"model_uri": p1_metadata.get("model_uri"),
|
| 1412 |
"p1_metadata": p1_metadata,
|
| 1413 |
}
|
| 1414 |
|
scripts/mlflow_config.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Configuration MLflow commune aux scripts et a l'interface locale."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 9 |
+
MLFLOW_DB_PATH = (PROJECT_ROOT / "artifacts" / "mlflow.db").resolve()
|
| 10 |
+
MLFLOW_ARTIFACTS_DIR = (PROJECT_ROOT / "artifacts" / "mlruns").resolve()
|
| 11 |
+
DEFAULT_MLFLOW_TRACKING_URI = f"sqlite:///{MLFLOW_DB_PATH}"
|
| 12 |
+
|
| 13 |
+
EXPERIENCE_1_EXPERIMENT_NAME = "experience_1"
|
| 14 |
+
SIMULATION_RUNTIME_EXPERIMENT_NAME = "simulation_runtime"
|
| 15 |
+
FULL_PIPELINE_EXPERIMENT_NAME = "run_full_pipeline"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def normalize_tracking_uri(tracking_uri: str | None = None) -> str:
|
| 19 |
+
"""Retourne un tracking URI MLflow stable depuis la racine du projet."""
|
| 20 |
+
resolved_uri = tracking_uri or DEFAULT_MLFLOW_TRACKING_URI
|
| 21 |
+
if not resolved_uri.startswith("sqlite:///"):
|
| 22 |
+
return resolved_uri
|
| 23 |
+
|
| 24 |
+
db_path = Path(resolved_uri.removeprefix("sqlite:///"))
|
| 25 |
+
if not db_path.is_absolute():
|
| 26 |
+
db_path = (PROJECT_ROOT / db_path).resolve()
|
| 27 |
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
| 28 |
+
return f"sqlite:///{db_path}"
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def ensure_mlflow_directories() -> None:
|
| 32 |
+
"""Cree les dossiers MLflow attendus par le projet."""
|
| 33 |
+
MLFLOW_DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
MLFLOW_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def mlflow_artifacts_dir_for_tracking_uri(tracking_uri: str | None = None) -> Path:
|
| 38 |
+
"""Retourne la racine d'artefacts adaptee au tracking URI fourni."""
|
| 39 |
+
resolved_uri = normalize_tracking_uri(tracking_uri)
|
| 40 |
+
if resolved_uri == DEFAULT_MLFLOW_TRACKING_URI:
|
| 41 |
+
artifact_root = MLFLOW_ARTIFACTS_DIR
|
| 42 |
+
elif resolved_uri.startswith("sqlite:///"):
|
| 43 |
+
artifact_root = Path(resolved_uri.removeprefix("sqlite:///")).resolve().parent / "mlruns"
|
| 44 |
+
else:
|
| 45 |
+
artifact_root = MLFLOW_ARTIFACTS_DIR
|
| 46 |
+
|
| 47 |
+
artifact_root.mkdir(parents=True, exist_ok=True)
|
| 48 |
+
return artifact_root
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def experiment_artifact_location(experiment_name: str, tracking_uri: str | None = None) -> str:
|
| 52 |
+
"""Retourne l'emplacement d'artefacts standard d'une experience MLflow."""
|
| 53 |
+
artifact_dir = mlflow_artifacts_dir_for_tracking_uri(tracking_uri) / experiment_name
|
| 54 |
+
artifact_dir.mkdir(parents=True, exist_ok=True)
|
| 55 |
+
return artifact_dir.resolve().as_uri()
|
scripts/mlflow_logging.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import json
|
|
|
|
| 6 |
from pathlib import Path
|
| 7 |
from typing import Any
|
| 8 |
|
|
@@ -10,6 +11,30 @@ import mlflow
|
|
| 10 |
import mlflow.pyfunc
|
| 11 |
import mlflow.sklearn
|
| 12 |
import pandas as pd
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def sanitize_logged_model_name(raw_name: str) -> str:
|
|
@@ -53,6 +78,89 @@ def log_named_sklearn_model(estimator: Any, *, model_name: str) -> str:
|
|
| 53 |
return logged_model_name
|
| 54 |
|
| 55 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
class EvaluationPredictionLookupModel(mlflow.pyfunc.PythonModel):
|
| 57 |
"""MLflow pyfunc model exposing precomputed evaluation predictions by key lookup.
|
| 58 |
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import json
|
| 6 |
+
import logging
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Any
|
| 9 |
|
|
|
|
| 11 |
import mlflow.pyfunc
|
| 12 |
import mlflow.sklearn
|
| 13 |
import pandas as pd
|
| 14 |
+
from mlflow.tracking import MlflowClient
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
SKLEARN_PICKLE_WARNING_PREFIX = (
|
| 18 |
+
"Saving scikit-learn models in the pickle or cloudpickle format requires exercising caution"
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class _SuppressSklearnPickleWarning(logging.Filter):
|
| 23 |
+
"""Filtre le warning MLflow repete sur la serialisation pickle/cloudpickle."""
|
| 24 |
+
|
| 25 |
+
def filter(self, record: logging.LogRecord) -> bool:
|
| 26 |
+
"""Retourne `False` uniquement pour le warning verbeux attendu."""
|
| 27 |
+
return SKLEARN_PICKLE_WARNING_PREFIX not in record.getMessage()
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def configure_mlflow_sklearn_logging() -> None:
|
| 31 |
+
"""Rend les logs MLflow sklearn lisibles pendant les entrainements longs."""
|
| 32 |
+
logger = logging.getLogger("mlflow.sklearn")
|
| 33 |
+
if not any(isinstance(item, _SuppressSklearnPickleWarning) for item in logger.filters):
|
| 34 |
+
logger.addFilter(_SuppressSklearnPickleWarning())
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
configure_mlflow_sklearn_logging()
|
| 38 |
|
| 39 |
|
| 40 |
def sanitize_logged_model_name(raw_name: str) -> str:
|
|
|
|
| 78 |
return logged_model_name
|
| 79 |
|
| 80 |
|
| 81 |
+
def _registered_model_version_sort_key(version: Any) -> tuple[int, str]:
|
| 82 |
+
"""Produit une cle de tri robuste pour les versions du registry MLflow."""
|
| 83 |
+
raw_version = str(getattr(version, "version", version))
|
| 84 |
+
return (int(raw_version), raw_version) if raw_version.isdigit() else (-1, raw_version)
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def resolve_registered_model_version_for_run(
|
| 88 |
+
*,
|
| 89 |
+
registered_model_name: str,
|
| 90 |
+
run_id: str,
|
| 91 |
+
tracking_uri: str | None = None,
|
| 92 |
+
) -> Any:
|
| 93 |
+
"""Recupere la version du registry associee a un run MLflow donne.
|
| 94 |
+
|
| 95 |
+
Args:
|
| 96 |
+
registered_model_name: Nom du registered model a inspecter.
|
| 97 |
+
run_id: Identifiant du run source.
|
| 98 |
+
tracking_uri: Tracking URI MLflow optionnel.
|
| 99 |
+
|
| 100 |
+
Returns:
|
| 101 |
+
Any: Objet version retourne par le client MLflow.
|
| 102 |
+
"""
|
| 103 |
+
client = MlflowClient(tracking_uri=tracking_uri)
|
| 104 |
+
versions = [
|
| 105 |
+
version
|
| 106 |
+
for version in client.search_model_versions(f"name = '{registered_model_name}'")
|
| 107 |
+
if str(getattr(version, "run_id", "")) == str(run_id)
|
| 108 |
+
]
|
| 109 |
+
if not versions:
|
| 110 |
+
raise RuntimeError(
|
| 111 |
+
"Registered model version could not be resolved for "
|
| 112 |
+
f"model={registered_model_name!r} and run_id={run_id!r}."
|
| 113 |
+
)
|
| 114 |
+
return max(versions, key=_registered_model_version_sort_key)
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def log_and_register_sklearn_model(
|
| 118 |
+
estimator: Any,
|
| 119 |
+
*,
|
| 120 |
+
artifact_name: str,
|
| 121 |
+
registered_model_name: str,
|
| 122 |
+
model_metadata: dict[str, Any] | None = None,
|
| 123 |
+
await_registration_for: int = 300,
|
| 124 |
+
) -> dict[str, str]:
|
| 125 |
+
"""Journalise un estimateur et l'enregistre comme registered model MLflow.
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
estimator: Estimateur scikit-learn a enregistrer.
|
| 129 |
+
artifact_name: Nom de l'artefact de run.
|
| 130 |
+
registered_model_name: Nom du registered model cible.
|
| 131 |
+
model_metadata: Metadonnees MLflow optionnelles.
|
| 132 |
+
await_registration_for: Duree d'attente maximale de l'enregistrement.
|
| 133 |
+
|
| 134 |
+
Returns:
|
| 135 |
+
dict[str, str]: Contexte de registry resolu apres l'enregistrement.
|
| 136 |
+
"""
|
| 137 |
+
active_run = mlflow.active_run()
|
| 138 |
+
if active_run is None:
|
| 139 |
+
raise RuntimeError("An active MLflow run is required before registering a model.")
|
| 140 |
+
|
| 141 |
+
logged_model_name = sanitize_logged_model_name(artifact_name)
|
| 142 |
+
model_info = mlflow.sklearn.log_model(
|
| 143 |
+
estimator,
|
| 144 |
+
name=logged_model_name,
|
| 145 |
+
registered_model_name=registered_model_name,
|
| 146 |
+
metadata=model_metadata,
|
| 147 |
+
await_registration_for=await_registration_for,
|
| 148 |
+
)
|
| 149 |
+
resolved_version = resolve_registered_model_version_for_run(
|
| 150 |
+
registered_model_name=registered_model_name,
|
| 151 |
+
run_id=active_run.info.run_id,
|
| 152 |
+
tracking_uri=mlflow.get_tracking_uri(),
|
| 153 |
+
)
|
| 154 |
+
return {
|
| 155 |
+
"logged_model_name": logged_model_name,
|
| 156 |
+
"registered_model_name": registered_model_name,
|
| 157 |
+
"registered_model_version": str(resolved_version.version),
|
| 158 |
+
"model_uri": f"models:/{registered_model_name}/{resolved_version.version}",
|
| 159 |
+
"run_id": active_run.info.run_id,
|
| 160 |
+
"logged_model_uri": str(getattr(model_info, "model_uri", "")),
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
|
| 164 |
class EvaluationPredictionLookupModel(mlflow.pyfunc.PythonModel):
|
| 165 |
"""MLflow pyfunc model exposing precomputed evaluation predictions by key lookup.
|
| 166 |
|
scripts/prediction_adjustment.py
CHANGED
|
@@ -24,15 +24,19 @@ from sklearn.model_selection import train_test_split
|
|
| 24 |
from sklearn.pipeline import Pipeline
|
| 25 |
from sklearn.preprocessing import OneHotEncoder
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
from scripts.simulation_dataset import load_normalized_simulation_dataset
|
| 28 |
|
| 29 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 30 |
HISTORICAL_WIDE_DATASET_PATH = PROJECT_ROOT / "artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"
|
| 31 |
-
HISTORICAL_MODEL_PATH =
|
| 32 |
-
HISTORICAL_METADATA_PATH =
|
| 33 |
SIMULATION_DATASET_PATH = PROJECT_ROOT / "data/simulation/crop_yield.csv"
|
| 34 |
-
SIMULATION_MODEL_PATH =
|
| 35 |
-
SIMULATION_METADATA_PATH =
|
| 36 |
|
| 37 |
SEED = 42
|
| 38 |
SIMULATION_SAMPLE_SIZE = 200_000
|
|
|
|
| 24 |
from sklearn.pipeline import Pipeline
|
| 25 |
from sklearn.preprocessing import OneHotEncoder
|
| 26 |
|
| 27 |
+
from scripts.runtime_model_specs import (
|
| 28 |
+
HISTORICAL_RUNTIME_MODEL_SPEC,
|
| 29 |
+
SIMULATION_RUNTIME_MODEL_SPEC,
|
| 30 |
+
)
|
| 31 |
from scripts.simulation_dataset import load_normalized_simulation_dataset
|
| 32 |
|
| 33 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 34 |
HISTORICAL_WIDE_DATASET_PATH = PROJECT_ROOT / "artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"
|
| 35 |
+
HISTORICAL_MODEL_PATH = HISTORICAL_RUNTIME_MODEL_SPEC.output_model_path
|
| 36 |
+
HISTORICAL_METADATA_PATH = HISTORICAL_RUNTIME_MODEL_SPEC.output_metadata_path
|
| 37 |
SIMULATION_DATASET_PATH = PROJECT_ROOT / "data/simulation/crop_yield.csv"
|
| 38 |
+
SIMULATION_MODEL_PATH = SIMULATION_RUNTIME_MODEL_SPEC.output_model_path
|
| 39 |
+
SIMULATION_METADATA_PATH = SIMULATION_RUNTIME_MODEL_SPEC.output_metadata_path
|
| 40 |
|
| 41 |
SEED = 42
|
| 42 |
SIMULATION_SAMPLE_SIZE = 200_000
|
scripts/project_config.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
"""Centralise le chargement de la configuration de
|
| 2 |
|
| 3 |
from pathlib import Path
|
| 4 |
|
|
@@ -10,7 +10,7 @@ DEFAULT_CONFIG_PATH = PROJECT_ROOT / "config" / "project_paths.yaml"
|
|
| 10 |
|
| 11 |
|
| 12 |
def _resolve_path(raw_path: str) -> Path:
|
| 13 |
-
"""
|
| 14 |
path = Path(raw_path)
|
| 15 |
if path.is_absolute():
|
| 16 |
return path
|
|
@@ -18,13 +18,13 @@ def _resolve_path(raw_path: str) -> Path:
|
|
| 18 |
|
| 19 |
|
| 20 |
def ensure_preparation_directories(config: dict[str, object]) -> dict[str, object]:
|
| 21 |
-
"""
|
| 22 |
|
| 23 |
Args:
|
| 24 |
config: Configuration chargee depuis `project_paths.yaml`.
|
| 25 |
|
| 26 |
Returns:
|
| 27 |
-
dict[str, object]: Configuration
|
| 28 |
"""
|
| 29 |
artifacts_dir = config["ARTIFACTS_DIR"]
|
| 30 |
pca_artifacts_dir = config["PCA_ARTIFACTS_DIR"]
|
|
@@ -45,14 +45,14 @@ def load_preparation_config(
|
|
| 45 |
*,
|
| 46 |
ensure_dirs: bool = False,
|
| 47 |
) -> dict[str, object]:
|
| 48 |
-
"""Charge la configuration de
|
| 49 |
|
| 50 |
Args:
|
| 51 |
config_path: Chemin optionnel vers un fichier YAML de configuration.
|
| 52 |
-
ensure_dirs:
|
| 53 |
|
| 54 |
Returns:
|
| 55 |
-
dict[str, object]: Configuration
|
| 56 |
"""
|
| 57 |
path = config_path or DEFAULT_CONFIG_PATH
|
| 58 |
raw_config = yaml.safe_load(path.read_text())
|
|
|
|
| 1 |
+
"""Centralise le chargement de la configuration de préparation du projet."""
|
| 2 |
|
| 3 |
from pathlib import Path
|
| 4 |
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
def _resolve_path(raw_path: str) -> Path:
|
| 13 |
+
"""Résout un chemin de configuration relativement à la racine du projet."""
|
| 14 |
path = Path(raw_path)
|
| 15 |
if path.is_absolute():
|
| 16 |
return path
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
def ensure_preparation_directories(config: dict[str, object]) -> dict[str, object]:
|
| 21 |
+
"""Crée les dossiers cibles attendus par la préparation.
|
| 22 |
|
| 23 |
Args:
|
| 24 |
config: Configuration chargee depuis `project_paths.yaml`.
|
| 25 |
|
| 26 |
Returns:
|
| 27 |
+
dict[str, object]: Configuration inchangée, pour permettre le chainage.
|
| 28 |
"""
|
| 29 |
artifacts_dir = config["ARTIFACTS_DIR"]
|
| 30 |
pca_artifacts_dir = config["PCA_ARTIFACTS_DIR"]
|
|
|
|
| 45 |
*,
|
| 46 |
ensure_dirs: bool = False,
|
| 47 |
) -> dict[str, object]:
|
| 48 |
+
"""Charge la configuration de préparation depuis le fichier YAML du projet.
|
| 49 |
|
| 50 |
Args:
|
| 51 |
config_path: Chemin optionnel vers un fichier YAML de configuration.
|
| 52 |
+
ensure_dirs: Crée les dossiers cibles si `True`.
|
| 53 |
|
| 54 |
Returns:
|
| 55 |
+
dict[str, object]: Configuration normalisée avec des `Path` résolus.
|
| 56 |
"""
|
| 57 |
path = config_path or DEFAULT_CONFIG_PATH
|
| 58 |
raw_config = yaml.safe_load(path.read_text())
|
scripts/promote_registered_model.py
CHANGED
|
@@ -1,12 +1,9 @@
|
|
| 1 |
-
"""
|
| 2 |
-
|
| 3 |
-
Le script applique volontairement une selection stricte du modele source pour
|
| 4 |
-
eviter les exports ambigus quand le registre MLflow contient plusieurs modeles.
|
| 5 |
-
"""
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
import argparse
|
|
|
|
| 10 |
from datetime import datetime, timezone
|
| 11 |
import json
|
| 12 |
from pathlib import Path
|
|
@@ -17,45 +14,71 @@ import mlflow
|
|
| 17 |
import mlflow.sklearn
|
| 18 |
from mlflow.tracking import MlflowClient
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
def parse_args() -> argparse.Namespace:
|
| 28 |
-
"""Construit l'interface
|
| 29 |
parser = argparse.ArgumentParser(
|
| 30 |
description=(
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
)
|
| 34 |
)
|
| 35 |
parser.add_argument(
|
| 36 |
"--tracking-uri",
|
| 37 |
-
default=
|
| 38 |
help="Tracking URI MLflow. Par defaut: base SQLite locale du projet.",
|
| 39 |
)
|
| 40 |
parser.add_argument(
|
| 41 |
-
"--
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
default=None,
|
| 43 |
-
help="Nom du registered model
|
| 44 |
)
|
| 45 |
parser.add_argument(
|
| 46 |
-
"--version",
|
| 47 |
default=None,
|
| 48 |
-
help="Version du
|
| 49 |
)
|
| 50 |
parser.add_argument(
|
| 51 |
-
"--
|
| 52 |
-
default=
|
| 53 |
-
help="
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
)
|
| 55 |
parser.add_argument(
|
| 56 |
-
"--
|
| 57 |
-
|
| 58 |
-
help="
|
| 59 |
)
|
| 60 |
return parser.parse_args()
|
| 61 |
|
|
@@ -64,73 +87,114 @@ def project_relative_path(path: Path) -> str:
|
|
| 64 |
"""Retourne un chemin relatif au projet si possible."""
|
| 65 |
resolved = path.resolve()
|
| 66 |
try:
|
| 67 |
-
return str(resolved.relative_to(
|
| 68 |
except ValueError:
|
| 69 |
return str(resolved)
|
| 70 |
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
def normalize_registered_model_names(models: list[Any]) -> list[str]:
|
| 73 |
"""Extrait et trie les noms de registered models MLflow."""
|
| 74 |
return sorted(str(model.name) for model in models)
|
| 75 |
|
| 76 |
|
| 77 |
-
def
|
| 78 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
Args:
|
| 81 |
-
available_names: Noms presents dans le registre MLflow.
|
| 82 |
-
requested_name: Nom explicitement demande, si fourni.
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
if requested_name not in available_names:
|
| 89 |
-
available = ", ".join(available_names) if available_names else "
|
| 90 |
raise ValueError(
|
| 91 |
-
f"
|
| 92 |
-
f"
|
| 93 |
)
|
| 94 |
return requested_name
|
| 95 |
|
| 96 |
-
|
|
|
|
| 97 |
raise ValueError(
|
| 98 |
-
"
|
| 99 |
-
"
|
| 100 |
)
|
| 101 |
-
|
| 102 |
-
if len(available_names) > 1:
|
| 103 |
-
available = ", ".join(available_names)
|
| 104 |
raise ValueError(
|
| 105 |
-
"
|
| 106 |
-
f"
|
| 107 |
-
f"Modeles disponibles: {available}."
|
| 108 |
)
|
| 109 |
-
|
| 110 |
-
return available_names[0]
|
| 111 |
|
| 112 |
|
| 113 |
-
def _version_sort_key(version:
|
| 114 |
"""Produit une cle de tri robuste pour les versions MLflow."""
|
| 115 |
-
|
| 116 |
-
return (int(
|
| 117 |
|
| 118 |
|
| 119 |
-
def
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
if requested_version is not None:
|
| 122 |
for version in versions:
|
| 123 |
if str(version.version) == str(requested_version):
|
| 124 |
return version
|
| 125 |
-
available = ", ".join(str(version.version) for version in versions) if versions else "
|
| 126 |
raise ValueError(
|
| 127 |
-
f"
|
|
|
|
|
|
|
| 128 |
)
|
| 129 |
|
| 130 |
if not versions:
|
| 131 |
-
raise ValueError(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
return
|
| 134 |
|
| 135 |
|
| 136 |
def read_json_if_exists(path: Path) -> dict[str, Any]:
|
|
@@ -140,48 +204,49 @@ def read_json_if_exists(path: Path) -> dict[str, Any]:
|
|
| 140 |
return json.loads(path.read_text(encoding="utf-8"))
|
| 141 |
|
| 142 |
|
| 143 |
-
def
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
|
| 158 |
def build_export_metadata(
|
| 159 |
*,
|
| 160 |
existing_metadata: dict[str, Any],
|
|
|
|
| 161 |
registered_model_name: str,
|
| 162 |
model_version: Any,
|
| 163 |
tracking_uri: str,
|
| 164 |
model_output_path: Path,
|
|
|
|
| 165 |
source_run: Any | None,
|
| 166 |
) -> dict[str, Any]:
|
| 167 |
-
"""Construit les metadonnees de tracabilite de l'export
|
| 168 |
-
|
| 169 |
-
Args:
|
| 170 |
-
existing_metadata: Metadonnees deja presentes sur disque.
|
| 171 |
-
registered_model_name: Nom du registered model exporte.
|
| 172 |
-
model_version: Version MLflow exportee.
|
| 173 |
-
tracking_uri: Tracking URI source.
|
| 174 |
-
model_output_path: Chemin du joblib genere.
|
| 175 |
-
source_run: Run MLflow source, si disponible.
|
| 176 |
-
|
| 177 |
-
Returns:
|
| 178 |
-
dict[str, Any]: Metadonnees consolidees de l'export.
|
| 179 |
-
"""
|
| 180 |
metadata = dict(existing_metadata)
|
| 181 |
-
|
| 182 |
metadata.update(
|
| 183 |
{
|
| 184 |
-
"
|
| 185 |
"registered_model_name": registered_model_name,
|
| 186 |
"registered_model_version": str(model_version.version),
|
| 187 |
"registered_model_stage": str(getattr(model_version, "current_stage", "None") or "None"),
|
|
@@ -191,8 +256,10 @@ def build_export_metadata(
|
|
| 191 |
"tracking_uri": tracking_uri,
|
| 192 |
"exported_at_utc": datetime.now(timezone.utc).isoformat(),
|
| 193 |
"artifact_path": project_relative_path(model_output_path),
|
| 194 |
-
"
|
| 195 |
-
"
|
|
|
|
|
|
|
| 196 |
}
|
| 197 |
)
|
| 198 |
|
|
@@ -207,69 +274,163 @@ def build_export_metadata(
|
|
| 207 |
return metadata
|
| 208 |
|
| 209 |
|
| 210 |
-
def
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
estimator = mlflow.sklearn.load_model(model_uri)
|
| 221 |
-
model_output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 222 |
-
joblib.dump(estimator, model_output_path)
|
| 223 |
-
|
| 224 |
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
|
| 230 |
-
metadata_output_path = Path(args.output_metadata_path).resolve()
|
| 231 |
|
| 232 |
-
mlflow.set_tracking_uri(tracking_uri)
|
| 233 |
-
client = MlflowClient(tracking_uri=tracking_uri)
|
| 234 |
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
)
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
export_registered_model(
|
| 247 |
-
tracking_uri=tracking_uri,
|
| 248 |
registered_model_name=registered_model_name,
|
| 249 |
-
|
| 250 |
-
|
| 251 |
)
|
|
|
|
| 252 |
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
export_metadata = build_export_metadata(
|
| 255 |
existing_metadata=existing_metadata,
|
|
|
|
| 256 |
registered_model_name=registered_model_name,
|
| 257 |
model_version=selected_version,
|
| 258 |
tracking_uri=tracking_uri,
|
| 259 |
-
model_output_path=
|
|
|
|
| 260 |
source_run=source_run,
|
| 261 |
)
|
| 262 |
-
|
| 263 |
-
|
| 264 |
json.dumps(json_ready(export_metadata), indent=2, ensure_ascii=True),
|
| 265 |
encoding="utf-8",
|
| 266 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
|
| 275 |
if __name__ == "__main__":
|
|
|
|
| 1 |
+
"""Promouvoit les deux registered models runtime depuis MLflow vers le disque."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import argparse
|
| 6 |
+
from dataclasses import replace
|
| 7 |
from datetime import datetime, timezone
|
| 8 |
import json
|
| 9 |
from pathlib import Path
|
|
|
|
| 14 |
import mlflow.sklearn
|
| 15 |
from mlflow.tracking import MlflowClient
|
| 16 |
|
| 17 |
+
from scripts.mlflow_config import normalize_tracking_uri
|
| 18 |
+
from scripts.runtime_model_specs import (
|
| 19 |
+
DEFAULT_MLFLOW_TRACKING_URI,
|
| 20 |
+
DEFAULT_MODELS_DIR,
|
| 21 |
+
HISTORICAL_RUNTIME_MODEL_SPEC,
|
| 22 |
+
RuntimeModelSpec,
|
| 23 |
+
SIMULATION_RUNTIME_MODEL_SPEC,
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
REQUIRED_RUNTIME_METADATA_FIELDS = {
|
| 28 |
+
"runtime_model_role",
|
| 29 |
+
"registered_model_name",
|
| 30 |
+
"registered_model_version",
|
| 31 |
+
"registered_model_run_id",
|
| 32 |
+
"model_uri",
|
| 33 |
+
"tracking_uri",
|
| 34 |
+
"exported_at_utc",
|
| 35 |
+
"artifact_path",
|
| 36 |
+
"metadata_path",
|
| 37 |
+
}
|
| 38 |
|
| 39 |
|
| 40 |
def parse_args() -> argparse.Namespace:
|
| 41 |
+
"""Construit l'interface CLI du script de promotion runtime."""
|
| 42 |
parser = argparse.ArgumentParser(
|
| 43 |
description=(
|
| 44 |
+
"Promote the two MLflow registered models used by the FastAPI runtime "
|
| 45 |
+
"and export them to artifacts/models/."
|
| 46 |
)
|
| 47 |
)
|
| 48 |
parser.add_argument(
|
| 49 |
"--tracking-uri",
|
| 50 |
+
default=DEFAULT_MLFLOW_TRACKING_URI,
|
| 51 |
help="Tracking URI MLflow. Par defaut: base SQLite locale du projet.",
|
| 52 |
)
|
| 53 |
parser.add_argument(
|
| 54 |
+
"--models-dir",
|
| 55 |
+
default=str(DEFAULT_MODELS_DIR),
|
| 56 |
+
help="Dossier cible pour les artefacts runtime exportes.",
|
| 57 |
+
)
|
| 58 |
+
parser.add_argument(
|
| 59 |
+
"--historical-registered-model",
|
| 60 |
default=None,
|
| 61 |
+
help="Nom du registered model historique a exporter.",
|
| 62 |
)
|
| 63 |
parser.add_argument(
|
| 64 |
+
"--historical-version",
|
| 65 |
default=None,
|
| 66 |
+
help="Version MLflow du modele historique a exporter.",
|
| 67 |
)
|
| 68 |
parser.add_argument(
|
| 69 |
+
"--simulation-registered-model",
|
| 70 |
+
default=None,
|
| 71 |
+
help="Nom du registered model local/simulation a exporter.",
|
| 72 |
+
)
|
| 73 |
+
parser.add_argument(
|
| 74 |
+
"--simulation-version",
|
| 75 |
+
default=None,
|
| 76 |
+
help="Version MLflow du modele local/simulation a exporter.",
|
| 77 |
)
|
| 78 |
parser.add_argument(
|
| 79 |
+
"--json",
|
| 80 |
+
action="store_true",
|
| 81 |
+
help="Imprime le resume de promotion au format JSON.",
|
| 82 |
)
|
| 83 |
return parser.parse_args()
|
| 84 |
|
|
|
|
| 87 |
"""Retourne un chemin relatif au projet si possible."""
|
| 88 |
resolved = path.resolve()
|
| 89 |
try:
|
| 90 |
+
return str(resolved.relative_to(Path(__file__).resolve().parents[1]))
|
| 91 |
except ValueError:
|
| 92 |
return str(resolved)
|
| 93 |
|
| 94 |
|
| 95 |
+
def json_ready(value: Any) -> Any:
|
| 96 |
+
"""Convertit recursivement les types Python en valeurs serialisables JSON."""
|
| 97 |
+
if isinstance(value, dict):
|
| 98 |
+
return {str(key): json_ready(item) for key, item in value.items()}
|
| 99 |
+
if isinstance(value, list):
|
| 100 |
+
return [json_ready(item) for item in value]
|
| 101 |
+
if isinstance(value, tuple):
|
| 102 |
+
return [json_ready(item) for item in value]
|
| 103 |
+
if isinstance(value, Path):
|
| 104 |
+
return str(value)
|
| 105 |
+
if isinstance(value, datetime):
|
| 106 |
+
return value.isoformat()
|
| 107 |
+
return value
|
| 108 |
+
|
| 109 |
+
|
| 110 |
def normalize_registered_model_names(models: list[Any]) -> list[str]:
|
| 111 |
"""Extrait et trie les noms de registered models MLflow."""
|
| 112 |
return sorted(str(model.name) for model in models)
|
| 113 |
|
| 114 |
|
| 115 |
+
def with_models_dir(spec: RuntimeModelSpec, models_dir: Path) -> RuntimeModelSpec:
|
| 116 |
+
"""Construit une specification identique avec un dossier cible surcharge."""
|
| 117 |
+
return replace(
|
| 118 |
+
spec,
|
| 119 |
+
output_model_path=models_dir / spec.output_model_path.name,
|
| 120 |
+
output_metadata_path=models_dir / spec.output_metadata_path.name,
|
| 121 |
+
)
|
| 122 |
|
|
|
|
|
|
|
|
|
|
| 123 |
|
| 124 |
+
def resolve_registered_model_name_for_role(
|
| 125 |
+
*,
|
| 126 |
+
role_spec: RuntimeModelSpec,
|
| 127 |
+
available_names: list[str],
|
| 128 |
+
requested_name: str | None = None,
|
| 129 |
+
) -> str:
|
| 130 |
+
"""Selectionne le registered model a promouvoir pour un role donne."""
|
| 131 |
+
if requested_name is not None:
|
| 132 |
if requested_name not in available_names:
|
| 133 |
+
available = ", ".join(available_names) if available_names else "none"
|
| 134 |
raise ValueError(
|
| 135 |
+
f"Requested registered model {requested_name!r} for role "
|
| 136 |
+
f"{role_spec.role!r} was not found. Available registered models: {available}."
|
| 137 |
)
|
| 138 |
return requested_name
|
| 139 |
|
| 140 |
+
matching_names = [name for name in available_names if name == role_spec.registered_model_name]
|
| 141 |
+
if not matching_names:
|
| 142 |
raise ValueError(
|
| 143 |
+
f"No MLflow registered model found for role {role_spec.role!r}. "
|
| 144 |
+
f"Expected one of: {role_spec.registered_model_name}."
|
| 145 |
)
|
| 146 |
+
if len(matching_names) > 1:
|
|
|
|
|
|
|
| 147 |
raise ValueError(
|
| 148 |
+
f"Multiple candidate registered models found for role {role_spec.role!r}. "
|
| 149 |
+
f"Please pass --{role_spec.role}-registered-model."
|
|
|
|
| 150 |
)
|
| 151 |
+
return matching_names[0]
|
|
|
|
| 152 |
|
| 153 |
|
| 154 |
+
def _version_sort_key(version: Any) -> tuple[int, str]:
|
| 155 |
"""Produit une cle de tri robuste pour les versions MLflow."""
|
| 156 |
+
raw_value = str(getattr(version, "version", version))
|
| 157 |
+
return (int(raw_value), raw_value) if raw_value.isdigit() else (-1, raw_value)
|
| 158 |
|
| 159 |
|
| 160 |
+
def resolve_model_version_for_role(
|
| 161 |
+
versions: list[Any],
|
| 162 |
+
*,
|
| 163 |
+
role_spec: RuntimeModelSpec,
|
| 164 |
+
registered_model_name: str,
|
| 165 |
+
requested_version: str | None = None,
|
| 166 |
+
allow_latest_version: bool = False,
|
| 167 |
+
) -> Any:
|
| 168 |
+
"""Selectionne strictement la version a exporter pour un role runtime."""
|
| 169 |
if requested_version is not None:
|
| 170 |
for version in versions:
|
| 171 |
if str(version.version) == str(requested_version):
|
| 172 |
return version
|
| 173 |
+
available = ", ".join(str(version.version) for version in versions) if versions else "none"
|
| 174 |
raise ValueError(
|
| 175 |
+
f"Requested version {requested_version!r} for role {role_spec.role!r} and "
|
| 176 |
+
f"registered model {registered_model_name!r} does not exist. "
|
| 177 |
+
f"Available versions: {available}."
|
| 178 |
)
|
| 179 |
|
| 180 |
if not versions:
|
| 181 |
+
raise ValueError(
|
| 182 |
+
f"Registered model exists but no version could be resolved for role "
|
| 183 |
+
f"{role_spec.role!r}."
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
if allow_latest_version:
|
| 187 |
+
return max(versions, key=_version_sort_key)
|
| 188 |
+
|
| 189 |
+
if len(versions) > 1:
|
| 190 |
+
available = ", ".join(str(version.version) for version in sorted(versions, key=_version_sort_key))
|
| 191 |
+
raise ValueError(
|
| 192 |
+
f"Multiple versions are available for role {role_spec.role!r} and "
|
| 193 |
+
f"registered model {registered_model_name!r}. "
|
| 194 |
+
f"Please pass --{role_spec.role}-version. Available versions: {available}."
|
| 195 |
+
)
|
| 196 |
|
| 197 |
+
return versions[0]
|
| 198 |
|
| 199 |
|
| 200 |
def read_json_if_exists(path: Path) -> dict[str, Any]:
|
|
|
|
| 204 |
return json.loads(path.read_text(encoding="utf-8"))
|
| 205 |
|
| 206 |
|
| 207 |
+
def export_registered_model(
|
| 208 |
+
*,
|
| 209 |
+
tracking_uri: str,
|
| 210 |
+
registered_model_name: str,
|
| 211 |
+
model_version: Any,
|
| 212 |
+
model_output_path: Path,
|
| 213 |
+
) -> None:
|
| 214 |
+
"""Charge un modele depuis MLflow et l'exporte en `joblib` local."""
|
| 215 |
+
mlflow.set_tracking_uri(tracking_uri)
|
| 216 |
+
model_uri = f"models:/{registered_model_name}/{model_version.version}"
|
| 217 |
+
estimator = mlflow.sklearn.load_model(model_uri)
|
| 218 |
+
model_output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 219 |
+
joblib.dump(estimator, model_output_path)
|
| 220 |
+
|
| 221 |
+
|
| 222 |
+
def validate_exported_artifact(model_output_path: Path) -> None:
|
| 223 |
+
"""Verifie que l'artefact joblib exporte existe et est rechargeable."""
|
| 224 |
+
if not model_output_path.exists():
|
| 225 |
+
raise RuntimeError(f"Exported artifact is missing or cannot be loaded: {model_output_path}")
|
| 226 |
+
try:
|
| 227 |
+
joblib.load(model_output_path)
|
| 228 |
+
except Exception as exc: # pragma: no cover - defensive branch
|
| 229 |
+
raise RuntimeError(
|
| 230 |
+
f"Exported artifact is missing or cannot be loaded: {model_output_path}"
|
| 231 |
+
) from exc
|
| 232 |
|
| 233 |
|
| 234 |
def build_export_metadata(
|
| 235 |
*,
|
| 236 |
existing_metadata: dict[str, Any],
|
| 237 |
+
role_spec: RuntimeModelSpec,
|
| 238 |
registered_model_name: str,
|
| 239 |
model_version: Any,
|
| 240 |
tracking_uri: str,
|
| 241 |
model_output_path: Path,
|
| 242 |
+
metadata_output_path: Path,
|
| 243 |
source_run: Any | None,
|
| 244 |
) -> dict[str, Any]:
|
| 245 |
+
"""Construit les metadonnees de tracabilite de l'export runtime."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
metadata = dict(existing_metadata)
|
|
|
|
| 247 |
metadata.update(
|
| 248 |
{
|
| 249 |
+
"runtime_model_role": role_spec.role,
|
| 250 |
"registered_model_name": registered_model_name,
|
| 251 |
"registered_model_version": str(model_version.version),
|
| 252 |
"registered_model_stage": str(getattr(model_version, "current_stage", "None") or "None"),
|
|
|
|
| 256 |
"tracking_uri": tracking_uri,
|
| 257 |
"exported_at_utc": datetime.now(timezone.utc).isoformat(),
|
| 258 |
"artifact_path": project_relative_path(model_output_path),
|
| 259 |
+
"metadata_path": project_relative_path(metadata_output_path),
|
| 260 |
+
"output_path": project_relative_path(model_output_path),
|
| 261 |
+
"output_metadata_path": project_relative_path(metadata_output_path),
|
| 262 |
+
"role": role_spec.role,
|
| 263 |
}
|
| 264 |
)
|
| 265 |
|
|
|
|
| 274 |
return metadata
|
| 275 |
|
| 276 |
|
| 277 |
+
def validate_runtime_metadata(metadata: dict[str, Any], *, role_spec: RuntimeModelSpec) -> None:
|
| 278 |
+
"""Verifie que les metadonnees exportees sont coherentes pour le runtime."""
|
| 279 |
+
missing_fields = sorted(
|
| 280 |
+
field_name for field_name in REQUIRED_RUNTIME_METADATA_FIELDS if not metadata.get(field_name)
|
| 281 |
+
)
|
| 282 |
+
if missing_fields:
|
| 283 |
+
raise RuntimeError(
|
| 284 |
+
f"Metadata validation failed for role {role_spec.role!r}. "
|
| 285 |
+
f"Missing fields: {', '.join(missing_fields)}."
|
| 286 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
+
if metadata.get("runtime_model_role") != role_spec.role:
|
| 289 |
+
raise RuntimeError(
|
| 290 |
+
f"Metadata validation failed for role {role_spec.role!r}. "
|
| 291 |
+
f"Unexpected runtime_model_role={metadata.get('runtime_model_role')!r}."
|
| 292 |
+
)
|
|
|
|
| 293 |
|
|
|
|
|
|
|
| 294 |
|
| 295 |
+
def promote_single_registered_model(
|
| 296 |
+
*,
|
| 297 |
+
client: MlflowClient,
|
| 298 |
+
tracking_uri: str,
|
| 299 |
+
role_spec: RuntimeModelSpec,
|
| 300 |
+
available_names: list[str],
|
| 301 |
+
requested_name: str | None = None,
|
| 302 |
+
requested_version: str | None = None,
|
| 303 |
+
allow_latest_version: bool = False,
|
| 304 |
+
) -> dict[str, Any]:
|
| 305 |
+
"""Promouvoit un registered model runtime unique depuis MLflow."""
|
| 306 |
+
registered_model_name = resolve_registered_model_name_for_role(
|
| 307 |
+
role_spec=role_spec,
|
| 308 |
+
available_names=available_names,
|
| 309 |
+
requested_name=requested_name,
|
| 310 |
)
|
| 311 |
+
versions = list(client.search_model_versions(f"name = '{registered_model_name}'"))
|
| 312 |
+
selected_version = resolve_model_version_for_role(
|
| 313 |
+
versions,
|
| 314 |
+
role_spec=role_spec,
|
|
|
|
|
|
|
|
|
|
| 315 |
registered_model_name=registered_model_name,
|
| 316 |
+
requested_version=requested_version,
|
| 317 |
+
allow_latest_version=allow_latest_version,
|
| 318 |
)
|
| 319 |
+
source_run = client.get_run(selected_version.run_id) if getattr(selected_version, "run_id", None) else None
|
| 320 |
|
| 321 |
+
try:
|
| 322 |
+
export_registered_model(
|
| 323 |
+
tracking_uri=tracking_uri,
|
| 324 |
+
registered_model_name=registered_model_name,
|
| 325 |
+
model_version=selected_version,
|
| 326 |
+
model_output_path=role_spec.output_model_path,
|
| 327 |
+
)
|
| 328 |
+
except Exception as exc: # pragma: no cover - defensive branch
|
| 329 |
+
raise RuntimeError(
|
| 330 |
+
f"Export failed for role {role_spec.role!r} and model {registered_model_name!r}."
|
| 331 |
+
) from exc
|
| 332 |
+
|
| 333 |
+
validate_exported_artifact(role_spec.output_model_path)
|
| 334 |
+
existing_metadata = read_json_if_exists(role_spec.output_metadata_path)
|
| 335 |
export_metadata = build_export_metadata(
|
| 336 |
existing_metadata=existing_metadata,
|
| 337 |
+
role_spec=role_spec,
|
| 338 |
registered_model_name=registered_model_name,
|
| 339 |
model_version=selected_version,
|
| 340 |
tracking_uri=tracking_uri,
|
| 341 |
+
model_output_path=role_spec.output_model_path,
|
| 342 |
+
metadata_output_path=role_spec.output_metadata_path,
|
| 343 |
source_run=source_run,
|
| 344 |
)
|
| 345 |
+
role_spec.output_metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
| 346 |
+
role_spec.output_metadata_path.write_text(
|
| 347 |
json.dumps(json_ready(export_metadata), indent=2, ensure_ascii=True),
|
| 348 |
encoding="utf-8",
|
| 349 |
)
|
| 350 |
+
validate_runtime_metadata(export_metadata, role_spec=role_spec)
|
| 351 |
+
|
| 352 |
+
return {
|
| 353 |
+
"role": role_spec.role,
|
| 354 |
+
"registered_model_name": registered_model_name,
|
| 355 |
+
"registered_model_version": str(selected_version.version),
|
| 356 |
+
"registered_model_run_id": getattr(selected_version, "run_id", None),
|
| 357 |
+
"model_uri": f"models:/{registered_model_name}/{selected_version.version}",
|
| 358 |
+
"artifact_path": project_relative_path(role_spec.output_model_path),
|
| 359 |
+
"metadata_path": project_relative_path(role_spec.output_metadata_path),
|
| 360 |
+
}
|
| 361 |
+
|
| 362 |
+
|
| 363 |
+
def promote_registered_models(
|
| 364 |
+
*,
|
| 365 |
+
tracking_uri: str = DEFAULT_MLFLOW_TRACKING_URI,
|
| 366 |
+
models_dir: str | Path = DEFAULT_MODELS_DIR,
|
| 367 |
+
historical_registered_model: str | None = None,
|
| 368 |
+
historical_version: str | None = None,
|
| 369 |
+
simulation_registered_model: str | None = None,
|
| 370 |
+
simulation_version: str | None = None,
|
| 371 |
+
allow_latest_version: bool = False,
|
| 372 |
+
) -> dict[str, Any]:
|
| 373 |
+
"""Promouvoit les deux registered models runtime depuis MLflow."""
|
| 374 |
+
tracking_uri = normalize_tracking_uri(tracking_uri)
|
| 375 |
+
resolved_models_dir = Path(models_dir).resolve()
|
| 376 |
+
historical_spec = with_models_dir(HISTORICAL_RUNTIME_MODEL_SPEC, resolved_models_dir)
|
| 377 |
+
simulation_spec = with_models_dir(SIMULATION_RUNTIME_MODEL_SPEC, resolved_models_dir)
|
| 378 |
+
|
| 379 |
+
mlflow.set_tracking_uri(tracking_uri)
|
| 380 |
+
client = MlflowClient(tracking_uri=tracking_uri)
|
| 381 |
+
registered_models = list(client.search_registered_models())
|
| 382 |
+
available_names = normalize_registered_model_names(registered_models)
|
| 383 |
+
|
| 384 |
+
historical_summary = promote_single_registered_model(
|
| 385 |
+
client=client,
|
| 386 |
+
tracking_uri=tracking_uri,
|
| 387 |
+
role_spec=historical_spec,
|
| 388 |
+
available_names=available_names,
|
| 389 |
+
requested_name=historical_registered_model,
|
| 390 |
+
requested_version=historical_version,
|
| 391 |
+
allow_latest_version=allow_latest_version,
|
| 392 |
+
)
|
| 393 |
+
simulation_summary = promote_single_registered_model(
|
| 394 |
+
client=client,
|
| 395 |
+
tracking_uri=tracking_uri,
|
| 396 |
+
role_spec=simulation_spec,
|
| 397 |
+
available_names=available_names,
|
| 398 |
+
requested_name=simulation_registered_model,
|
| 399 |
+
requested_version=simulation_version,
|
| 400 |
+
allow_latest_version=allow_latest_version,
|
| 401 |
+
)
|
| 402 |
+
return {
|
| 403 |
+
"tracking_uri": tracking_uri,
|
| 404 |
+
"models_dir": project_relative_path(resolved_models_dir),
|
| 405 |
+
"historical": historical_summary,
|
| 406 |
+
"simulation": simulation_summary,
|
| 407 |
+
}
|
| 408 |
|
| 409 |
+
|
| 410 |
+
def main() -> None:
|
| 411 |
+
"""Execute la promotion runtime depuis la CLI."""
|
| 412 |
+
args = parse_args()
|
| 413 |
+
summary = promote_registered_models(
|
| 414 |
+
tracking_uri=str(args.tracking_uri),
|
| 415 |
+
models_dir=args.models_dir,
|
| 416 |
+
historical_registered_model=args.historical_registered_model,
|
| 417 |
+
historical_version=args.historical_version,
|
| 418 |
+
simulation_registered_model=args.simulation_registered_model,
|
| 419 |
+
simulation_version=args.simulation_version,
|
| 420 |
+
)
|
| 421 |
+
if args.json:
|
| 422 |
+
print(json.dumps(summary, indent=2, ensure_ascii=True))
|
| 423 |
+
return
|
| 424 |
+
|
| 425 |
+
for role_name in ("historical", "simulation"):
|
| 426 |
+
role_summary = summary[role_name]
|
| 427 |
+
print(
|
| 428 |
+
"[promotion] "
|
| 429 |
+
f"role={role_summary['role']} "
|
| 430 |
+
f"registered_model={role_summary['registered_model_name']} "
|
| 431 |
+
f"version={role_summary['registered_model_version']} "
|
| 432 |
+
f"artifact={role_summary['artifact_path']}"
|
| 433 |
+
)
|
| 434 |
|
| 435 |
|
| 436 |
if __name__ == "__main__":
|
scripts/run_full_pipeline.py
CHANGED
|
@@ -1,31 +1,122 @@
|
|
| 1 |
-
"""Orchestre la chaine locale
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import argparse
|
| 6 |
import json
|
|
|
|
|
|
|
| 7 |
from pathlib import Path
|
| 8 |
import sys
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 12 |
if str(PROJECT_ROOT) not in sys.path:
|
| 13 |
sys.path.insert(0, str(PROJECT_ROOT))
|
| 14 |
|
| 15 |
-
from scripts.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
from scripts.run_preparation import run_preparation
|
| 17 |
from scripts.train_historical_model import train_historical_model
|
| 18 |
from scripts.train_simulation_model import train_simulation_model
|
| 19 |
from scripts.validate_runtime import validate_runtime
|
| 20 |
|
| 21 |
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def parse_args() -> argparse.Namespace:
|
|
@@ -38,21 +129,11 @@ def parse_args() -> argparse.Namespace:
|
|
| 38 |
action="store_true",
|
| 39 |
help="Reuse the existing preparation outputs instead of re-executing preparation.ipynb.",
|
| 40 |
)
|
| 41 |
-
parser.add_argument(
|
| 42 |
-
"--run-experience-2",
|
| 43 |
-
action="store_true",
|
| 44 |
-
help="Optionally execute the abandoned complementary temporal notebook.",
|
| 45 |
-
)
|
| 46 |
parser.add_argument(
|
| 47 |
"--skip-runtime-validation",
|
| 48 |
action="store_true",
|
| 49 |
help="Skip the final smoke test against the runtime service.",
|
| 50 |
)
|
| 51 |
-
parser.add_argument(
|
| 52 |
-
"--run-experience-3",
|
| 53 |
-
action="store_true",
|
| 54 |
-
help="Also execute notebooks/experience_3.ipynb after the artifacts are rebuilt.",
|
| 55 |
-
)
|
| 56 |
parser.add_argument(
|
| 57 |
"--reuse-simulation-artifact",
|
| 58 |
action="store_true",
|
|
@@ -75,6 +156,21 @@ def parse_args() -> argparse.Namespace:
|
|
| 75 |
default="python3",
|
| 76 |
help="Jupyter kernel used to execute notebook-backed stages.",
|
| 77 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
parser.add_argument(
|
| 79 |
"--json",
|
| 80 |
action="store_true",
|
|
@@ -86,74 +182,78 @@ def parse_args() -> argparse.Namespace:
|
|
| 86 |
def run_full_pipeline(
|
| 87 |
*,
|
| 88 |
skip_preparation: bool = False,
|
| 89 |
-
run_experience_2: bool = False,
|
| 90 |
skip_runtime_validation: bool = False,
|
| 91 |
-
run_experience_3: bool = False,
|
| 92 |
reuse_simulation_artifact: bool = False,
|
| 93 |
simulation_sample_size: int = 200_000,
|
| 94 |
notebook_timeout_seconds: int = 7200,
|
| 95 |
kernel_name: str = "python3",
|
|
|
|
|
|
|
|
|
|
| 96 |
) -> dict[str, object]:
|
| 97 |
"""Execute les principales etapes de regeneration des artefacts.
|
| 98 |
|
| 99 |
Args:
|
| 100 |
skip_preparation: Saute `preparation.ipynb` si les sorties existent deja.
|
| 101 |
-
run_experience_2: Execute explicitement le notebook temporel abandonne.
|
| 102 |
skip_runtime_validation: Saute le smoke test final.
|
| 103 |
-
run_experience_3: Execute aussi le notebook de verification de stack.
|
| 104 |
reuse_simulation_artifact: Reutilise le modele local existant au lieu de le reentrainer.
|
| 105 |
simulation_sample_size: Taille d'echantillon pour le modele local.
|
| 106 |
notebook_timeout_seconds: Timeout applique a chaque notebook execute.
|
| 107 |
kernel_name: Kernel Jupyter a utiliser.
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
Returns:
|
| 110 |
dict[str, object]: Resume des etapes executees et des artefacts verifies.
|
| 111 |
"""
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
if not skip_preparation:
|
| 115 |
results["preparation"] = run_preparation(
|
| 116 |
timeout_seconds=notebook_timeout_seconds,
|
| 117 |
kernel_name=kernel_name,
|
| 118 |
)
|
|
|
|
|
|
|
| 119 |
|
| 120 |
results["historical_model"] = train_historical_model(
|
|
|
|
| 121 |
cv_splits=4,
|
| 122 |
)
|
| 123 |
|
| 124 |
-
if run_experience_2:
|
| 125 |
-
print(f"[experience_2] Executing {relative_to_project(EXPERIENCE_2_NOTEBOOK_PATH)}")
|
| 126 |
-
execute_notebook(
|
| 127 |
-
EXPERIENCE_2_NOTEBOOK_PATH,
|
| 128 |
-
timeout_seconds=notebook_timeout_seconds,
|
| 129 |
-
kernel_name=kernel_name,
|
| 130 |
-
)
|
| 131 |
-
resolved_outputs = ensure_paths_exist(EXPERIENCE_2_OUTPUTS, label="experience_2 outputs")
|
| 132 |
-
print("[experience_2] Outputs validated")
|
| 133 |
-
results["experience_2"] = {
|
| 134 |
-
"notebook": relative_to_project(EXPERIENCE_2_NOTEBOOK_PATH),
|
| 135 |
-
"outputs": [relative_to_project(path) for path in resolved_outputs],
|
| 136 |
-
}
|
| 137 |
-
|
| 138 |
results["simulation_model"] = train_simulation_model(
|
| 139 |
force_retrain=not reuse_simulation_artifact,
|
| 140 |
save_artifact=True,
|
| 141 |
sample_size=simulation_sample_size,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
)
|
| 143 |
-
|
| 144 |
-
if run_experience_3:
|
| 145 |
-
print(f"[experience_3] Executing {relative_to_project(EXPERIENCE_3_NOTEBOOK_PATH)}")
|
| 146 |
-
execute_notebook(
|
| 147 |
-
EXPERIENCE_3_NOTEBOOK_PATH,
|
| 148 |
-
timeout_seconds=notebook_timeout_seconds,
|
| 149 |
-
kernel_name=kernel_name,
|
| 150 |
-
)
|
| 151 |
-
results["experience_3"] = {
|
| 152 |
-
"notebook": relative_to_project(EXPERIENCE_3_NOTEBOOK_PATH),
|
| 153 |
-
}
|
| 154 |
|
| 155 |
if not skip_runtime_validation:
|
| 156 |
results["runtime_validation"] = validate_runtime()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
|
| 158 |
return results
|
| 159 |
|
|
@@ -163,13 +263,14 @@ def main() -> None:
|
|
| 163 |
args = parse_args()
|
| 164 |
summary = run_full_pipeline(
|
| 165 |
skip_preparation=args.skip_preparation,
|
| 166 |
-
run_experience_2=args.run_experience_2,
|
| 167 |
skip_runtime_validation=args.skip_runtime_validation,
|
| 168 |
-
run_experience_3=args.run_experience_3,
|
| 169 |
reuse_simulation_artifact=args.reuse_simulation_artifact,
|
| 170 |
simulation_sample_size=args.simulation_sample_size,
|
| 171 |
notebook_timeout_seconds=args.notebook_timeout_seconds,
|
| 172 |
kernel_name=args.kernel_name,
|
|
|
|
|
|
|
|
|
|
| 173 |
)
|
| 174 |
if args.json:
|
| 175 |
print(json.dumps(summary, indent=2, ensure_ascii=True))
|
|
|
|
| 1 |
+
"""Orchestre la chaine locale officielle, de la preparation a la validation."""
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import argparse
|
| 6 |
import json
|
| 7 |
+
import math
|
| 8 |
+
from numbers import Real
|
| 9 |
from pathlib import Path
|
| 10 |
import sys
|
| 11 |
|
| 12 |
+
import mlflow
|
| 13 |
+
from mlflow.tracking import MlflowClient
|
| 14 |
+
|
| 15 |
|
| 16 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 17 |
if str(PROJECT_ROOT) not in sys.path:
|
| 18 |
sys.path.insert(0, str(PROJECT_ROOT))
|
| 19 |
|
| 20 |
+
from scripts.mlflow_config import (
|
| 21 |
+
DEFAULT_MLFLOW_TRACKING_URI,
|
| 22 |
+
FULL_PIPELINE_EXPERIMENT_NAME,
|
| 23 |
+
experiment_artifact_location,
|
| 24 |
+
normalize_tracking_uri,
|
| 25 |
+
)
|
| 26 |
+
from scripts.promote_registered_model import promote_registered_models
|
| 27 |
from scripts.run_preparation import run_preparation
|
| 28 |
from scripts.train_historical_model import train_historical_model
|
| 29 |
from scripts.train_simulation_model import train_simulation_model
|
| 30 |
from scripts.validate_runtime import validate_runtime
|
| 31 |
|
| 32 |
|
| 33 |
+
def _ensure_full_pipeline_experiment(tracking_uri: str) -> None:
|
| 34 |
+
"""Prepare l'experience MLflow qui trace les executions du pipeline complet."""
|
| 35 |
+
mlflow.set_tracking_uri(tracking_uri)
|
| 36 |
+
while mlflow.active_run() is not None:
|
| 37 |
+
mlflow.end_run()
|
| 38 |
+
|
| 39 |
+
client = MlflowClient(tracking_uri=tracking_uri)
|
| 40 |
+
experiment = client.get_experiment_by_name(FULL_PIPELINE_EXPERIMENT_NAME)
|
| 41 |
+
if experiment is None:
|
| 42 |
+
client.create_experiment(
|
| 43 |
+
FULL_PIPELINE_EXPERIMENT_NAME,
|
| 44 |
+
artifact_location=experiment_artifact_location(
|
| 45 |
+
FULL_PIPELINE_EXPERIMENT_NAME,
|
| 46 |
+
tracking_uri=tracking_uri,
|
| 47 |
+
),
|
| 48 |
+
)
|
| 49 |
+
mlflow.set_experiment(FULL_PIPELINE_EXPERIMENT_NAME)
|
| 50 |
+
|
| 51 |
+
|
| 52 |
+
def _log_numeric_metrics(prefix: str, metrics: object) -> None:
|
| 53 |
+
"""Journalise les metriques numeriques disponibles dans un dictionnaire."""
|
| 54 |
+
if not isinstance(metrics, dict):
|
| 55 |
+
return
|
| 56 |
+
|
| 57 |
+
for metric_name, metric_value in metrics.items():
|
| 58 |
+
if isinstance(metric_value, bool) or not isinstance(metric_value, Real):
|
| 59 |
+
continue
|
| 60 |
+
numeric_value = float(metric_value)
|
| 61 |
+
if math.isfinite(numeric_value):
|
| 62 |
+
mlflow.log_metric(f"{prefix}_{metric_name}", numeric_value)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _log_param_if_present(name: str, value: object) -> None:
|
| 66 |
+
"""Journalise un parametre MLflow seulement s'il est renseigne."""
|
| 67 |
+
if value is not None:
|
| 68 |
+
mlflow.log_param(name, value)
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
def log_pipeline_summary_to_mlflow(
|
| 72 |
+
summary: dict[str, object],
|
| 73 |
+
*,
|
| 74 |
+
tracking_uri: str,
|
| 75 |
+
skip_preparation: bool,
|
| 76 |
+
skip_runtime_validation: bool,
|
| 77 |
+
reuse_simulation_artifact: bool,
|
| 78 |
+
simulation_sample_size: int,
|
| 79 |
+
) -> dict[str, str]:
|
| 80 |
+
"""Ajoute une trace MLflow lisible pour une execution de `run_full_pipeline.py`."""
|
| 81 |
+
resolved_tracking_uri = normalize_tracking_uri(tracking_uri)
|
| 82 |
+
_ensure_full_pipeline_experiment(resolved_tracking_uri)
|
| 83 |
+
serializable_summary = json.loads(json.dumps(summary, ensure_ascii=True, default=str))
|
| 84 |
+
|
| 85 |
+
with mlflow.start_run(run_name=FULL_PIPELINE_EXPERIMENT_NAME) as run:
|
| 86 |
+
mlflow.log_param("entrypoint", "scripts/run_full_pipeline.py")
|
| 87 |
+
mlflow.log_param("skip_preparation", bool(skip_preparation))
|
| 88 |
+
mlflow.log_param("skip_runtime_validation", bool(skip_runtime_validation))
|
| 89 |
+
mlflow.log_param("reuse_simulation_artifact", bool(reuse_simulation_artifact))
|
| 90 |
+
mlflow.log_param("simulation_sample_size", int(simulation_sample_size))
|
| 91 |
+
|
| 92 |
+
historical_model = serializable_summary.get("historical_model", {})
|
| 93 |
+
simulation_model = serializable_summary.get("simulation_model", {})
|
| 94 |
+
runtime_validation = serializable_summary.get("runtime_validation", {})
|
| 95 |
+
|
| 96 |
+
if isinstance(historical_model, dict):
|
| 97 |
+
_log_param_if_present("historical_registered_model", historical_model.get("registered_model_name"))
|
| 98 |
+
_log_param_if_present(
|
| 99 |
+
"historical_registered_model_version",
|
| 100 |
+
historical_model.get("registered_model_version"),
|
| 101 |
+
)
|
| 102 |
+
_log_numeric_metrics("historical", historical_model.get("metrics"))
|
| 103 |
+
if isinstance(simulation_model, dict):
|
| 104 |
+
_log_param_if_present("simulation_registered_model", simulation_model.get("registered_model_name"))
|
| 105 |
+
_log_param_if_present(
|
| 106 |
+
"simulation_registered_model_version",
|
| 107 |
+
simulation_model.get("registered_model_version"),
|
| 108 |
+
)
|
| 109 |
+
_log_numeric_metrics("simulation", simulation_model.get("metrics"))
|
| 110 |
+
if isinstance(runtime_validation, dict):
|
| 111 |
+
mlflow.log_param("runtime_validation_skipped", bool(runtime_validation.get("skipped", False)))
|
| 112 |
+
_log_param_if_present("runtime_validation_status", runtime_validation.get("status", "executed"))
|
| 113 |
+
|
| 114 |
+
mlflow.log_dict(serializable_summary, "pipeline_summary.json")
|
| 115 |
+
return {
|
| 116 |
+
"experiment_name": FULL_PIPELINE_EXPERIMENT_NAME,
|
| 117 |
+
"run_id": run.info.run_id,
|
| 118 |
+
"tracking_uri": resolved_tracking_uri,
|
| 119 |
+
}
|
| 120 |
|
| 121 |
|
| 122 |
def parse_args() -> argparse.Namespace:
|
|
|
|
| 129 |
action="store_true",
|
| 130 |
help="Reuse the existing preparation outputs instead of re-executing preparation.ipynb.",
|
| 131 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
parser.add_argument(
|
| 133 |
"--skip-runtime-validation",
|
| 134 |
action="store_true",
|
| 135 |
help="Skip the final smoke test against the runtime service.",
|
| 136 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
parser.add_argument(
|
| 138 |
"--reuse-simulation-artifact",
|
| 139 |
action="store_true",
|
|
|
|
| 156 |
default="python3",
|
| 157 |
help="Jupyter kernel used to execute notebook-backed stages.",
|
| 158 |
)
|
| 159 |
+
parser.add_argument(
|
| 160 |
+
"--tracking-uri",
|
| 161 |
+
default=DEFAULT_MLFLOW_TRACKING_URI,
|
| 162 |
+
help="Tracking URI MLflow partage entre entrainement et promotion.",
|
| 163 |
+
)
|
| 164 |
+
parser.add_argument(
|
| 165 |
+
"--historical-version",
|
| 166 |
+
default=None,
|
| 167 |
+
help="Version MLflow historique a promouvoir. Par defaut, le pipeline prend la derniere version.",
|
| 168 |
+
)
|
| 169 |
+
parser.add_argument(
|
| 170 |
+
"--simulation-version",
|
| 171 |
+
default=None,
|
| 172 |
+
help="Version MLflow simulation a promouvoir. Par defaut, le pipeline prend la derniere version.",
|
| 173 |
+
)
|
| 174 |
parser.add_argument(
|
| 175 |
"--json",
|
| 176 |
action="store_true",
|
|
|
|
| 182 |
def run_full_pipeline(
|
| 183 |
*,
|
| 184 |
skip_preparation: bool = False,
|
|
|
|
| 185 |
skip_runtime_validation: bool = False,
|
|
|
|
| 186 |
reuse_simulation_artifact: bool = False,
|
| 187 |
simulation_sample_size: int = 200_000,
|
| 188 |
notebook_timeout_seconds: int = 7200,
|
| 189 |
kernel_name: str = "python3",
|
| 190 |
+
tracking_uri: str = DEFAULT_MLFLOW_TRACKING_URI,
|
| 191 |
+
historical_version: str | None = None,
|
| 192 |
+
simulation_version: str | None = None,
|
| 193 |
) -> dict[str, object]:
|
| 194 |
"""Execute les principales etapes de regeneration des artefacts.
|
| 195 |
|
| 196 |
Args:
|
| 197 |
skip_preparation: Saute `preparation.ipynb` si les sorties existent deja.
|
|
|
|
| 198 |
skip_runtime_validation: Saute le smoke test final.
|
|
|
|
| 199 |
reuse_simulation_artifact: Reutilise le modele local existant au lieu de le reentrainer.
|
| 200 |
simulation_sample_size: Taille d'echantillon pour le modele local.
|
| 201 |
notebook_timeout_seconds: Timeout applique a chaque notebook execute.
|
| 202 |
kernel_name: Kernel Jupyter a utiliser.
|
| 203 |
+
tracking_uri: Tracking URI MLflow utilise pour l'entrainement et la promotion.
|
| 204 |
+
historical_version: Version historique a promouvoir, ou derniere version si absent.
|
| 205 |
+
simulation_version: Version simulation a promouvoir, ou derniere version si absent.
|
| 206 |
|
| 207 |
Returns:
|
| 208 |
dict[str, object]: Resume des etapes executees et des artefacts verifies.
|
| 209 |
"""
|
| 210 |
+
tracking_uri = normalize_tracking_uri(tracking_uri)
|
| 211 |
+
results: dict[str, object] = {
|
| 212 |
+
"mlflow": {
|
| 213 |
+
"tracking_uri": tracking_uri,
|
| 214 |
+
"pipeline_experiment": FULL_PIPELINE_EXPERIMENT_NAME,
|
| 215 |
+
}
|
| 216 |
+
}
|
| 217 |
|
| 218 |
if not skip_preparation:
|
| 219 |
results["preparation"] = run_preparation(
|
| 220 |
timeout_seconds=notebook_timeout_seconds,
|
| 221 |
kernel_name=kernel_name,
|
| 222 |
)
|
| 223 |
+
else:
|
| 224 |
+
results["preparation"] = {"skipped": True}
|
| 225 |
|
| 226 |
results["historical_model"] = train_historical_model(
|
| 227 |
+
tracking_uri=tracking_uri,
|
| 228 |
cv_splits=4,
|
| 229 |
)
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
results["simulation_model"] = train_simulation_model(
|
| 232 |
force_retrain=not reuse_simulation_artifact,
|
| 233 |
save_artifact=True,
|
| 234 |
sample_size=simulation_sample_size,
|
| 235 |
+
tracking_uri=tracking_uri,
|
| 236 |
+
)
|
| 237 |
+
results["registered_model_promotion"] = promote_registered_models(
|
| 238 |
+
tracking_uri=tracking_uri,
|
| 239 |
+
historical_version=historical_version,
|
| 240 |
+
simulation_version=simulation_version,
|
| 241 |
+
allow_latest_version=True,
|
| 242 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
if not skip_runtime_validation:
|
| 245 |
results["runtime_validation"] = validate_runtime()
|
| 246 |
+
else:
|
| 247 |
+
results["runtime_validation"] = {"skipped": True}
|
| 248 |
+
|
| 249 |
+
results["pipeline_run"] = log_pipeline_summary_to_mlflow(
|
| 250 |
+
results,
|
| 251 |
+
tracking_uri=tracking_uri,
|
| 252 |
+
skip_preparation=skip_preparation,
|
| 253 |
+
skip_runtime_validation=skip_runtime_validation,
|
| 254 |
+
reuse_simulation_artifact=reuse_simulation_artifact,
|
| 255 |
+
simulation_sample_size=simulation_sample_size,
|
| 256 |
+
)
|
| 257 |
|
| 258 |
return results
|
| 259 |
|
|
|
|
| 263 |
args = parse_args()
|
| 264 |
summary = run_full_pipeline(
|
| 265 |
skip_preparation=args.skip_preparation,
|
|
|
|
| 266 |
skip_runtime_validation=args.skip_runtime_validation,
|
|
|
|
| 267 |
reuse_simulation_artifact=args.reuse_simulation_artifact,
|
| 268 |
simulation_sample_size=args.simulation_sample_size,
|
| 269 |
notebook_timeout_seconds=args.notebook_timeout_seconds,
|
| 270 |
kernel_name=args.kernel_name,
|
| 271 |
+
tracking_uri=args.tracking_uri,
|
| 272 |
+
historical_version=args.historical_version,
|
| 273 |
+
simulation_version=args.simulation_version,
|
| 274 |
)
|
| 275 |
if args.json:
|
| 276 |
print(json.dumps(summary, indent=2, ensure_ascii=True))
|
scripts/runtime_model_specs.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Centralise les contrats des deux modeles runtime du projet."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
|
| 8 |
+
from scripts.mlflow_config import DEFAULT_MLFLOW_TRACKING_URI, PROJECT_ROOT
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
DEFAULT_MODELS_DIR = PROJECT_ROOT / "artifacts" / "models"
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
@dataclass(frozen=True)
|
| 15 |
+
class RuntimeModelSpec:
|
| 16 |
+
"""Decrit un modele runtime attendu par l'API finale."""
|
| 17 |
+
|
| 18 |
+
role: str
|
| 19 |
+
registered_model_name: str
|
| 20 |
+
output_model_path: Path
|
| 21 |
+
output_metadata_path: Path
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
HISTORICAL_RUNTIME_MODEL_SPEC = RuntimeModelSpec(
|
| 25 |
+
role="historical",
|
| 26 |
+
registered_model_name="p1_historical_pipeline",
|
| 27 |
+
output_model_path=DEFAULT_MODELS_DIR / "p1_historical_pipeline.joblib",
|
| 28 |
+
output_metadata_path=DEFAULT_MODELS_DIR / "p1_historical_metadata.json",
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
SIMULATION_RUNTIME_MODEL_SPEC = RuntimeModelSpec(
|
| 32 |
+
role="simulation",
|
| 33 |
+
registered_model_name="p23_simulation_pipeline",
|
| 34 |
+
output_model_path=DEFAULT_MODELS_DIR / "p23_simulation_pipeline.joblib",
|
| 35 |
+
output_metadata_path=DEFAULT_MODELS_DIR / "p23_simulation_metadata.json",
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
RUNTIME_MODEL_SPECS = {
|
| 39 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.role: HISTORICAL_RUNTIME_MODEL_SPEC,
|
| 40 |
+
SIMULATION_RUNTIME_MODEL_SPEC.role: SIMULATION_RUNTIME_MODEL_SPEC,
|
| 41 |
+
}
|
scripts/train_historical_model.py
CHANGED
|
@@ -14,16 +14,17 @@ if str(PROJECT_ROOT) not in sys.path:
|
|
| 14 |
|
| 15 |
from scripts.experience_1 import run_experience_1
|
| 16 |
from scripts.pipeline_utils import ensure_paths_exist, relative_to_project
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
EXPERIENCE_1_SCRIPT_PATH = Path("scripts/experience_1.py")
|
| 20 |
HISTORICAL_OUTPUTS = [
|
| 21 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
| 22 |
Path("artifacts/experiments/experience_1/model_results.csv"),
|
| 23 |
-
|
| 24 |
-
|
| 25 |
]
|
| 26 |
-
HISTORICAL_METADATA_PATH =
|
| 27 |
|
| 28 |
|
| 29 |
def parse_args() -> argparse.Namespace:
|
|
@@ -68,10 +69,15 @@ def train_historical_model(
|
|
| 68 |
)
|
| 69 |
return {
|
| 70 |
"script": relative_to_project(EXPERIENCE_1_SCRIPT_PATH),
|
|
|
|
| 71 |
"training_notebook_reference": metadata.get("training_notebook"),
|
| 72 |
"outputs": [relative_to_project(path) for path in resolved_outputs],
|
| 73 |
"model_name": metadata.get("model_name"),
|
| 74 |
"target_year": metadata.get("target_year"),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
"metrics": metrics,
|
| 76 |
}
|
| 77 |
|
|
|
|
| 14 |
|
| 15 |
from scripts.experience_1 import run_experience_1
|
| 16 |
from scripts.pipeline_utils import ensure_paths_exist, relative_to_project
|
| 17 |
+
from scripts.runtime_model_specs import HISTORICAL_RUNTIME_MODEL_SPEC
|
| 18 |
|
| 19 |
|
| 20 |
EXPERIENCE_1_SCRIPT_PATH = Path("scripts/experience_1.py")
|
| 21 |
HISTORICAL_OUTPUTS = [
|
| 22 |
Path("artifacts/experiments/experience_1/dataset_consolide_historique_colonnes.csv"),
|
| 23 |
Path("artifacts/experiments/experience_1/model_results.csv"),
|
| 24 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_model_path.relative_to(PROJECT_ROOT),
|
| 25 |
+
HISTORICAL_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT),
|
| 26 |
]
|
| 27 |
+
HISTORICAL_METADATA_PATH = HISTORICAL_RUNTIME_MODEL_SPEC.output_metadata_path.relative_to(PROJECT_ROOT)
|
| 28 |
|
| 29 |
|
| 30 |
def parse_args() -> argparse.Namespace:
|
|
|
|
| 69 |
)
|
| 70 |
return {
|
| 71 |
"script": relative_to_project(EXPERIENCE_1_SCRIPT_PATH),
|
| 72 |
+
"artifact_source": "retrained",
|
| 73 |
"training_notebook_reference": metadata.get("training_notebook"),
|
| 74 |
"outputs": [relative_to_project(path) for path in resolved_outputs],
|
| 75 |
"model_name": metadata.get("model_name"),
|
| 76 |
"target_year": metadata.get("target_year"),
|
| 77 |
+
"registered_model_name": metadata.get("registered_model_name"),
|
| 78 |
+
"registered_model_version": metadata.get("registered_model_version"),
|
| 79 |
+
"registered_model_run_id": metadata.get("registered_model_run_id"),
|
| 80 |
+
"model_uri": metadata.get("model_uri"),
|
| 81 |
"metrics": metrics,
|
| 82 |
}
|
| 83 |
|
scripts/train_simulation_model.py
CHANGED
|
@@ -3,26 +3,41 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import argparse
|
|
|
|
| 6 |
from pathlib import Path
|
| 7 |
import sys
|
| 8 |
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 11 |
if str(PROJECT_ROOT) not in sys.path:
|
| 12 |
sys.path.insert(0, str(PROJECT_ROOT))
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
from scripts.pipeline_utils import ensure_paths_exist, relative_to_project
|
| 15 |
from scripts.prediction_adjustment import (
|
| 16 |
SIMULATION_METADATA_PATH,
|
| 17 |
SIMULATION_MODEL_PATH,
|
| 18 |
load_or_train_simulation_model,
|
| 19 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
SIMULATION_OUTPUTS = [
|
| 23 |
SIMULATION_MODEL_PATH,
|
| 24 |
SIMULATION_METADATA_PATH,
|
| 25 |
]
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
def parse_args() -> argparse.Namespace:
|
|
@@ -46,14 +61,66 @@ def parse_args() -> argparse.Namespace:
|
|
| 46 |
action="store_true",
|
| 47 |
help="Train in memory without rewriting the model artifacts.",
|
| 48 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
return parser.parse_args()
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def train_simulation_model(
|
| 53 |
*,
|
| 54 |
force_retrain: bool = False,
|
| 55 |
save_artifact: bool = True,
|
| 56 |
sample_size: int = 200_000,
|
|
|
|
| 57 |
) -> dict[str, object]:
|
| 58 |
"""Charge ou reentraine le modele local de simulation.
|
| 59 |
|
|
@@ -61,18 +128,42 @@ def train_simulation_model(
|
|
| 61 |
force_retrain: Force le reentrainement meme si les artefacts existent.
|
| 62 |
save_artifact: Ecrit les artefacts sur disque si `True`.
|
| 63 |
sample_size: Nombre maximal de lignes echantillonnees pour l'entrainement.
|
|
|
|
| 64 |
|
| 65 |
Returns:
|
| 66 |
dict[str, object]: Resume du dataset utilise, des metriques et des sorties.
|
| 67 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
loaded_model, simulation_df = load_or_train_simulation_model(
|
| 69 |
force_retrain=force_retrain,
|
| 70 |
save_artifact=save_artifact,
|
| 71 |
sample_size=sample_size,
|
| 72 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
output_paths: list[str] = []
|
| 75 |
if save_artifact:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
resolved_outputs = ensure_paths_exist(SIMULATION_OUTPUTS, label="simulation model outputs")
|
| 77 |
output_paths = [relative_to_project(path) for path in resolved_outputs]
|
| 78 |
|
|
@@ -85,6 +176,11 @@ def train_simulation_model(
|
|
| 85 |
return {
|
| 86 |
"dataset_rows": int(len(simulation_df)),
|
| 87 |
"sample_size": loaded_model.metadata.get("sample_size"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
"metrics": metrics,
|
| 89 |
"outputs": output_paths,
|
| 90 |
}
|
|
@@ -97,6 +193,7 @@ def main() -> None:
|
|
| 97 |
force_retrain=args.force_retrain,
|
| 98 |
save_artifact=not args.no_save,
|
| 99 |
sample_size=args.sample_size,
|
|
|
|
| 100 |
)
|
| 101 |
|
| 102 |
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
import argparse
|
| 6 |
+
import json
|
| 7 |
from pathlib import Path
|
| 8 |
import sys
|
| 9 |
|
| 10 |
+
import mlflow
|
| 11 |
+
from mlflow.tracking import MlflowClient
|
| 12 |
+
|
| 13 |
|
| 14 |
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 15 |
if str(PROJECT_ROOT) not in sys.path:
|
| 16 |
sys.path.insert(0, str(PROJECT_ROOT))
|
| 17 |
|
| 18 |
+
from scripts.mlflow_logging import log_and_register_sklearn_model
|
| 19 |
+
from scripts.mlflow_config import (
|
| 20 |
+
SIMULATION_RUNTIME_EXPERIMENT_NAME,
|
| 21 |
+
experiment_artifact_location,
|
| 22 |
+
normalize_tracking_uri,
|
| 23 |
+
)
|
| 24 |
from scripts.pipeline_utils import ensure_paths_exist, relative_to_project
|
| 25 |
from scripts.prediction_adjustment import (
|
| 26 |
SIMULATION_METADATA_PATH,
|
| 27 |
SIMULATION_MODEL_PATH,
|
| 28 |
load_or_train_simulation_model,
|
| 29 |
)
|
| 30 |
+
from scripts.runtime_model_specs import (
|
| 31 |
+
DEFAULT_MLFLOW_TRACKING_URI,
|
| 32 |
+
SIMULATION_RUNTIME_MODEL_SPEC,
|
| 33 |
+
)
|
| 34 |
|
| 35 |
|
| 36 |
SIMULATION_OUTPUTS = [
|
| 37 |
SIMULATION_MODEL_PATH,
|
| 38 |
SIMULATION_METADATA_PATH,
|
| 39 |
]
|
| 40 |
+
SIMULATION_MLFLOW_EXPERIMENT_NAME = SIMULATION_RUNTIME_EXPERIMENT_NAME
|
| 41 |
|
| 42 |
|
| 43 |
def parse_args() -> argparse.Namespace:
|
|
|
|
| 61 |
action="store_true",
|
| 62 |
help="Train in memory without rewriting the model artifacts.",
|
| 63 |
)
|
| 64 |
+
parser.add_argument(
|
| 65 |
+
"--tracking-uri",
|
| 66 |
+
default=DEFAULT_MLFLOW_TRACKING_URI,
|
| 67 |
+
help="Tracking URI MLflow utilise pour journaliser et enregistrer le modele.",
|
| 68 |
+
)
|
| 69 |
return parser.parse_args()
|
| 70 |
|
| 71 |
|
| 72 |
+
def _ensure_simulation_mlflow_experiment(tracking_uri: str) -> None:
|
| 73 |
+
"""Initialise l'experiment MLflow utilise par la brique de simulation."""
|
| 74 |
+
tracking_uri = normalize_tracking_uri(tracking_uri)
|
| 75 |
+
mlflow.set_tracking_uri(tracking_uri)
|
| 76 |
+
client = MlflowClient(tracking_uri=tracking_uri)
|
| 77 |
+
experiment = client.get_experiment_by_name(SIMULATION_MLFLOW_EXPERIMENT_NAME)
|
| 78 |
+
if experiment is None:
|
| 79 |
+
client.create_experiment(
|
| 80 |
+
SIMULATION_MLFLOW_EXPERIMENT_NAME,
|
| 81 |
+
artifact_location=experiment_artifact_location(
|
| 82 |
+
SIMULATION_MLFLOW_EXPERIMENT_NAME,
|
| 83 |
+
tracking_uri=tracking_uri,
|
| 84 |
+
),
|
| 85 |
+
)
|
| 86 |
+
mlflow.set_experiment(SIMULATION_MLFLOW_EXPERIMENT_NAME)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _register_simulation_runtime_model(
|
| 90 |
+
*,
|
| 91 |
+
loaded_model,
|
| 92 |
+
tracking_uri: str,
|
| 93 |
+
) -> dict[str, str]:
|
| 94 |
+
"""Journalise et enregistre le modele local comme registered model MLflow."""
|
| 95 |
+
_ensure_simulation_mlflow_experiment(tracking_uri)
|
| 96 |
+
metrics = loaded_model.metadata.get("metrics", {})
|
| 97 |
+
with mlflow.start_run(run_name=f"{SIMULATION_MLFLOW_EXPERIMENT_NAME}__runtime_model"):
|
| 98 |
+
mlflow.log_param("runtime_model_role", SIMULATION_RUNTIME_MODEL_SPEC.role)
|
| 99 |
+
mlflow.log_param("registered_model_name", SIMULATION_RUNTIME_MODEL_SPEC.registered_model_name)
|
| 100 |
+
mlflow.log_param("training_entrypoint", "scripts/train_simulation_model.py")
|
| 101 |
+
mlflow.log_param("model_name", loaded_model.metadata.get("model_name"))
|
| 102 |
+
mlflow.log_param("dataset_source", loaded_model.metadata.get("dataset_source"))
|
| 103 |
+
mlflow.log_param("sample_size", loaded_model.metadata.get("sample_size"))
|
| 104 |
+
for metric_name, metric_value in metrics.items():
|
| 105 |
+
if metric_value is not None:
|
| 106 |
+
mlflow.log_metric(metric_name, float(metric_value))
|
| 107 |
+
return log_and_register_sklearn_model(
|
| 108 |
+
loaded_model.pipeline,
|
| 109 |
+
artifact_name=SIMULATION_RUNTIME_MODEL_SPEC.registered_model_name,
|
| 110 |
+
registered_model_name=SIMULATION_RUNTIME_MODEL_SPEC.registered_model_name,
|
| 111 |
+
model_metadata={
|
| 112 |
+
"runtime_model_role": SIMULATION_RUNTIME_MODEL_SPEC.role,
|
| 113 |
+
"training_entrypoint": "scripts/train_simulation_model.py",
|
| 114 |
+
},
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
def train_simulation_model(
|
| 119 |
*,
|
| 120 |
force_retrain: bool = False,
|
| 121 |
save_artifact: bool = True,
|
| 122 |
sample_size: int = 200_000,
|
| 123 |
+
tracking_uri: str = DEFAULT_MLFLOW_TRACKING_URI,
|
| 124 |
) -> dict[str, object]:
|
| 125 |
"""Charge ou reentraine le modele local de simulation.
|
| 126 |
|
|
|
|
| 128 |
force_retrain: Force le reentrainement meme si les artefacts existent.
|
| 129 |
save_artifact: Ecrit les artefacts sur disque si `True`.
|
| 130 |
sample_size: Nombre maximal de lignes echantillonnees pour l'entrainement.
|
| 131 |
+
tracking_uri: Tracking URI MLflow utilise pour le registry.
|
| 132 |
|
| 133 |
Returns:
|
| 134 |
dict[str, object]: Resume du dataset utilise, des metriques et des sorties.
|
| 135 |
"""
|
| 136 |
+
tracking_uri = normalize_tracking_uri(tracking_uri)
|
| 137 |
+
reused_existing_artifact = (
|
| 138 |
+
not force_retrain
|
| 139 |
+
and SIMULATION_MODEL_PATH.exists()
|
| 140 |
+
and SIMULATION_METADATA_PATH.exists()
|
| 141 |
+
)
|
| 142 |
loaded_model, simulation_df = load_or_train_simulation_model(
|
| 143 |
force_retrain=force_retrain,
|
| 144 |
save_artifact=save_artifact,
|
| 145 |
sample_size=sample_size,
|
| 146 |
)
|
| 147 |
+
registration = _register_simulation_runtime_model(
|
| 148 |
+
loaded_model=loaded_model,
|
| 149 |
+
tracking_uri=tracking_uri,
|
| 150 |
+
)
|
| 151 |
+
loaded_model.metadata.update(
|
| 152 |
+
{
|
| 153 |
+
"runtime_model_role": SIMULATION_RUNTIME_MODEL_SPEC.role,
|
| 154 |
+
"registered_model_name": registration["registered_model_name"],
|
| 155 |
+
"registered_model_version": registration["registered_model_version"],
|
| 156 |
+
"registered_model_run_id": registration["run_id"],
|
| 157 |
+
"model_uri": registration["model_uri"],
|
| 158 |
+
}
|
| 159 |
+
)
|
| 160 |
|
| 161 |
output_paths: list[str] = []
|
| 162 |
if save_artifact:
|
| 163 |
+
SIMULATION_METADATA_PATH.write_text(
|
| 164 |
+
json.dumps(loaded_model.metadata, indent=2, ensure_ascii=True),
|
| 165 |
+
encoding="utf-8",
|
| 166 |
+
)
|
| 167 |
resolved_outputs = ensure_paths_exist(SIMULATION_OUTPUTS, label="simulation model outputs")
|
| 168 |
output_paths = [relative_to_project(path) for path in resolved_outputs]
|
| 169 |
|
|
|
|
| 176 |
return {
|
| 177 |
"dataset_rows": int(len(simulation_df)),
|
| 178 |
"sample_size": loaded_model.metadata.get("sample_size"),
|
| 179 |
+
"artifact_source": "reused_existing" if reused_existing_artifact else "retrained",
|
| 180 |
+
"registered_model_name": registration["registered_model_name"],
|
| 181 |
+
"registered_model_version": registration["registered_model_version"],
|
| 182 |
+
"registered_model_run_id": registration["run_id"],
|
| 183 |
+
"model_uri": registration["model_uri"],
|
| 184 |
"metrics": metrics,
|
| 185 |
"outputs": output_paths,
|
| 186 |
}
|
|
|
|
| 193 |
force_retrain=args.force_retrain,
|
| 194 |
save_artifact=not args.no_save,
|
| 195 |
sample_size=args.sample_size,
|
| 196 |
+
tracking_uri=args.tracking_uri,
|
| 197 |
)
|
| 198 |
|
| 199 |
|
streamlit/requirements.txt
CHANGED
|
@@ -5,5 +5,6 @@ pandas==2.3.3
|
|
| 5 |
Pillow==11.3.0
|
| 6 |
requests==2.32.5
|
| 7 |
scikit-learn==1.8.0
|
|
|
|
| 8 |
streamlit==1.49.1
|
| 9 |
uvicorn==0.42.0
|
|
|
|
| 5 |
Pillow==11.3.0
|
| 6 |
requests==2.32.5
|
| 7 |
scikit-learn==1.8.0
|
| 8 |
+
shap
|
| 9 |
streamlit==1.49.1
|
| 10 |
uvicorn==0.42.0
|