Spaces:

stephmnt
/

bdv

Running

App Files Files Community

bdv / src /model /train.py

stephmnt

Sync from GitHub Actions

46f9144 verified about 1 month ago

raw

history blame contribute delete

27.8 kB

	from __future__ import annotations

	import argparse
	import json
	import logging
	import sys
	from dataclasses import dataclass
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple

	import joblib
	import numpy as np
	import pandas as pd
	from sklearn.compose import ColumnTransformer
	from sklearn.base import BaseEstimator, RegressorMixin, clone
	from sklearn.ensemble import HistGradientBoostingClassifier, HistGradientBoostingRegressor
	from sklearn.impute import SimpleImputer
	from sklearn.linear_model import Ridge
	from sklearn.metrics import (
	explained_variance_score,
	mean_absolute_error,
	mean_squared_error,
	median_absolute_error,
	r2_score,
	)
	from sklearn.model_selection import TimeSeriesSplit
	from sklearn.multioutput import MultiOutputRegressor
	from sklearn.pipeline import Pipeline
	from sklearn.preprocessing import StandardScaler
	from sklearn.utils.validation import check_is_fitted

	# Ensure project root is on sys.path when running as a script
	PROJECT_ROOT = Path(__file__).resolve().parents[2]
	if str(PROJECT_ROOT) not in sys.path:
	sys.path.append(str(PROJECT_ROOT))

	from src.constants import CANDIDATE_CATEGORIES

	LOGGER = logging.getLogger(__name__)

	TARGET_COLS = [f"target_share_{c}" for c in CANDIDATE_CATEGORIES]
	META_COLS = [
	"commune_code",
	"code_bv",
	"election_type",
	"election_year",
	"round",
	"date_scrutin",
	"target_sum_before_renorm",
	"target_sum_after_renorm",
	]

	MODEL_GRIDS: Dict[str, List[Dict[str, object]]] = {
	"ridge": [
	{"alpha": 0.1},
	{"alpha": 1.0},
	{"alpha": 10.0},
	{"alpha": 50.0},
	],
	"hist_gradient_boosting": [
	{"max_depth": 3, "learning_rate": 0.08, "max_iter": 400, "min_samples_leaf": 30, "l2_regularization": 0.1},
	{"max_depth": 4, "learning_rate": 0.05, "max_iter": 600, "min_samples_leaf": 20, "l2_regularization": 0.1},
	{"max_depth": 4, "learning_rate": 0.1, "max_iter": 300, "min_samples_leaf": 50, "l2_regularization": 1.0},
	{"max_depth": 6, "learning_rate": 0.05, "max_iter": 500, "min_samples_leaf": 40, "l2_regularization": 0.5},
	{"max_depth": 3, "learning_rate": 0.05, "max_iter": 500, "min_samples_leaf": 80, "l2_regularization": 1.0},
	{"max_depth": 3, "learning_rate": 0.04, "max_iter": 600, "min_samples_leaf": 120, "l2_regularization": 2.0},
	{"max_depth": 2, "learning_rate": 0.08, "max_iter": 500, "min_samples_leaf": 150, "l2_regularization": 3.0},
	],
	"lightgbm": [
	{"n_estimators": 600, "learning_rate": 0.05, "num_leaves": 31, "subsample": 0.8, "colsample_bytree": 0.8},
	{"n_estimators": 400, "learning_rate": 0.08, "num_leaves": 16, "min_child_samples": 30, "subsample": 0.7, "colsample_bytree": 0.7},
	],
	"xgboost": [
	{"n_estimators": 600, "learning_rate": 0.05, "max_depth": 6, "subsample": 0.8, "colsample_bytree": 0.8},
	{"n_estimators": 400, "learning_rate": 0.08, "max_depth": 4, "subsample": 0.7, "colsample_bytree": 0.7},
	],
	"two_stage_hgb": [
	{
	"clf_params": {"max_depth": 3, "learning_rate": 0.08, "max_iter": 300, "min_samples_leaf": 30, "l2_regularization": 0.1},
	"reg_params": {"max_depth": 3, "learning_rate": 0.08, "max_iter": 400, "min_samples_leaf": 30, "l2_regularization": 0.1},
	"epsilon": 1e-4,
	"use_logit": True,
	"use_proba": True,
	},
	{
	"clf_params": {"max_depth": 2, "learning_rate": 0.1, "max_iter": 300, "min_samples_leaf": 60, "l2_regularization": 0.2},
	"reg_params": {"max_depth": 2, "learning_rate": 0.08, "max_iter": 500, "min_samples_leaf": 60, "l2_regularization": 0.5},
	"epsilon": 1e-4,
	"use_logit": True,
	"use_proba": True,
	},
	],
	"catboost": [
	{"depth": 6, "learning_rate": 0.05, "iterations": 500},
	{"depth": 4, "learning_rate": 0.08, "iterations": 400},
	],
	}


	@dataclass
	class SplitConfig:
	train_end_year: int
	valid_end_year: int
	test_start_year: int


	def load_panel(path: Path) -> pd.DataFrame:
	if not path.exists():
	raise FileNotFoundError(f"Panel introuvable : {path}")
	if path.suffix == ".parquet":
	df = pd.read_parquet(path)
	else:
	df = pd.read_csv(path, sep=";")
	df["election_year"] = pd.to_numeric(df["election_year"], errors="coerce")
	df["round"] = pd.to_numeric(df["round"], errors="coerce")
	return df


	def get_feature_columns(df: pd.DataFrame) -> List[str]:
	exclude = set(TARGET_COLS + META_COLS)
	candidates = [c for c in df.columns if c not in exclude]
	numeric_feats = [c for c in candidates if pd.api.types.is_numeric_dtype(df[c])]
	return numeric_feats


	def temporal_split(df: pd.DataFrame, cfg: SplitConfig) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
	train = df[df["election_year"] <= cfg.train_end_year]
	valid = df[(df["election_year"] > cfg.train_end_year) & (df["election_year"] <= cfg.valid_end_year)]
	test = df[df["election_year"] >= cfg.test_start_year]
	return train, valid, test


	def make_preprocessor(feature_cols: List[str]) -> ColumnTransformer:
	return ColumnTransformer(
	transformers=[
	("num", Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]), feature_cols)
	],
	remainder="drop",
	)


	def normalize_predictions(y_pred: np.ndarray) -> np.ndarray:
	y_pred = np.clip(y_pred, 0, 1)
	sums = y_pred.sum(axis=1, keepdims=True)
	sums[sums == 0] = 1
	return y_pred / sums


	def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray) -> Dict[str, float]:
	y_true = np.asarray(y_true)
	y_pred = np.asarray(y_pred)
	y_pred = normalize_predictions(y_pred)
	y_true_flat = y_true.reshape(-1)
	y_pred_flat = y_pred.reshape(-1)
	mae = float(mean_absolute_error(y_true_flat, y_pred_flat))
	rmse = float(np.sqrt(mean_squared_error(y_true_flat, y_pred_flat)))
	medae = float(median_absolute_error(y_true_flat, y_pred_flat))
	r2 = float(r2_score(y_true_flat, y_pred_flat)) if len(y_true_flat) > 1 else np.nan
	evs = float(explained_variance_score(y_true_flat, y_pred_flat)) if len(y_true_flat) > 1 else np.nan
	denom = float(np.sum(np.abs(y_true_flat)))
	wape = float(np.sum(np.abs(y_true_flat - y_pred_flat)) / denom) if denom > 0 else np.nan
	smape = float(np.mean(2 * np.abs(y_pred_flat - y_true_flat) / (np.abs(y_true_flat) + np.abs(y_pred_flat) + 1e-9)))
	bias = float(np.mean(y_pred_flat - y_true_flat))
	winner_true = np.argmax(y_true, axis=1)
	winner_pred = np.argmax(y_pred, axis=1)
	winner_acc = float(np.mean(winner_true == winner_pred)) if len(winner_true) else np.nan
	metrics = {
	"mae_mean": mae,
	"rmse": rmse,
	"medae": medae,
	"r2": r2,
	"explained_var": evs,
	"wape": wape,
	"smape": smape,
	"bias": bias,
	"winner_accuracy": winner_acc,
	}
	for idx, cat in enumerate(CANDIDATE_CATEGORIES):
	metrics[f"mae_{cat}"] = float(mean_absolute_error(y_true[:, idx], y_pred[:, idx]))
	return metrics


	def build_event_folds(df: pd.DataFrame, n_splits: int) -> List[Tuple[np.ndarray, np.ndarray]]:
	if df.empty:
	return []
	work = df.copy()
	work["date_scrutin"] = pd.to_datetime(work.get("date_scrutin"), errors="coerce") # type: ignore
	if work["date_scrutin"].isna().all():
	work["date_scrutin"] = pd.to_datetime(work["election_year"], format="%Y", errors="coerce")
	work["event_key"] = (
	work["election_type"].astype(str).str.lower().str.strip()
	+ "\|"
	+ work["election_year"].astype(str)
	+ "\|"
	+ work["round"].astype(str)
	)
	events = (
	work[["event_key", "date_scrutin"]]
	.dropna(subset=["event_key", "date_scrutin"])
	.drop_duplicates()
	.sort_values("date_scrutin")
	.reset_index(drop=True)
	)
	if len(events) < 2:
	return []
	max_splits = min(n_splits, len(events) - 1)
	tscv = TimeSeriesSplit(n_splits=max_splits)
	folds = []
	for train_evt_idx, test_evt_idx in tscv.split(events):
	train_keys = set(events.iloc[train_evt_idx]["event_key"])
	test_keys = set(events.iloc[test_evt_idx]["event_key"])
	train_idx = work.index[work["event_key"].isin(train_keys)].to_numpy()
	test_idx = work.index[work["event_key"].isin(test_keys)].to_numpy()
	folds.append((train_idx, test_idx))
	return folds


	class TwoStageRegressor(BaseEstimator, RegressorMixin):
	def __init__(
	self,
	classifier: Optional[BaseEstimator] = None,
	regressor: Optional[BaseEstimator] = None,
	epsilon: float = 1e-4,
	positive_threshold: float = 0.5,
	use_proba: bool = True,
	use_logit: bool = True,
	logit_eps: float = 1e-6,
	) -> None:
	self.classifier = classifier
	self.regressor = regressor
	self.epsilon = epsilon
	self.positive_threshold = positive_threshold
	self.use_proba = use_proba
	self.use_logit = use_logit
	self.logit_eps = logit_eps

	def _default_classifier(self) -> BaseEstimator:
	return HistGradientBoostingClassifier(random_state=42)

	def _default_regressor(self) -> BaseEstimator:
	return HistGradientBoostingRegressor(random_state=42)

	def fit(self, X, y):
	y = np.asarray(y).ravel()
	mask_pos = y > self.epsilon

	self._constant_proba = None
	if mask_pos.all() or (~mask_pos).all():
	self._constant_proba = float(mask_pos.mean())
	self.classifier_ = None
	else:
	classifier = self.classifier if self.classifier is not None else self._default_classifier()
	self.classifier_ = clone(classifier)
	self.classifier_.fit(X, mask_pos.astype(int))

	self.regressor_ = None
	if mask_pos.any():
	regressor = self.regressor if self.regressor is not None else self._default_regressor()
	self.regressor_ = clone(regressor)
	y_reg = y[mask_pos]
	if self.use_logit:
	y_reg = np.clip(y_reg, self.logit_eps, 1 - self.logit_eps)
	y_reg = np.log(y_reg / (1 - y_reg))
	self.regressor_.fit(X[mask_pos], y_reg)
	return self

	def predict(self, X):
	if self._constant_proba is not None:
	proba = np.full(len(X), self._constant_proba, dtype=float)
	else:
	check_is_fitted(self, ["classifier_"])
	if self.use_proba and hasattr(self.classifier_, "predict_proba"):
	proba = self.classifier_.predict_proba(X)[:, 1] # type: ignore
	else:
	proba = self.classifier_.predict(X) # type: ignore
	proba = np.asarray(proba, dtype=float)

	if self.regressor_ is None:
	reg_pred = np.zeros(len(proba), dtype=float)
	else:
	reg_pred = np.asarray(self.regressor_.predict(X), dtype=float)
	if self.use_logit:
	reg_pred = 1 / (1 + np.exp(-reg_pred))
	reg_pred = np.clip(reg_pred, 0, 1)

	if self.use_proba:
	preds = proba * reg_pred
	else:
	preds = np.where(proba >= self.positive_threshold, reg_pred, 0.0)
	return preds


	class CatBoostRegressorWrapper(BaseEstimator, RegressorMixin):
	def __init__(self, **params: float \| int \| str):
	self.params = dict(params)
	self.model_ = None

	def fit(self, X, y, **fit_params):
	from catboost import CatBoostRegressor

	self.model_ = CatBoostRegressor(**self.params) # type: ignore
	self.model_.fit(X, y, **fit_params)
	return self

	def predict(self, X):
	if self.model_ is None:
	raise ValueError("CatBoostRegressorWrapper n'est pas entraîné.")
	return self.model_.predict(X)

	def get_params(self, deep: bool = True):
	return dict(self.params)

	def set_params(self, **params):
	self.params.update(params)
	return self


	def make_model(model_name: str, feature_cols: List[str], params: Dict[str, object]) -> Optional[Pipeline]:
	preprocessor = make_preprocessor(feature_cols)
	if model_name == "ridge":
	estimator = Ridge(**params) # type: ignore
	elif model_name == "hist_gradient_boosting":
	estimator = HistGradientBoostingRegressor(random_state=42, **params) # type: ignore
	elif model_name == "lightgbm":
	try:
	from lightgbm import LGBMRegressor
	except Exception:
	LOGGER.info("LightGBM indisponible, ignoré.")
	return None
	estimator = LGBMRegressor(random_state=42, force_row_wise=True, verbosity=-1, **params) # type: ignore
	elif model_name == "xgboost":
	try:
	from xgboost import XGBRegressor
	except Exception:
	LOGGER.info("XGBoost indisponible, ignoré.")
	return None
	estimator = XGBRegressor(random_state=42, **params)
	elif model_name == "two_stage_hgb":
	clf_params = params.get("clf_params", {})
	reg_params = params.get("reg_params", {})
	estimator = TwoStageRegressor(
	classifier=HistGradientBoostingClassifier(random_state=42, **clf_params), # type: ignore
	regressor=HistGradientBoostingRegressor(random_state=42, **reg_params), # type: ignore
	epsilon=params.get("epsilon", 1e-4), # type: ignore
	positive_threshold=params.get("positive_threshold", 0.5), # type: ignore
	use_proba=bool(params.get("use_proba", True)),
	use_logit=bool(params.get("use_logit", True)),
	logit_eps=params.get("logit_eps", 1e-6), # type: ignore
	)
	elif model_name == "catboost":
	try:
	from catboost import CatBoostRegressor
	except Exception:
	LOGGER.info("CatBoost indisponible, ignoré.")
	return None
	if not hasattr(CatBoostRegressor, "__sklearn_tags__"):
	estimator = CatBoostRegressorWrapper(verbose=0, random_state=42, **params) # type: ignore
	else:
	estimator = CatBoostRegressor(verbose=0, random_state=42, **params) # type: ignore
	else:
	raise ValueError(f"Modèle inconnu: {model_name}")
	# n_jobs=1 to avoid process-based parallelism issues in some environments.
	model = MultiOutputRegressor(estimator, n_jobs=1) # type: ignore
	return Pipeline(
	steps=[
	("preprocess", preprocessor),
	("model", model),
	]
	)


	def evaluate(model: Pipeline, X, y_true: np.ndarray) -> Dict[str, float]:
	if X is None or len(X) == 0:
	return {"mae_mean": np.nan}
	y_pred = model.predict(X)
	return regression_metrics(y_true, y_pred) # type: ignore


	def evaluate_cv(
	model: Pipeline,
	df: pd.DataFrame,
	feature_cols: List[str],
	n_splits: int,
	target_cols: List[str],
	) -> Dict[str, float]:
	folds = build_event_folds(df, n_splits)
	if not folds:
	return {"folds_used": 0}
	metrics_acc: Dict[str, list[float]] = {}
	for train_idx, test_idx in folds:
	model_clone = clone(model)
	X_train = df.iloc[train_idx][feature_cols]
	y_train = df.iloc[train_idx][target_cols].values
	X_test = df.iloc[test_idx][feature_cols]
	y_test = df.iloc[test_idx][target_cols].values
	model_clone.fit(X_train, y_train)
	fold_metrics = evaluate(model_clone, X_test, y_test)
	for key, value in fold_metrics.items():
	metrics_acc.setdefault(key, []).append(value)
	summary = {f"cv_{k}": float(np.nanmean(v)) for k, v in metrics_acc.items()}
	summary["folds_used"] = len(folds)
	return summary


	def compute_cv_residual_intervals(
	model: Pipeline,
	df: pd.DataFrame,
	feature_cols: List[str],
	target_cols: List[str],
	n_splits: int,
	quantiles: Tuple[float, ...] = (0.05, 0.1, 0.9, 0.95),
	) -> Dict[str, object]:
	folds = build_event_folds(df, n_splits)
	if not folds:
	return {"folds_used": 0, "quantiles": list(quantiles), "residuals": {}}

	residuals_by_cat: Dict[str, list[float]] = {cat: [] for cat in CANDIDATE_CATEGORIES}
	for train_idx, test_idx in folds:
	model_clone = clone(model)
	X_train = df.iloc[train_idx][feature_cols]
	y_train = df.iloc[train_idx][target_cols].values
	X_test = df.iloc[test_idx][feature_cols]
	y_test = df.iloc[test_idx][target_cols].values
	model_clone.fit(X_train, y_train)
	y_pred = model_clone.predict(X_test)
	y_pred = normalize_predictions(y_pred)
	resid = y_pred - y_test
	for idx, cat in enumerate(CANDIDATE_CATEGORIES):
	residuals_by_cat[cat].extend(resid[:, idx].tolist())

	quantile_keys = [f"q{int(q * 100):02d}" for q in quantiles]
	summary: Dict[str, Dict[str, float]] = {}
	for cat, values in residuals_by_cat.items():
	arr = np.asarray(values, dtype=float)
	if arr.size == 0:
	continue
	q_vals = np.quantile(arr, quantiles).tolist()
	entry = {key: float(val) for key, val in zip(quantile_keys, q_vals)}
	entry["mean"] = float(np.mean(arr))
	entry["std"] = float(np.std(arr))
	entry["n"] = int(arr.size)
	summary[cat] = entry

	return {
	"folds_used": len(folds),
	"quantiles": list(quantiles),
	"residuals": summary,
	}


	def add_cv_selection_helpers(cv_summary: pd.DataFrame) -> pd.DataFrame:
	work = cv_summary.copy()
	block_cols = [c for c in work.columns if c.startswith("cv_mae_") and c != "cv_mae_mean"]
	if block_cols:
	work["worst_block_mae"] = work[block_cols].max(axis=1)
	if "cv_bias" in work.columns:
	work["bias_abs"] = work["cv_bias"].abs()
	return work


	def select_best_model(cv_summary: pd.DataFrame) -> Tuple[str, Dict[str, object]]:
	if cv_summary.empty:
	raise RuntimeError("Aucun modèle évalué.")
	work = add_cv_selection_helpers(cv_summary)
	bias_threshold = 0.02
	candidates = work
	if "bias_abs" in work.columns:
	filtered = work[work["bias_abs"] <= bias_threshold]
	if not filtered.empty:
	candidates = filtered
	sort_cols = [c for c in ["cv_mae_mean", "worst_block_mae", "bias_abs", "cv_rmse", "cv_smape"] if c in candidates.columns]
	best_row = candidates.sort_values(sort_cols, na_position="last").iloc[0]
	return str(best_row["model"]), dict(best_row["params"])


	def save_metrics(
	metrics: Dict[str, Dict[str, Dict[str, float]]],
	output_dir: Path,
	cv_summary: pd.DataFrame \| None = None,
	) -> None:
	output_dir.mkdir(parents=True, exist_ok=True)
	with (output_dir / "metrics.json").open("w", encoding="utf-8") as f:
	json.dump(metrics, f, indent=2)

	if cv_summary is not None and not cv_summary.empty:
	cv_summary.to_csv(output_dir / "cv_summary.csv", index=False)
	lines = ["# Métriques (parts, 0-1)\n"]
	for model_name, splits in metrics.items():
	lines.append(f"## {model_name}")
	for split, vals in splits.items():
	lines.append(
	f"- {split} mae_mean: {vals.get('mae_mean', float('nan')):.4f}, "
	f"rmse: {vals.get('rmse', float('nan')):.4f}, "
	f"wape: {vals.get('wape', float('nan')):.4f}, "
	f"winner_acc: {vals.get('winner_accuracy', float('nan')):.3f}"
	)
	lines.append("")
	(output_dir / "metrics.md").write_text("\n".join(lines), encoding="utf-8")


	def save_model_card(
	model_name: str,
	cfg: SplitConfig,
	feature_cols: List[str],
	metrics: Dict[str, Dict[str, Dict[str, float]]],
	output_dir: Path,
	) -> None:
	lines = [
	"# Model card",
	f"- Modèle: {model_name}",
	f"- Split temporel: train<= {cfg.train_end_year}, valid<= {cfg.valid_end_year}, test>= {cfg.test_start_year}",
	f"- Features: {len(feature_cols)} colonnes numériques (lags, écarts national, swing, turnout)",
	"- Cibles: parts par bloc (7 catégories) renormalisées.",
	"- Métriques principales (MAE moyen, jeux valid/test):",
	f" - Valid: {metrics[model_name]['valid'].get('mae_mean', float('nan')):.4f}",
	f" - Test: {metrics[model_name]['test'].get('mae_mean', float('nan')):.4f}",
	]
	output_dir.mkdir(parents=True, exist_ok=True)
	(output_dir / "model_card.md").write_text("\n".join(lines), encoding="utf-8")


	def plot_mae_per_category(model_name: str, mae_scores: Dict[str, float], output_dir: Path) -> None:
	try:
	import matplotlib.pyplot as plt
	except Exception:
	LOGGER.warning("Matplotlib indisponible, skip figure.")
	return
	if not all(f"mae_{c}" in mae_scores for c in CANDIDATE_CATEGORIES):
	LOGGER.warning("Scores MAE par categorie indisponibles, skip figure.")
	return
	cats = CANDIDATE_CATEGORIES
	values = [mae_scores[f"mae_{c}"] for c in cats]
	plt.figure(figsize=(8, 4))
	plt.bar(cats, values, color="#2c7fb8")
	plt.xticks(rotation=30, ha="right")
	plt.ylabel("MAE (part)")
	plt.title(f"MAE par catégorie - {model_name}")
	output_dir.mkdir(parents=True, exist_ok=True)
	plt.tight_layout()
	plt.savefig(output_dir / "mae_per_category.png")
	plt.close()


	def main() -> None:
	parser = argparse.ArgumentParser(description="Entraînement et évaluation temporelle multi-blocs.")
	parser.add_argument("--panel", type=Path, default=Path("data/processed/panel.parquet"), help="Dataset panel parquet.")
	parser.add_argument("--models-dir", type=Path, default=Path("models"), help="Répertoire de sauvegarde des modèles.")
	parser.add_argument("--reports-dir", type=Path, default=Path("reports"), help="Répertoire de sortie des rapports.")
	parser.add_argument("--train-end-year", type=int, default=2019, help="Dernière année incluse dans le train.")
	parser.add_argument("--valid-end-year", type=int, default=2021, help="Dernière année incluse dans la validation.")
	parser.add_argument("--test-start-year", type=int, default=2022, help="Première année du test (inclusif).")
	parser.add_argument("--cv-splits", type=int, default=4, help="Nombre de folds temporels pour la CV par scrutin.")
	parser.add_argument("--no-tune", action="store_true", help="Désactiver la recherche d'hyperparamètres.")
	parser.add_argument("--max-trials", type=int, default=0, help="Limiter le nombre d'essais par modèle (0=all).")
	parser.add_argument(
	"--models",
	nargs="+",
	default=list(MODEL_GRIDS.keys()),
	help="Liste des modèles à tester (ridge, hist_gradient_boosting, lightgbm, xgboost, two_stage_hgb, catboost).",
	)
	args = parser.parse_args()

	logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
	cfg = SplitConfig(train_end_year=args.train_end_year, valid_end_year=args.valid_end_year, test_start_year=args.test_start_year)

	panel = load_panel(args.panel)
	panel = panel.dropna(subset=TARGET_COLS)
	feature_cols = get_feature_columns(panel)
	all_na = [c for c in feature_cols if panel[c].isna().all()]
	if all_na:
	LOGGER.warning("Features supprimées car entièrement NA: %s", all_na)
	feature_cols = [c for c in feature_cols if c not in all_na]

	train_df, valid_df, test_df = temporal_split(panel, cfg)
	train_valid_df = panel[panel["election_year"] < cfg.test_start_year].copy().reset_index(drop=True)

	models_to_run = [m for m in args.models if m in MODEL_GRIDS]
	if not models_to_run:
	raise RuntimeError("Aucun modèle demandé n'est reconnu.")

	cv_rows: List[Dict[str, object]] = []
	if not args.no_tune:
	rng = np.random.default_rng(42)
	for model_name in models_to_run:
	grid = MODEL_GRIDS[model_name]
	if args.max_trials and len(grid) > args.max_trials:
	indices = rng.choice(len(grid), size=args.max_trials, replace=False)
	grid = [grid[i] for i in indices]
	for params in grid:
	model = make_model(model_name, feature_cols, params)
	if model is None:
	continue
	cv_metrics = evaluate_cv(model, train_valid_df, feature_cols, args.cv_splits, TARGET_COLS)
	row = {"model": model_name, "params": params, **cv_metrics}
	cv_rows.append(row)

	cv_summary = pd.DataFrame(cv_rows)
	if not cv_summary.empty:
	cv_summary = cv_summary.dropna(subset=["cv_mae_mean"])
	cv_summary = add_cv_selection_helpers(cv_summary)
	if not cv_summary.empty:
	best_model_name, best_params = select_best_model(cv_summary)
	LOGGER.info("Meilleur modèle CV: %s %s", best_model_name, best_params)
	else:
	best_model_name = models_to_run[0]
	best_params = MODEL_GRIDS[best_model_name][0]
	LOGGER.warning("Pas de CV disponible, fallback sur %s %s", best_model_name, best_params)

	residual_payload = {}
	model_for_intervals = make_model(best_model_name, feature_cols, best_params)
	if model_for_intervals is not None and not train_valid_df.empty:
	residual_payload = compute_cv_residual_intervals(
	model_for_intervals,
	train_valid_df,
	feature_cols,
	TARGET_COLS,
	args.cv_splits,
	)
	if residual_payload.get("residuals"):
	args.reports_dir.mkdir(parents=True, exist_ok=True)
	(args.reports_dir / "residual_intervals.json").write_text(
	json.dumps(
	{
	"model": best_model_name,
	**residual_payload,
	},
	indent=2,
	),
	encoding="utf-8",
	)

	X_train, y_train = train_df[feature_cols], train_df[TARGET_COLS].values
	X_valid, y_valid = valid_df[feature_cols], valid_df[TARGET_COLS].values
	X_test, y_test = test_df[feature_cols], test_df[TARGET_COLS].values
	X_train_valid, y_train_valid = train_valid_df[feature_cols], train_valid_df[TARGET_COLS].values

	eval_results: Dict[str, Dict[str, Dict[str, float]]] = {}
	best_model_eval = make_model(best_model_name, feature_cols, best_params)
	if best_model_eval is None:
	raise RuntimeError(f"Modèle indisponible: {best_model_name}")
	best_model_eval.fit(X_train, y_train)
	eval_results[best_model_name] = {
	"train": evaluate(best_model_eval, X_train, y_train),
	"valid": evaluate(best_model_eval, X_valid, y_valid),
	"test": evaluate(best_model_eval, X_test, y_test),
	"train_valid": evaluate(best_model_eval, X_train_valid, y_train_valid),
	}

	best_model_final = make_model(best_model_name, feature_cols, best_params)
	if best_model_final is None:
	raise RuntimeError(f"Modèle indisponible: {best_model_name}")
	best_model_final.fit(X_train_valid, y_train_valid)

	args.models_dir.mkdir(parents=True, exist_ok=True)
	joblib.dump(best_model_final, args.models_dir / f"{best_model_name}.joblib")
	LOGGER.info("Modèle sauvegardé dans %s", args.models_dir / f"{best_model_name}.joblib")
	(args.models_dir / "feature_columns.json").write_text(json.dumps(feature_cols, indent=2), encoding="utf-8")
	(args.models_dir / "best_model.json").write_text(json.dumps({"name": best_model_name}, indent=2), encoding="utf-8")

	save_metrics(eval_results, args.reports_dir, cv_summary=cv_summary)
	plot_mae_per_category(best_model_name, eval_results[best_model_name]["test"], args.reports_dir / "figures")
	save_model_card(best_model_name, cfg, feature_cols, eval_results, args.models_dir)


	if __name__ == "__main__":
	main()