SchemeImpactNet / src /model.py
sammeeer's picture
Inital schemeimpactnet deployment
f87e795
"""
model.py
--------
V4 Multi-Algorithm Model Selection for MNREGA district-level forecasting.
Algorithms compared via walk-forward CV:
- GradientBoostingRegressor (current champion)
- RandomForestRegressor
- XGBoost
- LightGBM
- Ridge (linear baseline)
- ElasticNet (regularised linear baseline)
Selection criterion: mean RΒ² across walk-forward CV years (excl. 2022 anomaly).
Best model is saved to models/mnrega_best_model.pkl.
W&B logging:
- Each algorithm gets its own W&B run (group="mnrega_model_selection")
- Per-year CV metrics logged as time-series
- Feature importance logged as bar chart
- Model comparison summary table logged
- Best model flagged with tag "champion"
Usage:
export WANDB_API_KEY=your_key # or wandb login
python main.py --stage 3
"""
import os
import pickle
import warnings
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Ridge, ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
warnings.filterwarnings("ignore")
# Optional imports β€” graceful fallback if not installed
try:
from xgboost import XGBRegressor
HAS_XGB = True
except ImportError:
HAS_XGB = False
print("[model] xgboost not installed β€” skipping")
try:
from lightgbm import LGBMRegressor
HAS_LGB = True
except ImportError:
HAS_LGB = False
print("[model] lightgbm not installed β€” skipping")
try:
import wandb
HAS_WANDB = True
except ImportError:
HAS_WANDB = False
print("[model] wandb not installed β€” metrics will be logged locally only")
from src.features import FEATURE_COLS
TARGET = "person_days_lakhs"
FIGURES_DIR = os.path.join("reports", "figures")
OUTPUT_DIR = os.path.join("data", "processed")
MODELS_DIR = "models"
MODEL_PATH = os.path.join(MODELS_DIR, "mnrega_best_model.pkl")
WANDB_PROJECT = "SchemeImpactNet"
WANDB_GROUP = "mnrega_model_selection"
os.makedirs(FIGURES_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)
# Walk-forward CV test years
WF_TEST_YEARS = [2018, 2019, 2020, 2021, 2022, 2023, 2024]
# ── Algorithm registry ────────────────────────────────────────────────────────
def _build_candidates() -> dict:
"""
Returns dict of {name: estimator}.
Each estimator is either a plain sklearn estimator or a Pipeline
(for linear models that need scaling).
"""
candidates = {
"GradientBoosting": GradientBoostingRegressor(
n_estimators=200, max_depth=4, learning_rate=0.03,
subsample=0.7, min_samples_leaf=10, random_state=42,
),
"RandomForest": RandomForestRegressor(
n_estimators=300, max_depth=8, min_samples_leaf=10,
n_jobs=-1, random_state=42,
),
"Ridge": Pipeline([
("scaler", StandardScaler()),
("model", Ridge(alpha=10.0)),
]),
"ElasticNet": Pipeline([
("scaler", StandardScaler()),
("model", ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=2000)),
]),
}
if HAS_XGB:
candidates["XGBoost"] = XGBRegressor(
n_estimators=200, max_depth=4, learning_rate=0.03,
subsample=0.7, colsample_bytree=0.8,
reg_alpha=0.1, reg_lambda=1.0,
random_state=42, verbosity=0,
)
if HAS_LGB:
candidates["LightGBM"] = LGBMRegressor(
n_estimators=200, max_depth=4, learning_rate=0.03,
subsample=0.7, colsample_bytree=0.8,
reg_alpha=0.1, reg_lambda=1.0,
random_state=42, verbosity=-1,
)
return candidates
# ── Main entry point ──────────────────────────────────────────────────────────
def run_model(df: pd.DataFrame) -> pd.DataFrame:
"""
Full model selection pipeline:
1. Walk-forward CV for each algorithm candidate
2. Select best by mean RΒ² (excl. 2022)
3. Train winner on all data
4. Save model + metadata pkl
5. Generate figures + W&B logs
6. Return predictions DataFrame
"""
print("\n[model] ── V4 Multi-Algorithm Model Selection ───────────────")
features = _get_features(df)
print(f"[model] Features ({len(features)}): {features}")
print(f"[model] Algorithms: {list(_build_candidates().keys())}")
candidates = _build_candidates()
# ── Walk-forward CV for all candidates ────────────────────────────────
all_cv_results = {}
for name, estimator in candidates.items():
print(f"\n[model] ── {name} ──")
cv = _walk_forward_cv(df, features, estimator, name)
all_cv_results[name] = cv
# ── Select best model ─────────────────────────────────────────────────
best_name, best_cv = _select_best(all_cv_results)
print(f"\n[model] βœ“ Best model: {best_name}")
# ── Print full comparison table ───────────────────────────────────────
_print_comparison_table(all_cv_results)
# ── Train winner on all data ──────────────────────────────────────────
print(f"\n[model] Training {best_name} on all {len(df):,} district-years...")
best_estimator = candidates[best_name]
X_all = df[features].fillna(0)
y_all = df[TARGET]
best_estimator.fit(X_all, y_all)
# ── Log to W&B ────────────────────────────────────────────────────────
if HAS_WANDB:
_wandb_log_all(all_cv_results, best_name, best_estimator, features, df)
# ── Save best model ───────────────────────────────────────────────────
_save_model(best_name, best_estimator, features, best_cv, all_cv_results, df)
# ── Figures ───────────────────────────────────────────────────────────
_plot_model_comparison(all_cv_results, best_name)
_plot_cv_per_year(all_cv_results, best_name)
_plot_feature_importance(best_name, best_estimator, features)
# ── Predictions + report ──────────────────────────────────────────────
predictions_df = _predict_all(best_estimator, df, features)
_save_predictions(predictions_df)
_save_model_report(best_name, best_cv, all_cv_results, features, best_estimator)
print("\n[model] ── V4 Pipeline Complete ─────────────────────────────\n")
return predictions_df
# ── Walk-forward CV ───────────────────────────────────────────────────────────
def _walk_forward_cv(
df: pd.DataFrame,
features: list,
estimator,
name: str,
) -> pd.DataFrame:
"""Walk-forward CV: train on years < T, evaluate on T."""
print(f" {'Year':<6} {'n':>5} {'RΒ²':>8} {'MAE':>8} {'RMSE':>8} {'Naive RΒ²':>10} {'RΒ² gain':>8}")
print(f" {'-'*68}")
rows = []
for test_yr in WF_TEST_YEARS:
tr = df[df["financial_year"] < test_yr]
te = df[df["financial_year"] == test_yr]
if len(tr) < 200 or len(te) < 50:
continue
import copy
m = copy.deepcopy(estimator)
m.fit(tr[features].fillna(0), tr[TARGET])
pred = m.predict(te[features].fillna(0))
naive = te["lag1_pd"].fillna(te[TARGET].mean()).values
r2 = r2_score(te[TARGET], pred)
mae = mean_absolute_error(te[TARGET], pred)
rmse = np.sqrt(mean_squared_error(te[TARGET], pred))
naive_r2 = r2_score(te[TARGET], naive)
naive_mae = mean_absolute_error(te[TARGET], naive)
mape = np.mean(np.abs((te[TARGET].values - pred) / (te[TARGET].values + 1e-9))) * 100
print(f" {test_yr:<6} {len(te):>5} {r2:>8.4f} {mae:>8.3f} {rmse:>8.3f} "
f"{naive_r2:>10.4f} {r2-naive_r2:>+8.4f}")
rows.append({
"year": test_yr, "n": len(te),
"r2": round(r2, 4),
"mae": round(mae, 3),
"rmse": round(rmse, 3),
"mape": round(mape, 3),
"naive_r2": round(naive_r2, 4),
"naive_mae": round(naive_mae, 3),
"r2_gain": round(r2 - naive_r2, 4),
"mae_gain": round(naive_mae - mae, 3),
})
cv = pd.DataFrame(rows)
ex22 = cv[cv["year"] != 2022]
print(f" β†’ Mean RΒ²={cv['r2'].mean():.4f} excl.2022 RΒ²={ex22['r2'].mean():.4f} "
f"MAE={cv['mae'].mean():.3f}L")
return cv
# ── Model selection ───────────────────────────────────────────────────────────
def _select_best(all_cv: dict) -> tuple:
"""Select best model by mean RΒ² excluding 2022 anomaly year."""
scores = {}
for name, cv in all_cv.items():
ex22 = cv[cv["year"] != 2022]
scores[name] = ex22["r2"].mean()
best_name = max(scores, key=scores.get)
print(f"\n[model] Model selection (mean RΒ² excl. 2022):")
for name, score in sorted(scores.items(), key=lambda x: -x[1]):
marker = " ← BEST" if name == best_name else ""
print(f" {name:<20}: {score:.4f}{marker}")
return best_name, all_cv[best_name]
def _print_comparison_table(all_cv: dict) -> None:
print(f"\n[model] Full comparison (all years):")
print(f" {'Model':<20} {'RΒ²':>8} {'excl22 RΒ²':>10} {'MAE':>8} {'RMSE':>8} {'RΒ²gain':>8}")
print(f" {'-'*72}")
for name, cv in all_cv.items():
ex22 = cv[cv["year"] != 2022]
print(f" {name:<20} {cv['r2'].mean():>8.4f} {ex22['r2'].mean():>10.4f} "
f"{cv['mae'].mean():>8.3f} {cv['rmse'].mean():>8.3f} "
f"{cv['r2_gain'].mean():>+8.4f}")
# ── W&B logging ───────────────────────────────────────────────────────────────
def _wandb_log_all(
all_cv: dict,
best_name: str,
best_estimator,
features: list,
df: pd.DataFrame,
) -> None:
"""Log all model results to W&B β€” one run per algorithm + one summary run."""
# ── Per-algorithm runs ────────────────────────────────────────────────
for name, cv in all_cv.items():
ex22 = cv[cv["year"] != 2022]
tags = ["champion"] if name == best_name else []
run = wandb.init(
project=WANDB_PROJECT,
group=WANDB_GROUP,
name=name,
tags=tags,
config={
"algorithm": name,
"n_features": len(features),
"features": features,
"wf_test_years": WF_TEST_YEARS,
"target": TARGET,
"is_best": name == best_name,
},
reinit=True,
)
# Per-year CV metrics as time series
for _, row in cv.iterrows():
run.log({
"year": int(row["year"]),
"r2": row["r2"],
"mae": row["mae"],
"rmse": row["rmse"],
"mape": row["mape"],
"naive_r2": row["naive_r2"],
"r2_gain": row["r2_gain"],
"mae_gain": row["mae_gain"],
"is_anomaly_year": int(row["year"]) == 2022,
})
# Summary metrics
run.summary.update({
"cv_mean_r2": round(cv["r2"].mean(), 4),
"cv_ex22_r2": round(ex22["r2"].mean(), 4),
"cv_mean_mae": round(cv["mae"].mean(), 3),
"cv_mean_rmse": round(cv["rmse"].mean(), 3),
"cv_mean_mape": round(cv["mape"].mean(), 3),
"cv_r2_gain": round(cv["r2_gain"].mean(), 4),
"n_districts": df["district"].nunique(),
"n_states": df["state"].nunique(),
"train_years": len(df["financial_year"].unique()),
})
# Feature importance (tree-based only)
fi = _get_feature_importance(name, best_estimator if name == best_name else None, features)
if fi is not None and name == best_name:
fi_table = wandb.Table(
columns=["feature", "importance"],
data=[[f, v] for f, v in sorted(fi.items(), key=lambda x: -x[1])]
)
run.log({"feature_importance": wandb.plot.bar(
fi_table, "feature", "importance",
title=f"Feature Importance β€” {name}"
)})
# CV RΒ² chart per year
cv_table = wandb.Table(dataframe=cv[["year","r2","naive_r2","mae","rmse","r2_gain"]])
run.log({
"cv_results_table": cv_table,
"cv_r2_chart": wandb.plot.line_series(
xs=cv["year"].tolist(),
ys=[cv["r2"].tolist(), cv["naive_r2"].tolist()],
keys=["Model RΒ²", "Naive RΒ²"],
title=f"Walk-Forward CV RΒ² β€” {name}",
xname="Financial Year",
),
})
run.finish()
# ── Summary comparison run ────────────────────────────────────────────
run = wandb.init(
project=WANDB_PROJECT,
group=WANDB_GROUP,
name="model_selection_summary",
tags=["summary"],
reinit=True,
)
summary_rows = []
for name, cv in all_cv.items():
ex22 = cv[cv["year"] != 2022]
summary_rows.append([
name,
round(cv["r2"].mean(), 4),
round(ex22["r2"].mean(), 4),
round(cv["mae"].mean(), 3),
round(cv["rmse"].mean(), 3),
round(cv["mape"].mean(), 3),
round(cv["r2_gain"].mean(), 4),
name == best_name,
])
summary_table = wandb.Table(
columns=["model", "mean_r2", "ex22_r2", "mean_mae",
"mean_rmse", "mean_mape", "r2_gain", "is_best"],
data=summary_rows,
)
run.log({
"model_comparison": summary_table,
"best_model": best_name,
"best_ex22_r2": round(all_cv[best_name][all_cv[best_name]["year"] != 2022]["r2"].mean(), 4),
})
# Comparison bar chart
run.log({
"r2_comparison": wandb.plot.bar(
wandb.Table(
columns=["model", "ex22_r2"],
data=[[r[0], r[2]] for r in summary_rows]
),
"model", "ex22_r2",
title="Model Comparison β€” RΒ² excl. 2022",
)
})
run.finish()
print(f"[model] W&B logs complete β†’ project: {WANDB_PROJECT} / group: {WANDB_GROUP}")
# ── Figures ───────────────────────────────────────────────────────────────────
def _plot_model_comparison(all_cv: dict, best_name: str) -> None:
"""Bar chart comparing all models on mean RΒ² (all years and excl. 2022)."""
names = list(all_cv.keys())
mean_r2 = [all_cv[n]["r2"].mean() for n in names]
ex22_r2 = [all_cv[n][all_cv[n]["year"] != 2022]["r2"].mean() for n in names]
mean_mae = [all_cv[n]["mae"].mean() for n in names]
x = np.arange(len(names))
w = 0.35
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
bars1 = ax1.bar(x - w/2, mean_r2, w, label="All years", alpha=0.8, color="#42A5F5")
bars2 = ax1.bar(x + w/2, ex22_r2, w, label="excl. 2022", alpha=0.8, color="#26A69A")
ax1.set_xticks(x); ax1.set_xticklabels(names, rotation=20, ha="right")
ax1.set_ylabel("Mean RΒ² (Walk-Forward CV)")
ax1.set_title("Model Comparison β€” RΒ² Score")
ax1.set_ylim(0, 1)
ax1.legend()
# Annotate best
best_idx = names.index(best_name)
ax1.annotate("β˜… BEST", xy=(best_idx + w/2, ex22_r2[best_idx] + 0.01),
ha="center", color="#E53935", fontsize=9, fontweight="bold")
bars3 = ax2.bar(x, mean_mae, alpha=0.8,
color=["#E53935" if n == best_name else "#78909C" for n in names])
ax2.set_xticks(x); ax2.set_xticklabels(names, rotation=20, ha="right")
ax2.set_ylabel("Mean MAE (lakh person-days)")
ax2.set_title("Model Comparison β€” MAE")
for bar in bars3:
ax2.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,
f"{bar.get_height():.2f}", ha="center", va="bottom", fontsize=8)
plt.suptitle("SchemeImpactNet V4 β€” Algorithm Selection Results", fontsize=12, fontweight="bold")
plt.tight_layout()
path = os.path.join(FIGURES_DIR, "06_model_comparison.png")
plt.savefig(path, dpi=150, bbox_inches="tight")
plt.close()
print(f"[model] Saved: {path}")
def _plot_cv_per_year(all_cv: dict, best_name: str) -> None:
"""Line chart: RΒ² per year for every algorithm."""
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
colors = plt.cm.tab10(np.linspace(0, 1, len(all_cv)))
for (name, cv), color in zip(all_cv.items(), colors):
lw = 2.5 if name == best_name else 1.2
ls = "-" if name == best_name else "--"
alpha = 1.0 if name == best_name else 0.65
axes[0].plot(cv["year"], cv["r2"], marker="o", label=name,
linewidth=lw, linestyle=ls, alpha=alpha, color=color)
axes[1].plot(cv["year"], cv["mae"], marker="o", label=name,
linewidth=lw, linestyle=ls, alpha=alpha, color=color)
for ax in axes:
ax.axvspan(2021.5, 2022.5, alpha=0.08, color="red", label="2022 anomaly")
ax.axvspan(2019.5, 2020.5, alpha=0.05, color="orange", label="COVID-2020")
ax.set_xticks(WF_TEST_YEARS)
ax.set_xlabel("Financial Year")
ax.legend(fontsize=8)
axes[0].set_ylabel("RΒ²"); axes[0].set_title("Walk-Forward CV RΒ² by Year")
axes[1].set_ylabel("MAE (lakh PD)"); axes[1].set_title("Walk-Forward CV MAE by Year")
plt.suptitle("All Models β€” Walk-Forward CV Results", fontsize=12, fontweight="bold")
plt.tight_layout()
path = os.path.join(FIGURES_DIR, "07_cv_per_year.png")
plt.savefig(path, dpi=150, bbox_inches="tight")
plt.close()
print(f"[model] Saved: {path}")
def _plot_feature_importance(name: str, estimator, features: list) -> None:
fi = _get_feature_importance(name, estimator, features)
if fi is None:
return
imp = pd.Series(fi).sort_values()
fig, ax = plt.subplots(figsize=(8, max(5, len(imp) * 0.35)))
colors = ["#E53935" if imp[f] > imp.quantile(0.75) else "#42A5F5" for f in imp.index]
imp.plot(kind="barh", ax=ax, color=colors)
ax.set_title(f"Feature Importances β€” {name} (Best Model)")
ax.set_xlabel("Importance Score")
plt.tight_layout()
path = os.path.join(FIGURES_DIR, "08_feature_importance.png")
plt.savefig(path, dpi=150, bbox_inches="tight")
plt.close()
print(f"[model] Saved: {path}")
print(f"\n[model] Top 5 features ({name}):")
for feat, val in imp.sort_values(ascending=False).head(5).items():
print(f" {feat:<35}: {val:.4f}")
def _get_feature_importance(name: str, estimator, features: list):
"""Extract feature importance β€” works for tree models and linear models."""
if estimator is None:
return None
try:
# Tree-based: direct feature_importances_
if hasattr(estimator, "feature_importances_"):
return dict(zip(features, estimator.feature_importances_))
# Pipeline with tree inside
if hasattr(estimator, "named_steps"):
inner = list(estimator.named_steps.values())[-1]
if hasattr(inner, "feature_importances_"):
return dict(zip(features, inner.feature_importances_))
if hasattr(inner, "coef_"):
return dict(zip(features, np.abs(inner.coef_)))
# XGBoost / LightGBM
if hasattr(estimator, "feature_importances_"):
return dict(zip(features, estimator.feature_importances_))
except Exception:
pass
return None
# ── Model persistence ─────────────────────────────────────────────────────────
def _save_model(
best_name: str,
best_estimator,
features: list,
best_cv: pd.DataFrame,
all_cv: dict,
df: pd.DataFrame,
) -> None:
ex22 = best_cv[best_cv["year"] != 2022]
# Build comparison summary for the bundle
comparison = {}
for name, cv in all_cv.items():
e22 = cv[cv["year"] != 2022]
comparison[name] = {
"mean_r2": round(cv["r2"].mean(), 4),
"ex22_r2": round(e22["r2"].mean(), 4),
"mean_mae": round(cv["mae"].mean(), 3),
"mean_rmse": round(cv["rmse"].mean(), 3),
}
bundle = {
"model": best_estimator,
"model_name": best_name,
"features": features,
"target": TARGET,
"covid_multiplier": 1.447,
"train_years": sorted(df["financial_year"].unique().tolist()),
"n_districts": df["district"].nunique(),
"n_states": df["state"].nunique(),
"feature_importance": _get_feature_importance(best_name, best_estimator, features),
"cv_results": best_cv.to_dict(),
"cv_mean_r2": round(best_cv["r2"].mean(), 4),
"cv_ex22_r2": round(ex22["r2"].mean(), 4),
"cv_mean_mae": round(best_cv["mae"].mean(), 3),
"all_model_comparison": comparison,
}
with open(MODEL_PATH, "wb") as f:
pickle.dump(bundle, f)
print(f"\n[model] Model saved β†’ {MODEL_PATH}")
print(f"[model] Best: {best_name} | ex22 RΒ²={ex22['r2'].mean():.4f} | MAE={best_cv['mae'].mean():.3f}L")
def load_model(path: str = MODEL_PATH) -> dict:
"""Load the saved best model bundle."""
with open(path, "rb") as f:
bundle = pickle.load(f)
print(f"[model] Loaded: {bundle['model_name']} from {path}")
print(f"[model] ex22 RΒ²={bundle['cv_ex22_r2']} | MAE={bundle['cv_mean_mae']}L")
return bundle
# ── Prediction helpers ────────────────────────────────────────────────────────
def _predict_all(estimator, df: pd.DataFrame, features: list) -> pd.DataFrame:
preds = estimator.predict(df[features].fillna(0))
out = df[["state", "district", "financial_year", TARGET]].copy()
out["predicted_persondays"] = preds.round(3)
out["prediction_error"] = (out[TARGET] - out["predicted_persondays"]).round(3)
out["abs_error"] = out["prediction_error"].abs()
return out
def _save_predictions(df: pd.DataFrame) -> None:
path = os.path.join(OUTPUT_DIR, "mnrega_predictions.csv")
df.to_csv(path, index=False)
print(f"[model] Predictions saved β†’ {path}")
# ── Report ────────────────────────────────────────────────────────────────────
def _save_model_report(
best_name: str,
best_cv: pd.DataFrame,
all_cv: dict,
features: list,
best_estimator,
) -> None:
ex22 = best_cv[best_cv["year"] != 2022]
path = os.path.join("reports", "model_report.txt")
os.makedirs("reports", exist_ok=True)
with open(path, "w") as f:
f.write("SchemeImpactNet β€” V4 Model Selection Report\n")
f.write("=" * 60 + "\n\n")
f.write(f"Best Model : {best_name}\n")
f.write(f"Selection : max mean RΒ² excl. 2022 (walk-forward CV)\n")
f.write(f"Features : {len(features)}\n")
f.write(f"Evaluation : Walk-forward CV (2018–2024)\n\n")
f.write("Algorithm Comparison:\n")
f.write(f" {'Model':<20} {'RΒ²':>8} {'ex22 RΒ²':>10} {'MAE':>8} {'RMSE':>8}\n")
f.write(f" {'-'*60}\n")
for name, cv in all_cv.items():
e22 = cv[cv["year"] != 2022]
marker = " ← BEST" if name == best_name else ""
f.write(f" {name:<20} {cv['r2'].mean():>8.4f} "
f"{e22['r2'].mean():>10.4f} {cv['mae'].mean():>8.3f} "
f"{cv['rmse'].mean():>8.3f}{marker}\n")
f.write(f"\nBest Model ({best_name}) Walk-Forward CV:\n")
f.write(f" Mean RΒ² : {best_cv['r2'].mean():.4f}\n")
f.write(f" excl.2022 RΒ²: {ex22['r2'].mean():.4f}\n")
f.write(f" Mean MAE : {best_cv['mae'].mean():.3f} lakh\n")
f.write(f" Mean RMSE : {best_cv['rmse'].mean():.3f} lakh\n")
f.write(f" RΒ² gain : {best_cv['r2_gain'].mean():+.4f} vs naive lag-1\n\n")
f.write(f"Previous (leaked) RΒ²: 0.9963\n")
f.write(f"Leakage source: works_completed (r=1.0 with target)\n\n")
f.write(f"2022 anomaly: West Bengal -93 to -98% reporting drop. Excl. RΒ²={ex22['r2'].mean():.4f}\n\n")
fi = _get_feature_importance(best_name, best_estimator, features)
if fi:
f.write("Feature Importances:\n")
for feat, val in sorted(fi.items(), key=lambda x: -x[1]):
f.write(f" {feat:<35} {val:.4f}\n")
f.write(f"\nYear-by-year CV ({best_name}):\n")
f.write(best_cv.to_string(index=False))
print(f"[model] Report saved β†’ {path}")
# ── Feature list helper ───────────────────────────────────────────────────────
def _get_features(df: pd.DataFrame) -> list:
available = [f for f in FEATURE_COLS if f in df.columns]
missing = [f for f in FEATURE_COLS if f not in df.columns]
if missing:
print(f"[model] Warning: {len(missing)} features not in df: {missing}")
return available