| """ |
| src/models/train_regressor.py |
| ================================ |
| Trains and evaluates flight price regression models: |
| - Linear Regression |
| - Random Forest Regressor |
| - Gradient Boosting Regressor |
| - ANN Regressor (ReLU + linear output) |
| |
| Selects best model by RMSE on held-out test set. |
| """ |
|
|
| import warnings |
| warnings.filterwarnings("ignore") |
|
|
| import json |
| import numpy as np |
| import pandas as pd |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
| from datetime import datetime |
| from src.utils.logger import get_logger |
| from src.models.model_selector import ( |
| sensitivity_analysis, save_model, save_metrics, walk_forward_cv |
| ) |
| from config.settings import ( |
| PROCESSED_DIR, FIGURES_DIR, METRICS_DIR, RANDOM_SEED, TEST_SIZE, |
| REGRESSOR_FEATURES, REGRESSOR_TARGET |
| ) |
|
|
| logger = get_logger(__name__) |
|
|
| try: |
| from sklearn.linear_model import LinearRegression |
| from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor |
| from sklearn.neural_network import MLPRegressor |
| from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score |
| SKLEARN_OK = True |
| except ImportError: |
| logger.warning("scikit-learn not installed — using numpy fallback regressors") |
| SKLEARN_OK = False |
|
|
|
|
| |
|
|
| class _NumpyLinearRegression: |
| def __init__(self): |
| self.w = None; self.b = 0.0 |
|
|
| def fit(self, X, y): |
| X_ = np.column_stack([np.ones(len(X)), X]) |
| try: |
| coef = np.linalg.lstsq(X_, y, rcond=None)[0] |
| except Exception: |
| coef = np.zeros(X_.shape[1]) |
| self.b = coef[0]; self.w = coef[1:] |
| return self |
|
|
| def predict(self, X): |
| return X @ self.w + self.b |
|
|
| def get_params(self, **kw): return {} |
|
|
|
|
| class _NumpyRFRegressor: |
| def __init__(self, n_estimators=30, max_depth=6, seed=42): |
| self.n_estimators = n_estimators |
| self.max_depth = max_depth |
| self.seed = seed |
| self.trees = [] |
|
|
| def _build_tree(self, X, y, depth=0): |
| if depth >= self.max_depth or len(y) < 5: |
| return {"leaf": True, "value": np.mean(y)} |
| rng = np.random.default_rng(self.seed + depth * 7) |
| n_feat = max(1, int(np.sqrt(X.shape[1]))) |
| feat_idx = rng.choice(X.shape[1], n_feat, replace=False) |
| best_fi, best_t, best_mse = 0, 0, float("inf") |
| for fi in feat_idx: |
| for t in np.percentile(X[:, fi], [25, 50, 75]): |
| l = y[X[:, fi] <= t]; r = y[X[:, fi] > t] |
| if len(l) == 0 or len(r) == 0: continue |
| mse = len(l) * l.var() + len(r) * r.var() |
| if mse < best_mse: best_mse, best_fi, best_t = mse, fi, t |
| lm = X[:, best_fi] <= best_t |
| return {"leaf": False, "feat": best_fi, "thresh": best_t, |
| "left": self._build_tree(X[lm], y[lm], depth+1), |
| "right": self._build_tree(X[~lm], y[~lm], depth+1)} |
|
|
| def _pred_tree(self, node, x): |
| if node["leaf"]: return node["value"] |
| if x[node["feat"]] <= node["thresh"]: return self._pred_tree(node["left"], x) |
| return self._pred_tree(node["right"], x) |
|
|
| def fit(self, X, y): |
| rng = np.random.default_rng(self.seed) |
| self.trees = [] |
| for i in range(self.n_estimators): |
| idx = rng.choice(len(X), len(X), replace=True) |
| self.trees.append(self._build_tree(X[idx], y[idx])) |
| self.feature_importances_ = np.ones(X.shape[1]) / X.shape[1] |
| return self |
|
|
| def predict(self, X): |
| preds = np.array([[self._pred_tree(t, x) for t in self.trees] |
| for x in X], dtype=float) |
| result = np.nanmean(preds, axis=1) |
| result = np.where(np.isfinite(result), result, 0.0) |
| return result |
|
|
| def get_params(self, **kw): return {"n_estimators": self.n_estimators} |
|
|
|
|
| class _NumpyANNRegressor: |
| """2-layer ANN regression with ReLU hidden, linear output.""" |
| def __init__(self, hidden=(16, 8), lr=0.001, n_iter=200, seed=42): |
| self.hidden = hidden; self.lr = lr; self.n_iter = n_iter; self.seed = seed |
| self.weights = []; self.biases = [] |
|
|
| def _relu(self, z): return np.maximum(0, z) |
| def _drelu(self, z): return (z > 0).astype(float) |
|
|
| def fit(self, X, y): |
| rng = np.random.default_rng(self.seed) |
| layers = [X.shape[1]] + list(self.hidden) + [1] |
| self.weights = [rng.normal(0, np.sqrt(2/layers[i]), (layers[i], layers[i+1])) |
| for i in range(len(layers)-1)] |
| self.biases = [np.zeros(layers[i+1]) for i in range(len(layers)-1)] |
| y_ = y.reshape(-1, 1).astype(float) |
| for _ in range(self.n_iter): |
| acts = [X] |
| zs = [] |
| for i, (W, b) in enumerate(zip(self.weights, self.biases)): |
| z = acts[-1] @ W + b |
| zs.append(z) |
| acts.append(self._relu(z) if i < len(self.weights)-1 else z) |
| delta = (acts[-1] - y_) / len(y_) |
| for i in reversed(range(len(self.weights))): |
| dW = acts[i].T @ delta |
| db = delta.mean(axis=0) |
| if i > 0: |
| delta = delta @ self.weights[i].T * self._drelu(zs[i-1]) |
| self.weights[i] -= self.lr * dW |
| self.biases[i] -= self.lr * db |
| return self |
|
|
| def predict(self, X): |
| a = X |
| for i, (W, b) in enumerate(zip(self.weights, self.biases)): |
| a = (self._relu(a @ W + b) if i < len(self.weights)-1 |
| else a @ W + b) |
| return a.flatten() |
|
|
| def get_params(self, **kw): return {"hidden": self.hidden, "lr": self.lr} |
|
|
|
|
| |
|
|
| def _compute_metrics(model, X_test, y_test, model_name, y_mean): |
| pred = model.predict(X_test) |
| |
| pred_dn = pred * y_mean["std"] + y_mean["mean"] |
| true_dn = y_test * y_mean["std"] + y_mean["mean"] |
|
|
| rmse = float(np.sqrt(np.mean((pred_dn - true_dn)**2))) |
| mae = float(np.mean(np.abs(pred_dn - true_dn))) |
| ss_res = np.sum((true_dn - pred_dn)**2) |
| ss_tot = np.sum((true_dn - true_dn.mean())**2) |
| r2 = float(1 - ss_res / (ss_tot + 1e-10)) |
|
|
| return {"model": model_name, |
| "rmse": round(rmse, 2), "mae": round(mae, 2), "r2": round(r2, 4), |
| "pred": pred_dn.tolist()[:100], "true": true_dn.tolist()[:100]} |
|
|
|
|
| |
|
|
| def train_regressor() -> dict: |
| logger.info("=" * 60) |
| logger.info("Training flight price regression models") |
| logger.info("=" * 60) |
|
|
| df = pd.read_csv(PROCESSED_DIR / "features_regression.csv", low_memory=False) |
| logger.info("Loaded %d rows", len(df)) |
|
|
| |
| |
| |
| |
| |
| |
| if "origin" in df.columns and "destination" in df.columns: |
| df["route"] = df["origin"] + "-" + df["destination"] |
| elif "route" not in df.columns: |
| df["route"] = "unknown" |
|
|
| |
| n_train_prelim = int(len(df) * (1 - TEST_SIZE)) |
| train_part = df.iloc[:n_train_prelim] |
| route_means = train_part.groupby("route")[REGRESSOR_TARGET].mean().to_dict() |
| global_mean = float(train_part[REGRESSOR_TARGET].mean()) |
| df["route_mean_price"] = df["route"].map(route_means).fillna(global_mean) |
|
|
| |
| import json as _json |
| _route_lookup_out = { |
| r: {"mean": round(v, 2), "origin": r.split("-")[0], "destination": r.split("-")[1]} |
| for r, v in route_means.items() |
| } |
| (METRICS_DIR / "route_price_lookup_v2.json").write_text( |
| _json.dumps({"routes": _route_lookup_out, "global_mean": round(global_mean, 2)}, indent=2) |
| ) |
| logger.info("Route encoding added. Routes: %s", list(route_means.keys())) |
|
|
| |
| avail_feats = [f for f in REGRESSOR_FEATURES if f in df.columns] |
| if "route_mean_price" not in avail_feats: |
| avail_feats = ["route_mean_price"] + avail_feats |
| logger.info("Final feature set (%d): %s", len(avail_feats), avail_feats) |
| |
|
|
| X = df[avail_feats].fillna(0).values.astype(float) |
| y = df[REGRESSOR_TARGET].fillna(df[REGRESSOR_TARGET].median()).values.astype(float) |
|
|
| n_train = int(len(X) * (1 - TEST_SIZE)) |
| X_train, X_test = X[:n_train], X[n_train:] |
| y_train, y_test = y[:n_train], y[n_train:] |
|
|
| |
| X_mean = X_train.mean(axis=0); X_std = X_train.std(axis=0) + 1e-8 |
| X_train_n = (X_train - X_mean) / X_std |
| X_test_n = (X_test - X_mean) / X_std |
|
|
| |
| y_mu = y_train.mean(); y_sigma = y_train.std() + 1e-8 |
| y_train_n = (y_train - y_mu) / y_sigma |
| y_test_n = (y_test - y_mu) / y_sigma |
| y_norm_meta = {"mean": float(y_mu), "std": float(y_sigma)} |
|
|
| logger.info("Train: %d | Test: %d | Price range: $%.0f–$%.0f", |
| n_train, len(X_test), y.min(), y.max()) |
|
|
| if SKLEARN_OK: |
| candidates = { |
| "Linear Regression": LinearRegression(), |
| "Random Forest": RandomForestRegressor( |
| n_estimators=100, max_depth=8, random_state=RANDOM_SEED), |
| "Gradient Boosting": GradientBoostingRegressor( |
| n_estimators=100, max_depth=4, learning_rate=0.1, random_state=RANDOM_SEED), |
| "ANN": MLPRegressor( |
| hidden_layer_sizes=(16, 8), activation="relu", max_iter=500, |
| random_state=RANDOM_SEED, learning_rate_init=0.001), |
| } |
| else: |
| candidates = { |
| "Linear Regression": _NumpyLinearRegression(), |
| "Random Forest": _NumpyRFRegressor(n_estimators=30, max_depth=6, seed=RANDOM_SEED), |
| "ANN": _NumpyANNRegressor(hidden=(16, 8), lr=0.005, n_iter=150, seed=RANDOM_SEED), |
| } |
|
|
| results = [] |
| trained_models = {} |
|
|
| for name, model in candidates.items(): |
| logger.info("Training: %s ...", name) |
| try: |
| model.fit(X_train_n, y_train_n) |
| metrics = _compute_metrics(model, X_test_n, y_test_n, name, y_norm_meta) |
| results.append(metrics) |
| trained_models[name] = model |
| logger.info(" RMSE=$%.2f | MAE=$%.2f | R²=%.4f", |
| metrics["rmse"], metrics["mae"], metrics["r2"]) |
| except Exception as e: |
| logger.error("Training failed for %s: %s", name, e) |
|
|
| if not results: |
| return {} |
|
|
| best = min(results, key=lambda r: r["rmse"]) |
| best_name = best["model"] |
| best_model = trained_models[best_name] |
| logger.info("Best model: %s (RMSE=$%.2f, R²=%.4f)", |
| best_name, best["rmse"], best["r2"]) |
|
|
| |
| grid_search_results = {} |
| if SKLEARN_OK: |
| _param_grids_reg = { |
| "Linear Regression": {}, |
| "Random Forest": { |
| "n_estimators": [50, 100, 200], "max_depth": [5, 8, 12], |
| }, |
| "Gradient Boosting": { |
| "n_estimators": [50, 100], "max_depth": [3, 4], |
| "learning_rate": [0.05, 0.1], |
| }, |
| "ANN": { |
| "hidden_layer_sizes": [(16, 8), (32, 16)], |
| "learning_rate_init": [0.001, 0.005], |
| }, |
| } |
| if best_name in _param_grids_reg and _param_grids_reg[best_name]: |
| try: |
| from sklearn.model_selection import GridSearchCV as _GridSearchCV |
| _base_map_reg = { |
| "Random Forest": RandomForestRegressor(random_state=RANDOM_SEED), |
| "Gradient Boosting": GradientBoostingRegressor(random_state=RANDOM_SEED), |
| "ANN": MLPRegressor( |
| activation="relu", max_iter=500, random_state=RANDOM_SEED), |
| } |
| _gs_reg = _GridSearchCV( |
| _base_map_reg[best_name], _param_grids_reg[best_name], |
| cv=3, scoring="neg_root_mean_squared_error", |
| n_jobs=-1, refit=True, |
| ) |
| _gs_reg.fit(X_train_n, y_train_n) |
| _tuned_reg = _gs_reg.best_estimator_ |
| _tuned_reg_metrics = _compute_metrics( |
| _tuned_reg, X_test_n, y_test_n, f"{best_name} (Tuned)", y_norm_meta) |
| grid_search_results = { |
| "best_params": _gs_reg.best_params_, |
| "best_cv_rmse": round(float(-_gs_reg.best_score_), 2), |
| "tuned_test_rmse": _tuned_reg_metrics.get("rmse", 999), |
| "tuned_test_r2": _tuned_reg_metrics.get("r2", 0), |
| } |
| if _tuned_reg_metrics["rmse"] < best["rmse"]: |
| best_model = _tuned_reg |
| best = _tuned_reg_metrics |
| logger.info( |
| "GridSearchCV improved regressor: RMSE=$%.2f R²=%.4f params=%s", |
| _tuned_reg_metrics["rmse"], _tuned_reg_metrics["r2"], |
| _gs_reg.best_params_, |
| ) |
| else: |
| logger.info( |
| "GridSearchCV: existing model already optimal " |
| "(cv_rmse=%.2f)", -_gs_reg.best_score_) |
| except Exception as _ge_reg: |
| logger.warning("GridSearchCV (regression) failed (non-fatal): %s", _ge_reg) |
|
|
| |
| wf_cv_results = {} |
| try: |
| from sklearn.base import clone as _clone_reg |
| X_wfcv_reg = pd.DataFrame(X, columns=avail_feats) |
| y_wfcv_reg = pd.Series(y, name=REGRESSOR_TARGET) |
| def _best_reg_factory(): |
| if SKLEARN_OK: |
| try: |
| return _clone_reg(best_model) |
| except Exception: |
| pass |
| return _NumpyLinearRegression() |
| wf_cv_results = walk_forward_cv( |
| _best_reg_factory, X_wfcv_reg, y_wfcv_reg, n_splits=5, task="regression") |
| logger.info( |
| "Walk-forward CV — mean_rmse=%.4f ± %.4f | mean_r2=%.4f ± %.4f", |
| wf_cv_results.get("mean_rmse", 0) or 0, |
| wf_cv_results.get("std_rmse", 0) or 0, |
| wf_cv_results.get("mean_r2", 0) or 0, |
| wf_cv_results.get("std_r2", 0) or 0, |
| ) |
| except Exception as _wfe_reg: |
| logger.warning("Walk-forward CV (regression) failed (non-fatal): %s", _wfe_reg) |
|
|
| |
| feat_df = pd.DataFrame(X_test_n, columns=avail_feats) |
| sensitivity = sensitivity_analysis(best_model, feat_df) |
| sensitivity.to_csv(METRICS_DIR / "sensitivity_regressor.csv", index=False) |
|
|
| fi_data = {} |
| if hasattr(best_model, "feature_importances_"): |
| fi_data = dict(zip(avail_feats, best_model.feature_importances_.tolist())) |
|
|
| model_meta = { |
| "best_model": best_name, |
| "scaler_mean": X_mean.tolist(), |
| "scaler_std": X_std.tolist(), |
| "y_mean": y_mu, "y_std": y_sigma, |
| "features": avail_feats, |
| "target": REGRESSOR_TARGET, |
| "all_models": [{k: v for k, v in r.items() if k not in ["pred", "true"]} |
| for r in results], |
| "feature_importance": fi_data, |
| "trained_at": datetime.utcnow().isoformat(), |
| "n_train": int(n_train), "n_test": int(len(X_test)), |
| "grid_search": grid_search_results, |
| "walk_forward_cv": wf_cv_results, |
| } |
| save_metrics(model_meta, "regressor") |
| save_model(best_model, "best_regressor") |
| save_model({"mean": X_mean, "std": X_std, |
| "y_mean": y_mu, "y_std": y_sigma, "features": avail_feats}, |
| "regressor_scaler") |
|
|
| _plot_model_comparison(results) |
| _plot_pred_vs_actual(best, best_name) |
| _plot_feature_importance(sensitivity, best_name) |
| _run_shap_regressor(best_model, best_name, X_train_n, X_test_n, avail_feats) |
|
|
| return model_meta |
|
|
|
|
| def _plot_model_comparison(results): |
| fig, axes = plt.subplots(1, 3, figsize=(12, 5)) |
| metrics_to_plot = ["rmse", "mae", "r2"] |
| labels = [r["model"] for r in results] |
| colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728"] |
| for ax, metric in zip(axes, metrics_to_plot): |
| vals = [r[metric] for r in results] |
| ax.bar(range(len(labels)), vals, color=colors[:len(labels)]) |
| ax.set_xticks(range(len(labels))) |
| ax.set_xticklabels([l.replace(" ", "\n") for l in labels], fontsize=8) |
| ax.set_title(metric.upper()) |
| ax.grid(True, alpha=0.3, axis="y") |
| plt.suptitle("Regression Model Comparison", fontsize=13) |
| plt.tight_layout() |
| fig.savefig(FIGURES_DIR / "model_comparison_regressor.png", dpi=150) |
| plt.close(fig) |
|
|
|
|
| def _plot_pred_vs_actual(metrics, model_name): |
| if "pred" not in metrics or "true" not in metrics: |
| return |
| pred = metrics["pred"]; true = metrics["true"] |
| fig, ax = plt.subplots(figsize=(7, 6)) |
| ax.scatter(true, pred, alpha=0.4, s=20, color="#1f77b4") |
| lims = [min(true + pred), max(true + pred)] |
| ax.plot(lims, lims, "r--", label="Perfect prediction") |
| ax.set_xlabel("Actual Price (USD)"); ax.set_ylabel("Predicted Price (USD)") |
| ax.set_title(f"Predicted vs Actual — {model_name}") |
| ax.legend(); ax.grid(True, alpha=0.3) |
| plt.tight_layout() |
| fig.savefig(FIGURES_DIR / "pred_vs_actual_regressor.png", dpi=150) |
| plt.close(fig) |
|
|
|
|
| def _plot_feature_importance(sensitivity_df, model_name): |
| if sensitivity_df.empty: return |
| fig, ax = plt.subplots(figsize=(8, 6)) |
| top = sensitivity_df.head(10) |
| ax.barh(top["feature"][::-1], top["sensitivity"][::-1], color="#ff7f0e") |
| ax.set_xlabel("Sensitivity"); ax.set_title(f"Feature Sensitivity — {model_name}") |
| ax.grid(True, alpha=0.3, axis="x") |
| plt.tight_layout() |
| fig.savefig(FIGURES_DIR / "feature_sensitivity_regressor.png", dpi=150) |
| plt.close(fig) |
|
|
|
|
| def _run_shap_regressor(model, model_name: str, X_train_n, X_test_n, feature_names): |
| """Compute SHAP values for the best regressor and save outputs.""" |
| try: |
| import shap |
| import matplotlib |
| matplotlib.use("Agg") |
| import matplotlib.pyplot as plt |
|
|
| X_train_df = pd.DataFrame(X_train_n, columns=feature_names) |
| X_test_df = pd.DataFrame(X_test_n, columns=feature_names) |
|
|
| if hasattr(model, "feature_importances_"): |
| explainer = shap.TreeExplainer(model) |
| shap_vals = explainer.shap_values(X_test_df) |
| elif hasattr(model, "coef_"): |
| background = shap.sample(X_train_df, min(50, len(X_train_df)), random_state=42) |
| explainer = shap.LinearExplainer(model, background) |
| shap_vals = explainer.shap_values(X_test_df) |
| else: |
| background = shap.sample(X_train_df, min(50, len(X_train_df)), random_state=42) |
| explainer = shap.KernelExplainer(model.predict, background) |
| shap_vals = explainer.shap_values(X_test_df, nsamples=100) |
|
|
| mean_shap = np.abs(shap_vals).mean(axis=0) |
| shap_df = pd.DataFrame({ |
| "feature": feature_names, |
| "mean_abs_shap": mean_shap.tolist() |
| }).sort_values("mean_abs_shap", ascending=False) |
| shap_df.to_csv(METRICS_DIR / "shap_regressor.csv", index=False) |
|
|
| fig, ax = plt.subplots(figsize=(8, 5)) |
| colors = ["#D97706" if v > 0.1 else "#1A2B5E" if v > 0.01 else "#CBD5E1" |
| for v in shap_df["mean_abs_shap"]] |
| ax.barh(shap_df["feature"][::-1], shap_df["mean_abs_shap"][::-1], color=colors[::-1]) |
| ax.set_xlabel("Mean |SHAP value| (USD impact on price prediction)") |
| ax.set_title(f"SHAP Feature Importance — {model_name} (Price Regressor)") |
| ax.grid(True, alpha=0.3, axis="x") |
| plt.tight_layout() |
| fig.savefig(FIGURES_DIR / "shap_regressor.png", dpi=150) |
| plt.close(fig) |
| logger.info("✓ SHAP regressor: saved shap_regressor.csv + shap_regressor.png") |
|
|
| except Exception as e: |
| logger.warning("SHAP regressor failed (non-fatal): %s", e) |
|
|
|
|
| if __name__ == "__main__": |
| metrics = train_regressor() |
| print(f"\nBest model: {metrics.get('best_model')}") |
| for m in metrics.get("all_models", []): |
| print(f" {m['model']}: RMSE=${m['rmse']:.2f} MAE=${m['mae']:.2f} R²={m['r2']:.4f}") |
|
|