bhanug2026
Initial commit
47c6cfd
"""
src/models/train_regressor.py
================================
Trains and evaluates flight price regression models:
- Linear Regression
- Random Forest Regressor
- Gradient Boosting Regressor
- ANN Regressor (ReLU + linear output)
Selects best model by RMSE on held-out test set.
"""
import warnings
warnings.filterwarnings("ignore")
import json
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from datetime import datetime
from src.utils.logger import get_logger
from src.models.model_selector import (
sensitivity_analysis, save_model, save_metrics, walk_forward_cv
)
from config.settings import (
PROCESSED_DIR, FIGURES_DIR, METRICS_DIR, RANDOM_SEED, TEST_SIZE,
REGRESSOR_FEATURES, REGRESSOR_TARGET
)
logger = get_logger(__name__)
try:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
SKLEARN_OK = True
except ImportError:
logger.warning("scikit-learn not installed — using numpy fallback regressors")
SKLEARN_OK = False
# ── Numpy Fallback Regressors ─────────────────────────────────────────────────
class _NumpyLinearRegression:
def __init__(self):
self.w = None; self.b = 0.0
def fit(self, X, y):
X_ = np.column_stack([np.ones(len(X)), X])
try:
coef = np.linalg.lstsq(X_, y, rcond=None)[0]
except Exception:
coef = np.zeros(X_.shape[1])
self.b = coef[0]; self.w = coef[1:]
return self
def predict(self, X):
return X @ self.w + self.b
def get_params(self, **kw): return {}
class _NumpyRFRegressor:
def __init__(self, n_estimators=30, max_depth=6, seed=42):
self.n_estimators = n_estimators
self.max_depth = max_depth
self.seed = seed
self.trees = []
def _build_tree(self, X, y, depth=0):
if depth >= self.max_depth or len(y) < 5:
return {"leaf": True, "value": np.mean(y)}
rng = np.random.default_rng(self.seed + depth * 7)
n_feat = max(1, int(np.sqrt(X.shape[1])))
feat_idx = rng.choice(X.shape[1], n_feat, replace=False)
best_fi, best_t, best_mse = 0, 0, float("inf")
for fi in feat_idx:
for t in np.percentile(X[:, fi], [25, 50, 75]):
l = y[X[:, fi] <= t]; r = y[X[:, fi] > t]
if len(l) == 0 or len(r) == 0: continue
mse = len(l) * l.var() + len(r) * r.var()
if mse < best_mse: best_mse, best_fi, best_t = mse, fi, t
lm = X[:, best_fi] <= best_t
return {"leaf": False, "feat": best_fi, "thresh": best_t,
"left": self._build_tree(X[lm], y[lm], depth+1),
"right": self._build_tree(X[~lm], y[~lm], depth+1)}
def _pred_tree(self, node, x):
if node["leaf"]: return node["value"]
if x[node["feat"]] <= node["thresh"]: return self._pred_tree(node["left"], x)
return self._pred_tree(node["right"], x)
def fit(self, X, y):
rng = np.random.default_rng(self.seed)
self.trees = []
for i in range(self.n_estimators):
idx = rng.choice(len(X), len(X), replace=True)
self.trees.append(self._build_tree(X[idx], y[idx]))
self.feature_importances_ = np.ones(X.shape[1]) / X.shape[1]
return self
def predict(self, X):
preds = np.array([[self._pred_tree(t, x) for t in self.trees]
for x in X], dtype=float)
result = np.nanmean(preds, axis=1)
result = np.where(np.isfinite(result), result, 0.0)
return result
def get_params(self, **kw): return {"n_estimators": self.n_estimators}
class _NumpyANNRegressor:
"""2-layer ANN regression with ReLU hidden, linear output."""
def __init__(self, hidden=(16, 8), lr=0.001, n_iter=200, seed=42):
self.hidden = hidden; self.lr = lr; self.n_iter = n_iter; self.seed = seed
self.weights = []; self.biases = []
def _relu(self, z): return np.maximum(0, z)
def _drelu(self, z): return (z > 0).astype(float)
def fit(self, X, y):
rng = np.random.default_rng(self.seed)
layers = [X.shape[1]] + list(self.hidden) + [1]
self.weights = [rng.normal(0, np.sqrt(2/layers[i]), (layers[i], layers[i+1]))
for i in range(len(layers)-1)]
self.biases = [np.zeros(layers[i+1]) for i in range(len(layers)-1)]
y_ = y.reshape(-1, 1).astype(float)
for _ in range(self.n_iter):
acts = [X]
zs = []
for i, (W, b) in enumerate(zip(self.weights, self.biases)):
z = acts[-1] @ W + b
zs.append(z)
acts.append(self._relu(z) if i < len(self.weights)-1 else z)
delta = (acts[-1] - y_) / len(y_)
for i in reversed(range(len(self.weights))):
dW = acts[i].T @ delta
db = delta.mean(axis=0)
if i > 0:
delta = delta @ self.weights[i].T * self._drelu(zs[i-1])
self.weights[i] -= self.lr * dW
self.biases[i] -= self.lr * db
return self
def predict(self, X):
a = X
for i, (W, b) in enumerate(zip(self.weights, self.biases)):
a = (self._relu(a @ W + b) if i < len(self.weights)-1
else a @ W + b)
return a.flatten()
def get_params(self, **kw): return {"hidden": self.hidden, "lr": self.lr}
# ── Metrics ───────────────────────────────────────────────────────────────────
def _compute_metrics(model, X_test, y_test, model_name, y_mean):
pred = model.predict(X_test)
# Denormalise
pred_dn = pred * y_mean["std"] + y_mean["mean"]
true_dn = y_test * y_mean["std"] + y_mean["mean"]
rmse = float(np.sqrt(np.mean((pred_dn - true_dn)**2)))
mae = float(np.mean(np.abs(pred_dn - true_dn)))
ss_res = np.sum((true_dn - pred_dn)**2)
ss_tot = np.sum((true_dn - true_dn.mean())**2)
r2 = float(1 - ss_res / (ss_tot + 1e-10))
return {"model": model_name,
"rmse": round(rmse, 2), "mae": round(mae, 2), "r2": round(r2, 4),
"pred": pred_dn.tolist()[:100], "true": true_dn.tolist()[:100]}
# ── Main ──────────────────────────────────────────────────────────────────────
def train_regressor() -> dict:
logger.info("=" * 60)
logger.info("Training flight price regression models")
logger.info("=" * 60)
df = pd.read_csv(PROCESSED_DIR / "features_regression.csv", low_memory=False)
logger.info("Loaded %d rows", len(df))
# ── Route encoding (v4 fix) ───────────────────────────────────────────────
# Route is the dominant price driver (~$80-90 spread across routes) but was
# not in REGRESSOR_FEATURES, causing the model to be near-constant by route.
# We encode route as its mean target price (target encoding on training split
# only, to avoid leakage). This gives the model a continuous, interpretable
# route signal while keeping the architecture unchanged.
if "origin" in df.columns and "destination" in df.columns:
df["route"] = df["origin"] + "-" + df["destination"]
elif "route" not in df.columns:
df["route"] = "unknown"
# Compute split before target encoding to prevent leakage
n_train_prelim = int(len(df) * (1 - TEST_SIZE))
train_part = df.iloc[:n_train_prelim]
route_means = train_part.groupby("route")[REGRESSOR_TARGET].mean().to_dict()
global_mean = float(train_part[REGRESSOR_TARGET].mean())
df["route_mean_price"] = df["route"].map(route_means).fillna(global_mean)
# Save route lookup for UI tool
import json as _json
_route_lookup_out = {
r: {"mean": round(v, 2), "origin": r.split("-")[0], "destination": r.split("-")[1]}
for r, v in route_means.items()
}
(METRICS_DIR / "route_price_lookup_v2.json").write_text(
_json.dumps({"routes": _route_lookup_out, "global_mean": round(global_mean, 2)}, indent=2)
)
logger.info("Route encoding added. Routes: %s", list(route_means.keys()))
# Use available features from settings + route_mean_price
avail_feats = [f for f in REGRESSOR_FEATURES if f in df.columns]
if "route_mean_price" not in avail_feats:
avail_feats = ["route_mean_price"] + avail_feats
logger.info("Final feature set (%d): %s", len(avail_feats), avail_feats)
# ─────────────────────────────────────────────────────────────────────────
X = df[avail_feats].fillna(0).values.astype(float)
y = df[REGRESSOR_TARGET].fillna(df[REGRESSOR_TARGET].median()).values.astype(float)
n_train = int(len(X) * (1 - TEST_SIZE))
X_train, X_test = X[:n_train], X[n_train:]
y_train, y_test = y[:n_train], y[n_train:]
# Normalise X
X_mean = X_train.mean(axis=0); X_std = X_train.std(axis=0) + 1e-8
X_train_n = (X_train - X_mean) / X_std
X_test_n = (X_test - X_mean) / X_std
# Normalise y
y_mu = y_train.mean(); y_sigma = y_train.std() + 1e-8
y_train_n = (y_train - y_mu) / y_sigma
y_test_n = (y_test - y_mu) / y_sigma
y_norm_meta = {"mean": float(y_mu), "std": float(y_sigma)}
logger.info("Train: %d | Test: %d | Price range: $%.0f–$%.0f",
n_train, len(X_test), y.min(), y.max())
if SKLEARN_OK:
candidates = {
"Linear Regression": LinearRegression(),
"Random Forest": RandomForestRegressor(
n_estimators=100, max_depth=8, random_state=RANDOM_SEED),
"Gradient Boosting": GradientBoostingRegressor(
n_estimators=100, max_depth=4, learning_rate=0.1, random_state=RANDOM_SEED),
"ANN": MLPRegressor(
hidden_layer_sizes=(16, 8), activation="relu", max_iter=500,
random_state=RANDOM_SEED, learning_rate_init=0.001),
}
else:
candidates = {
"Linear Regression": _NumpyLinearRegression(),
"Random Forest": _NumpyRFRegressor(n_estimators=30, max_depth=6, seed=RANDOM_SEED),
"ANN": _NumpyANNRegressor(hidden=(16, 8), lr=0.005, n_iter=150, seed=RANDOM_SEED),
}
results = []
trained_models = {}
for name, model in candidates.items():
logger.info("Training: %s ...", name)
try:
model.fit(X_train_n, y_train_n)
metrics = _compute_metrics(model, X_test_n, y_test_n, name, y_norm_meta)
results.append(metrics)
trained_models[name] = model
logger.info(" RMSE=$%.2f | MAE=$%.2f | R²=%.4f",
metrics["rmse"], metrics["mae"], metrics["r2"])
except Exception as e:
logger.error("Training failed for %s: %s", name, e)
if not results:
return {}
best = min(results, key=lambda r: r["rmse"])
best_name = best["model"]
best_model = trained_models[best_name]
logger.info("Best model: %s (RMSE=$%.2f, R²=%.4f)",
best_name, best["rmse"], best["r2"])
# GridSearchCV — hyperparameter tuning on the best model type
grid_search_results = {}
if SKLEARN_OK:
_param_grids_reg = {
"Linear Regression": {}, # no hyperparams to tune
"Random Forest": {
"n_estimators": [50, 100, 200], "max_depth": [5, 8, 12],
},
"Gradient Boosting": {
"n_estimators": [50, 100], "max_depth": [3, 4],
"learning_rate": [0.05, 0.1],
},
"ANN": {
"hidden_layer_sizes": [(16, 8), (32, 16)],
"learning_rate_init": [0.001, 0.005],
},
}
if best_name in _param_grids_reg and _param_grids_reg[best_name]:
try:
from sklearn.model_selection import GridSearchCV as _GridSearchCV
_base_map_reg = {
"Random Forest": RandomForestRegressor(random_state=RANDOM_SEED),
"Gradient Boosting": GradientBoostingRegressor(random_state=RANDOM_SEED),
"ANN": MLPRegressor(
activation="relu", max_iter=500, random_state=RANDOM_SEED),
}
_gs_reg = _GridSearchCV(
_base_map_reg[best_name], _param_grids_reg[best_name],
cv=3, scoring="neg_root_mean_squared_error",
n_jobs=-1, refit=True,
)
_gs_reg.fit(X_train_n, y_train_n)
_tuned_reg = _gs_reg.best_estimator_
_tuned_reg_metrics = _compute_metrics(
_tuned_reg, X_test_n, y_test_n, f"{best_name} (Tuned)", y_norm_meta)
grid_search_results = {
"best_params": _gs_reg.best_params_,
"best_cv_rmse": round(float(-_gs_reg.best_score_), 2),
"tuned_test_rmse": _tuned_reg_metrics.get("rmse", 999),
"tuned_test_r2": _tuned_reg_metrics.get("r2", 0),
}
if _tuned_reg_metrics["rmse"] < best["rmse"]:
best_model = _tuned_reg
best = _tuned_reg_metrics
logger.info(
"GridSearchCV improved regressor: RMSE=$%.2f R²=%.4f params=%s",
_tuned_reg_metrics["rmse"], _tuned_reg_metrics["r2"],
_gs_reg.best_params_,
)
else:
logger.info(
"GridSearchCV: existing model already optimal "
"(cv_rmse=%.2f)", -_gs_reg.best_score_)
except Exception as _ge_reg:
logger.warning("GridSearchCV (regression) failed (non-fatal): %s", _ge_reg)
# Walk-forward cross-validation (time-series robust)
wf_cv_results = {}
try:
from sklearn.base import clone as _clone_reg
X_wfcv_reg = pd.DataFrame(X, columns=avail_feats)
y_wfcv_reg = pd.Series(y, name=REGRESSOR_TARGET)
def _best_reg_factory():
if SKLEARN_OK:
try:
return _clone_reg(best_model)
except Exception:
pass
return _NumpyLinearRegression()
wf_cv_results = walk_forward_cv(
_best_reg_factory, X_wfcv_reg, y_wfcv_reg, n_splits=5, task="regression")
logger.info(
"Walk-forward CV — mean_rmse=%.4f ± %.4f | mean_r2=%.4f ± %.4f",
wf_cv_results.get("mean_rmse", 0) or 0,
wf_cv_results.get("std_rmse", 0) or 0,
wf_cv_results.get("mean_r2", 0) or 0,
wf_cv_results.get("std_r2", 0) or 0,
)
except Exception as _wfe_reg:
logger.warning("Walk-forward CV (regression) failed (non-fatal): %s", _wfe_reg)
# Sensitivity analysis
feat_df = pd.DataFrame(X_test_n, columns=avail_feats)
sensitivity = sensitivity_analysis(best_model, feat_df)
sensitivity.to_csv(METRICS_DIR / "sensitivity_regressor.csv", index=False)
fi_data = {}
if hasattr(best_model, "feature_importances_"):
fi_data = dict(zip(avail_feats, best_model.feature_importances_.tolist()))
model_meta = {
"best_model": best_name,
"scaler_mean": X_mean.tolist(),
"scaler_std": X_std.tolist(),
"y_mean": y_mu, "y_std": y_sigma,
"features": avail_feats,
"target": REGRESSOR_TARGET,
"all_models": [{k: v for k, v in r.items() if k not in ["pred", "true"]}
for r in results],
"feature_importance": fi_data,
"trained_at": datetime.utcnow().isoformat(),
"n_train": int(n_train), "n_test": int(len(X_test)),
"grid_search": grid_search_results,
"walk_forward_cv": wf_cv_results,
}
save_metrics(model_meta, "regressor")
save_model(best_model, "best_regressor")
save_model({"mean": X_mean, "std": X_std,
"y_mean": y_mu, "y_std": y_sigma, "features": avail_feats},
"regressor_scaler")
_plot_model_comparison(results)
_plot_pred_vs_actual(best, best_name)
_plot_feature_importance(sensitivity, best_name)
_run_shap_regressor(best_model, best_name, X_train_n, X_test_n, avail_feats)
return model_meta
def _plot_model_comparison(results):
fig, axes = plt.subplots(1, 3, figsize=(12, 5))
metrics_to_plot = ["rmse", "mae", "r2"]
labels = [r["model"] for r in results]
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728"]
for ax, metric in zip(axes, metrics_to_plot):
vals = [r[metric] for r in results]
ax.bar(range(len(labels)), vals, color=colors[:len(labels)])
ax.set_xticks(range(len(labels)))
ax.set_xticklabels([l.replace(" ", "\n") for l in labels], fontsize=8)
ax.set_title(metric.upper())
ax.grid(True, alpha=0.3, axis="y")
plt.suptitle("Regression Model Comparison", fontsize=13)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "model_comparison_regressor.png", dpi=150)
plt.close(fig)
def _plot_pred_vs_actual(metrics, model_name):
if "pred" not in metrics or "true" not in metrics:
return
pred = metrics["pred"]; true = metrics["true"]
fig, ax = plt.subplots(figsize=(7, 6))
ax.scatter(true, pred, alpha=0.4, s=20, color="#1f77b4")
lims = [min(true + pred), max(true + pred)]
ax.plot(lims, lims, "r--", label="Perfect prediction")
ax.set_xlabel("Actual Price (USD)"); ax.set_ylabel("Predicted Price (USD)")
ax.set_title(f"Predicted vs Actual — {model_name}")
ax.legend(); ax.grid(True, alpha=0.3)
plt.tight_layout()
fig.savefig(FIGURES_DIR / "pred_vs_actual_regressor.png", dpi=150)
plt.close(fig)
def _plot_feature_importance(sensitivity_df, model_name):
if sensitivity_df.empty: return
fig, ax = plt.subplots(figsize=(8, 6))
top = sensitivity_df.head(10)
ax.barh(top["feature"][::-1], top["sensitivity"][::-1], color="#ff7f0e")
ax.set_xlabel("Sensitivity"); ax.set_title(f"Feature Sensitivity — {model_name}")
ax.grid(True, alpha=0.3, axis="x")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "feature_sensitivity_regressor.png", dpi=150)
plt.close(fig)
def _run_shap_regressor(model, model_name: str, X_train_n, X_test_n, feature_names):
"""Compute SHAP values for the best regressor and save outputs."""
try:
import shap
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
X_train_df = pd.DataFrame(X_train_n, columns=feature_names)
X_test_df = pd.DataFrame(X_test_n, columns=feature_names)
if hasattr(model, "feature_importances_"):
explainer = shap.TreeExplainer(model)
shap_vals = explainer.shap_values(X_test_df)
elif hasattr(model, "coef_"):
background = shap.sample(X_train_df, min(50, len(X_train_df)), random_state=42)
explainer = shap.LinearExplainer(model, background)
shap_vals = explainer.shap_values(X_test_df)
else:
background = shap.sample(X_train_df, min(50, len(X_train_df)), random_state=42)
explainer = shap.KernelExplainer(model.predict, background)
shap_vals = explainer.shap_values(X_test_df, nsamples=100)
mean_shap = np.abs(shap_vals).mean(axis=0)
shap_df = pd.DataFrame({
"feature": feature_names,
"mean_abs_shap": mean_shap.tolist()
}).sort_values("mean_abs_shap", ascending=False)
shap_df.to_csv(METRICS_DIR / "shap_regressor.csv", index=False)
fig, ax = plt.subplots(figsize=(8, 5))
colors = ["#D97706" if v > 0.1 else "#1A2B5E" if v > 0.01 else "#CBD5E1"
for v in shap_df["mean_abs_shap"]]
ax.barh(shap_df["feature"][::-1], shap_df["mean_abs_shap"][::-1], color=colors[::-1])
ax.set_xlabel("Mean |SHAP value| (USD impact on price prediction)")
ax.set_title(f"SHAP Feature Importance — {model_name} (Price Regressor)")
ax.grid(True, alpha=0.3, axis="x")
plt.tight_layout()
fig.savefig(FIGURES_DIR / "shap_regressor.png", dpi=150)
plt.close(fig)
logger.info("✓ SHAP regressor: saved shap_regressor.csv + shap_regressor.png")
except Exception as e:
logger.warning("SHAP regressor failed (non-fatal): %s", e)
if __name__ == "__main__":
metrics = train_regressor()
print(f"\nBest model: {metrics.get('best_model')}")
for m in metrics.get("all_models", []):
print(f" {m['model']}: RMSE=${m['rmse']:.2f} MAE=${m['mae']:.2f} R²={m['r2']:.4f}")