File size: 13,353 Bytes
404b7cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
"""
APOO ML Module — Travel Time Prediction with Uncertainty
=========================================================
XGBoost quantile regression for travel time prediction.
SHAP explainability for feature importance.
"""

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from apoo_core import IndianTrafficGenerator


# ============================================================
# 1. FEATURE ENGINEERING
# ============================================================

FEATURE_COLUMNS = [
    "link_length_m", "speed_limit_kmh", "num_lanes", "gradient_pct",
    "side_friction", "pct_two_wheeler", "pct_car", "pct_auto",
    "pct_bus", "pct_truck", "density_veh_km_lane",
    "weather_speed_factor", "time_of_day_sin", "time_of_day_cos",
    "is_peak", "is_weekend", "platoon_size", "platoon_pcu",
    "upstream_queue_pcu", "downstream_queue_pcu",
]

TARGET_COLUMN = "actual_travel_time_s"


def prepare_features(df: pd.DataFrame):
    """Extract features and target from training data."""
    X = df[FEATURE_COLUMNS].copy()
    y = df[TARGET_COLUMN].copy()
    return X, y


# ============================================================
# 2. XGBOOST QUANTILE REGRESSION MODELS
# ============================================================

class APOOPredictor:
    """
    Uncertainty-aware travel time predictor using XGBoost quantile regression.
    
    Trains 3 models: P10 (lower bound), P50 (median), P90 (upper bound).
    This gives 80% prediction intervals for each travel time estimate.
    """
    
    def __init__(self, n_estimators: int = 300, max_depth: int = 6,
                 learning_rate: float = 0.05):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.learning_rate = learning_rate
        self.models = {}
        self.quantiles = [0.1, 0.5, 0.9]
        self.feature_names = FEATURE_COLUMNS
        self.train_metrics = {}
        self.shap_values = None
        self.explainer = None
    
    def train(self, X_train, y_train, X_val=None, y_val=None):
        """Train quantile regression models."""
        for q in self.quantiles:
            print(f"  Training Q{q:.0%} model...")
            model = xgb.XGBRegressor(
                objective='reg:quantileerror',
                quantile_alpha=q,
                n_estimators=self.n_estimators,
                max_depth=self.max_depth,
                learning_rate=self.learning_rate,
                subsample=0.8,
                colsample_bytree=0.8,
                tree_method='hist',
                random_state=42,
            )
            
            eval_set = [(X_train, y_train)]
            if X_val is not None:
                eval_set.append((X_val, y_val))
            
            model.fit(
                X_train, y_train,
                eval_set=eval_set,
                verbose=False,
            )
            
            self.models[q] = model
        
        # Compute metrics on validation set
        if X_val is not None:
            self._compute_metrics(X_val, y_val)
        
        # Compute SHAP values (on training subset for speed)
        self._compute_shap(X_train[:min(500, len(X_train))])
        
        return self
    
    def predict(self, X):
        """Predict with uncertainty bounds."""
        p10 = self.models[0.1].predict(X)
        p50 = self.models[0.5].predict(X)
        p90 = self.models[0.9].predict(X)
        uncertainty = (p90 - p10) / 2
        return p50, p10, p90, uncertainty
    
    def _compute_metrics(self, X_val, y_val):
        """Compute validation metrics."""
        p50 = self.models[0.5].predict(X_val)
        p10 = self.models[0.1].predict(X_val)
        p90 = self.models[0.9].predict(X_val)
        
        mae = mean_absolute_error(y_val, p50)
        rmse = np.sqrt(mean_squared_error(y_val, p50))
        r2 = r2_score(y_val, p50)
        
        # Coverage: % of actual values within [P10, P90]
        in_interval = ((y_val >= p10) & (y_val <= p90)).mean() * 100
        
        # Mean interval width
        mean_width = np.mean(p90 - p10)
        
        self.train_metrics = {
            "MAE (s)": round(mae, 2),
            "RMSE (s)": round(rmse, 2),
            "R² Score": round(r2, 4),
            "80% PI Coverage (%)": round(in_interval, 1),
            "Mean PI Width (s)": round(mean_width, 2),
            "MAPE (%)": round(np.mean(np.abs(y_val - p50) / np.clip(y_val, 1, None)) * 100, 2),
        }
        
        print(f"  Validation Metrics:")
        for k, v in self.train_metrics.items():
            print(f"    {k}: {v}")
    
    def _compute_shap(self, X_sample):
        """Compute SHAP values for explainability."""
        try:
            self.explainer = shap.TreeExplainer(self.models[0.5])
            self.shap_values = self.explainer(X_sample)
        except Exception as e:
            print(f"  SHAP computation warning: {e}")
            # Fallback: use basic feature importance
            self.shap_values = None
    
    def get_feature_importance(self) -> pd.DataFrame:
        """Get feature importance from median model."""
        model = self.models[0.5]
        importance = model.feature_importances_
        return pd.DataFrame({
            "Feature": self.feature_names,
            "Importance": importance,
        }).sort_values("Importance", ascending=False)
    
    # ---- Plotting Methods ----
    
    def plot_predictions_vs_actual(self, X_val, y_val, title=""):
        """Scatter plot of predicted vs actual travel times."""
        p50, p10, p90, _ = self.predict(X_val)
        
        fig, axes = plt.subplots(1, 2, figsize=(14, 6))
        
        # Left: Scatter with uncertainty
        ax = axes[0]
        sorted_idx = np.argsort(y_val.values)
        y_sorted = y_val.values[sorted_idx]
        p50_s = p50[sorted_idx]
        p10_s = p10[sorted_idx]
        p90_s = p90[sorted_idx]
        
        x_range = np.arange(len(y_sorted))
        ax.scatter(x_range, y_sorted, alpha=0.3, s=8, color='#2c3e50', label='Actual', zorder=3)
        ax.plot(x_range, p50_s, color='#e74c3c', linewidth=1.5, label='Predicted (P50)', zorder=4)
        ax.fill_between(x_range, p10_s, p90_s, alpha=0.2, color='#3498db', label='80% PI [P10-P90]', zorder=2)
        ax.set_xlabel("Sample Index (sorted by actual)", fontsize=11)
        ax.set_ylabel("Travel Time (seconds)", fontsize=11)
        ax.set_title(f"Predictions with Uncertainty Bands{' — ' + title if title else ''}", fontsize=12)
        ax.legend(fontsize=10)
        ax.grid(alpha=0.3)
        
        # Right: Residual distribution
        ax2 = axes[1]
        residuals = y_val.values - p50
        ax2.hist(residuals, bins=50, alpha=0.7, color='#3498db', edgecolor='white')
        ax2.axvline(0, color='#e74c3c', linestyle='--', linewidth=2, label=f'Zero Error')
        ax2.axvline(np.mean(residuals), color='#f39c12', linestyle='--', linewidth=2, 
                    label=f'Mean: {np.mean(residuals):.1f}s')
        ax2.set_xlabel("Residual (Actual - Predicted) [seconds]", fontsize=11)
        ax2.set_ylabel("Count", fontsize=11)
        ax2.set_title("Residual Distribution", fontsize=12)
        ax2.legend(fontsize=10)
        ax2.grid(alpha=0.3)
        
        plt.tight_layout()
        plt.close(fig)
        return fig
    
    def plot_shap_beeswarm(self, max_display=15):
        """SHAP beeswarm plot showing feature impact distribution."""
        if self.shap_values is None:
            return self._fallback_importance_plot()
        
        fig, ax = plt.subplots(figsize=(11, 7))
        shap.plots.beeswarm(self.shap_values, max_display=max_display, show=False)
        plt.title("SHAP Feature Impact on Travel Time Prediction", fontsize=13, fontweight='bold')
        plt.tight_layout()
        fig = plt.gcf()
        plt.close(fig)
        return fig
    
    def plot_shap_bar(self, max_display=15):
        """SHAP global feature importance bar plot."""
        if self.shap_values is None:
            return self._fallback_importance_plot()
        
        fig, ax = plt.subplots(figsize=(10, 6))
        shap.plots.bar(self.shap_values, max_display=max_display, show=False)
        plt.title("Global Feature Importance (Mean |SHAP|)", fontsize=13, fontweight='bold')
        plt.tight_layout()
        fig = plt.gcf()
        plt.close(fig)
        return fig
    
    def plot_shap_waterfall(self, X_sample, idx=0):
        """SHAP waterfall plot for a single prediction."""
        if self.shap_values is None or self.explainer is None:
            return self._fallback_importance_plot()
        
        try:
            sv = self.explainer(X_sample[idx:idx+1])
            fig, ax = plt.subplots(figsize=(10, 6))
            shap.plots.waterfall(sv[0], show=False)
            plt.title(f"SHAP Waterfall — Prediction Breakdown (Sample {idx})", fontsize=12, fontweight='bold')
            plt.tight_layout()
            fig = plt.gcf()
            plt.close(fig)
            return fig
        except:
            return self._fallback_importance_plot()
    
    def _fallback_importance_plot(self):
        """Fallback: XGBoost native feature importance."""
        importance_df = self.get_feature_importance()
        
        fig, ax = plt.subplots(figsize=(10, 6))
        bars = ax.barh(importance_df["Feature"][:15][::-1], 
                      importance_df["Importance"][:15][::-1],
                      color='#3498db', edgecolor='white')
        ax.set_xlabel("Feature Importance (Gain)", fontsize=11)
        ax.set_title("XGBoost Feature Importance (Fallback)", fontsize=13, fontweight='bold')
        ax.grid(alpha=0.3, axis='x')
        plt.tight_layout()
        plt.close(fig)
        return fig
    
    def plot_quantile_calibration(self, X_val, y_val):
        """Check if quantile predictions are well-calibrated."""
        fig, ax = plt.subplots(figsize=(8, 6))
        
        test_quantiles = [0.1, 0.5, 0.9]
        observed_below = []
        
        for q in test_quantiles:
            pred = self.models[q].predict(X_val)
            frac_below = (y_val.values <= pred).mean()
            observed_below.append(frac_below)
        
        ax.plot([0, 1], [0, 1], 'k--', alpha=0.5, label='Perfect Calibration')
        ax.scatter(test_quantiles, observed_below, s=120, c='#e74c3c', 
                  zorder=5, edgecolors='white', linewidth=2)
        ax.plot(test_quantiles, observed_below, 'r-', alpha=0.7, linewidth=2, label='Model')
        
        for q, obs in zip(test_quantiles, observed_below):
            ax.annotate(f'Q{q:.0%}: {obs:.1%}', (q, obs), 
                       textcoords="offset points", xytext=(10, 10), fontsize=10)
        
        ax.set_xlabel("Predicted Quantile", fontsize=12)
        ax.set_ylabel("Observed Fraction Below", fontsize=12)
        ax.set_title("Quantile Calibration Plot", fontsize=13, fontweight='bold')
        ax.legend(fontsize=11)
        ax.set_xlim(-0.05, 1.05)
        ax.set_ylim(-0.05, 1.05)
        ax.grid(alpha=0.3)
        plt.tight_layout()
        plt.close(fig)
        return fig


# ============================================================
# 3. TRAINING PIPELINE
# ============================================================

def train_apoo_model(n_samples: int = 5000, city_type: str = "metro"):
    """Full training pipeline for APOO predictor."""
    print("=" * 60)
    print("APOO ML Training Pipeline")
    print("=" * 60)
    
    # Step 1: Generate training data
    print("\n[1/4] Generating synthetic Indian traffic data...")
    gen = IndianTrafficGenerator(seed=42)
    df = gen.generate_training_data(n_samples=n_samples, city_type=city_type)
    print(f"  Generated {len(df)} samples with {len(FEATURE_COLUMNS)} features")
    print(f"  Target stats: mean={df[TARGET_COLUMN].mean():.1f}s, "
          f"std={df[TARGET_COLUMN].std():.1f}s, "
          f"range=[{df[TARGET_COLUMN].min():.1f}, {df[TARGET_COLUMN].max():.1f}]s")
    
    # Step 2: Prepare features
    print("\n[2/4] Preparing features & splitting data...")
    X, y = prepare_features(df)
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    print(f"  Train: {len(X_train)}, Validation: {len(X_val)}")
    
    # Step 3: Train models
    print("\n[3/4] Training XGBoost quantile models...")
    predictor = APOOPredictor(n_estimators=300, max_depth=6, learning_rate=0.05)
    predictor.train(X_train, y_train, X_val, y_val)
    
    # Step 4: Save artifacts
    print("\n[4/4] Training complete!")
    print(f"  Model metrics: {predictor.train_metrics}")
    
    return predictor, X_train, X_val, y_train, y_val, df


if __name__ == "__main__":
    predictor, X_train, X_val, y_train, y_val, df = train_apoo_model(n_samples=5000)
    
    # Generate plots
    fig1 = predictor.plot_predictions_vs_actual(X_val, y_val)
    fig1.savefig("/app/pred_vs_actual.png", dpi=150, bbox_inches='tight')
    
    fig2 = predictor.plot_shap_beeswarm()
    fig2.savefig("/app/shap_beeswarm.png", dpi=150, bbox_inches='tight')
    
    print("Plots saved.")