Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| from xgboost import XGBRegressor | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.metrics import r2_score | |
| import pickle | |
| from datetime import datetime | |
| from db_client import get_db_connection, COLLECTION | |
| # ================= CONFIG ================= | |
| MODEL_PATH = "demand_prediction_model.pkl" | |
| RANDOM_STATE = 42 | |
| le = LabelEncoder() | |
| # ========================================= | |
| # ------------------------------------------------------------------ | |
| # DATA LOADING | |
| # ------------------------------------------------------------------ | |
| def export_data_from_mongo(): | |
| print("🔹 Exporting data from MongoDB...") | |
| with get_db_connection() as db: | |
| collection = db[COLLECTION] | |
| df = pd.DataFrame(list(collection.find({}, {"_id": 0}))) | |
| print(f"✅ Loaded {len(df)} rows from MongoDB") | |
| return df | |
| # ------------------------------------------------------------------ | |
| # TRAIN / VALIDATION SPLIT | |
| # ------------------------------------------------------------------ | |
| def per_h3_time_split(data): | |
| train_raw = [] | |
| val_raw = [] | |
| val_true_list = [] | |
| for h3_cell, group in data.groupby("h3_cell"): | |
| group = group.sort_values("timestamp") | |
| val_part = group.tail(24).copy() | |
| train_part = group.iloc[:-24].copy() | |
| val_true_list.append(val_part[['h3_cell', 'timestamp', 'demand']].copy()) | |
| val_part["demand"] = np.nan | |
| train_raw.append(train_part) | |
| val_raw.append(val_part) | |
| return ( | |
| pd.concat(train_raw).reset_index(drop=True), | |
| pd.concat(val_raw).reset_index(drop=True), | |
| pd.concat(val_true_list).reset_index(drop=True), | |
| ) | |
| # ------------------------------------------------------------------ | |
| # TRAINING FEATURE PIPELINE (HISTORICAL DATA ONLY) | |
| # ------------------------------------------------------------------ | |
| def prepare_training_features(data): | |
| print("Preparing TRAINING features...") | |
| data["timestamp"] = pd.to_datetime(data["timestamp"]) | |
| data = data.sort_values(["h3_cell", "timestamp"]).reset_index(drop=True) | |
| data["Weekday"] = data["timestamp"].dt.weekday | |
| data["Month"] = data["timestamp"].dt.month | |
| data["Quarter"] = data["timestamp"].dt.quarter | |
| data["day_number"] = (data["timestamp"] - data["timestamp"].min()).dt.days | |
| data["trend_sq"] = data["day_number"] ** 2 | |
| data["h3_cell_enc"] = le.fit_transform(data["h3_cell"]) | |
| # 🚨 TRAINING ONLY — safe to drop NaNs here | |
| data = data.dropna().reset_index(drop=True) | |
| feature_columns = [ | |
| "hour_sin", | |
| "hour_cos", | |
| "is_weekend", | |
| "isHoliday", | |
| "neighbor_availability", | |
| "h3_cell_enc", | |
| "Weekday", | |
| "Month", | |
| "Quarter", | |
| "day_number", | |
| "trend_sq", | |
| ] | |
| X = data[feature_columns] | |
| y = data["demand"] | |
| return data, X, y, feature_columns | |
| # ------------------------------------------------------------------ | |
| # MODEL TRAINING | |
| # ------------------------------------------------------------------ | |
| def train_model(): | |
| data = export_data_from_mongo() | |
| data, X, y, feature_columns = prepare_training_features(data) | |
| train_raw, val_raw, val_true = per_h3_time_split(data) | |
| model = XGBRegressor( | |
| n_estimators=600, | |
| learning_rate=0.05, | |
| max_depth=8, | |
| subsample=0.9, | |
| colsample_bytree=0.8, | |
| objective="reg:squarederror", | |
| random_state=RANDOM_STATE, | |
| n_jobs=-1, | |
| ) | |
| X_train = train_raw[feature_columns] | |
| y_train = train_raw["demand"] | |
| X_val = val_raw[feature_columns] | |
| y_val = val_true["demand"] | |
| model.fit(X_train, y_train) | |
| preds = model.predict(X_val) | |
| print("✅ Validation R²:", r2_score(y_val, preds)) | |
| with open(MODEL_PATH, "wb") as f: | |
| pickle.dump( | |
| { | |
| "model": model, | |
| "features": feature_columns, | |
| "encoder": le, | |
| "trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), | |
| }, | |
| f, | |
| ) | |
| print(f"💾 Model saved → {MODEL_PATH}") | |
| return model | |
| # ------------------------------------------------------------------ | |
| # INFERENCE FEATURE BUILDER | |
| # ------------------------------------------------------------------ | |
| def build_inference_features(doc, encoder, feature_columns): | |
| df = pd.DataFrame([doc]) | |
| df["timestamp"] = pd.to_datetime(df["timestamp"]) | |
| df["Weekday"] = df["timestamp"].dt.weekday | |
| df["Month"] = df["timestamp"].dt.month | |
| df["Quarter"] = df["timestamp"].dt.quarter | |
| # Neutral trend values for inference | |
| df["day_number"] = 0 | |
| df["trend_sq"] = 0 | |
| # Encode H3 | |
| df["h3_cell_enc"] = encoder.transform(df["h3_cell"]) | |
| # Safe defaults | |
| df["neighbor_availability"] = df.get("neighbor_availability", 1.0) | |
| return df[feature_columns] | |
| # ------------------------------------------------------------------ | |
| # DEMAND + PRICING PREDICTION | |
| # ------------------------------------------------------------------ | |
| def predict_demand(h3_cell, timestamp): | |
| if isinstance(timestamp, str): | |
| timestamp = pd.to_datetime(timestamp) | |
| with open(MODEL_PATH, "rb") as f: | |
| model_data = pickle.load(f) | |
| model = model_data["model"] | |
| encoder = model_data["encoder"] | |
| feature_columns = model_data["features"] | |
| with get_db_connection() as db: | |
| doc = db[COLLECTION].find_one({ | |
| "h3_cell": h3_cell, | |
| "timestamp": timestamp | |
| }) | |
| if not doc: | |
| return {"error": "Record not found"} | |
| X = build_inference_features(doc, encoder, feature_columns) | |
| predicted_demand = float(model.predict(X)[0]) | |
| capacity = max(doc.get("total_capacity", 1), 1) | |
| availability_ratio = doc.get("availability_ratio", 1) | |
| demand_pressure = predicted_demand / capacity | |
| scarcity = max(0, 1 - availability_ratio) | |
| demand_factor = min(max( | |
| 1 + 0.6 * demand_pressure + 0.4 * scarcity, | |
| 1.0 | |
| ), 2.0) | |
| return demand_factor | |
| # ------------------------------------------------------------------ | |
| if __name__ == "__main__": | |
| train_model() | |