demand-prediction-api / train_model.py
gulabjam
First commit
e118065
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score
import pickle
from datetime import datetime
from db_client import get_db_connection, COLLECTION
# ================= CONFIG =================
MODEL_PATH = "demand_prediction_model.pkl"
RANDOM_STATE = 42
le = LabelEncoder()
# =========================================
# ------------------------------------------------------------------
# DATA LOADING
# ------------------------------------------------------------------
def export_data_from_mongo():
print("🔹 Exporting data from MongoDB...")
with get_db_connection() as db:
collection = db[COLLECTION]
df = pd.DataFrame(list(collection.find({}, {"_id": 0})))
print(f"✅ Loaded {len(df)} rows from MongoDB")
return df
# ------------------------------------------------------------------
# TRAIN / VALIDATION SPLIT
# ------------------------------------------------------------------
def per_h3_time_split(data):
train_raw = []
val_raw = []
val_true_list = []
for h3_cell, group in data.groupby("h3_cell"):
group = group.sort_values("timestamp")
val_part = group.tail(24).copy()
train_part = group.iloc[:-24].copy()
val_true_list.append(val_part[['h3_cell', 'timestamp', 'demand']].copy())
val_part["demand"] = np.nan
train_raw.append(train_part)
val_raw.append(val_part)
return (
pd.concat(train_raw).reset_index(drop=True),
pd.concat(val_raw).reset_index(drop=True),
pd.concat(val_true_list).reset_index(drop=True),
)
# ------------------------------------------------------------------
# TRAINING FEATURE PIPELINE (HISTORICAL DATA ONLY)
# ------------------------------------------------------------------
def prepare_training_features(data):
print("Preparing TRAINING features...")
data["timestamp"] = pd.to_datetime(data["timestamp"])
data = data.sort_values(["h3_cell", "timestamp"]).reset_index(drop=True)
data["Weekday"] = data["timestamp"].dt.weekday
data["Month"] = data["timestamp"].dt.month
data["Quarter"] = data["timestamp"].dt.quarter
data["day_number"] = (data["timestamp"] - data["timestamp"].min()).dt.days
data["trend_sq"] = data["day_number"] ** 2
data["h3_cell_enc"] = le.fit_transform(data["h3_cell"])
# 🚨 TRAINING ONLY — safe to drop NaNs here
data = data.dropna().reset_index(drop=True)
feature_columns = [
"hour_sin",
"hour_cos",
"is_weekend",
"isHoliday",
"neighbor_availability",
"h3_cell_enc",
"Weekday",
"Month",
"Quarter",
"day_number",
"trend_sq",
]
X = data[feature_columns]
y = data["demand"]
return data, X, y, feature_columns
# ------------------------------------------------------------------
# MODEL TRAINING
# ------------------------------------------------------------------
def train_model():
data = export_data_from_mongo()
data, X, y, feature_columns = prepare_training_features(data)
train_raw, val_raw, val_true = per_h3_time_split(data)
model = XGBRegressor(
n_estimators=600,
learning_rate=0.05,
max_depth=8,
subsample=0.9,
colsample_bytree=0.8,
objective="reg:squarederror",
random_state=RANDOM_STATE,
n_jobs=-1,
)
X_train = train_raw[feature_columns]
y_train = train_raw["demand"]
X_val = val_raw[feature_columns]
y_val = val_true["demand"]
model.fit(X_train, y_train)
preds = model.predict(X_val)
print("✅ Validation R²:", r2_score(y_val, preds))
with open(MODEL_PATH, "wb") as f:
pickle.dump(
{
"model": model,
"features": feature_columns,
"encoder": le,
"trained_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
},
f,
)
print(f"💾 Model saved → {MODEL_PATH}")
return model
# ------------------------------------------------------------------
# INFERENCE FEATURE BUILDER
# ------------------------------------------------------------------
def build_inference_features(doc, encoder, feature_columns):
df = pd.DataFrame([doc])
df["timestamp"] = pd.to_datetime(df["timestamp"])
df["Weekday"] = df["timestamp"].dt.weekday
df["Month"] = df["timestamp"].dt.month
df["Quarter"] = df["timestamp"].dt.quarter
# Neutral trend values for inference
df["day_number"] = 0
df["trend_sq"] = 0
# Encode H3
df["h3_cell_enc"] = encoder.transform(df["h3_cell"])
# Safe defaults
df["neighbor_availability"] = df.get("neighbor_availability", 1.0)
return df[feature_columns]
# ------------------------------------------------------------------
# DEMAND + PRICING PREDICTION
# ------------------------------------------------------------------
def predict_demand(h3_cell, timestamp):
if isinstance(timestamp, str):
timestamp = pd.to_datetime(timestamp)
with open(MODEL_PATH, "rb") as f:
model_data = pickle.load(f)
model = model_data["model"]
encoder = model_data["encoder"]
feature_columns = model_data["features"]
with get_db_connection() as db:
doc = db[COLLECTION].find_one({
"h3_cell": h3_cell,
"timestamp": timestamp
})
if not doc:
return {"error": "Record not found"}
X = build_inference_features(doc, encoder, feature_columns)
predicted_demand = float(model.predict(X)[0])
capacity = max(doc.get("total_capacity", 1), 1)
availability_ratio = doc.get("availability_ratio", 1)
demand_pressure = predicted_demand / capacity
scarcity = max(0, 1 - availability_ratio)
demand_factor = min(max(
1 + 0.6 * demand_pressure + 0.4 * scarcity,
1.0
), 2.0)
return demand_factor
# ------------------------------------------------------------------
if __name__ == "__main__":
train_model()