from fastapi import APIRouter import pandas as pd import numpy as np from sklearn.ensemble import GradientBoostingClassifier from sklearn.preprocessing import StandardScaler from sklearn.model_selection import train_test_split import joblib import os from app.core.config import settings router = APIRouter() @router.post("/admin/train") def train_models(): """ Training and saving HVR model directly on the server. Call once after deployment to generate compatible model files. """ try: data_path = os.path.join(settings.MODEL_DIR, "train_data.csv") if not os.path.exists(data_path): return { "status": "error", "message": f"Training data not found at {data_path}. Upload train_data.csv to the models/ folder first." } raw = pd.read_csv(data_path) raw.columns = raw.columns.str.strip() raw["InvoiceDate"] = pd.to_datetime(raw["InvoiceDate"], infer_datetime_format=True) raw = raw[raw["Quantity"] > 0] raw = raw[raw["UnitPrice"] > 0] raw["LineTotal"] = raw["Quantity"] * raw["UnitPrice"] raw = raw.dropna(subset=["CustomerID"]) raw["CustomerID"] = raw["CustomerID"].astype(int) split_date = raw["InvoiceDate"].min() + pd.DateOffset(months=8) train_raw = raw[raw["InvoiceDate"] < split_date] predict_raw = raw[raw["InvoiceDate"] >= split_date] def compute_features(df, reference_date): return df.groupby("CustomerID").agg( Recency = ("InvoiceDate", lambda x: (reference_date - x.max()).days), Frequency = ("InvoiceNo", "nunique"), Monetary = ("LineTotal", "sum"), AvgOrderValue = ("LineTotal", "mean"), TotalItems = ("Quantity", "sum"), DistinctProducts = ("StockCode", "nunique"), TenureDays = ("InvoiceDate", lambda x: (x.max() - x.min()).days), AvgItemsPerOrder = ("Quantity", "mean"), ).reset_index() train_features = compute_features(train_raw, split_date) predict_summary = predict_raw.groupby("CustomerID").agg( future_monetary = ("LineTotal", "sum"), future_frequency = ("InvoiceNo", "nunique"), ).reset_index() monetary_threshold = predict_summary["future_monetary"].quantile(0.60) predict_summary["is_high_value_future"] = ( (predict_summary["future_monetary"] >= monetary_threshold) & (predict_summary["future_frequency"] >= 2) ).astype(int) df_model = train_features.merge( predict_summary[["CustomerID", "is_high_value_future"]], on="CustomerID", how="inner" ) df_model["monetary_per_day"] = df_model["Monetary"] / (df_model["TenureDays"] + 1) df_model["orders_per_day"] = df_model["Frequency"] / (df_model["TenureDays"] + 1) df_model["avg_gap"] = df_model["TenureDays"] / (df_model["Frequency"].clip(lower=1)) df_model["spend_diversity"] = df_model["Monetary"] / (df_model["DistinctProducts"] + 1) df_model["basket_value"] = df_model["Monetary"] / (df_model["TotalItems"] + 1) feature_cols = [ "Recency", "Frequency", "Monetary", "AvgOrderValue", "TotalItems", "DistinctProducts", "TenureDays", "AvgItemsPerOrder", "monetary_per_day", "orders_per_day", "avg_gap", "spend_diversity", "basket_value" ] X = df_model[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0) for col in X.columns: X[col] = X[col].clip(upper=X[col].quantile(0.999)) y = df_model["is_high_value_future"] customer_first_date = train_raw.groupby("CustomerID")["InvoiceDate"].min() df_model["first_purchase"] = df_model["CustomerID"].map(customer_first_date) temporal_split = df_model["first_purchase"].quantile(0.80) train_mask = df_model["first_purchase"] <= temporal_split X_train = X[train_mask] X_test = X[~train_mask] y_train = y[train_mask] y_test = y[~train_mask] scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) weights = np.where(y_train == 1, 2.0, 1.0) model = GradientBoostingClassifier( n_estimators = 200, learning_rate = 0.05, max_depth = 3, min_samples_leaf= 20, subsample = 0.8, random_state = 42 ) model.fit(X_train_scaled, y_train, sample_weight=weights) from sklearn.metrics import roc_auc_score y_prob = model.predict_proba(X_test_scaled)[:, 1] auc = roc_auc_score(y_test, y_prob) os.makedirs(settings.MODEL_DIR, exist_ok=True) joblib.dump(model, os.path.join(settings.MODEL_DIR, "hvr_model.pkl")) joblib.dump(scaler, os.path.join(settings.MODEL_DIR, "hvr_scaler.pkl")) joblib.dump(feature_cols,os.path.join(settings.MODEL_DIR, "hvr_features.pkl")) return { "status": "success", "auc": round(auc, 4), "train_size": int(train_mask.sum()), "test_size": int((~train_mask).sum()), "features": feature_cols, "model_dir": settings.MODEL_DIR, "message": "Models trained and saved successfully. Pipeline is ready." } except Exception as e: return {"status": "error", "message": str(e)}