import pandas as pd import numpy as np import json import os from datetime import datetime # ================================================ # CONFIGURATION # ================================================ REFERENCE_DATA_PATH = "data/train_data.csv" LIVE_DATA_PATH = "data/live_data.csv" REPORT_PATH = "reports/drift_report.html" # ================================================ # STEP 1 - Load Reference Data # ================================================ def load_reference_data(): if os.path.exists(REFERENCE_DATA_PATH): print("Loading reference data from file...") df = pd.read_csv(REFERENCE_DATA_PATH) df['ds'] = pd.to_datetime(df['ds']) else: print("Generating sample reference data...") np.random.seed(42) dates = pd.date_range(start="2021-01-01", end="2022-12-31", freq="D") trend = np.linspace(100, 150, len(dates)) yearly = 30 * np.sin(2 * np.pi * np.arange(len(dates)) / 365) noise = np.random.normal(0, 8, len(dates)) df = pd.DataFrame({ "ds": dates, "y": (trend + yearly + noise).clip(min=10), "month": dates.month, "dayofweek": dates.dayofweek, "quarter": dates.quarter }) return df # ================================================ # STEP 2 - Load Live Data # ================================================ def load_live_data(): if os.path.exists(LIVE_DATA_PATH): print("Loading live data from file...") df = pd.read_csv(LIVE_DATA_PATH) df['ds'] = pd.to_datetime(df['ds']) else: print("Generating sample live data (simulating drift)...") np.random.seed(99) dates = pd.date_range(start="2024-01-01", end="2024-01-31", freq="D") trend = np.linspace(200, 250, len(dates)) noise = np.random.normal(0, 20, len(dates)) df = pd.DataFrame({ "ds": dates, "y": (trend + noise).clip(min=10), "month": dates.month, "dayofweek": dates.dayofweek, "quarter": dates.quarter }) return df # ================================================ # STEP 3 - Run Drift Detection # ================================================ def run_drift_detection(reference_df, live_df): print("Running drift detection...") feature_cols = ['y', 'month', 'dayofweek', 'quarter'] ref = reference_df[feature_cols].copy() curr = live_df[feature_cols].copy() drift_results = {} drifted_count = 0 for col in feature_cols: ref_mean = ref[col].mean() curr_mean = curr[col].mean() ref_std = ref[col].std() # Simple z-score drift detection if ref_std > 0: z_score = abs(curr_mean - ref_mean) / ref_std drifted = z_score > 2.0 else: drifted = False drift_results[col] = { "ref_mean": round(ref_mean, 4), "curr_mean": round(curr_mean, 4), "drifted": drifted } if drifted: drifted_count += 1 print(f" DRIFT in {col}: ref={ref_mean:.2f} curr={curr_mean:.2f}") else: print(f" OK {col}: ref={ref_mean:.2f} curr={curr_mean:.2f}") drift_share = drifted_count / len(feature_cols) drift_detected = drift_share > 0.5 return { "drift_detected": drift_detected, "drift_share": round(drift_share, 4), "feature_drift": drift_results } # ================================================ # STEP 4 - Get Forecast Metrics # ================================================ def get_forecast_metrics(reference_df, live_df): print("Calculating forecast metrics...") try: from prophet import Prophet model = Prophet( seasonality_mode="multiplicative", yearly_seasonality=True ) model.fit(reference_df[['ds', 'y']]) future = model.make_future_dataframe(periods=len(live_df), freq='D') forecast = model.predict(future) forecast_live = forecast.tail(len(live_df)) actual = live_df['y'].values predicted = forecast_live['yhat'].values rmse = np.sqrt(np.mean((actual - predicted) ** 2)) mae = np.mean(np.abs(actual - predicted)) mape = np.mean(np.abs((actual - predicted) / actual)) * 100 print(f" RMSE: {rmse:.4f}") print(f" MAE: {mae:.4f}") print(f" MAPE: {mape:.4f}%") return {"rmse": round(rmse, 4), "mae": round(mae, 4), "mape": round(mape, 4)} except Exception as e: print(f" Prophet metrics skipped: {e}") return {"rmse": None, "mae": None, "mape": None} # ================================================ # STEP 5 - Save HTML Report # ================================================ def save_report(drift_results, forecast_metrics): os.makedirs("reports", exist_ok=True) drift_color = "red" if drift_results['drift_detected'] else "green" drift_text = "DRIFT DETECTED" if drift_results['drift_detected'] else "NO DRIFT" rows = "" for col, result in drift_results['feature_drift'].items(): color = "red" if result['drifted'] else "green" rows += f"""
Generated: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
{drift_text}
Drifted Features: {drift_results['drift_share']*100:.1f}%
RMSE: {forecast_metrics['rmse']}
MAE: {forecast_metrics['mae']}
MAPE: {forecast_metrics['mape']}%
| Feature | Reference Mean | Current Mean | Drifted |
|---|