import pandas as pd import numpy as np from scipy import stats from typing import Dict, Any import json from pathlib import Path class DriftDetector: def __init__(self, reference_data: pd.DataFrame, threshold: float = 0.05): self.reference_data = reference_data self.threshold = threshold def detect_drift(self, current_data: pd.DataFrame) -> Dict[str, Any]: """Detect drift using Kolmogorov-Smirnov test""" drift_report = { "drift_detected": False, "drifted_features": [], "drift_scores": {} } for col in self.reference_data.select_dtypes(include=[np.number]).columns: if col in current_data.columns: # KS test for numerical features statistic, p_value = stats.ks_2samp( self.reference_data[col].dropna(), current_data[col].dropna() ) drift_report["drift_scores"][col] = { "statistic": float(statistic), "p_value": float(p_value), "drift": p_value < self.threshold } if p_value < self.threshold: drift_report["drift_detected"] = True drift_report["drifted_features"].append(col) return drift_report def save_report(self, report: Dict[str, Any], output_path: Path): output_path.parent.mkdir(parents=True, exist_ok=True) with open(output_path, 'w') as f: json.dump(report, f, indent=2)