File size: 1,633 Bytes
b53ee19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pandas as pd
import numpy as np
from scipy import stats
from typing import Dict, Any
import json
from pathlib import Path


class DriftDetector:
    def __init__(self, reference_data: pd.DataFrame, threshold: float = 0.05):
        self.reference_data = reference_data
        self.threshold = threshold
        
    def detect_drift(self, current_data: pd.DataFrame) -> Dict[str, Any]:
        """Detect drift using Kolmogorov-Smirnov test"""
        drift_report = {
            "drift_detected": False,
            "drifted_features": [],
            "drift_scores": {}
        }
        
        for col in self.reference_data.select_dtypes(include=[np.number]).columns:
            if col in current_data.columns:
                # KS test for numerical features
                statistic, p_value = stats.ks_2samp(
                    self.reference_data[col].dropna(),
                    current_data[col].dropna()
                )
                
                drift_report["drift_scores"][col] = {
                    "statistic": float(statistic),
                    "p_value": float(p_value),
                    "drift": p_value < self.threshold
                }
                
                if p_value < self.threshold:
                    drift_report["drift_detected"] = True
                    drift_report["drifted_features"].append(col)
        
        return drift_report
    
    def save_report(self, report: Dict[str, Any], output_path: Path):
        output_path.parent.mkdir(parents=True, exist_ok=True)
        with open(output_path, 'w') as f:
            json.dump(report, f, indent=2)