File size: 1,452 Bytes
aac9e56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# src/drift_detection.py
import os
import json
import pandas as pd
from datetime import timedelta
from river.drift import ADWIN, PageHinkley

DATA_PATH = "data/processed/merged_features.csv"
OUT_DIR = "drift_reports"
os.makedirs(OUT_DIR, exist_ok=True)

FEATURES = ["return_lag1", "volume_lag1", "sentiment_lag1"]
CURRENT_DAYS = 30

def main():
    df = pd.read_csv(DATA_PATH)
    df["Date"] = pd.to_datetime(df["Date"])

    cutoff = df["Date"].max() - timedelta(days=CURRENT_DAYS)
    recent = df[df["Date"] >= cutoff]

    if recent.empty:
        raise RuntimeError("No recent data for drift detection")

    results = {}

    for f in FEATURES:
        adwin = ADWIN()
        ph = PageHinkley()

        drift_points = []

        for val in recent[f].dropna():
            adwin.update(val)
            ph.update(val)

            if adwin.drift_detected or ph.drift_detected:
                drift_points.append(True)
            else:
                drift_points.append(False)

        results[f] = {
            "adwin_drift": adwin.drift_detected,
            "page_hinkley_drift": ph.drift_detected,
            "drift_flag": adwin.drift_detected or ph.drift_detected
        }

    out_path = os.path.join(OUT_DIR, "drift_summary.json")
    with open(out_path, "w") as f:
        json.dump(results, f, indent=4)

    print("River drift detection completed")
    print(json.dumps(results, indent=2))

if __name__ == "__main__":
    main()