Spaces:
Sleeping
Sleeping
File size: 1,452 Bytes
aac9e56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# src/drift_detection.py
import os
import json
import pandas as pd
from datetime import timedelta
from river.drift import ADWIN, PageHinkley
DATA_PATH = "data/processed/merged_features.csv"
OUT_DIR = "drift_reports"
os.makedirs(OUT_DIR, exist_ok=True)
FEATURES = ["return_lag1", "volume_lag1", "sentiment_lag1"]
CURRENT_DAYS = 30
def main():
df = pd.read_csv(DATA_PATH)
df["Date"] = pd.to_datetime(df["Date"])
cutoff = df["Date"].max() - timedelta(days=CURRENT_DAYS)
recent = df[df["Date"] >= cutoff]
if recent.empty:
raise RuntimeError("No recent data for drift detection")
results = {}
for f in FEATURES:
adwin = ADWIN()
ph = PageHinkley()
drift_points = []
for val in recent[f].dropna():
adwin.update(val)
ph.update(val)
if adwin.drift_detected or ph.drift_detected:
drift_points.append(True)
else:
drift_points.append(False)
results[f] = {
"adwin_drift": adwin.drift_detected,
"page_hinkley_drift": ph.drift_detected,
"drift_flag": adwin.drift_detected or ph.drift_detected
}
out_path = os.path.join(OUT_DIR, "drift_summary.json")
with open(out_path, "w") as f:
json.dump(results, f, indent=4)
print("River drift detection completed")
print(json.dumps(results, indent=2))
if __name__ == "__main__":
main()
|