File size: 8,750 Bytes
1aa566a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
"""
Monitoring & drift endpoints.

GET  /monitor/metrics     β†’ rolling performance metrics
GET  /monitor/drift       β†’ run drift check on recent live data vs reference
POST /monitor/retrain     β†’ manually trigger retraining evaluation
GET  /monitor/history     β†’ drift + performance history
"""
from __future__ import annotations

import json
import time
from pathlib import Path
from typing import Any

import pandas as pd
from fastapi import APIRouter, HTTPException, Request

from src.api.schemas import (
    DriftCheckResponse,
    PerformanceMetricsResponse,
    RetrainingResponse,
)
from src.utils.config import settings, resolve
from src.utils.logging_config import get_logger

router = APIRouter(prefix="/monitor", tags=["Monitoring"])
log = get_logger(__name__)


@router.get("/metrics", response_model=PerformanceMetricsResponse)
async def get_metrics(request: Request) -> PerformanceMetricsResponse:
    """Return rolling performance metrics from matched predictions."""
    monitor = request.app.state.monitor
    metrics = monitor.compute_metrics()
    baseline = monitor.get_baseline_rmse()

    if metrics is None:
        return PerformanceMetricsResponse(
            rmse=None, mae=None, r2=None,
            n_samples=monitor.matched_count(),
            n_pending=monitor.pending_count(),
            baseline_rmse=baseline,
            timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        )

    return PerformanceMetricsResponse(
        rmse=metrics["rmse"],
        mae=metrics["mae"],
        r2=metrics["r2"],
        n_samples=metrics["n_samples"],
        n_pending=monitor.pending_count(),
        baseline_rmse=baseline,
        timestamp=metrics["timestamp"],
    )


@router.get("/drift", response_model=DriftCheckResponse)
async def check_drift(request: Request) -> DriftCheckResponse:
    """
    Run drift detection on live data collected since last check.

    Combines feature drift (PSI/KS) + performance drift into one report
    and runs root-cause analysis if drift is detected.
    """
    app_state = request.app.state
    drift_detector = app_state.drift_detector
    rca = app_state.rca
    monitor = app_state.monitor

    if not drift_detector.has_reference():
        raise HTTPException(status_code=503, detail="Reference dataset not loaded yet.")

    # Get live feature data from matched predictions
    live_df = monitor.get_matched_dataframe()
    min_samples = settings.monitoring.drift.min_samples_for_drift_test

    if len(live_df) < min_samples:
        return DriftCheckResponse(
            drift_detected=False,
            root_cause=[],
            performance_drop=None,
            action="insufficient_data",
            feature_results={},
            drifted_features=[],
            rca_details=None,
            timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        )

    # Feature drift
    feature_cols = [c for c in settings.data.features if c in live_df.columns]
    feat_report = drift_detector.detect_feature_drift(live_df, features=feature_cols)

    # Performance drift
    metrics = monitor.compute_metrics()
    baseline = monitor.get_baseline_rmse()
    perf_report: dict = {"drift_detected": False}
    performance_drop = None

    if metrics and baseline:
        perf_report = drift_detector.detect_performance_drift(metrics["rmse"], baseline)
        if perf_report["drift_detected"]:
            pct = perf_report["pct_change"]
            performance_drop = f"{pct:.1f}%"

    # RCA
    rca_result: dict = {}
    if feat_report["drift_detected"]:
        rca_result = rca.analyze(feat_report)

    # Retraining decision
    trigger = app_state.trigger
    decision = trigger.should_retrain(
        feature_drift_report=feat_report,
        performance_report=perf_report,
        samples_since_last_retrain=app_state.samples_since_last_retrain,
    )

    action = "no_action"
    if decision["should_retrain"]:
        action = "retraining_triggered"
        # Fire async retraining (background)
        import asyncio
        asyncio.create_task(_run_retraining(app_state, rca_result))
    elif feat_report["drift_detected"]:
        action = "drift_detected_monitoring"

    return DriftCheckResponse(
        drift_detected=feat_report["drift_detected"] or perf_report.get("drift_detected", False),
        root_cause=feat_report.get("drifted_features", []),
        performance_drop=performance_drop,
        action=action,
        feature_results=feat_report.get("feature_results", {}),
        drifted_features=feat_report.get("drifted_features", []),
        rca_details=rca_result.get("root_causes"),
        timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
    )


@router.post("/retrain", response_model=RetrainingResponse)
async def trigger_retrain(request: Request) -> RetrainingResponse:
    """Manually trigger a retraining run (bypasses drift gates)."""
    app_state = request.app.state
    monitor = app_state.monitor
    pipeline = app_state.retrain_pipeline

    live_df = monitor.get_matched_dataframe()
    if len(live_df) < 50:
        return RetrainingResponse(
            triggered=False,
            promoted=None,
            improvement_pct=None,
            root_causes=[],
            action="insufficient_labeled_data",
            message=f"Only {len(live_df)} labeled samples available. Need at least 50.",
            timestamp=time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
        )

    result = pipeline.run(
        training_df=live_df,
        eval_df=live_df.sample(frac=0.2, random_state=42),
        tags={"trigger": "manual"},
    )

    if result["promoted"]:
        model = app_state.registry.load_champion()
        if model:
            app_state.model = model
            app_state.model_version = f"v{int(time.time())}"
            if result["challenger_metrics"].get("rmse"):
                monitor.set_baseline_rmse(result["challenger_metrics"]["rmse"])

    return RetrainingResponse(
        triggered=True,
        promoted=result["promoted"],
        improvement_pct=result.get("improvement_pct"),
        root_causes=[c["feature"] for c in result.get("root_causes", [])],
        action="champion_promoted" if result["promoted"] else "challenger_not_promoted",
        message="Manual retraining completed.",
        timestamp=result["timestamp"],
    )


@router.get("/history")
async def get_history(
    limit: int = 100,
    log_type: str = "drift",
) -> list[dict]:
    """
    Return recent log entries.

    log_type: 'drift' | 'performance' | 'retrain' | 'feedback'
    """
    log_paths = {
        "drift": resolve(settings.monitoring.drift_report_path),
        "performance": resolve(settings.monitoring.performance_log_path),
        "retrain": resolve(settings.retraining.retrain_log_path),
        "feedback": resolve(settings.delayed_feedback.feedback_log_path),
    }

    path = log_paths.get(log_type)
    if path is None:
        raise HTTPException(status_code=400, detail=f"Unknown log_type '{log_type}'")

    if not path.exists():
        return []

    lines = path.read_text(encoding="utf-8").splitlines()
    entries = []
    for line in lines[-limit:]:
        try:
            entries.append(json.loads(line))
        except json.JSONDecodeError:
            pass
    return entries


# ------------------------------------------------------------------
# Background task helper
# ------------------------------------------------------------------

async def _run_retraining(app_state: Any, rca_result: dict) -> None:
    """Fire-and-forget retraining coroutine."""
    try:
        monitor = app_state.monitor
        pipeline = app_state.retrain_pipeline
        live_df = monitor.get_matched_dataframe()

        if len(live_df) < 50:
            log.warning("Not enough labeled samples for retraining (%d).", len(live_df))
            return

        result = pipeline.run(
            training_df=live_df,
            eval_df=live_df.sample(frac=0.2, random_state=42),
            rca_report=rca_result,
            tags={"trigger": "auto_drift"},
        )

        if result["promoted"]:
            model = app_state.registry.load_champion()
            if model:
                app_state.model = model
                app_state.model_version = f"v{int(time.time())}"
                if result["challenger_metrics"].get("rmse"):
                    monitor.set_baseline_rmse(result["challenger_metrics"]["rmse"])
            app_state.trigger.record_retrain_completed()
            app_state.samples_since_last_retrain = 0
            log.info("Auto-retraining complete β€” new champion promoted.")
    except Exception as exc:
        log.error("Background retraining failed: %s", exc, exc_info=True)