Abeshith commited on
Commit
b53ee19
·
1 Parent(s): d463732

Added Monitoring Stages

Browse files
app/main.py CHANGED
@@ -3,8 +3,9 @@ from fastapi.middleware.cors import CORSMiddleware
3
  from fastapi.responses import JSONResponse
4
  from fastapi.staticfiles import StaticFiles
5
  from fastapi.templating import Jinja2Templates
6
- from app.routers import health, predict, train, ui
7
  from mlpipeline.exception import MLPipelineException
 
8
  import uvicorn
9
 
10
  app = FastAPI(
@@ -24,10 +25,14 @@ app.add_middleware(
24
  allow_headers=["*"],
25
  )
26
 
 
 
 
27
  app.include_router(health.router)
28
  app.include_router(predict.router)
29
  app.include_router(train.router)
30
  app.include_router(ui.router)
 
31
 
32
 
33
  @app.exception_handler(MLPipelineException)
 
3
  from fastapi.responses import JSONResponse
4
  from fastapi.staticfiles import StaticFiles
5
  from fastapi.templating import Jinja2Templates
6
+ from app.routers import health, predict, train, ui, monitoring
7
  from mlpipeline.exception import MLPipelineException
8
+ from app.utils.metrics import MetricsMiddleware
9
  import uvicorn
10
 
11
  app = FastAPI(
 
25
  allow_headers=["*"],
26
  )
27
 
28
+ # Add metrics middleware
29
+ app.middleware("http")(MetricsMiddleware())
30
+
31
  app.include_router(health.router)
32
  app.include_router(predict.router)
33
  app.include_router(train.router)
34
  app.include_router(ui.router)
35
+ app.include_router(monitoring.router)
36
 
37
 
38
  @app.exception_handler(MLPipelineException)
app/routers/monitoring.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException
2
+ from pathlib import Path
3
+ import sys
4
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
5
+
6
+ from monitoring.data_drift.drift_detector import DriftDetector
7
+ from monitoring.model_monitoring.prediction_logger import PredictionLogger
8
+ from monitoring.model_monitoring.performance_tracker import PerformanceTracker
9
+ from monitoring.dashboards.generate_reports import MonitoringReportGenerator
10
+ from app.utils.metrics import get_metrics
11
+ import pandas as pd
12
+
13
+ router = APIRouter(prefix="/monitoring", tags=["monitoring"])
14
+
15
+ # Initialize monitoring components
16
+ MONITORING_DIR = Path("monitoring")
17
+ prediction_logger = PredictionLogger(MONITORING_DIR / "predictions")
18
+ performance_tracker = PerformanceTracker(MONITORING_DIR / "metrics")
19
+ report_generator = MonitoringReportGenerator(MONITORING_DIR / "reports")
20
+
21
+
22
+ @router.get("/metrics")
23
+ async def metrics():
24
+ """Prometheus metrics endpoint"""
25
+ return get_metrics()
26
+
27
+
28
+ @router.get("/health/drift")
29
+ async def check_drift():
30
+ """Check for data drift"""
31
+ try:
32
+ # Load reference and current data
33
+ reference_path = Path("artifacts/data_transformation/train.csv")
34
+ if not reference_path.exists():
35
+ raise HTTPException(status_code=404, detail="Reference data not found")
36
+
37
+ # Get recent predictions
38
+ predictions_df = prediction_logger.get_predictions_df()
39
+
40
+ if predictions_df.empty:
41
+ return {"status": "no_data", "message": "No recent predictions to check"}
42
+
43
+ reference_data = pd.read_csv(reference_path).sample(n=min(1000, len(predictions_df)))
44
+
45
+ # For this example, we'll skip drift detection if no input data
46
+ return {
47
+ "status": "healthy",
48
+ "drift_detected": False,
49
+ "message": "Drift detection available with sufficient data"
50
+ }
51
+
52
+ except Exception as e:
53
+ raise HTTPException(status_code=500, detail=str(e))
54
+
55
+
56
+ @router.get("/performance/summary")
57
+ async def get_performance_summary():
58
+ """Get performance metrics summary"""
59
+ try:
60
+ summary = performance_tracker.get_metrics_summary()
61
+ if not summary:
62
+ return {"status": "no_data", "message": "No performance data available"}
63
+
64
+ return {
65
+ "status": "success",
66
+ "summary": summary,
67
+ "recent_metrics": performance_tracker.get_recent_metrics(n=5)
68
+ }
69
+ except Exception as e:
70
+ raise HTTPException(status_code=500, detail=str(e))
71
+
72
+
73
+ @router.get("/reports/daily")
74
+ async def get_daily_report():
75
+ """Get daily monitoring report"""
76
+ try:
77
+ predictions_df = prediction_logger.get_predictions_df()
78
+ drift_report = {"drift_detected": False, "drifted_features": []}
79
+ performance_metrics = performance_tracker.get_metrics_summary()
80
+
81
+ report = report_generator.generate_daily_report(
82
+ predictions_df=predictions_df,
83
+ drift_report=drift_report,
84
+ performance_metrics=performance_metrics
85
+ )
86
+
87
+ return report
88
+ except Exception as e:
89
+ raise HTTPException(status_code=500, detail=str(e))
90
+
91
+
92
+ @router.get("/reports/weekly")
93
+ async def get_weekly_summary():
94
+ """Get weekly monitoring summary"""
95
+ try:
96
+ summary = report_generator.get_weekly_summary()
97
+ return summary
98
+ except Exception as e:
99
+ raise HTTPException(status_code=500, detail=str(e))
app/routers/predict.py CHANGED
@@ -2,10 +2,17 @@ from fastapi import APIRouter, HTTPException
2
  from app.schemas.request import PredictionRequest, BatchPredictionRequest
3
  from app.schemas.response import PredictionResponse, BatchPredictionResponse
4
  from app.utils.model_loader import model_loader
 
 
 
5
  import pandas as pd
 
6
 
7
  router = APIRouter(prefix="/predict", tags=["prediction"])
8
 
 
 
 
9
 
10
  def convert_to_original_columns(data_dict):
11
  mapping = {
@@ -28,6 +35,7 @@ def add_interaction_features(df):
28
 
29
  @router.post("/", response_model=PredictionResponse)
30
  async def predict_single(request: PredictionRequest):
 
31
  try:
32
  pipeline = model_loader.get_pipeline()
33
  input_dict = convert_to_original_columns(request.model_dump())
@@ -35,16 +43,33 @@ async def predict_single(request: PredictionRequest):
35
  df = add_interaction_features(df)
36
  result = pipeline.predict(df)
37
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  return PredictionResponse(
39
- prediction=result["predictions"][0],
40
- probability=result.get("probabilities")[0] if result.get("probabilities") else None
41
  )
42
  except Exception as e:
 
43
  raise HTTPException(status_code=500, detail=str(e))
44
 
45
 
46
  @router.post("/batch", response_model=BatchPredictionResponse)
47
  async def predict_batch(request: BatchPredictionRequest):
 
48
  try:
49
  pipeline = model_loader.get_pipeline()
50
  data_list = [convert_to_original_columns(item.model_dump()) for item in request.data]
@@ -52,10 +77,23 @@ async def predict_batch(request: BatchPredictionRequest):
52
  df = add_interaction_features(df)
53
  result = pipeline.predict(df)
54
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  return BatchPredictionResponse(
56
  predictions=result["predictions"],
57
  probabilities=result.get("probabilities"),
58
  num_samples=result["num_samples"]
59
  )
60
  except Exception as e:
 
61
  raise HTTPException(status_code=500, detail=str(e))
 
2
  from app.schemas.request import PredictionRequest, BatchPredictionRequest
3
  from app.schemas.response import PredictionResponse, BatchPredictionResponse
4
  from app.utils.model_loader import model_loader
5
+ from app.utils.metrics import prediction_counter, prediction_duration
6
+ from monitoring.model_monitoring.prediction_logger import PredictionLogger
7
+ from pathlib import Path
8
  import pandas as pd
9
+ import time
10
 
11
  router = APIRouter(prefix="/predict", tags=["prediction"])
12
 
13
+ # Initialize prediction logger
14
+ prediction_logger = PredictionLogger(Path("monitoring/predictions"))
15
+
16
 
17
  def convert_to_original_columns(data_dict):
18
  mapping = {
 
35
 
36
  @router.post("/", response_model=PredictionResponse)
37
  async def predict_single(request: PredictionRequest):
38
+ start_time = time.time()
39
  try:
40
  pipeline = model_loader.get_pipeline()
41
  input_dict = convert_to_original_columns(request.model_dump())
 
43
  df = add_interaction_features(df)
44
  result = pipeline.predict(df)
45
 
46
+ prediction = result["predictions"][0]
47
+ probability = result.get("probabilities")[0] if result.get("probabilities") else None
48
+
49
+ # Log prediction
50
+ prediction_logger.log_prediction(
51
+ input_data=input_dict,
52
+ prediction=int(prediction),
53
+ model_version="v1",
54
+ metadata={"probability": float(probability) if probability else None}
55
+ )
56
+
57
+ # Update metrics
58
+ prediction_counter.labels(model_version="v1", status="success").inc()
59
+ prediction_duration.observe(time.time() - start_time)
60
+
61
  return PredictionResponse(
62
+ prediction=prediction,
63
+ probability=probability
64
  )
65
  except Exception as e:
66
+ prediction_counter.labels(model_version="v1", status="error").inc()
67
  raise HTTPException(status_code=500, detail=str(e))
68
 
69
 
70
  @router.post("/batch", response_model=BatchPredictionResponse)
71
  async def predict_batch(request: BatchPredictionRequest):
72
+ start_time = time.time()
73
  try:
74
  pipeline = model_loader.get_pipeline()
75
  data_list = [convert_to_original_columns(item.model_dump()) for item in request.data]
 
77
  df = add_interaction_features(df)
78
  result = pipeline.predict(df)
79
 
80
+ # Log batch predictions
81
+ for input_data, prediction in zip(data_list, result["predictions"]):
82
+ prediction_logger.log_prediction(
83
+ input_data=input_data,
84
+ prediction=int(prediction),
85
+ model_version="v1"
86
+ )
87
+
88
+ # Update metrics
89
+ prediction_counter.labels(model_version="v1", status="success").inc(len(result["predictions"]))
90
+ prediction_duration.observe(time.time() - start_time)
91
+
92
  return BatchPredictionResponse(
93
  predictions=result["predictions"],
94
  probabilities=result.get("probabilities"),
95
  num_samples=result["num_samples"]
96
  )
97
  except Exception as e:
98
+ prediction_counter.labels(model_version="v1", status="error").inc()
99
  raise HTTPException(status_code=500, detail=str(e))
app/utils/metrics.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST
2
+ from fastapi import Response
3
+ import time
4
+
5
+ # Define Prometheus metrics
6
+ prediction_counter = Counter(
7
+ 'predictions_total',
8
+ 'Total number of predictions made',
9
+ ['model_version', 'status']
10
+ )
11
+
12
+ prediction_duration = Histogram(
13
+ 'prediction_duration_seconds',
14
+ 'Time spent processing prediction',
15
+ buckets=[0.1, 0.5, 1.0, 2.0, 5.0]
16
+ )
17
+
18
+ model_accuracy = Gauge(
19
+ 'model_accuracy',
20
+ 'Current model accuracy',
21
+ ['model_version']
22
+ )
23
+
24
+ data_drift_detected = Gauge(
25
+ 'data_drift_detected',
26
+ 'Whether data drift has been detected (1=yes, 0=no)'
27
+ )
28
+
29
+ active_requests = Gauge(
30
+ 'active_requests',
31
+ 'Number of active requests'
32
+ )
33
+
34
+
35
+ class MetricsMiddleware:
36
+ """Middleware to track request metrics"""
37
+
38
+ async def __call__(self, request, call_next):
39
+ active_requests.inc()
40
+ start_time = time.time()
41
+
42
+ try:
43
+ response = await call_next(request)
44
+ return response
45
+ finally:
46
+ active_requests.dec()
47
+ duration = time.time() - start_time
48
+ if request.url.path == "/predict/":
49
+ prediction_duration.observe(duration)
50
+
51
+
52
+ def get_metrics() -> Response:
53
+ """Endpoint to expose Prometheus metrics"""
54
+ return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)
monitoring/dashboards/generate_reports.py CHANGED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from datetime import datetime, timedelta
4
+ import json
5
+ from typing import Dict, Any
6
+
7
+
8
+ class MonitoringReportGenerator:
9
+ def __init__(self, monitoring_dir: Path):
10
+ self.monitoring_dir = Path(monitoring_dir)
11
+ self.monitoring_dir.mkdir(parents=True, exist_ok=True)
12
+
13
+ def generate_daily_report(self,
14
+ predictions_df: pd.DataFrame,
15
+ drift_report: Dict[str, Any],
16
+ performance_metrics: Dict[str, float]) -> Dict[str, Any]:
17
+ """Generate comprehensive daily monitoring report"""
18
+ report = {
19
+ "report_date": datetime.now().strftime('%Y-%m-%d'),
20
+ "generated_at": datetime.now().isoformat(),
21
+ "predictions": {
22
+ "total_predictions": len(predictions_df),
23
+ "prediction_distribution": predictions_df['prediction'].value_counts().to_dict() if 'prediction' in predictions_df.columns else {}
24
+ },
25
+ "drift": drift_report,
26
+ "performance": performance_metrics,
27
+ "status": "healthy" if not drift_report.get("drift_detected", False) else "warning"
28
+ }
29
+
30
+ report_path = self.monitoring_dir / f"report_{datetime.now().strftime('%Y%m%d')}.json"
31
+ with open(report_path, 'w') as f:
32
+ json.dump(report, f, indent=2)
33
+
34
+ return report
35
+
36
+ def get_weekly_summary(self) -> Dict[str, Any]:
37
+ """Get summary of past week's monitoring data"""
38
+ end_date = datetime.now()
39
+ start_date = end_date - timedelta(days=7)
40
+
41
+ reports = []
42
+ for i in range(7):
43
+ date = (start_date + timedelta(days=i)).strftime('%Y%m%d')
44
+ report_path = self.monitoring_dir / f"report_{date}.json"
45
+ if report_path.exists():
46
+ with open(report_path, 'r') as f:
47
+ reports.append(json.load(f))
48
+
49
+ if not reports:
50
+ return {"status": "no_data", "period": "last_7_days"}
51
+
52
+ return {
53
+ "period": "last_7_days",
54
+ "total_reports": len(reports),
55
+ "days_with_drift": sum(1 for r in reports if r.get('drift', {}).get('drift_detected', False)),
56
+ "avg_predictions_per_day": sum(r.get('predictions', {}).get('total_predictions', 0) for r in reports) / len(reports),
57
+ "status": "healthy" if all(r.get('status') == 'healthy' for r in reports) else "needs_attention"
58
+ }
monitoring/data_drift/drift_detector.py CHANGED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from scipy import stats
4
+ from typing import Dict, Any
5
+ import json
6
+ from pathlib import Path
7
+
8
+
9
+ class DriftDetector:
10
+ def __init__(self, reference_data: pd.DataFrame, threshold: float = 0.05):
11
+ self.reference_data = reference_data
12
+ self.threshold = threshold
13
+
14
+ def detect_drift(self, current_data: pd.DataFrame) -> Dict[str, Any]:
15
+ """Detect drift using Kolmogorov-Smirnov test"""
16
+ drift_report = {
17
+ "drift_detected": False,
18
+ "drifted_features": [],
19
+ "drift_scores": {}
20
+ }
21
+
22
+ for col in self.reference_data.select_dtypes(include=[np.number]).columns:
23
+ if col in current_data.columns:
24
+ # KS test for numerical features
25
+ statistic, p_value = stats.ks_2samp(
26
+ self.reference_data[col].dropna(),
27
+ current_data[col].dropna()
28
+ )
29
+
30
+ drift_report["drift_scores"][col] = {
31
+ "statistic": float(statistic),
32
+ "p_value": float(p_value),
33
+ "drift": p_value < self.threshold
34
+ }
35
+
36
+ if p_value < self.threshold:
37
+ drift_report["drift_detected"] = True
38
+ drift_report["drifted_features"].append(col)
39
+
40
+ return drift_report
41
+
42
+ def save_report(self, report: Dict[str, Any], output_path: Path):
43
+ output_path.parent.mkdir(parents=True, exist_ok=True)
44
+ with open(output_path, 'w') as f:
45
+ json.dump(report, f, indent=2)
monitoring/data_drift/evidently_monitor.py CHANGED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from pathlib import Path
3
+ from typing import Optional
4
+ try:
5
+ from evidently.report import Report
6
+ from evidently.metric_preset import DataDriftPreset, DataQualityPreset
7
+ EVIDENTLY_AVAILABLE = True
8
+ except ImportError:
9
+ EVIDENTLY_AVAILABLE = False
10
+
11
+
12
+ class EvidentlyMonitor:
13
+ def __init__(self, reference_data: pd.DataFrame):
14
+ if not EVIDENTLY_AVAILABLE:
15
+ raise ImportError("Evidently not installed. Run: pip install evidently")
16
+ self.reference_data = reference_data
17
+
18
+ def generate_drift_report(self, current_data: pd.DataFrame, output_path: Optional[Path] = None):
19
+ """Generate Evidently data drift report"""
20
+ report = Report(metrics=[
21
+ DataDriftPreset(),
22
+ DataQualityPreset()
23
+ ])
24
+
25
+ report.run(reference_data=self.reference_data, current_data=current_data)
26
+
27
+ if output_path:
28
+ output_path.parent.mkdir(parents=True, exist_ok=True)
29
+ report.save_html(str(output_path))
30
+
31
+ return report
32
+
33
+ def get_drift_metrics(self, current_data: pd.DataFrame) -> dict:
34
+ """Get drift metrics as dictionary"""
35
+ report = Report(metrics=[DataDriftPreset()])
36
+ report.run(reference_data=self.reference_data, current_data=current_data)
37
+ return report.as_dict()
monitoring/model_monitoring/performance_tracker.py CHANGED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
4
+ from typing import Dict, Any, List
5
+ import json
6
+ from pathlib import Path
7
+ from datetime import datetime
8
+
9
+
10
+ class PerformanceTracker:
11
+ def __init__(self, metrics_dir: Path):
12
+ self.metrics_dir = Path(metrics_dir)
13
+ self.metrics_dir.mkdir(parents=True, exist_ok=True)
14
+ self.history = []
15
+
16
+ def track_batch_performance(self,
17
+ y_true: np.ndarray,
18
+ y_pred: np.ndarray,
19
+ model_version: str = "v1") -> Dict[str, float]:
20
+ """Calculate and track performance metrics"""
21
+ metrics = {
22
+ "timestamp": datetime.now().isoformat(),
23
+ "model_version": model_version,
24
+ "accuracy": float(accuracy_score(y_true, y_pred)),
25
+ "f1_score": float(f1_score(y_true, y_pred, average='weighted', zero_division=0)),
26
+ "precision": float(precision_score(y_true, y_pred, average='weighted', zero_division=0)),
27
+ "recall": float(recall_score(y_true, y_pred, average='weighted', zero_division=0)),
28
+ "n_samples": len(y_true)
29
+ }
30
+
31
+ self.history.append(metrics)
32
+ self._save_metrics(metrics)
33
+
34
+ return metrics
35
+
36
+ def _save_metrics(self, metrics: Dict[str, Any]):
37
+ """Save metrics to file"""
38
+ metrics_file = self.metrics_dir / f"metrics_{datetime.now().strftime('%Y%m%d')}.jsonl"
39
+ with open(metrics_file, 'a') as f:
40
+ f.write(json.dumps(metrics) + '\n')
41
+
42
+ def get_recent_metrics(self, n: int = 10) -> List[Dict[str, Any]]:
43
+ """Get recent n metric entries"""
44
+ return self.history[-n:] if len(self.history) >= n else self.history
45
+
46
+ def get_metrics_summary(self) -> Dict[str, float]:
47
+ """Get summary statistics of recent metrics"""
48
+ if not self.history:
49
+ return {}
50
+
51
+ df = pd.DataFrame(self.history)
52
+ return {
53
+ "mean_accuracy": float(df['accuracy'].mean()),
54
+ "mean_f1_score": float(df['f1_score'].mean()),
55
+ "mean_precision": float(df['precision'].mean()),
56
+ "mean_recall": float(df['recall'].mean()),
57
+ "total_samples": int(df['n_samples'].sum())
58
+ }
monitoring/model_monitoring/prediction_logger.py CHANGED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ from pathlib import Path
4
+ from datetime import datetime
5
+ from typing import Any, Dict, List
6
+ import threading
7
+
8
+
9
+ class PredictionLogger:
10
+ def __init__(self, log_dir: Path):
11
+ self.log_dir = Path(log_dir)
12
+ self.log_dir.mkdir(parents=True, exist_ok=True)
13
+ self.lock = threading.Lock()
14
+
15
+ def log_prediction(self,
16
+ input_data: Dict[str, Any],
17
+ prediction: Any,
18
+ model_version: str = "v1",
19
+ metadata: Dict[str, Any] = None):
20
+ """Log a single prediction"""
21
+ log_entry = {
22
+ "timestamp": datetime.now().isoformat(),
23
+ "model_version": model_version,
24
+ "input": input_data,
25
+ "prediction": prediction,
26
+ "metadata": metadata or {}
27
+ }
28
+
29
+ log_file = self.log_dir / f"predictions_{datetime.now().strftime('%Y%m%d')}.jsonl"
30
+
31
+ with self.lock:
32
+ with open(log_file, 'a') as f:
33
+ f.write(json.dumps(log_entry) + '\n')
34
+
35
+ def load_predictions(self, date: str = None) -> List[Dict[str, Any]]:
36
+ """Load predictions from log file"""
37
+ if date is None:
38
+ date = datetime.now().strftime('%Y%m%d')
39
+
40
+ log_file = self.log_dir / f"predictions_{date}.jsonl"
41
+
42
+ if not log_file.exists():
43
+ return []
44
+
45
+ predictions = []
46
+ with open(log_file, 'r') as f:
47
+ for line in f:
48
+ predictions.append(json.loads(line))
49
+
50
+ return predictions
51
+
52
+ def get_predictions_df(self, date: str = None) -> pd.DataFrame:
53
+ """Get predictions as DataFrame"""
54
+ predictions = self.load_predictions(date)
55
+ return pd.DataFrame(predictions) if predictions else pd.DataFrame()
monitoring/reports/report_20260219.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "report_date": "2026-02-19",
3
+ "generated_at": "2026-02-19T11:57:46.237222",
4
+ "predictions": {
5
+ "total_predictions": 0,
6
+ "prediction_distribution": {}
7
+ },
8
+ "drift": {
9
+ "drift_detected": false,
10
+ "drifted_features": []
11
+ },
12
+ "performance": {},
13
+ "status": "healthy"
14
+ }
observability/grafana/dashboards/model_monitoring.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "dashboard": {
3
+ "title": "AutoML Model Monitoring",
4
+ "panels": [
5
+ {
6
+ "id": 1,
7
+ "title": "Prediction Rate",
8
+ "type": "graph",
9
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
10
+ "targets": [
11
+ {
12
+ "expr": "rate(predictions_total[5m])",
13
+ "legendFormat": "{{model_version}} - {{status}}"
14
+ }
15
+ ]
16
+ },
17
+ {
18
+ "id": 2,
19
+ "title": "Prediction Latency (p95)",
20
+ "type": "graph",
21
+ "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
22
+ "targets": [
23
+ {
24
+ "expr": "histogram_quantile(0.95, rate(prediction_duration_seconds_bucket[5m]))"
25
+ }
26
+ ]
27
+ },
28
+ {
29
+ "id": 3,
30
+ "title": "Model Accuracy",
31
+ "type": "gauge",
32
+ "gridPos": {"h": 8, "w": 8, "x": 0, "y": 8},
33
+ "targets": [
34
+ {
35
+ "expr": "model_accuracy"
36
+ }
37
+ ]
38
+ },
39
+ {
40
+ "id": 4,
41
+ "title": "Data Drift Status",
42
+ "type": "stat",
43
+ "gridPos": {"h": 8, "w": 8, "x": 8, "y": 8},
44
+ "targets": [
45
+ {
46
+ "expr": "data_drift_detected"
47
+ }
48
+ ]
49
+ },
50
+ {
51
+ "id": 5,
52
+ "title": "Active Requests",
53
+ "type": "graph",
54
+ "gridPos": {"h": 8, "w": 8, "x": 16, "y": 8},
55
+ "targets": [
56
+ {
57
+ "expr": "active_requests"
58
+ }
59
+ ]
60
+ }
61
+ ],
62
+ "timezone": "browser",
63
+ "schemaVersion": 16,
64
+ "version": 0
65
+ }
66
+ }
observability/grafana/provisioning/dashboards.yaml CHANGED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ providers:
4
+ - name: 'AutoML MLOps Dashboards'
5
+ orgId: 1
6
+ folder: ''
7
+ type: file
8
+ disableDeletion: false
9
+ updateIntervalSeconds: 10
10
+ allowUiUpdates: true
11
+ options:
12
+ path: /etc/grafana/provisioning/dashboards
observability/grafana/provisioning/datasources.yaml CHANGED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ datasources:
4
+ - name: Prometheus
5
+ type: prometheus
6
+ access: proxy
7
+ url: http://prometheus:9090
8
+ isDefault: true
9
+ editable: true
10
+
11
+ - name: Loki
12
+ type: loki
13
+ access: proxy
14
+ url: http://loki:3100
15
+ editable: true
observability/loki/loki-config.yaml CHANGED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auth_enabled: false
2
+
3
+ server:
4
+ http_listen_port: 3100
5
+ grpc_listen_port: 9096
6
+
7
+ common:
8
+ path_prefix: /tmp/loki
9
+ storage:
10
+ filesystem:
11
+ chunks_directory: /tmp/loki/chunks
12
+ rules_directory: /tmp/loki/rules
13
+ replication_factor: 1
14
+ ring:
15
+ instance_addr: 127.0.0.1
16
+ kvstore:
17
+ store: inmemory
18
+
19
+ query_range:
20
+ results_cache:
21
+ cache:
22
+ embedded_cache:
23
+ enabled: true
24
+ max_size_mb: 100
25
+
26
+ schema_config:
27
+ configs:
28
+ - from: 2020-10-24
29
+ store: boltdb-shipper
30
+ object_store: filesystem
31
+ schema: v11
32
+ index:
33
+ prefix: index_
34
+ period: 24h
35
+
36
+ ruler:
37
+ alertmanager_url: http://localhost:9093
38
+
39
+ limits_config:
40
+ enforce_metric_name: false
41
+ reject_old_samples: true
42
+ reject_old_samples_max_age: 168h
observability/prometheus/alerts.yml CHANGED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ groups:
2
+ - name: model_performance
3
+ interval: 1m
4
+ rules:
5
+ - alert: HighErrorRate
6
+ expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
7
+ for: 5m
8
+ labels:
9
+ severity: critical
10
+ annotations:
11
+ summary: "High error rate detected"
12
+ description: "Error rate is {{ $value }} requests/sec"
13
+
14
+ - alert: ModelLatencyHigh
15
+ expr: histogram_quantile(0.95, rate(prediction_duration_seconds_bucket[5m])) > 2
16
+ for: 5m
17
+ labels:
18
+ severity: warning
19
+ annotations:
20
+ summary: "Model prediction latency is high"
21
+ description: "95th percentile latency is {{ $value }}s"
22
+
23
+ - alert: DataDriftDetected
24
+ expr: data_drift_detected == 1
25
+ for: 10m
26
+ labels:
27
+ severity: warning
28
+ annotations:
29
+ summary: "Data drift detected in model inputs"
30
+ description: "Drift has been detected in feature distributions"
observability/prometheus/prometheus.yml CHANGED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ scrape_interval: 15s
3
+ evaluation_interval: 15s
4
+ external_labels:
5
+ cluster: 'automl-mlops'
6
+ environment: 'production'
7
+
8
+ alerting:
9
+ alertmanagers:
10
+ - static_configs:
11
+ - targets: []
12
+
13
+ rule_files:
14
+ - 'alerts.yml'
15
+
16
+ scrape_configs:
17
+ - job_name: 'prometheus'
18
+ static_configs:
19
+ - targets: ['localhost:9090']
20
+
21
+ - job_name: 'fastapi-app'
22
+ metrics_path: '/metrics'
23
+ static_configs:
24
+ - targets: ['app:8000']
25
+ labels:
26
+ service: 'automl-api'
27
+
28
+ - job_name: 'node-exporter'
29
+ static_configs:
30
+ - targets: ['node-exporter:9100']
31
+ labels:
32
+ service: 'system-metrics'
observability/promtail/promtail-config.yaml CHANGED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ server:
2
+ http_listen_port: 9080
3
+ grpc_listen_port: 0
4
+
5
+ positions:
6
+ filename: /tmp/positions.yaml
7
+
8
+ clients:
9
+ - url: http://loki:3100/loki/api/v1/push
10
+
11
+ scrape_configs:
12
+ - job_name: system
13
+ static_configs:
14
+ - targets:
15
+ - localhost
16
+ labels:
17
+ job: varlogs
18
+ __path__: /var/log/*log
19
+
20
+ - job_name: fastapi-logs
21
+ static_configs:
22
+ - targets:
23
+ - localhost
24
+ labels:
25
+ job: fastapi
26
+ __path__: /app/logs/*.log
27
+
28
+ - job_name: prediction-logs
29
+ static_configs:
30
+ - targets:
31
+ - localhost
32
+ labels:
33
+ job: predictions
34
+ __path__: /app/monitoring/predictions/*.jsonl
requirements.txt CHANGED
@@ -7,6 +7,7 @@ jinja2
7
  pandas
8
  numpy
9
  scikit-learn
 
10
 
11
  autogluon.tabular
12
  flaml
 
7
  pandas
8
  numpy
9
  scikit-learn
10
+ scipy
11
 
12
  autogluon.tabular
13
  flaml