Spaces:

Abeshith
/

AutoML_MLOps_PipeLine

Sleeping

App Files Files Community

Abeshith commited on Feb 23

Commit

b53ee19

1 Parent(s): d463732

Added Monitoring Stages

Browse files

Files changed (18) hide show

app/main.py +6 -1
app/routers/monitoring.py +99 -0
app/routers/predict.py +40 -2
app/utils/metrics.py +54 -0
monitoring/dashboards/generate_reports.py +58 -0
monitoring/data_drift/drift_detector.py +45 -0
monitoring/data_drift/evidently_monitor.py +37 -0
monitoring/model_monitoring/performance_tracker.py +58 -0
monitoring/model_monitoring/prediction_logger.py +55 -0
monitoring/reports/report_20260219.json +14 -0
observability/grafana/dashboards/model_monitoring.json +66 -0
observability/grafana/provisioning/dashboards.yaml +12 -0
observability/grafana/provisioning/datasources.yaml +15 -0
observability/loki/loki-config.yaml +42 -0
observability/prometheus/alerts.yml +30 -0
observability/prometheus/prometheus.yml +32 -0
observability/promtail/promtail-config.yaml +34 -0
requirements.txt +1 -0

app/main.py CHANGED Viewed

@@ -3,8 +3,9 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
-from app.routers import health, predict, train, ui
 from mlpipeline.exception import MLPipelineException
 import uvicorn
 app = FastAPI(
@@ -24,10 +25,14 @@ app.add_middleware(
     allow_headers=["*"],
 )
 app.include_router(health.router)
 app.include_router(predict.router)
 app.include_router(train.router)
 app.include_router(ui.router)
 @app.exception_handler(MLPipelineException)

 from fastapi.responses import JSONResponse
 from fastapi.staticfiles import StaticFiles
 from fastapi.templating import Jinja2Templates
+from app.routers import health, predict, train, ui, monitoring
 from mlpipeline.exception import MLPipelineException
+from app.utils.metrics import MetricsMiddleware
 import uvicorn
 app = FastAPI(
     allow_headers=["*"],
 )
+# Add metrics middleware
+app.middleware("http")(MetricsMiddleware())
 app.include_router(health.router)
 app.include_router(predict.router)
 app.include_router(train.router)
 app.include_router(ui.router)
+app.include_router(monitoring.router)
 @app.exception_handler(MLPipelineException)

app/routers/monitoring.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from fastapi import APIRouter, HTTPException
+from pathlib import Path
+import sys
+sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+from monitoring.data_drift.drift_detector import DriftDetector
+from monitoring.model_monitoring.prediction_logger import PredictionLogger
+from monitoring.model_monitoring.performance_tracker import PerformanceTracker
+from monitoring.dashboards.generate_reports import MonitoringReportGenerator
+from app.utils.metrics import get_metrics
+import pandas as pd
+router = APIRouter(prefix="/monitoring", tags=["monitoring"])
+# Initialize monitoring components
+MONITORING_DIR = Path("monitoring")
+prediction_logger = PredictionLogger(MONITORING_DIR / "predictions")
+performance_tracker = PerformanceTracker(MONITORING_DIR / "metrics")
+report_generator = MonitoringReportGenerator(MONITORING_DIR / "reports")
+@router.get("/metrics")
+async def metrics():
+    """Prometheus metrics endpoint"""
+    return get_metrics()
+@router.get("/health/drift")
+async def check_drift():
+    """Check for data drift"""
+    try:
+        # Load reference and current data
+        reference_path = Path("artifacts/data_transformation/train.csv")
+        if not reference_path.exists():
+            raise HTTPException(status_code=404, detail="Reference data not found")
+        # Get recent predictions
+        predictions_df = prediction_logger.get_predictions_df()
+        if predictions_df.empty:
+            return {"status": "no_data", "message": "No recent predictions to check"}
+        reference_data = pd.read_csv(reference_path).sample(n=min(1000, len(predictions_df)))
+        # For this example, we'll skip drift detection if no input data
+        return {
+            "status": "healthy",
+            "drift_detected": False,
+            "message": "Drift detection available with sufficient data"
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/performance/summary")
+async def get_performance_summary():
+    """Get performance metrics summary"""
+    try:
+        summary = performance_tracker.get_metrics_summary()
+        if not summary:
+            return {"status": "no_data", "message": "No performance data available"}
+        return {
+            "status": "success",
+            "summary": summary,
+            "recent_metrics": performance_tracker.get_recent_metrics(n=5)
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/reports/daily")
+async def get_daily_report():
+    """Get daily monitoring report"""
+    try:
+        predictions_df = prediction_logger.get_predictions_df()
+        drift_report = {"drift_detected": False, "drifted_features": []}
+        performance_metrics = performance_tracker.get_metrics_summary()
+        report = report_generator.generate_daily_report(
+            predictions_df=predictions_df,
+            drift_report=drift_report,
+            performance_metrics=performance_metrics
+        )
+        return report
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@router.get("/reports/weekly")
+async def get_weekly_summary():
+    """Get weekly monitoring summary"""
+    try:
+        summary = report_generator.get_weekly_summary()
+        return summary
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))

app/routers/predict.py CHANGED Viewed

@@ -2,10 +2,17 @@ from fastapi import APIRouter, HTTPException
 from app.schemas.request import PredictionRequest, BatchPredictionRequest
 from app.schemas.response import PredictionResponse, BatchPredictionResponse
 from app.utils.model_loader import model_loader
 import pandas as pd
 router = APIRouter(prefix="/predict", tags=["prediction"])
 def convert_to_original_columns(data_dict):
     mapping = {
@@ -28,6 +35,7 @@ def add_interaction_features(df):
 @router.post("/", response_model=PredictionResponse)
 async def predict_single(request: PredictionRequest):
     try:
         pipeline = model_loader.get_pipeline()
         input_dict = convert_to_original_columns(request.model_dump())
@@ -35,16 +43,33 @@ async def predict_single(request: PredictionRequest):
         df = add_interaction_features(df)
         result = pipeline.predict(df)
         return PredictionResponse(
-            prediction=result["predictions"][0],
-            probability=result.get("probabilities")[0] if result.get("probabilities") else None
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @router.post("/batch", response_model=BatchPredictionResponse)
 async def predict_batch(request: BatchPredictionRequest):
     try:
         pipeline = model_loader.get_pipeline()
         data_list = [convert_to_original_columns(item.model_dump()) for item in request.data]
@@ -52,10 +77,23 @@ async def predict_batch(request: BatchPredictionRequest):
         df = add_interaction_features(df)
         result = pipeline.predict(df)
         return BatchPredictionResponse(
             predictions=result["predictions"],
             probabilities=result.get("probabilities"),
             num_samples=result["num_samples"]
         )
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))

 from app.schemas.request import PredictionRequest, BatchPredictionRequest
 from app.schemas.response import PredictionResponse, BatchPredictionResponse
 from app.utils.model_loader import model_loader
+from app.utils.metrics import prediction_counter, prediction_duration
+from monitoring.model_monitoring.prediction_logger import PredictionLogger
+from pathlib import Path
 import pandas as pd
+import time
 router = APIRouter(prefix="/predict", tags=["prediction"])
+# Initialize prediction logger
+prediction_logger = PredictionLogger(Path("monitoring/predictions"))
 def convert_to_original_columns(data_dict):
     mapping = {
 @router.post("/", response_model=PredictionResponse)
 async def predict_single(request: PredictionRequest):
+    start_time = time.time()
     try:
         pipeline = model_loader.get_pipeline()
         input_dict = convert_to_original_columns(request.model_dump())
         df = add_interaction_features(df)
         result = pipeline.predict(df)
+        prediction = result["predictions"][0]
+        probability = result.get("probabilities")[0] if result.get("probabilities") else None
+        # Log prediction
+        prediction_logger.log_prediction(
+            input_data=input_dict,
+            prediction=int(prediction),
+            model_version="v1",
+            metadata={"probability": float(probability) if probability else None}
+        )
+        # Update metrics
+        prediction_counter.labels(model_version="v1", status="success").inc()
+        prediction_duration.observe(time.time() - start_time)
         return PredictionResponse(
+            prediction=prediction,
+            probability=probability
         )
     except Exception as e:
+        prediction_counter.labels(model_version="v1", status="error").inc()
         raise HTTPException(status_code=500, detail=str(e))
 @router.post("/batch", response_model=BatchPredictionResponse)
 async def predict_batch(request: BatchPredictionRequest):
+    start_time = time.time()
     try:
         pipeline = model_loader.get_pipeline()
         data_list = [convert_to_original_columns(item.model_dump()) for item in request.data]
         df = add_interaction_features(df)
         result = pipeline.predict(df)
+        # Log batch predictions
+        for input_data, prediction in zip(data_list, result["predictions"]):
+            prediction_logger.log_prediction(
+                input_data=input_data,
+                prediction=int(prediction),
+                model_version="v1"
+            )
+        # Update metrics
+        prediction_counter.labels(model_version="v1", status="success").inc(len(result["predictions"]))
+        prediction_duration.observe(time.time() - start_time)
         return BatchPredictionResponse(
             predictions=result["predictions"],
             probabilities=result.get("probabilities"),
             num_samples=result["num_samples"]
         )
     except Exception as e:
+        prediction_counter.labels(model_version="v1", status="error").inc()
         raise HTTPException(status_code=500, detail=str(e))

app/utils/metrics.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST
+from fastapi import Response
+import time
+# Define Prometheus metrics
+prediction_counter = Counter(
+    'predictions_total',
+    'Total number of predictions made',
+    ['model_version', 'status']
+)
+prediction_duration = Histogram(
+    'prediction_duration_seconds',
+    'Time spent processing prediction',
+    buckets=[0.1, 0.5, 1.0, 2.0, 5.0]
+)
+model_accuracy = Gauge(
+    'model_accuracy',
+    'Current model accuracy',
+    ['model_version']
+)
+data_drift_detected = Gauge(
+    'data_drift_detected',
+    'Whether data drift has been detected (1=yes, 0=no)'
+)
+active_requests = Gauge(
+    'active_requests',
+    'Number of active requests'
+)
+class MetricsMiddleware:
+    """Middleware to track request metrics"""
+    async def __call__(self, request, call_next):
+        active_requests.inc()
+        start_time = time.time()
+        try:
+            response = await call_next(request)
+            return response
+        finally:
+            active_requests.dec()
+            duration = time.time() - start_time
+            if request.url.path == "/predict/":
+                prediction_duration.observe(duration)
+def get_metrics() -> Response:
+    """Endpoint to expose Prometheus metrics"""
+    return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)

monitoring/dashboards/generate_reports.py CHANGED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+from pathlib import Path
+from datetime import datetime, timedelta
+import json
+from typing import Dict, Any
+class MonitoringReportGenerator:
+    def __init__(self, monitoring_dir: Path):
+        self.monitoring_dir = Path(monitoring_dir)
+        self.monitoring_dir.mkdir(parents=True, exist_ok=True)
+    def generate_daily_report(self,
+                            predictions_df: pd.DataFrame,
+                            drift_report: Dict[str, Any],
+                            performance_metrics: Dict[str, float]) -> Dict[str, Any]:
+        """Generate comprehensive daily monitoring report"""
+        report = {
+            "report_date": datetime.now().strftime('%Y-%m-%d'),
+            "generated_at": datetime.now().isoformat(),
+            "predictions": {
+                "total_predictions": len(predictions_df),
+                "prediction_distribution": predictions_df['prediction'].value_counts().to_dict() if 'prediction' in predictions_df.columns else {}
+            },
+            "drift": drift_report,
+            "performance": performance_metrics,
+            "status": "healthy" if not drift_report.get("drift_detected", False) else "warning"
+        }
+        report_path = self.monitoring_dir / f"report_{datetime.now().strftime('%Y%m%d')}.json"
+        with open(report_path, 'w') as f:
+            json.dump(report, f, indent=2)
+        return report
+    def get_weekly_summary(self) -> Dict[str, Any]:
+        """Get summary of past week's monitoring data"""
+        end_date = datetime.now()
+        start_date = end_date - timedelta(days=7)
+        reports = []
+        for i in range(7):
+            date = (start_date + timedelta(days=i)).strftime('%Y%m%d')
+            report_path = self.monitoring_dir / f"report_{date}.json"
+            if report_path.exists():
+                with open(report_path, 'r') as f:
+                    reports.append(json.load(f))
+        if not reports:
+            return {"status": "no_data", "period": "last_7_days"}
+        return {
+            "period": "last_7_days",
+            "total_reports": len(reports),
+            "days_with_drift": sum(1 for r in reports if r.get('drift', {}).get('drift_detected', False)),
+            "avg_predictions_per_day": sum(r.get('predictions', {}).get('total_predictions', 0) for r in reports) / len(reports),
+            "status": "healthy" if all(r.get('status') == 'healthy' for r in reports) else "needs_attention"
+        }

monitoring/data_drift/drift_detector.py CHANGED Viewed

	@@ -0,0 +1,45 @@

+import pandas as pd
+import numpy as np
+from scipy import stats
+from typing import Dict, Any
+import json
+from pathlib import Path
+class DriftDetector:
+    def __init__(self, reference_data: pd.DataFrame, threshold: float = 0.05):
+        self.reference_data = reference_data
+        self.threshold = threshold
+    def detect_drift(self, current_data: pd.DataFrame) -> Dict[str, Any]:
+        """Detect drift using Kolmogorov-Smirnov test"""
+        drift_report = {
+            "drift_detected": False,
+            "drifted_features": [],
+            "drift_scores": {}
+        }
+        for col in self.reference_data.select_dtypes(include=[np.number]).columns:
+            if col in current_data.columns:
+                # KS test for numerical features
+                statistic, p_value = stats.ks_2samp(
+                    self.reference_data[col].dropna(),
+                    current_data[col].dropna()
+                )
+                drift_report["drift_scores"][col] = {
+                    "statistic": float(statistic),
+                    "p_value": float(p_value),
+                    "drift": p_value < self.threshold
+                }
+                if p_value < self.threshold:
+                    drift_report["drift_detected"] = True
+                    drift_report["drifted_features"].append(col)
+        return drift_report
+    def save_report(self, report: Dict[str, Any], output_path: Path):
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(output_path, 'w') as f:
+            json.dump(report, f, indent=2)

monitoring/data_drift/evidently_monitor.py CHANGED Viewed

	@@ -0,0 +1,37 @@

+import pandas as pd
+from pathlib import Path
+from typing import Optional
+try:
+    from evidently.report import Report
+    from evidently.metric_preset import DataDriftPreset, DataQualityPreset
+    EVIDENTLY_AVAILABLE = True
+except ImportError:
+    EVIDENTLY_AVAILABLE = False
+class EvidentlyMonitor:
+    def __init__(self, reference_data: pd.DataFrame):
+        if not EVIDENTLY_AVAILABLE:
+            raise ImportError("Evidently not installed. Run: pip install evidently")
+        self.reference_data = reference_data
+    def generate_drift_report(self, current_data: pd.DataFrame, output_path: Optional[Path] = None):
+        """Generate Evidently data drift report"""
+        report = Report(metrics=[
+            DataDriftPreset(),
+            DataQualityPreset()
+        ])
+        report.run(reference_data=self.reference_data, current_data=current_data)
+        if output_path:
+            output_path.parent.mkdir(parents=True, exist_ok=True)
+            report.save_html(str(output_path))
+        return report
+    def get_drift_metrics(self, current_data: pd.DataFrame) -> dict:
+        """Get drift metrics as dictionary"""
+        report = Report(metrics=[DataDriftPreset()])
+        report.run(reference_data=self.reference_data, current_data=current_data)
+        return report.as_dict()

monitoring/model_monitoring/performance_tracker.py CHANGED Viewed

	@@ -0,0 +1,58 @@

+import pandas as pd
+import numpy as np
+from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
+from typing import Dict, Any, List
+import json
+from pathlib import Path
+from datetime import datetime
+class PerformanceTracker:
+    def __init__(self, metrics_dir: Path):
+        self.metrics_dir = Path(metrics_dir)
+        self.metrics_dir.mkdir(parents=True, exist_ok=True)
+        self.history = []
+    def track_batch_performance(self,
+                               y_true: np.ndarray,
+                               y_pred: np.ndarray,
+                               model_version: str = "v1") -> Dict[str, float]:
+        """Calculate and track performance metrics"""
+        metrics = {
+            "timestamp": datetime.now().isoformat(),
+            "model_version": model_version,
+            "accuracy": float(accuracy_score(y_true, y_pred)),
+            "f1_score": float(f1_score(y_true, y_pred, average='weighted', zero_division=0)),
+            "precision": float(precision_score(y_true, y_pred, average='weighted', zero_division=0)),
+            "recall": float(recall_score(y_true, y_pred, average='weighted', zero_division=0)),
+            "n_samples": len(y_true)
+        }
+        self.history.append(metrics)
+        self._save_metrics(metrics)
+        return metrics
+    def _save_metrics(self, metrics: Dict[str, Any]):
+        """Save metrics to file"""
+        metrics_file = self.metrics_dir / f"metrics_{datetime.now().strftime('%Y%m%d')}.jsonl"
+        with open(metrics_file, 'a') as f:
+            f.write(json.dumps(metrics) + '\n')
+    def get_recent_metrics(self, n: int = 10) -> List[Dict[str, Any]]:
+        """Get recent n metric entries"""
+        return self.history[-n:] if len(self.history) >= n else self.history
+    def get_metrics_summary(self) -> Dict[str, float]:
+        """Get summary statistics of recent metrics"""
+        if not self.history:
+            return {}
+        df = pd.DataFrame(self.history)
+        return {
+            "mean_accuracy": float(df['accuracy'].mean()),
+            "mean_f1_score": float(df['f1_score'].mean()),
+            "mean_precision": float(df['precision'].mean()),
+            "mean_recall": float(df['recall'].mean()),
+            "total_samples": int(df['n_samples'].sum())
+        }

monitoring/model_monitoring/prediction_logger.py CHANGED Viewed

	@@ -0,0 +1,55 @@

+import pandas as pd
+import json
+from pathlib import Path
+from datetime import datetime
+from typing import Any, Dict, List
+import threading
+class PredictionLogger:
+    def __init__(self, log_dir: Path):
+        self.log_dir = Path(log_dir)
+        self.log_dir.mkdir(parents=True, exist_ok=True)
+        self.lock = threading.Lock()
+    def log_prediction(self,
+                      input_data: Dict[str, Any],
+                      prediction: Any,
+                      model_version: str = "v1",
+                      metadata: Dict[str, Any] = None):
+        """Log a single prediction"""
+        log_entry = {
+            "timestamp": datetime.now().isoformat(),
+            "model_version": model_version,
+            "input": input_data,
+            "prediction": prediction,
+            "metadata": metadata or {}
+        }
+        log_file = self.log_dir / f"predictions_{datetime.now().strftime('%Y%m%d')}.jsonl"
+        with self.lock:
+            with open(log_file, 'a') as f:
+                f.write(json.dumps(log_entry) + '\n')
+    def load_predictions(self, date: str = None) -> List[Dict[str, Any]]:
+        """Load predictions from log file"""
+        if date is None:
+            date = datetime.now().strftime('%Y%m%d')
+        log_file = self.log_dir / f"predictions_{date}.jsonl"
+        if not log_file.exists():
+            return []
+        predictions = []
+        with open(log_file, 'r') as f:
+            for line in f:
+                predictions.append(json.loads(line))
+        return predictions
+    def get_predictions_df(self, date: str = None) -> pd.DataFrame:
+        """Get predictions as DataFrame"""
+        predictions = self.load_predictions(date)
+        return pd.DataFrame(predictions) if predictions else pd.DataFrame()

monitoring/reports/report_20260219.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "report_date": "2026-02-19",
+  "generated_at": "2026-02-19T11:57:46.237222",
+  "predictions": {
+    "total_predictions": 0,
+    "prediction_distribution": {}
+  },
+  "drift": {
+    "drift_detected": false,
+    "drifted_features": []
+  },
+  "performance": {},
+  "status": "healthy"
+}

observability/grafana/dashboards/model_monitoring.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "dashboard": {
+    "title": "AutoML Model Monitoring",
+    "panels": [
+      {
+        "id": 1,
+        "title": "Prediction Rate",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
+        "targets": [
+          {
+            "expr": "rate(predictions_total[5m])",
+            "legendFormat": "{{model_version}} - {{status}}"
+          }
+        ]
+      },
+      {
+        "id": 2,
+        "title": "Prediction Latency (p95)",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
+        "targets": [
+          {
+            "expr": "histogram_quantile(0.95, rate(prediction_duration_seconds_bucket[5m]))"
+          }
+        ]
+      },
+      {
+        "id": 3,
+        "title": "Model Accuracy",
+        "type": "gauge",
+        "gridPos": {"h": 8, "w": 8, "x": 0, "y": 8},
+        "targets": [
+          {
+            "expr": "model_accuracy"
+          }
+        ]
+      },
+      {
+        "id": 4,
+        "title": "Data Drift Status",
+        "type": "stat",
+        "gridPos": {"h": 8, "w": 8, "x": 8, "y": 8},
+        "targets": [
+          {
+            "expr": "data_drift_detected"
+          }
+        ]
+      },
+      {
+        "id": 5,
+        "title": "Active Requests",
+        "type": "graph",
+        "gridPos": {"h": 8, "w": 8, "x": 16, "y": 8},
+        "targets": [
+          {
+            "expr": "active_requests"
+          }
+        ]
+      }
+    ],
+    "timezone": "browser",
+    "schemaVersion": 16,
+    "version": 0
+  }
+}

observability/grafana/provisioning/dashboards.yaml CHANGED Viewed

	@@ -0,0 +1,12 @@

+apiVersion: 1
+providers:
+  - name: 'AutoML MLOps Dashboards'
+    orgId: 1
+    folder: ''
+    type: file
+    disableDeletion: false
+    updateIntervalSeconds: 10
+    allowUiUpdates: true
+    options:
+      path: /etc/grafana/provisioning/dashboards

observability/grafana/provisioning/datasources.yaml CHANGED Viewed

	@@ -0,0 +1,15 @@

+apiVersion: 1
+datasources:
+  - name: Prometheus
+    type: prometheus
+    access: proxy
+    url: http://prometheus:9090
+    isDefault: true
+    editable: true
+  - name: Loki
+    type: loki
+    access: proxy
+    url: http://loki:3100
+    editable: true

observability/loki/loki-config.yaml CHANGED Viewed

	@@ -0,0 +1,42 @@

+auth_enabled: false
+server:
+  http_listen_port: 3100
+  grpc_listen_port: 9096
+common:
+  path_prefix: /tmp/loki
+  storage:
+    filesystem:
+      chunks_directory: /tmp/loki/chunks
+      rules_directory: /tmp/loki/rules
+  replication_factor: 1
+  ring:
+    instance_addr: 127.0.0.1
+    kvstore:
+      store: inmemory
+query_range:
+  results_cache:
+    cache:
+      embedded_cache:
+        enabled: true
+        max_size_mb: 100
+schema_config:
+  configs:
+    - from: 2020-10-24
+      store: boltdb-shipper
+      object_store: filesystem
+      schema: v11
+      index:
+        prefix: index_
+        period: 24h
+ruler:
+  alertmanager_url: http://localhost:9093
+limits_config:
+  enforce_metric_name: false
+  reject_old_samples: true
+  reject_old_samples_max_age: 168h

observability/prometheus/alerts.yml CHANGED Viewed

	@@ -0,0 +1,30 @@

+groups:
+  - name: model_performance
+    interval: 1m
+    rules:
+      - alert: HighErrorRate
+        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
+        for: 5m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High error rate detected"
+          description: "Error rate is {{ $value }} requests/sec"
+      - alert: ModelLatencyHigh
+        expr: histogram_quantile(0.95, rate(prediction_duration_seconds_bucket[5m])) > 2
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Model prediction latency is high"
+          description: "95th percentile latency is {{ $value }}s"
+      - alert: DataDriftDetected
+        expr: data_drift_detected == 1
+        for: 10m
+        labels:
+          severity: warning
+        annotations:
+          summary: "Data drift detected in model inputs"
+          description: "Drift has been detected in feature distributions"

observability/prometheus/prometheus.yml CHANGED Viewed

	@@ -0,0 +1,32 @@

+global:
+  scrape_interval: 15s
+  evaluation_interval: 15s
+  external_labels:
+    cluster: 'automl-mlops'
+    environment: 'production'
+alerting:
+  alertmanagers:
+    - static_configs:
+        - targets: []
+rule_files:
+  - 'alerts.yml'
+scrape_configs:
+  - job_name: 'prometheus'
+    static_configs:
+      - targets: ['localhost:9090']
+  - job_name: 'fastapi-app'
+    metrics_path: '/metrics'
+    static_configs:
+      - targets: ['app:8000']
+        labels:
+          service: 'automl-api'
+  - job_name: 'node-exporter'
+    static_configs:
+      - targets: ['node-exporter:9100']
+        labels:
+          service: 'system-metrics'

observability/promtail/promtail-config.yaml CHANGED Viewed

	@@ -0,0 +1,34 @@

+server:
+  http_listen_port: 9080
+  grpc_listen_port: 0
+positions:
+  filename: /tmp/positions.yaml
+clients:
+  - url: http://loki:3100/loki/api/v1/push
+scrape_configs:
+  - job_name: system
+    static_configs:
+      - targets:
+          - localhost
+        labels:
+          job: varlogs
+          __path__: /var/log/*log
+  - job_name: fastapi-logs
+    static_configs:
+      - targets:
+          - localhost
+        labels:
+          job: fastapi
+          __path__: /app/logs/*.log
+  - job_name: prediction-logs
+    static_configs:
+      - targets:
+          - localhost
+        labels:
+          job: predictions
+          __path__: /app/monitoring/predictions/*.jsonl

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ jinja2
 pandas
 numpy
 scikit-learn
 autogluon.tabular
 flaml

 pandas
 numpy
 scikit-learn
+scipy
 autogluon.tabular
 flaml