Merge pull request #33 from se4ai2526-uniba/Milestone-6
Browse files- .gitattributes +1 -0
- Dockerfile +3 -0
- README.md +42 -0
- docker-compose.yml +73 -0
- hopcroft_skill_classification_tool_competition/main.py +77 -2
- monitoring/README.md +207 -0
- monitoring/alertmanager/config.yml +21 -0
- monitoring/drift/scripts/prepare_baseline.py +119 -0
- monitoring/drift/scripts/requirements.txt +6 -0
- monitoring/drift/scripts/run_drift_check.py +216 -0
- monitoring/grafana/dashboards/hopcroft_dashboard.json +358 -0
- monitoring/grafana/provisioning/dashboards/dashboard.yml +13 -0
- monitoring/grafana/provisioning/dashboards/hopcroft_dashboard.json +358 -0
- monitoring/grafana/provisioning/datasources/prometheus.yml +14 -0
- monitoring/locust/README.md +101 -0
- monitoring/locust/locustfile.py +99 -0
- monitoring/prometheus/alert_rules.yml +32 -0
- monitoring/prometheus/prometheus.yml +32 -0
- monitoring/screenshots/incident acknowlege mail.png +3 -0
- monitoring/screenshots/incident acknowlege.png +3 -0
- monitoring/screenshots/incident mail.png +3 -0
- monitoring/screenshots/incident resolved mail.png +3 -0
- monitoring/screenshots/incident resolved.png +3 -0
- monitoring/screenshots/incident.png +3 -0
- monitoring/screenshots/monitors.png +3 -0
- nginx.conf +92 -0
- reports/alerting_test_report/alerting_report.md +31 -0
- reports/alerting_test_report/alertmanager_firing.png +3 -0
- reports/alerting_test_report/prometheus_firing.png +3 -0
- requirements.txt +3 -0
- scripts/start_space.sh +65 -11
.gitattributes
CHANGED
|
@@ -1 +1,2 @@
|
|
|
|
|
| 1 |
*.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
docs/img/*.png filter=lfs diff=lfs merge=lfs -text
|
| 2 |
*.png filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
CHANGED
|
@@ -11,6 +11,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
|
|
| 11 |
RUN apt-get update && apt-get install -y \
|
| 12 |
git \
|
| 13 |
dos2unix \
|
|
|
|
|
|
|
|
|
|
| 14 |
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
|
| 16 |
# Create a non-root user
|
|
|
|
| 11 |
RUN apt-get update && apt-get install -y \
|
| 12 |
git \
|
| 13 |
dos2unix \
|
| 14 |
+
nginx \
|
| 15 |
+
procps \
|
| 16 |
+
curl \
|
| 17 |
&& rm -rf /var/lib/apt/lists/*
|
| 18 |
|
| 19 |
# Create a non-root user
|
README.md
CHANGED
|
@@ -5,6 +5,7 @@ colorFrom: blue
|
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
|
|
|
| 8 |
---
|
| 9 |
|
| 10 |
# Hopcroft_Skill-Classification-Tool-Competition
|
|
@@ -477,6 +478,47 @@ docker-compose down
|
|
| 477 |
```
|
| 478 |
|
| 479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
## Demo UI (Streamlit)
|
| 481 |
|
| 482 |
The Streamlit GUI provides an interactive web interface for the skill classification API.
|
|
|
|
| 5 |
colorTo: green
|
| 6 |
sdk: docker
|
| 7 |
app_port: 7860
|
| 8 |
+
api_docs_url: /docs
|
| 9 |
---
|
| 10 |
|
| 11 |
# Hopcroft_Skill-Classification-Tool-Competition
|
|
|
|
| 478 |
```
|
| 479 |
|
| 480 |
|
| 481 |
+
--------
|
| 482 |
+
|
| 483 |
+
## Hugging Face Spaces Deployment
|
| 484 |
+
|
| 485 |
+
This project is configured to run on [Hugging Face Spaces](https://huggingface.co/spaces) using Docker.
|
| 486 |
+
|
| 487 |
+
### 1. Setup Space
|
| 488 |
+
1. Create a new Space on Hugging Face.
|
| 489 |
+
2. Select **Docker** as the SDK.
|
| 490 |
+
3. Choose the **Blank** template or upload your code.
|
| 491 |
+
|
| 492 |
+
### 2. Configure Secrets
|
| 493 |
+
To enable the application to pull models from DagsHub via DVC, you must configure the following **Variables and Secrets** in your Space settings:
|
| 494 |
+
|
| 495 |
+
| Name | Type | Description |
|
| 496 |
+
|------|------|-------------|
|
| 497 |
+
| `DAGSHUB_USERNAME` | Secret | Your DagsHub username. |
|
| 498 |
+
| `DAGSHUB_TOKEN` | Secret | Your DagsHub access token (Settings -> Tokens). |
|
| 499 |
+
|
| 500 |
+
> [!IMPORTANT]
|
| 501 |
+
> These secrets are injected into the container at runtime. The `scripts/start_space.sh` script uses them to authenticate DVC and pull the required model files (`.pkl`) before starting the API and GUI.
|
| 502 |
+
|
| 503 |
+
### 3. Automated Startup
|
| 504 |
+
The deployment follows this automated flow:
|
| 505 |
+
1. **Dockerfile**: Builds the environment, installs dependencies, and sets up Nginx.
|
| 506 |
+
2. **scripts/start_space.sh**:
|
| 507 |
+
- Configures DVC with your secrets.
|
| 508 |
+
- Pulls models from the DagsHub remote.
|
| 509 |
+
- Starts the **FastAPI** backend (port 8000).
|
| 510 |
+
- Starts the **Streamlit** frontend (port 8501).
|
| 511 |
+
- Starts **Nginx** (port 7860) as a reverse proxy to route traffic.
|
| 512 |
+
|
| 513 |
+
### 4. Direct Access
|
| 514 |
+
Once deployed, your Space will be available at:
|
| 515 |
+
`https://huggingface.co/spaces/se4ai2526-uniba/Hopcroft`
|
| 516 |
+
|
| 517 |
+
The API documentation will be accessible at:
|
| 518 |
+
`https://huggingface.co/spaces/se4ai2526-uniba/Hopcroft/docs`
|
| 519 |
+
|
| 520 |
+
--------
|
| 521 |
+
|
| 522 |
## Demo UI (Streamlit)
|
| 523 |
|
| 524 |
The Streamlit GUI provides an interactive web interface for the skill classification API.
|
docker-compose.yml
CHANGED
|
@@ -47,6 +47,75 @@ services:
|
|
| 47 |
condition: service_healthy
|
| 48 |
restart: unless-stopped
|
| 49 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
networks:
|
| 51 |
hopcroft-net:
|
| 52 |
driver: bridge
|
|
@@ -54,3 +123,7 @@ networks:
|
|
| 54 |
volumes:
|
| 55 |
hopcroft-logs:
|
| 56 |
driver: local
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
condition: service_healthy
|
| 48 |
restart: unless-stopped
|
| 49 |
|
| 50 |
+
prometheus:
|
| 51 |
+
image: prom/prometheus:latest
|
| 52 |
+
container_name: prometheus
|
| 53 |
+
volumes:
|
| 54 |
+
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
| 55 |
+
- ./monitoring/prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
|
| 56 |
+
ports:
|
| 57 |
+
- "9090:9090"
|
| 58 |
+
networks:
|
| 59 |
+
- hopcroft-net
|
| 60 |
+
depends_on:
|
| 61 |
+
- alertmanager
|
| 62 |
+
restart: unless-stopped
|
| 63 |
+
|
| 64 |
+
alertmanager:
|
| 65 |
+
image: prom/alertmanager:latest
|
| 66 |
+
container_name: alertmanager
|
| 67 |
+
volumes:
|
| 68 |
+
- ./monitoring/alertmanager/config.yml:/etc/alertmanager/config.yml
|
| 69 |
+
ports:
|
| 70 |
+
- "9093:9093"
|
| 71 |
+
networks:
|
| 72 |
+
- hopcroft-net
|
| 73 |
+
restart: unless-stopped
|
| 74 |
+
|
| 75 |
+
grafana:
|
| 76 |
+
image: grafana/grafana:latest
|
| 77 |
+
container_name: grafana
|
| 78 |
+
ports:
|
| 79 |
+
- "3000:3000"
|
| 80 |
+
environment:
|
| 81 |
+
- GF_SECURITY_ADMIN_USER=admin
|
| 82 |
+
- GF_SECURITY_ADMIN_PASSWORD=admin
|
| 83 |
+
- GF_USERS_ALLOW_SIGN_UP=false
|
| 84 |
+
- GF_SERVER_ROOT_URL=http://localhost:3000
|
| 85 |
+
volumes:
|
| 86 |
+
# Provisioning: auto-configure datasources and dashboards
|
| 87 |
+
- ./monitoring/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
|
| 88 |
+
- ./monitoring/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
|
| 89 |
+
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards
|
| 90 |
+
# Persistent storage for Grafana data
|
| 91 |
+
- grafana-data:/var/lib/grafana
|
| 92 |
+
networks:
|
| 93 |
+
- hopcroft-net
|
| 94 |
+
depends_on:
|
| 95 |
+
- prometheus
|
| 96 |
+
restart: unless-stopped
|
| 97 |
+
healthcheck:
|
| 98 |
+
test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
|
| 99 |
+
interval: 30s
|
| 100 |
+
timeout: 10s
|
| 101 |
+
retries: 3
|
| 102 |
+
|
| 103 |
+
pushgateway:
|
| 104 |
+
image: prom/pushgateway:latest
|
| 105 |
+
container_name: pushgateway
|
| 106 |
+
ports:
|
| 107 |
+
- "9091:9091"
|
| 108 |
+
networks:
|
| 109 |
+
- hopcroft-net
|
| 110 |
+
restart: unless-stopped
|
| 111 |
+
command:
|
| 112 |
+
- '--web.listen-address=:9091'
|
| 113 |
+
- '--persistence.file=/data/pushgateway.data'
|
| 114 |
+
- '--persistence.interval=5m'
|
| 115 |
+
volumes:
|
| 116 |
+
- pushgateway-data:/data
|
| 117 |
+
|
| 118 |
+
|
| 119 |
networks:
|
| 120 |
hopcroft-net:
|
| 121 |
driver: bridge
|
|
|
|
| 123 |
volumes:
|
| 124 |
hopcroft-logs:
|
| 125 |
driver: local
|
| 126 |
+
grafana-data:
|
| 127 |
+
driver: local
|
| 128 |
+
pushgateway-data:
|
| 129 |
+
driver: local
|
hopcroft_skill_classification_tool_competition/main.py
CHANGED
|
@@ -22,9 +22,17 @@ import os
|
|
| 22 |
import time
|
| 23 |
from typing import List
|
| 24 |
|
| 25 |
-
from fastapi import FastAPI, HTTPException, status
|
| 26 |
from fastapi.responses import JSONResponse, RedirectResponse
|
| 27 |
import mlflow
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
from pydantic import ValidationError
|
| 29 |
|
| 30 |
from hopcroft_skill_classification_tool_competition.api_models import (
|
|
@@ -40,6 +48,34 @@ from hopcroft_skill_classification_tool_competition.api_models import (
|
|
| 40 |
from hopcroft_skill_classification_tool_competition.config import MLFLOW_CONFIG
|
| 41 |
from hopcroft_skill_classification_tool_competition.modeling.predict import SkillPredictor
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
predictor = None
|
| 44 |
model_version = "1.0.0"
|
| 45 |
|
|
@@ -85,6 +121,43 @@ app = FastAPI(
|
|
| 85 |
)
|
| 86 |
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
@app.get("/", tags=["Root"])
|
| 89 |
async def root():
|
| 90 |
"""Return basic API information."""
|
|
@@ -143,9 +216,11 @@ async def predict_skills(issue: IssueInput) -> PredictionRecord:
|
|
| 143 |
|
| 144 |
# Combine text fields if needed, or just use issue_text
|
| 145 |
# The predictor expects a single string
|
|
|
|
| 146 |
full_text = f"{issue.issue_text} {issue.issue_description or ''} {issue.repo_name or ''}"
|
| 147 |
|
| 148 |
-
|
|
|
|
| 149 |
|
| 150 |
# Convert to Pydantic models
|
| 151 |
predictions = [
|
|
|
|
| 22 |
import time
|
| 23 |
from typing import List
|
| 24 |
|
| 25 |
+
from fastapi import FastAPI, HTTPException, status, Request, Response
|
| 26 |
from fastapi.responses import JSONResponse, RedirectResponse
|
| 27 |
import mlflow
|
| 28 |
+
from prometheus_client import (
|
| 29 |
+
CONTENT_TYPE_LATEST,
|
| 30 |
+
Counter,
|
| 31 |
+
Gauge,
|
| 32 |
+
Histogram,
|
| 33 |
+
Summary,
|
| 34 |
+
generate_latest,
|
| 35 |
+
)
|
| 36 |
from pydantic import ValidationError
|
| 37 |
|
| 38 |
from hopcroft_skill_classification_tool_competition.api_models import (
|
|
|
|
| 48 |
from hopcroft_skill_classification_tool_competition.config import MLFLOW_CONFIG
|
| 49 |
from hopcroft_skill_classification_tool_competition.modeling.predict import SkillPredictor
|
| 50 |
|
| 51 |
+
# Define Prometheus Metrics
|
| 52 |
+
# Counter: Total number of requests
|
| 53 |
+
REQUESTS_TOTAL = Counter(
|
| 54 |
+
"hopcroft_requests_total",
|
| 55 |
+
"Total number of requests",
|
| 56 |
+
["method", "endpoint", "http_status"],
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Histogram: Request duration
|
| 60 |
+
REQUEST_DURATION_SECONDS = Histogram(
|
| 61 |
+
"hopcroft_request_duration_seconds",
|
| 62 |
+
"Request duration in seconds",
|
| 63 |
+
["method", "endpoint"],
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
# Gauge: In-progress requests
|
| 67 |
+
IN_PROGRESS_REQUESTS = Gauge(
|
| 68 |
+
"hopcroft_in_progress_requests",
|
| 69 |
+
"Number of requests currently in progress",
|
| 70 |
+
["method", "endpoint"],
|
| 71 |
+
)
|
| 72 |
+
|
| 73 |
+
# Summary: Model prediction time
|
| 74 |
+
MODEL_PREDICTION_SECONDS = Summary(
|
| 75 |
+
"hopcroft_prediction_processing_seconds",
|
| 76 |
+
"Time spent processing model predictions",
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
predictor = None
|
| 80 |
model_version = "1.0.0"
|
| 81 |
|
|
|
|
| 121 |
)
|
| 122 |
|
| 123 |
|
| 124 |
+
@app.middleware("http")
|
| 125 |
+
async def monitor_requests(request: Request, call_next):
|
| 126 |
+
"""Middleware to collect Prometheus metrics for each request."""
|
| 127 |
+
method = request.method
|
| 128 |
+
# Use a simplified path or template if possible to avoid high cardinality
|
| 129 |
+
# For now, using request.url.path is acceptable for this scale
|
| 130 |
+
endpoint = request.url.path
|
| 131 |
+
|
| 132 |
+
IN_PROGRESS_REQUESTS.labels(method=method, endpoint=endpoint).inc()
|
| 133 |
+
start_time = time.time()
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
response = await call_next(request)
|
| 137 |
+
status_code = response.status_code
|
| 138 |
+
REQUESTS_TOTAL.labels(
|
| 139 |
+
method=method, endpoint=endpoint, http_status=status_code
|
| 140 |
+
).inc()
|
| 141 |
+
return response
|
| 142 |
+
except Exception as e:
|
| 143 |
+
REQUESTS_TOTAL.labels(
|
| 144 |
+
method=method, endpoint=endpoint, http_status=500
|
| 145 |
+
).inc()
|
| 146 |
+
raise e
|
| 147 |
+
finally:
|
| 148 |
+
duration = time.time() - start_time
|
| 149 |
+
REQUEST_DURATION_SECONDS.labels(method=method, endpoint=endpoint).observe(
|
| 150 |
+
duration
|
| 151 |
+
)
|
| 152 |
+
IN_PROGRESS_REQUESTS.labels(method=method, endpoint=endpoint).dec()
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
@app.get("/metrics", tags=["Observability"])
|
| 156 |
+
async def metrics():
|
| 157 |
+
"""Expose Prometheus metrics."""
|
| 158 |
+
return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)
|
| 159 |
+
|
| 160 |
+
|
| 161 |
@app.get("/", tags=["Root"])
|
| 162 |
async def root():
|
| 163 |
"""Return basic API information."""
|
|
|
|
| 216 |
|
| 217 |
# Combine text fields if needed, or just use issue_text
|
| 218 |
# The predictor expects a single string
|
| 219 |
+
# The predictor expects a single string
|
| 220 |
full_text = f"{issue.issue_text} {issue.issue_description or ''} {issue.repo_name or ''}"
|
| 221 |
|
| 222 |
+
with MODEL_PREDICTION_SECONDS.time():
|
| 223 |
+
predictions_data = predictor.predict(full_text)
|
| 224 |
|
| 225 |
# Convert to Pydantic models
|
| 226 |
predictions = [
|
monitoring/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Metrics Collection & Verification
|
| 2 |
+
|
| 3 |
+
This directory contains the configuration for Prometheus monitoring.
|
| 4 |
+
|
| 5 |
+
## Configuration
|
| 6 |
+
- **Prometheus Config**: `prometheus/prometheus.yml`
|
| 7 |
+
- **Scrape Target**: `hopcroft-api:8080`
|
| 8 |
+
- **Metrics Endpoint**: `http://localhost:8080/metrics`
|
| 9 |
+
|
| 10 |
+
## Verification Queries (PromQL)
|
| 11 |
+
|
| 12 |
+
You can run these queries in the Prometheus Expression Browser (`http://localhost:9090/graph`):
|
| 13 |
+
|
| 14 |
+
### 1. Request Rate (Counter)
|
| 15 |
+
Shows the rate of requests per second over the last minute.
|
| 16 |
+
```promql
|
| 17 |
+
rate(hopcroft_requests_total[1m])
|
| 18 |
+
```
|
| 19 |
+
|
| 20 |
+
### 2. Average Request Duration (Histogram)
|
| 21 |
+
Calculates average latency.
|
| 22 |
+
```promql
|
| 23 |
+
rate(hopcroft_request_duration_seconds_sum[5m]) / rate(hopcroft_request_duration_seconds_count[5m])
|
| 24 |
+
```
|
| 25 |
+
|
| 26 |
+
### 3. Current In-Progress Requests (Gauge)
|
| 27 |
+
Shows how many requests are currently being processed.
|
| 28 |
+
```promql
|
| 29 |
+
hopcroft_in_progress_requests
|
| 30 |
+
```
|
| 31 |
+
|
| 32 |
+
### 4. Model Prediction Time (Summary)
|
| 33 |
+
Shows the 90th percentile of model prediction time.
|
| 34 |
+
```promql
|
| 35 |
+
hopcroft_prediction_processing_seconds{quantile="0.9"}
|
| 36 |
+
```
|
| 37 |
+
|
| 38 |
+
---
|
| 39 |
+
|
| 40 |
+
## Uptime Monitoring (Better Stack)
|
| 41 |
+
|
| 42 |
+
We used Better Stack Uptime to monitor the availability of the production deployment hosted on Hugging Face Spaces.
|
| 43 |
+
|
| 44 |
+
**Base URL**
|
| 45 |
+
- https://dacrow13-hopcroft-skill-classification.hf.space
|
| 46 |
+
|
| 47 |
+
**Monitored endpoints**
|
| 48 |
+
- https://dacrow13-hopcroft-skill-classification.hf.space/health
|
| 49 |
+
- https://dacrow13-hopcroft-skill-classification.hf.space/openapi.json
|
| 50 |
+
- https://dacrow13-hopcroft-skill-classification.hf.space/docs
|
| 51 |
+
|
| 52 |
+
**Checks and alerts**
|
| 53 |
+
- Monitors are configured to run from multiple locations.
|
| 54 |
+
- Email notifications are enabled for failures.
|
| 55 |
+
- A failure scenario was tested to confirm Better Stack reports the server error details.
|
| 56 |
+
|
| 57 |
+
- Screenshots are available in `monitoring/screenshots/`.
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## Grafana Dashboard
|
| 62 |
+
|
| 63 |
+
Grafana provides real-time visualization of system metrics and drift detection status.
|
| 64 |
+
|
| 65 |
+
### Configuration
|
| 66 |
+
- **Port**: `3000`
|
| 67 |
+
- **Credentials**: `admin` / `admin`
|
| 68 |
+
- **Dashboard**: Hopcroft Monitoring Dashboard
|
| 69 |
+
- **Datasource**: Prometheus (auto-provisioned)
|
| 70 |
+
- **Provisioning Files**:
|
| 71 |
+
- Datasources: `grafana/provisioning/datasources/prometheus.yml`
|
| 72 |
+
- Dashboards: `grafana/provisioning/dashboards/dashboard.yml`
|
| 73 |
+
- Dashboard JSON: `grafana/dashboards/hopcroft_dashboard.json`
|
| 74 |
+
|
| 75 |
+
### Dashboard Panels
|
| 76 |
+
1. **API Request Rate**: Rate of incoming requests per endpoint
|
| 77 |
+
2. **API Latency**: Average response time per endpoint
|
| 78 |
+
3. **Drift Detection Status**: Real-time drift detection indicator (0=No Drift, 1=Drift Detected)
|
| 79 |
+
4. **Drift P-Value**: Statistical significance of detected drift
|
| 80 |
+
5. **Drift Distance**: Kolmogorov-Smirnov distance metric
|
| 81 |
+
|
| 82 |
+
### Access
|
| 83 |
+
Navigate to `http://localhost:3000` and login with the provided credentials. The dashboard refreshes every 10 seconds.
|
| 84 |
+
|
| 85 |
+
---
|
| 86 |
+
|
| 87 |
+
## Data Drift Detection
|
| 88 |
+
|
| 89 |
+
Automated distribution shift detection using statistical testing to monitor model input data quality.
|
| 90 |
+
|
| 91 |
+
### Algorithm
|
| 92 |
+
- **Method**: Kolmogorov-Smirnov Two-Sample Test (scipy-based)
|
| 93 |
+
- **Baseline Data**: 1000 samples from training set
|
| 94 |
+
- **Detection Threshold**: p-value < 0.05 (with Bonferroni correction)
|
| 95 |
+
- **Metrics Published**: drift_detected, drift_p_value, drift_distance, drift_check_timestamp
|
| 96 |
+
|
| 97 |
+
### Scripts
|
| 98 |
+
|
| 99 |
+
#### Baseline Preparation
|
| 100 |
+
**Script**: `drift/scripts/prepare_baseline.py`
|
| 101 |
+
|
| 102 |
+
Functionality:
|
| 103 |
+
- Loads data from SQLite database (`data/raw/skillscope_data.db`)
|
| 104 |
+
- Extracts numeric features only
|
| 105 |
+
- Samples 1000 representative records
|
| 106 |
+
- Saves to `drift/baseline/reference_data.pkl`
|
| 107 |
+
|
| 108 |
+
Usage:
|
| 109 |
+
```bash
|
| 110 |
+
cd monitoring/drift/scripts
|
| 111 |
+
python prepare_baseline.py
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
#### Drift Detection
|
| 115 |
+
**Script**: `drift/scripts/run_drift_check.py`
|
| 116 |
+
|
| 117 |
+
Functionality:
|
| 118 |
+
- Loads baseline reference data
|
| 119 |
+
- Compares with new production data
|
| 120 |
+
- Performs KS test on each feature
|
| 121 |
+
- Pushes metrics to Pushgateway
|
| 122 |
+
- Saves results to `drift/reports/`
|
| 123 |
+
|
| 124 |
+
Usage:
|
| 125 |
+
```bash
|
| 126 |
+
cd monitoring/drift/scripts
|
| 127 |
+
python run_drift_check.py
|
| 128 |
+
```
|
| 129 |
+
|
| 130 |
+
### Verification
|
| 131 |
+
Check Pushgateway metrics:
|
| 132 |
+
```bash
|
| 133 |
+
curl http://localhost:9091/metrics | grep drift
|
| 134 |
+
```
|
| 135 |
+
|
| 136 |
+
Query in Prometheus:
|
| 137 |
+
```promql
|
| 138 |
+
drift_detected
|
| 139 |
+
drift_p_value
|
| 140 |
+
drift_distance
|
| 141 |
+
```
|
| 142 |
+
|
| 143 |
+
---
|
| 144 |
+
|
| 145 |
+
## Pushgateway
|
| 146 |
+
|
| 147 |
+
Pushgateway collects metrics from short-lived jobs such as the drift detection script.
|
| 148 |
+
|
| 149 |
+
### Configuration
|
| 150 |
+
- **Port**: `9091`
|
| 151 |
+
- **Persistence**: Enabled with 5-minute intervals
|
| 152 |
+
- **Data Volume**: `pushgateway-data`
|
| 153 |
+
|
| 154 |
+
### Metrics Endpoint
|
| 155 |
+
Access metrics at `http://localhost:9091/metrics`
|
| 156 |
+
|
| 157 |
+
### Integration
|
| 158 |
+
The drift detection script pushes metrics to Pushgateway, which are then scraped by Prometheus and displayed in Grafana.
|
| 159 |
+
|
| 160 |
+
---
|
| 161 |
+
|
| 162 |
+
## Alerting
|
| 163 |
+
|
| 164 |
+
Alert rules are defined in `prometheus/alert_rules.yml`:
|
| 165 |
+
|
| 166 |
+
- **High Latency**: Triggered when average latency exceeds 2 seconds
|
| 167 |
+
- **High Error Rate**: Triggered when error rate exceeds 5%
|
| 168 |
+
- **Data Drift Detected**: Triggered when drift_detected = 1
|
| 169 |
+
|
| 170 |
+
Alerts are routed to Alertmanager (`http://localhost:9093`) and can be configured to send notifications via email, Slack, or other channels in `alertmanager/config.yml`.
|
| 171 |
+
|
| 172 |
+
---
|
| 173 |
+
|
| 174 |
+
## Complete Stack Usage
|
| 175 |
+
|
| 176 |
+
### Starting All Services
|
| 177 |
+
```bash
|
| 178 |
+
# Start all monitoring services
|
| 179 |
+
docker compose up -d
|
| 180 |
+
|
| 181 |
+
# Verify all containers are running
|
| 182 |
+
docker compose ps
|
| 183 |
+
|
| 184 |
+
# Check Prometheus targets
|
| 185 |
+
curl http://localhost:9090/targets
|
| 186 |
+
|
| 187 |
+
# Check Grafana health
|
| 188 |
+
curl http://localhost:3000/api/health
|
| 189 |
+
```
|
| 190 |
+
|
| 191 |
+
### Running Drift Detection Workflow
|
| 192 |
+
|
| 193 |
+
1. **Prepare Baseline (One-time setup)**
|
| 194 |
+
```bash
|
| 195 |
+
cd monitoring/drift/scripts
|
| 196 |
+
python prepare_baseline.py
|
| 197 |
+
```
|
| 198 |
+
|
| 199 |
+
2. **Execute Drift Check**
|
| 200 |
+
```bash
|
| 201 |
+
python run_drift_check.py
|
| 202 |
+
```
|
| 203 |
+
|
| 204 |
+
3. **Verify Results**
|
| 205 |
+
- Check Pushgateway: `http://localhost:9091`
|
| 206 |
+
- Check Prometheus: `http://localhost:9090/graph`
|
| 207 |
+
- Check Grafana: `http://localhost:3000`
|
monitoring/alertmanager/config.yml
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global:
|
| 2 |
+
resolve_timeout: 5m
|
| 3 |
+
|
| 4 |
+
route:
|
| 5 |
+
group_by: ['alertname', 'severity']
|
| 6 |
+
group_wait: 10s
|
| 7 |
+
group_interval: 10s
|
| 8 |
+
repeat_interval: 1h
|
| 9 |
+
receiver: 'log-receiver'
|
| 10 |
+
|
| 11 |
+
receivers:
|
| 12 |
+
- name: 'log-receiver'
|
| 13 |
+
webhook_configs:
|
| 14 |
+
- url: 'http://hopcroft-api:8080/health'
|
| 15 |
+
|
| 16 |
+
inhibition_rules:
|
| 17 |
+
- source_match:
|
| 18 |
+
severity: 'critical'
|
| 19 |
+
target_match:
|
| 20 |
+
severity: 'warning'
|
| 21 |
+
equal: ['alertname', 'dev', 'instance']
|
monitoring/drift/scripts/prepare_baseline.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Prepare baseline/reference data for drift detection.
|
| 3 |
+
This script samples representative data from the training set.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pickle
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import sqlite3
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from sklearn.model_selection import train_test_split
|
| 12 |
+
|
| 13 |
+
# Paths
|
| 14 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
|
| 15 |
+
BASELINE_DIR = Path(__file__).parent.parent / "baseline"
|
| 16 |
+
BASELINE_DIR.mkdir(parents=True, exist_ok=True)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def load_training_data():
|
| 20 |
+
"""Load the original training dataset from SQLite database."""
|
| 21 |
+
# Load from SQLite database
|
| 22 |
+
db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db"
|
| 23 |
+
|
| 24 |
+
if not db_path.exists():
|
| 25 |
+
raise FileNotFoundError(f"Database not found at {db_path}")
|
| 26 |
+
|
| 27 |
+
print(f"Loading data from database: {db_path}")
|
| 28 |
+
conn = sqlite3.connect(db_path)
|
| 29 |
+
|
| 30 |
+
# Load from the main table
|
| 31 |
+
query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000"
|
| 32 |
+
df = pd.read_sql_query(query, conn)
|
| 33 |
+
conn.close()
|
| 34 |
+
|
| 35 |
+
print(f"Loaded {len(df)} training samples")
|
| 36 |
+
return df
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def prepare_baseline(df, sample_size=1000, random_state=42):
|
| 40 |
+
"""
|
| 41 |
+
Sample representative baseline data.
|
| 42 |
+
|
| 43 |
+
Args:
|
| 44 |
+
df: Training dataframe
|
| 45 |
+
sample_size: Number of samples for baseline
|
| 46 |
+
random_state: Random seed for reproducibility
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
Baseline dataframe
|
| 50 |
+
"""
|
| 51 |
+
# Stratified sampling if you have labels
|
| 52 |
+
if 'label' in df.columns:
|
| 53 |
+
_, baseline_df = train_test_split(
|
| 54 |
+
df,
|
| 55 |
+
test_size=sample_size,
|
| 56 |
+
random_state=random_state,
|
| 57 |
+
stratify=df['label']
|
| 58 |
+
)
|
| 59 |
+
else:
|
| 60 |
+
baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state)
|
| 61 |
+
|
| 62 |
+
print(f"Sampled {len(baseline_df)} baseline samples")
|
| 63 |
+
return baseline_df
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def extract_features(df):
|
| 67 |
+
"""
|
| 68 |
+
Extract features used for drift detection.
|
| 69 |
+
Should match the features used by your model.
|
| 70 |
+
"""
|
| 71 |
+
|
| 72 |
+
# Select only numeric columns, exclude labels and IDs
|
| 73 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 74 |
+
exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id']
|
| 75 |
+
feature_columns = [col for col in numeric_cols if col not in exclude_cols]
|
| 76 |
+
|
| 77 |
+
X = df[feature_columns].values
|
| 78 |
+
|
| 79 |
+
print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples")
|
| 80 |
+
return X
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def save_baseline(baseline_data, filename="reference_data.pkl"):
|
| 84 |
+
"""Save baseline data to disk."""
|
| 85 |
+
baseline_path = BASELINE_DIR / filename
|
| 86 |
+
|
| 87 |
+
with open(baseline_path, 'wb') as f:
|
| 88 |
+
pickle.dump(baseline_data, f)
|
| 89 |
+
|
| 90 |
+
print(f"Baseline saved to {baseline_path}")
|
| 91 |
+
print(f" Shape: {baseline_data.shape}")
|
| 92 |
+
print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB")
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def main():
|
| 96 |
+
"""Main execution."""
|
| 97 |
+
print("=" * 60)
|
| 98 |
+
print("Preparing Baseline Data for Drift Detection")
|
| 99 |
+
print("=" * 60)
|
| 100 |
+
|
| 101 |
+
# Load data
|
| 102 |
+
df = load_training_data()
|
| 103 |
+
|
| 104 |
+
# Sample baseline
|
| 105 |
+
baseline_df = prepare_baseline(df, sample_size=1000)
|
| 106 |
+
|
| 107 |
+
# Extract features
|
| 108 |
+
X_baseline = extract_features(baseline_df)
|
| 109 |
+
|
| 110 |
+
# Save
|
| 111 |
+
save_baseline(X_baseline)
|
| 112 |
+
|
| 113 |
+
print("\n" + "=" * 60)
|
| 114 |
+
print("Baseline preparation complete!")
|
| 115 |
+
print("=" * 60)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
main()
|
monitoring/drift/scripts/requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
alibi-detect>=0.11.4
|
| 2 |
+
pandas>=2.0.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
scikit-learn>=1.3.0
|
| 5 |
+
requests>=2.31.0
|
| 6 |
+
mlflow>=2.8.0
|
monitoring/drift/scripts/run_drift_check.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data Drift Detection using Scipy KS Test.
|
| 3 |
+
Detects distribution shifts between baseline and new data.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import pickle
|
| 7 |
+
import json
|
| 8 |
+
import requests
|
| 9 |
+
import numpy as np
|
| 10 |
+
import pandas as pd
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from scipy.stats import ks_2samp
|
| 14 |
+
from typing import Dict, Tuple
|
| 15 |
+
|
| 16 |
+
# Configuration
|
| 17 |
+
PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
|
| 18 |
+
BASELINE_DIR = Path(__file__).parent.parent / "baseline"
|
| 19 |
+
REPORTS_DIR = Path(__file__).parent.parent / "reports"
|
| 20 |
+
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
PUSHGATEWAY_URL = "http://localhost:9091"
|
| 23 |
+
P_VALUE_THRESHOLD = 0.05 # Significance level
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def load_baseline() -> np.ndarray:
|
| 27 |
+
"""Load reference/baseline data."""
|
| 28 |
+
baseline_path = BASELINE_DIR / "reference_data.pkl"
|
| 29 |
+
|
| 30 |
+
if not baseline_path.exists():
|
| 31 |
+
raise FileNotFoundError(
|
| 32 |
+
f"Baseline data not found at {baseline_path}\n"
|
| 33 |
+
f"Run `python prepare_baseline.py` first!"
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
with open(baseline_path, 'rb') as f:
|
| 37 |
+
X_baseline = pickle.load(f)
|
| 38 |
+
|
| 39 |
+
print(f"Loaded baseline data: {X_baseline.shape}")
|
| 40 |
+
return X_baseline
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def load_new_data() -> np.ndarray:
|
| 44 |
+
"""
|
| 45 |
+
Load new/production data to check for drift.
|
| 46 |
+
|
| 47 |
+
In production, this would fetch from:
|
| 48 |
+
- Database
|
| 49 |
+
- S3 bucket
|
| 50 |
+
- API logs
|
| 51 |
+
- Data lake
|
| 52 |
+
|
| 53 |
+
For now, simulate or load from file.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
# Option 1: Load from file
|
| 57 |
+
data_path = PROJECT_ROOT / "data" / "test.csv"
|
| 58 |
+
if data_path.exists():
|
| 59 |
+
df = pd.read_csv(data_path)
|
| 60 |
+
# Extract same features as baseline
|
| 61 |
+
feature_columns = [col for col in df.columns if col not in ['label', 'id', 'timestamp']]
|
| 62 |
+
X_new = df[feature_columns].values[:500] # Take 500 samples
|
| 63 |
+
print(f"Loaded new data from file: {X_new.shape}")
|
| 64 |
+
return X_new
|
| 65 |
+
|
| 66 |
+
# Option 2: Simulate (for testing)
|
| 67 |
+
print("Simulating new data (no test file found)")
|
| 68 |
+
X_baseline = load_baseline()
|
| 69 |
+
# Add slight shift to simulate drift
|
| 70 |
+
X_new = X_baseline[:500] + np.random.normal(0, 0.1, (500, X_baseline.shape[1]))
|
| 71 |
+
return X_new
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def run_drift_detection(X_baseline: np.ndarray, X_new: np.ndarray) -> Dict:
|
| 75 |
+
"""
|
| 76 |
+
Run Kolmogorov-Smirnov drift detection using scipy.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
X_baseline: Reference data
|
| 80 |
+
X_new: New data to check
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Drift detection results
|
| 84 |
+
"""
|
| 85 |
+
print("\n" + "=" * 60)
|
| 86 |
+
print("Running Drift Detection (Kolmogorov-Smirnov Test)")
|
| 87 |
+
print("=" * 60)
|
| 88 |
+
|
| 89 |
+
# Run KS test for each feature
|
| 90 |
+
p_values = []
|
| 91 |
+
distances = []
|
| 92 |
+
|
| 93 |
+
for i in range(X_baseline.shape[1]):
|
| 94 |
+
statistic, p_value = ks_2samp(X_baseline[:, i], X_new[:, i])
|
| 95 |
+
p_values.append(p_value)
|
| 96 |
+
distances.append(statistic)
|
| 97 |
+
|
| 98 |
+
# Aggregate results
|
| 99 |
+
min_p_value = np.min(p_values)
|
| 100 |
+
max_distance = np.max(distances)
|
| 101 |
+
|
| 102 |
+
# Apply Bonferroni correction for multiple testing
|
| 103 |
+
adjusted_threshold = P_VALUE_THRESHOLD / X_baseline.shape[1]
|
| 104 |
+
drift_detected = min_p_value < adjusted_threshold
|
| 105 |
+
|
| 106 |
+
# Extract results
|
| 107 |
+
results = {
|
| 108 |
+
"timestamp": datetime.now().isoformat(),
|
| 109 |
+
"drift_detected": int(drift_detected),
|
| 110 |
+
"p_value": float(min_p_value),
|
| 111 |
+
"threshold": adjusted_threshold,
|
| 112 |
+
"distance": float(max_distance),
|
| 113 |
+
"baseline_samples": X_baseline.shape[0],
|
| 114 |
+
"new_samples": X_new.shape[0],
|
| 115 |
+
"num_features": X_baseline.shape[1]
|
| 116 |
+
}
|
| 117 |
+
|
| 118 |
+
# Print results
|
| 119 |
+
print(f"\nResults:")
|
| 120 |
+
print(f" Drift Detected: {'YES' if results['drift_detected'] else 'NO'}")
|
| 121 |
+
print(f" P-Value: {results['p_value']:.6f} (adjusted threshold: {adjusted_threshold:.6f})")
|
| 122 |
+
print(f" Distance: {results['distance']:.6f}")
|
| 123 |
+
print(f" Baseline: {X_baseline.shape[0]} samples")
|
| 124 |
+
print(f" New Data: {X_new.shape[0]} samples")
|
| 125 |
+
|
| 126 |
+
return results
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
def save_report(results: Dict):
|
| 130 |
+
"""Save drift detection report to file."""
|
| 131 |
+
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 132 |
+
report_path = REPORTS_DIR / f"drift_report_{timestamp}.json"
|
| 133 |
+
|
| 134 |
+
with open(report_path, 'w') as f:
|
| 135 |
+
json.dump(results, f, indent=2)
|
| 136 |
+
|
| 137 |
+
print(f"\nReport saved to: {report_path}")
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
def push_to_prometheus(results: Dict):
|
| 141 |
+
"""
|
| 142 |
+
Push drift metrics to Prometheus via Pushgateway.
|
| 143 |
+
|
| 144 |
+
This allows Prometheus to scrape short-lived job metrics.
|
| 145 |
+
"""
|
| 146 |
+
metrics = f"""# TYPE drift_detected gauge
|
| 147 |
+
# HELP drift_detected Whether data drift was detected (1=yes, 0=no)
|
| 148 |
+
drift_detected {results['drift_detected']}
|
| 149 |
+
|
| 150 |
+
# TYPE drift_p_value gauge
|
| 151 |
+
# HELP drift_p_value P-value from drift detection test
|
| 152 |
+
drift_p_value {results['p_value']}
|
| 153 |
+
|
| 154 |
+
# TYPE drift_distance gauge
|
| 155 |
+
# HELP drift_distance Statistical distance between distributions
|
| 156 |
+
drift_distance {results['distance']}
|
| 157 |
+
|
| 158 |
+
# TYPE drift_check_timestamp gauge
|
| 159 |
+
# HELP drift_check_timestamp Unix timestamp of last drift check
|
| 160 |
+
drift_check_timestamp {datetime.now().timestamp()}
|
| 161 |
+
"""
|
| 162 |
+
|
| 163 |
+
try:
|
| 164 |
+
response = requests.post(
|
| 165 |
+
f"{PUSHGATEWAY_URL}/metrics/job/drift_detection/instance/hopcroft",
|
| 166 |
+
data=metrics,
|
| 167 |
+
headers={'Content-Type': 'text/plain'}
|
| 168 |
+
)
|
| 169 |
+
response.raise_for_status()
|
| 170 |
+
print(f"Metrics pushed to Pushgateway at {PUSHGATEWAY_URL}")
|
| 171 |
+
except requests.exceptions.RequestException as e:
|
| 172 |
+
print(f"Failed to push to Pushgateway: {e}")
|
| 173 |
+
print(f" Make sure Pushgateway is running: docker compose ps pushgateway")
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
def main():
|
| 177 |
+
"""Main execution."""
|
| 178 |
+
print("\n" + "=" * 60)
|
| 179 |
+
print("Hopcroft Data Drift Detection")
|
| 180 |
+
print("=" * 60)
|
| 181 |
+
|
| 182 |
+
try:
|
| 183 |
+
# Load data
|
| 184 |
+
X_baseline = load_baseline()
|
| 185 |
+
X_new = load_new_data()
|
| 186 |
+
|
| 187 |
+
# Run drift detection
|
| 188 |
+
results = run_drift_detection(X_baseline, X_new)
|
| 189 |
+
|
| 190 |
+
# Save report
|
| 191 |
+
save_report(results)
|
| 192 |
+
|
| 193 |
+
# Push to Prometheus
|
| 194 |
+
push_to_prometheus(results)
|
| 195 |
+
|
| 196 |
+
print("\n" + "=" * 60)
|
| 197 |
+
print("Drift Detection Complete!")
|
| 198 |
+
print("=" * 60)
|
| 199 |
+
|
| 200 |
+
if results['drift_detected']:
|
| 201 |
+
print("\nWARNING: Data drift detected!")
|
| 202 |
+
print(f" P-value: {results['p_value']:.6f} < {P_VALUE_THRESHOLD}")
|
| 203 |
+
return 1
|
| 204 |
+
else:
|
| 205 |
+
print("\nNo significant drift detected")
|
| 206 |
+
return 0
|
| 207 |
+
|
| 208 |
+
except Exception as e:
|
| 209 |
+
print(f"\nError: {e}")
|
| 210 |
+
import traceback
|
| 211 |
+
traceback.print_exc()
|
| 212 |
+
return 1
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
if __name__ == "__main__":
|
| 216 |
+
exit(main())
|
monitoring/grafana/dashboards/hopcroft_dashboard.json
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": [
|
| 4 |
+
{
|
| 5 |
+
"builtIn": 1,
|
| 6 |
+
"datasource": "-- Grafana --",
|
| 7 |
+
"enable": true,
|
| 8 |
+
"hide": true,
|
| 9 |
+
"iconColor": "rgba(0, 211, 255, 1)",
|
| 10 |
+
"name": "Annotations & Alerts",
|
| 11 |
+
"type": "dashboard"
|
| 12 |
+
}
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"editable": true,
|
| 16 |
+
"gnetId": null,
|
| 17 |
+
"graphTooltip": 1,
|
| 18 |
+
"id": null,
|
| 19 |
+
"links": [],
|
| 20 |
+
"panels": [
|
| 21 |
+
{
|
| 22 |
+
"datasource": "Prometheus",
|
| 23 |
+
"fieldConfig": {
|
| 24 |
+
"defaults": {
|
| 25 |
+
"color": {
|
| 26 |
+
"mode": "thresholds"
|
| 27 |
+
},
|
| 28 |
+
"mappings": [],
|
| 29 |
+
"thresholds": {
|
| 30 |
+
"mode": "absolute",
|
| 31 |
+
"steps": [
|
| 32 |
+
{
|
| 33 |
+
"color": "green",
|
| 34 |
+
"value": null
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"color": "red",
|
| 38 |
+
"value": 80
|
| 39 |
+
}
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
"unit": "reqps"
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"gridPos": {
|
| 46 |
+
"h": 8,
|
| 47 |
+
"w": 6,
|
| 48 |
+
"x": 0,
|
| 49 |
+
"y": 0
|
| 50 |
+
},
|
| 51 |
+
"id": 1,
|
| 52 |
+
"options": {
|
| 53 |
+
"orientation": "auto",
|
| 54 |
+
"reduceOptions": {
|
| 55 |
+
"calcs": ["lastNotNull"],
|
| 56 |
+
"fields": "",
|
| 57 |
+
"values": false
|
| 58 |
+
},
|
| 59 |
+
"showThresholdLabels": false,
|
| 60 |
+
"showThresholdMarkers": true
|
| 61 |
+
},
|
| 62 |
+
"pluginVersion": "9.0.0",
|
| 63 |
+
"targets": [
|
| 64 |
+
{
|
| 65 |
+
"expr": "rate(fastapi_requests_total[1m])",
|
| 66 |
+
"refId": "A"
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"title": "Request Rate",
|
| 70 |
+
"type": "gauge",
|
| 71 |
+
"description": "Number of requests per second handled by the API"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"datasource": "Prometheus",
|
| 75 |
+
"fieldConfig": {
|
| 76 |
+
"defaults": {
|
| 77 |
+
"color": {
|
| 78 |
+
"mode": "palette-classic"
|
| 79 |
+
},
|
| 80 |
+
"custom": {
|
| 81 |
+
"axisLabel": "",
|
| 82 |
+
"axisPlacement": "auto",
|
| 83 |
+
"barAlignment": 0,
|
| 84 |
+
"drawStyle": "line",
|
| 85 |
+
"fillOpacity": 10,
|
| 86 |
+
"gradientMode": "none",
|
| 87 |
+
"hideFrom": {
|
| 88 |
+
"tooltip": false,
|
| 89 |
+
"viz": false,
|
| 90 |
+
"legend": false
|
| 91 |
+
},
|
| 92 |
+
"lineInterpolation": "linear",
|
| 93 |
+
"lineWidth": 1,
|
| 94 |
+
"pointSize": 5,
|
| 95 |
+
"scaleDistribution": {
|
| 96 |
+
"type": "linear"
|
| 97 |
+
},
|
| 98 |
+
"showPoints": "never",
|
| 99 |
+
"spanNulls": true
|
| 100 |
+
},
|
| 101 |
+
"mappings": [],
|
| 102 |
+
"thresholds": {
|
| 103 |
+
"mode": "absolute",
|
| 104 |
+
"steps": [
|
| 105 |
+
{
|
| 106 |
+
"color": "green",
|
| 107 |
+
"value": null
|
| 108 |
+
}
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
"unit": "ms"
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"gridPos": {
|
| 115 |
+
"h": 8,
|
| 116 |
+
"w": 18,
|
| 117 |
+
"x": 6,
|
| 118 |
+
"y": 0
|
| 119 |
+
},
|
| 120 |
+
"id": 2,
|
| 121 |
+
"options": {
|
| 122 |
+
"legend": {
|
| 123 |
+
"calcs": ["mean", "max"],
|
| 124 |
+
"displayMode": "table",
|
| 125 |
+
"placement": "right"
|
| 126 |
+
},
|
| 127 |
+
"tooltip": {
|
| 128 |
+
"mode": "multi"
|
| 129 |
+
}
|
| 130 |
+
},
|
| 131 |
+
"pluginVersion": "9.0.0",
|
| 132 |
+
"targets": [
|
| 133 |
+
{
|
| 134 |
+
"expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
|
| 135 |
+
"legendFormat": "p95",
|
| 136 |
+
"refId": "A"
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
|
| 140 |
+
"legendFormat": "p50 (median)",
|
| 141 |
+
"refId": "B"
|
| 142 |
+
}
|
| 143 |
+
],
|
| 144 |
+
"title": "Request Latency (p50, p95)",
|
| 145 |
+
"type": "timeseries",
|
| 146 |
+
"description": "API response time percentiles over time"
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"datasource": "Prometheus",
|
| 150 |
+
"fieldConfig": {
|
| 151 |
+
"defaults": {
|
| 152 |
+
"color": {
|
| 153 |
+
"mode": "thresholds"
|
| 154 |
+
},
|
| 155 |
+
"mappings": [
|
| 156 |
+
{
|
| 157 |
+
"options": {
|
| 158 |
+
"0": {
|
| 159 |
+
"color": "red",
|
| 160 |
+
"index": 1,
|
| 161 |
+
"text": "No Drift"
|
| 162 |
+
},
|
| 163 |
+
"1": {
|
| 164 |
+
"color": "green",
|
| 165 |
+
"index": 0,
|
| 166 |
+
"text": "Drift Detected"
|
| 167 |
+
}
|
| 168 |
+
},
|
| 169 |
+
"type": "value"
|
| 170 |
+
}
|
| 171 |
+
],
|
| 172 |
+
"thresholds": {
|
| 173 |
+
"mode": "absolute",
|
| 174 |
+
"steps": [
|
| 175 |
+
{
|
| 176 |
+
"color": "green",
|
| 177 |
+
"value": null
|
| 178 |
+
}
|
| 179 |
+
]
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
},
|
| 183 |
+
"gridPos": {
|
| 184 |
+
"h": 6,
|
| 185 |
+
"w": 6,
|
| 186 |
+
"x": 0,
|
| 187 |
+
"y": 8
|
| 188 |
+
},
|
| 189 |
+
"id": 3,
|
| 190 |
+
"options": {
|
| 191 |
+
"orientation": "auto",
|
| 192 |
+
"reduceOptions": {
|
| 193 |
+
"calcs": ["lastNotNull"],
|
| 194 |
+
"fields": "",
|
| 195 |
+
"values": false
|
| 196 |
+
},
|
| 197 |
+
"showThresholdLabels": false,
|
| 198 |
+
"showThresholdMarkers": true,
|
| 199 |
+
"text": {}
|
| 200 |
+
},
|
| 201 |
+
"pluginVersion": "9.0.0",
|
| 202 |
+
"targets": [
|
| 203 |
+
{
|
| 204 |
+
"expr": "drift_detected",
|
| 205 |
+
"refId": "A"
|
| 206 |
+
}
|
| 207 |
+
],
|
| 208 |
+
"title": "Data Drift Status",
|
| 209 |
+
"type": "stat",
|
| 210 |
+
"description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"datasource": "Prometheus",
|
| 214 |
+
"fieldConfig": {
|
| 215 |
+
"defaults": {
|
| 216 |
+
"color": {
|
| 217 |
+
"mode": "thresholds"
|
| 218 |
+
},
|
| 219 |
+
"decimals": 4,
|
| 220 |
+
"mappings": [],
|
| 221 |
+
"thresholds": {
|
| 222 |
+
"mode": "absolute",
|
| 223 |
+
"steps": [
|
| 224 |
+
{
|
| 225 |
+
"color": "green",
|
| 226 |
+
"value": null
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"color": "yellow",
|
| 230 |
+
"value": 0.01
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"color": "red",
|
| 234 |
+
"value": 0.05
|
| 235 |
+
}
|
| 236 |
+
]
|
| 237 |
+
},
|
| 238 |
+
"unit": "short"
|
| 239 |
+
}
|
| 240 |
+
},
|
| 241 |
+
"gridPos": {
|
| 242 |
+
"h": 6,
|
| 243 |
+
"w": 6,
|
| 244 |
+
"x": 6,
|
| 245 |
+
"y": 8
|
| 246 |
+
},
|
| 247 |
+
"id": 4,
|
| 248 |
+
"options": {
|
| 249 |
+
"orientation": "auto",
|
| 250 |
+
"reduceOptions": {
|
| 251 |
+
"calcs": ["lastNotNull"],
|
| 252 |
+
"fields": "",
|
| 253 |
+
"values": false
|
| 254 |
+
},
|
| 255 |
+
"showThresholdLabels": false,
|
| 256 |
+
"showThresholdMarkers": true,
|
| 257 |
+
"text": {}
|
| 258 |
+
},
|
| 259 |
+
"pluginVersion": "9.0.0",
|
| 260 |
+
"targets": [
|
| 261 |
+
{
|
| 262 |
+
"expr": "drift_p_value",
|
| 263 |
+
"refId": "A"
|
| 264 |
+
}
|
| 265 |
+
],
|
| 266 |
+
"title": "Drift P-Value",
|
| 267 |
+
"type": "stat",
|
| 268 |
+
"description": "Statistical significance of detected drift (lower = more significant)"
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"datasource": "Prometheus",
|
| 272 |
+
"fieldConfig": {
|
| 273 |
+
"defaults": {
|
| 274 |
+
"color": {
|
| 275 |
+
"mode": "palette-classic"
|
| 276 |
+
},
|
| 277 |
+
"custom": {
|
| 278 |
+
"axisLabel": "",
|
| 279 |
+
"axisPlacement": "auto",
|
| 280 |
+
"barAlignment": 0,
|
| 281 |
+
"drawStyle": "line",
|
| 282 |
+
"fillOpacity": 10,
|
| 283 |
+
"gradientMode": "none",
|
| 284 |
+
"hideFrom": {
|
| 285 |
+
"tooltip": false,
|
| 286 |
+
"viz": false,
|
| 287 |
+
"legend": false
|
| 288 |
+
},
|
| 289 |
+
"lineInterpolation": "linear",
|
| 290 |
+
"lineWidth": 1,
|
| 291 |
+
"pointSize": 5,
|
| 292 |
+
"scaleDistribution": {
|
| 293 |
+
"type": "linear"
|
| 294 |
+
},
|
| 295 |
+
"showPoints": "auto",
|
| 296 |
+
"spanNulls": false
|
| 297 |
+
},
|
| 298 |
+
"mappings": [],
|
| 299 |
+
"thresholds": {
|
| 300 |
+
"mode": "absolute",
|
| 301 |
+
"steps": [
|
| 302 |
+
{
|
| 303 |
+
"color": "green",
|
| 304 |
+
"value": null
|
| 305 |
+
}
|
| 306 |
+
]
|
| 307 |
+
},
|
| 308 |
+
"unit": "short"
|
| 309 |
+
}
|
| 310 |
+
},
|
| 311 |
+
"gridPos": {
|
| 312 |
+
"h": 6,
|
| 313 |
+
"w": 12,
|
| 314 |
+
"x": 12,
|
| 315 |
+
"y": 8
|
| 316 |
+
},
|
| 317 |
+
"id": 5,
|
| 318 |
+
"options": {
|
| 319 |
+
"legend": {
|
| 320 |
+
"calcs": ["mean", "lastNotNull"],
|
| 321 |
+
"displayMode": "table",
|
| 322 |
+
"placement": "right"
|
| 323 |
+
},
|
| 324 |
+
"tooltip": {
|
| 325 |
+
"mode": "multi"
|
| 326 |
+
}
|
| 327 |
+
},
|
| 328 |
+
"pluginVersion": "9.0.0",
|
| 329 |
+
"targets": [
|
| 330 |
+
{
|
| 331 |
+
"expr": "drift_distance",
|
| 332 |
+
"legendFormat": "Distance",
|
| 333 |
+
"refId": "A"
|
| 334 |
+
}
|
| 335 |
+
],
|
| 336 |
+
"title": "Drift Distance Over Time",
|
| 337 |
+
"type": "timeseries",
|
| 338 |
+
"description": "Statistical distance between baseline and current data distribution"
|
| 339 |
+
}
|
| 340 |
+
],
|
| 341 |
+
"refresh": "10s",
|
| 342 |
+
"schemaVersion": 36,
|
| 343 |
+
"style": "dark",
|
| 344 |
+
"tags": ["hopcroft", "ml", "monitoring"],
|
| 345 |
+
"templating": {
|
| 346 |
+
"list": []
|
| 347 |
+
},
|
| 348 |
+
"time": {
|
| 349 |
+
"from": "now-1h",
|
| 350 |
+
"to": "now"
|
| 351 |
+
},
|
| 352 |
+
"timepicker": {},
|
| 353 |
+
"timezone": "",
|
| 354 |
+
"title": "Hopcroft ML Model Monitoring",
|
| 355 |
+
"uid": "hopcroft-ml-dashboard",
|
| 356 |
+
"version": 1,
|
| 357 |
+
"weekStart": ""
|
| 358 |
+
}
|
monitoring/grafana/provisioning/dashboards/dashboard.yml
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
providers:
|
| 4 |
+
- name: 'Hopcroft Dashboards'
|
| 5 |
+
orgId: 1
|
| 6 |
+
folder: ''
|
| 7 |
+
type: file
|
| 8 |
+
disableDeletion: false
|
| 9 |
+
updateIntervalSeconds: 10
|
| 10 |
+
allowUiUpdates: true
|
| 11 |
+
options:
|
| 12 |
+
path: /var/lib/grafana/dashboards
|
| 13 |
+
foldersFromFilesStructure: true
|
monitoring/grafana/provisioning/dashboards/hopcroft_dashboard.json
ADDED
|
@@ -0,0 +1,358 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": [
|
| 4 |
+
{
|
| 5 |
+
"builtIn": 1,
|
| 6 |
+
"datasource": "-- Grafana --",
|
| 7 |
+
"enable": true,
|
| 8 |
+
"hide": true,
|
| 9 |
+
"iconColor": "rgba(0, 211, 255, 1)",
|
| 10 |
+
"name": "Annotations & Alerts",
|
| 11 |
+
"type": "dashboard"
|
| 12 |
+
}
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"editable": true,
|
| 16 |
+
"gnetId": null,
|
| 17 |
+
"graphTooltip": 1,
|
| 18 |
+
"id": null,
|
| 19 |
+
"links": [],
|
| 20 |
+
"panels": [
|
| 21 |
+
{
|
| 22 |
+
"datasource": "Prometheus",
|
| 23 |
+
"fieldConfig": {
|
| 24 |
+
"defaults": {
|
| 25 |
+
"color": {
|
| 26 |
+
"mode": "thresholds"
|
| 27 |
+
},
|
| 28 |
+
"mappings": [],
|
| 29 |
+
"thresholds": {
|
| 30 |
+
"mode": "absolute",
|
| 31 |
+
"steps": [
|
| 32 |
+
{
|
| 33 |
+
"color": "green",
|
| 34 |
+
"value": null
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"color": "red",
|
| 38 |
+
"value": 80
|
| 39 |
+
}
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
"unit": "reqps"
|
| 43 |
+
}
|
| 44 |
+
},
|
| 45 |
+
"gridPos": {
|
| 46 |
+
"h": 8,
|
| 47 |
+
"w": 6,
|
| 48 |
+
"x": 0,
|
| 49 |
+
"y": 0
|
| 50 |
+
},
|
| 51 |
+
"id": 1,
|
| 52 |
+
"options": {
|
| 53 |
+
"orientation": "auto",
|
| 54 |
+
"reduceOptions": {
|
| 55 |
+
"calcs": ["lastNotNull"],
|
| 56 |
+
"fields": "",
|
| 57 |
+
"values": false
|
| 58 |
+
},
|
| 59 |
+
"showThresholdLabels": false,
|
| 60 |
+
"showThresholdMarkers": true
|
| 61 |
+
},
|
| 62 |
+
"pluginVersion": "9.0.0",
|
| 63 |
+
"targets": [
|
| 64 |
+
{
|
| 65 |
+
"expr": "rate(fastapi_requests_total[1m])",
|
| 66 |
+
"refId": "A"
|
| 67 |
+
}
|
| 68 |
+
],
|
| 69 |
+
"title": "Request Rate",
|
| 70 |
+
"type": "gauge",
|
| 71 |
+
"description": "Number of requests per second handled by the API"
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"datasource": "Prometheus",
|
| 75 |
+
"fieldConfig": {
|
| 76 |
+
"defaults": {
|
| 77 |
+
"color": {
|
| 78 |
+
"mode": "palette-classic"
|
| 79 |
+
},
|
| 80 |
+
"custom": {
|
| 81 |
+
"axisLabel": "",
|
| 82 |
+
"axisPlacement": "auto",
|
| 83 |
+
"barAlignment": 0,
|
| 84 |
+
"drawStyle": "line",
|
| 85 |
+
"fillOpacity": 10,
|
| 86 |
+
"gradientMode": "none",
|
| 87 |
+
"hideFrom": {
|
| 88 |
+
"tooltip": false,
|
| 89 |
+
"viz": false,
|
| 90 |
+
"legend": false
|
| 91 |
+
},
|
| 92 |
+
"lineInterpolation": "linear",
|
| 93 |
+
"lineWidth": 1,
|
| 94 |
+
"pointSize": 5,
|
| 95 |
+
"scaleDistribution": {
|
| 96 |
+
"type": "linear"
|
| 97 |
+
},
|
| 98 |
+
"showPoints": "never",
|
| 99 |
+
"spanNulls": true
|
| 100 |
+
},
|
| 101 |
+
"mappings": [],
|
| 102 |
+
"thresholds": {
|
| 103 |
+
"mode": "absolute",
|
| 104 |
+
"steps": [
|
| 105 |
+
{
|
| 106 |
+
"color": "green",
|
| 107 |
+
"value": null
|
| 108 |
+
}
|
| 109 |
+
]
|
| 110 |
+
},
|
| 111 |
+
"unit": "ms"
|
| 112 |
+
}
|
| 113 |
+
},
|
| 114 |
+
"gridPos": {
|
| 115 |
+
"h": 8,
|
| 116 |
+
"w": 18,
|
| 117 |
+
"x": 6,
|
| 118 |
+
"y": 0
|
| 119 |
+
},
|
| 120 |
+
"id": 2,
|
| 121 |
+
"options": {
|
| 122 |
+
"legend": {
|
| 123 |
+
"calcs": ["mean", "max"],
|
| 124 |
+
"displayMode": "table",
|
| 125 |
+
"placement": "right"
|
| 126 |
+
},
|
| 127 |
+
"tooltip": {
|
| 128 |
+
"mode": "multi"
|
| 129 |
+
}
|
| 130 |
+
},
|
| 131 |
+
"pluginVersion": "9.0.0",
|
| 132 |
+
"targets": [
|
| 133 |
+
{
|
| 134 |
+
"expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
|
| 135 |
+
"legendFormat": "p95",
|
| 136 |
+
"refId": "A"
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
|
| 140 |
+
"legendFormat": "p50 (median)",
|
| 141 |
+
"refId": "B"
|
| 142 |
+
}
|
| 143 |
+
],
|
| 144 |
+
"title": "Request Latency (p50, p95)",
|
| 145 |
+
"type": "timeseries",
|
| 146 |
+
"description": "API response time percentiles over time"
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"datasource": "Prometheus",
|
| 150 |
+
"fieldConfig": {
|
| 151 |
+
"defaults": {
|
| 152 |
+
"color": {
|
| 153 |
+
"mode": "thresholds"
|
| 154 |
+
},
|
| 155 |
+
"mappings": [
|
| 156 |
+
{
|
| 157 |
+
"options": {
|
| 158 |
+
"0": {
|
| 159 |
+
"color": "red",
|
| 160 |
+
"index": 1,
|
| 161 |
+
"text": "No Drift"
|
| 162 |
+
},
|
| 163 |
+
"1": {
|
| 164 |
+
"color": "green",
|
| 165 |
+
"index": 0,
|
| 166 |
+
"text": "Drift Detected"
|
| 167 |
+
}
|
| 168 |
+
},
|
| 169 |
+
"type": "value"
|
| 170 |
+
}
|
| 171 |
+
],
|
| 172 |
+
"thresholds": {
|
| 173 |
+
"mode": "absolute",
|
| 174 |
+
"steps": [
|
| 175 |
+
{
|
| 176 |
+
"color": "green",
|
| 177 |
+
"value": null
|
| 178 |
+
}
|
| 179 |
+
]
|
| 180 |
+
}
|
| 181 |
+
}
|
| 182 |
+
},
|
| 183 |
+
"gridPos": {
|
| 184 |
+
"h": 6,
|
| 185 |
+
"w": 6,
|
| 186 |
+
"x": 0,
|
| 187 |
+
"y": 8
|
| 188 |
+
},
|
| 189 |
+
"id": 3,
|
| 190 |
+
"options": {
|
| 191 |
+
"orientation": "auto",
|
| 192 |
+
"reduceOptions": {
|
| 193 |
+
"calcs": ["lastNotNull"],
|
| 194 |
+
"fields": "",
|
| 195 |
+
"values": false
|
| 196 |
+
},
|
| 197 |
+
"showThresholdLabels": false,
|
| 198 |
+
"showThresholdMarkers": true,
|
| 199 |
+
"text": {}
|
| 200 |
+
},
|
| 201 |
+
"pluginVersion": "9.0.0",
|
| 202 |
+
"targets": [
|
| 203 |
+
{
|
| 204 |
+
"expr": "drift_detected",
|
| 205 |
+
"refId": "A"
|
| 206 |
+
}
|
| 207 |
+
],
|
| 208 |
+
"title": "Data Drift Status",
|
| 209 |
+
"type": "stat",
|
| 210 |
+
"description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
|
| 211 |
+
},
|
| 212 |
+
{
|
| 213 |
+
"datasource": "Prometheus",
|
| 214 |
+
"fieldConfig": {
|
| 215 |
+
"defaults": {
|
| 216 |
+
"color": {
|
| 217 |
+
"mode": "thresholds"
|
| 218 |
+
},
|
| 219 |
+
"decimals": 4,
|
| 220 |
+
"mappings": [],
|
| 221 |
+
"thresholds": {
|
| 222 |
+
"mode": "absolute",
|
| 223 |
+
"steps": [
|
| 224 |
+
{
|
| 225 |
+
"color": "green",
|
| 226 |
+
"value": null
|
| 227 |
+
},
|
| 228 |
+
{
|
| 229 |
+
"color": "yellow",
|
| 230 |
+
"value": 0.01
|
| 231 |
+
},
|
| 232 |
+
{
|
| 233 |
+
"color": "red",
|
| 234 |
+
"value": 0.05
|
| 235 |
+
}
|
| 236 |
+
]
|
| 237 |
+
},
|
| 238 |
+
"unit": "short"
|
| 239 |
+
}
|
| 240 |
+
},
|
| 241 |
+
"gridPos": {
|
| 242 |
+
"h": 6,
|
| 243 |
+
"w": 6,
|
| 244 |
+
"x": 6,
|
| 245 |
+
"y": 8
|
| 246 |
+
},
|
| 247 |
+
"id": 4,
|
| 248 |
+
"options": {
|
| 249 |
+
"orientation": "auto",
|
| 250 |
+
"reduceOptions": {
|
| 251 |
+
"calcs": ["lastNotNull"],
|
| 252 |
+
"fields": "",
|
| 253 |
+
"values": false
|
| 254 |
+
},
|
| 255 |
+
"showThresholdLabels": false,
|
| 256 |
+
"showThresholdMarkers": true,
|
| 257 |
+
"text": {}
|
| 258 |
+
},
|
| 259 |
+
"pluginVersion": "9.0.0",
|
| 260 |
+
"targets": [
|
| 261 |
+
{
|
| 262 |
+
"expr": "drift_p_value",
|
| 263 |
+
"refId": "A"
|
| 264 |
+
}
|
| 265 |
+
],
|
| 266 |
+
"title": "Drift P-Value",
|
| 267 |
+
"type": "stat",
|
| 268 |
+
"description": "Statistical significance of detected drift (lower = more significant)"
|
| 269 |
+
},
|
| 270 |
+
{
|
| 271 |
+
"datasource": "Prometheus",
|
| 272 |
+
"fieldConfig": {
|
| 273 |
+
"defaults": {
|
| 274 |
+
"color": {
|
| 275 |
+
"mode": "palette-classic"
|
| 276 |
+
},
|
| 277 |
+
"custom": {
|
| 278 |
+
"axisLabel": "",
|
| 279 |
+
"axisPlacement": "auto",
|
| 280 |
+
"barAlignment": 0,
|
| 281 |
+
"drawStyle": "line",
|
| 282 |
+
"fillOpacity": 10,
|
| 283 |
+
"gradientMode": "none",
|
| 284 |
+
"hideFrom": {
|
| 285 |
+
"tooltip": false,
|
| 286 |
+
"viz": false,
|
| 287 |
+
"legend": false
|
| 288 |
+
},
|
| 289 |
+
"lineInterpolation": "linear",
|
| 290 |
+
"lineWidth": 1,
|
| 291 |
+
"pointSize": 5,
|
| 292 |
+
"scaleDistribution": {
|
| 293 |
+
"type": "linear"
|
| 294 |
+
},
|
| 295 |
+
"showPoints": "auto",
|
| 296 |
+
"spanNulls": false
|
| 297 |
+
},
|
| 298 |
+
"mappings": [],
|
| 299 |
+
"thresholds": {
|
| 300 |
+
"mode": "absolute",
|
| 301 |
+
"steps": [
|
| 302 |
+
{
|
| 303 |
+
"color": "green",
|
| 304 |
+
"value": null
|
| 305 |
+
}
|
| 306 |
+
]
|
| 307 |
+
},
|
| 308 |
+
"unit": "short"
|
| 309 |
+
}
|
| 310 |
+
},
|
| 311 |
+
"gridPos": {
|
| 312 |
+
"h": 6,
|
| 313 |
+
"w": 12,
|
| 314 |
+
"x": 12,
|
| 315 |
+
"y": 8
|
| 316 |
+
},
|
| 317 |
+
"id": 5,
|
| 318 |
+
"options": {
|
| 319 |
+
"legend": {
|
| 320 |
+
"calcs": ["mean", "lastNotNull"],
|
| 321 |
+
"displayMode": "table",
|
| 322 |
+
"placement": "right"
|
| 323 |
+
},
|
| 324 |
+
"tooltip": {
|
| 325 |
+
"mode": "multi"
|
| 326 |
+
}
|
| 327 |
+
},
|
| 328 |
+
"pluginVersion": "9.0.0",
|
| 329 |
+
"targets": [
|
| 330 |
+
{
|
| 331 |
+
"expr": "drift_distance",
|
| 332 |
+
"legendFormat": "Distance",
|
| 333 |
+
"refId": "A"
|
| 334 |
+
}
|
| 335 |
+
],
|
| 336 |
+
"title": "Drift Distance Over Time",
|
| 337 |
+
"type": "timeseries",
|
| 338 |
+
"description": "Statistical distance between baseline and current data distribution"
|
| 339 |
+
}
|
| 340 |
+
],
|
| 341 |
+
"refresh": "10s",
|
| 342 |
+
"schemaVersion": 36,
|
| 343 |
+
"style": "dark",
|
| 344 |
+
"tags": ["hopcroft", "ml", "monitoring"],
|
| 345 |
+
"templating": {
|
| 346 |
+
"list": []
|
| 347 |
+
},
|
| 348 |
+
"time": {
|
| 349 |
+
"from": "now-1h",
|
| 350 |
+
"to": "now"
|
| 351 |
+
},
|
| 352 |
+
"timepicker": {},
|
| 353 |
+
"timezone": "",
|
| 354 |
+
"title": "Hopcroft ML Model Monitoring",
|
| 355 |
+
"uid": "hopcroft-ml-dashboard",
|
| 356 |
+
"version": 1,
|
| 357 |
+
"weekStart": ""
|
| 358 |
+
}
|
monitoring/grafana/provisioning/datasources/prometheus.yml
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
apiVersion: 1
|
| 2 |
+
|
| 3 |
+
datasources:
|
| 4 |
+
- name: Prometheus
|
| 5 |
+
type: prometheus
|
| 6 |
+
access: proxy
|
| 7 |
+
uid: prometheus
|
| 8 |
+
orgId: 1
|
| 9 |
+
url: http://prometheus:9090
|
| 10 |
+
isDefault: true
|
| 11 |
+
editable: true
|
| 12 |
+
jsonData:
|
| 13 |
+
httpMethod: POST
|
| 14 |
+
timeInterval: "15s"
|
monitoring/locust/README.md
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Locust Load Testing - Skill Classification API
|
| 2 |
+
|
| 3 |
+
Questa directory contiene gli script per il load testing della Skill Classification API utilizzando [Locust](https://locust.io/).
|
| 4 |
+
|
| 5 |
+
## Prerequisiti
|
| 6 |
+
|
| 7 |
+
Assicurati di avere Python installato e installa Locust:
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
pip install locust
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
## Avvio del Test
|
| 14 |
+
|
| 15 |
+
### 1. Avvia Locust
|
| 16 |
+
|
| 17 |
+
Dalla directory `monitoring/locust/`, esegui:
|
| 18 |
+
|
| 19 |
+
```bash
|
| 20 |
+
locust -f locustfile.py
|
| 21 |
+
```
|
| 22 |
+
|
| 23 |
+
### 2. Accedi alla Web UI
|
| 24 |
+
|
| 25 |
+
Apri il browser e vai a: **http://localhost:8089**
|
| 26 |
+
|
| 27 |
+
### 3. Configura il Test
|
| 28 |
+
|
| 29 |
+
Nella Web UI, configura i seguenti parametri:
|
| 30 |
+
|
| 31 |
+
| Parametro | Descrizione | Valore Consigliato |
|
| 32 |
+
|-----------|-------------|-------------------|
|
| 33 |
+
| **Host** | URL dell'API da testare | `http://localhost:8080` (Docker) o `http://localhost:8000` (locale) |
|
| 34 |
+
| **Number of users** | Numero totale di utenti simulati | 10-100 |
|
| 35 |
+
| **Spawn rate** | Utenti da creare al secondo | 1-10 |
|
| 36 |
+
|
| 37 |
+
### 4. Avvia il Test
|
| 38 |
+
|
| 39 |
+
Clicca su **"Start swarming"** per avviare il test di carico.
|
| 40 |
+
|
| 41 |
+
## Task Implementati
|
| 42 |
+
|
| 43 |
+
Lo script simula il comportamento di utenti reali con i seguenti task:
|
| 44 |
+
|
| 45 |
+
| Task | Endpoint | Metodo | Peso | Descrizione |
|
| 46 |
+
|------|----------|--------|------|-------------|
|
| 47 |
+
| **Predizione Singola** | `/predict` | POST | 3 | Classifica un singolo issue text. Task principale, eseguito più frequentemente. |
|
| 48 |
+
| **Predizione Batch** | `/predict/batch` | POST | 1 | Classifica multipli issue text in una singola richiesta. |
|
| 49 |
+
| **Monitoraggio e Storia** | `/predictions`, `/health` | GET | 1 | Visualizza la cronologia delle predizioni e verifica lo stato del sistema. |
|
| 50 |
+
|
| 51 |
+
### Distribuzione dei Pesi
|
| 52 |
+
|
| 53 |
+
Con i pesi configurati (3:1:1), la distribuzione approssimativa delle richieste è:
|
| 54 |
+
- **60%** - Predizione Singola
|
| 55 |
+
- **20%** - Predizione Batch
|
| 56 |
+
- **20%** - Monitoraggio e Storia
|
| 57 |
+
|
| 58 |
+
### Tempo di Attesa
|
| 59 |
+
|
| 60 |
+
Ogni utente attende tra **1 e 5 secondi** tra un task e l'altro per simulare un comportamento realistico.
|
| 61 |
+
|
| 62 |
+
## Metriche Monitorate
|
| 63 |
+
|
| 64 |
+
Durante il test, Locust fornisce le seguenti metriche in tempo reale:
|
| 65 |
+
|
| 66 |
+
- **RPS (Requests Per Second)**: Numero di richieste al secondo
|
| 67 |
+
- **Response Time**: Tempo medio/mediano/percentili di risposta
|
| 68 |
+
- **Failure Rate**: Percentuale di richieste fallite
|
| 69 |
+
- **Active Users**: Numero di utenti attualmente attivi
|
| 70 |
+
|
| 71 |
+
## Opzioni Avanzate
|
| 72 |
+
|
| 73 |
+
### Esecuzione Headless (senza UI)
|
| 74 |
+
|
| 75 |
+
```bash
|
| 76 |
+
locust -f locustfile.py --headless -u 50 -r 5 -t 5m --host http://localhost:8000
|
| 77 |
+
```
|
| 78 |
+
|
| 79 |
+
| Opzione | Descrizione |
|
| 80 |
+
|---------|-------------|
|
| 81 |
+
| `--headless` | Esegui senza Web UI |
|
| 82 |
+
| `-u 50` | 50 utenti simulati |
|
| 83 |
+
| `-r 5` | 5 utenti creati al secondo |
|
| 84 |
+
| `-t 5m` | Durata del test: 5 minuti |
|
| 85 |
+
| `--host` | URL dell'API |
|
| 86 |
+
|
| 87 |
+
### Esportazione Risultati
|
| 88 |
+
|
| 89 |
+
```bash
|
| 90 |
+
locust -f locustfile.py --headless -u 50 -r 5 -t 5m --host http://localhost:8000 --csv=results
|
| 91 |
+
```
|
| 92 |
+
|
| 93 |
+
Questo creerà file CSV con i risultati del test.
|
| 94 |
+
|
| 95 |
+
## Struttura File
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
monitoring/locust/
|
| 99 |
+
├── locustfile.py # Script principale di load testing
|
| 100 |
+
└── README.md # Questa documentazione
|
| 101 |
+
```
|
monitoring/locust/locustfile.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Locust Load Testing Script for Skill Classification API
|
| 3 |
+
|
| 4 |
+
This script defines user behavior for load testing the prediction and monitoring
|
| 5 |
+
endpoints of the Skill Classification API.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from locust import HttpUser, task, between
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class SkillClassificationUser(HttpUser):
|
| 12 |
+
"""
|
| 13 |
+
Simulated user for load testing the Skill Classification API.
|
| 14 |
+
|
| 15 |
+
This user performs the following actions:
|
| 16 |
+
- Single predictions (most frequent)
|
| 17 |
+
- Batch predictions
|
| 18 |
+
- Monitoring and health checks
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
# Default host for the API (can be overridden via --host flag or Web UI)
|
| 22 |
+
# Use http://localhost:8080 for Docker or http://localhost:8000 for local dev
|
| 23 |
+
host = "http://localhost:8080"
|
| 24 |
+
|
| 25 |
+
# Wait between 1 and 5 seconds between tasks to simulate real user behavior
|
| 26 |
+
wait_time = between(1, 5)
|
| 27 |
+
|
| 28 |
+
@task(3)
|
| 29 |
+
def predict_single(self):
|
| 30 |
+
"""
|
| 31 |
+
Task 1: Single Prediction (Weight: 3)
|
| 32 |
+
|
| 33 |
+
Performs a POST request to /predict with a single issue text.
|
| 34 |
+
This is the main task and executes more frequently due to higher weight.
|
| 35 |
+
"""
|
| 36 |
+
payload = {
|
| 37 |
+
"issue_text": "Fix authentication bug in login module"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
with self.client.post(
|
| 41 |
+
"/predict",
|
| 42 |
+
json=payload,
|
| 43 |
+
catch_response=True
|
| 44 |
+
) as response:
|
| 45 |
+
if response.status_code == 201:
|
| 46 |
+
response.success()
|
| 47 |
+
else:
|
| 48 |
+
response.failure(f"Prediction failed with status {response.status_code}")
|
| 49 |
+
|
| 50 |
+
@task(1)
|
| 51 |
+
def predict_batch(self):
|
| 52 |
+
"""
|
| 53 |
+
Task 2: Batch Prediction (Weight: 1)
|
| 54 |
+
|
| 55 |
+
Performs a POST request to /predict/batch with multiple issue texts.
|
| 56 |
+
"""
|
| 57 |
+
payload = {
|
| 58 |
+
"issues": [
|
| 59 |
+
{"issue_text": "Test 1"},
|
| 60 |
+
{"issue_text": "Test 2"}
|
| 61 |
+
]
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
with self.client.post(
|
| 65 |
+
"/predict/batch",
|
| 66 |
+
json=payload,
|
| 67 |
+
catch_response=True
|
| 68 |
+
) as response:
|
| 69 |
+
if response.status_code == 200:
|
| 70 |
+
response.success()
|
| 71 |
+
else:
|
| 72 |
+
response.failure(f"Batch prediction failed with status {response.status_code}")
|
| 73 |
+
|
| 74 |
+
@task(1)
|
| 75 |
+
def monitoring_and_history(self):
|
| 76 |
+
"""
|
| 77 |
+
Task 3: Monitoring and History (Weight: 1)
|
| 78 |
+
|
| 79 |
+
Performs GET requests to check prediction history and system health.
|
| 80 |
+
"""
|
| 81 |
+
# Check prediction history
|
| 82 |
+
with self.client.get(
|
| 83 |
+
"/predictions",
|
| 84 |
+
catch_response=True
|
| 85 |
+
) as response:
|
| 86 |
+
if 200 <= response.status_code < 300:
|
| 87 |
+
response.success()
|
| 88 |
+
else:
|
| 89 |
+
response.failure(f"Predictions history failed with status {response.status_code}")
|
| 90 |
+
|
| 91 |
+
# Check system health
|
| 92 |
+
with self.client.get(
|
| 93 |
+
"/health",
|
| 94 |
+
catch_response=True
|
| 95 |
+
) as response:
|
| 96 |
+
if 200 <= response.status_code < 300:
|
| 97 |
+
response.success()
|
| 98 |
+
else:
|
| 99 |
+
response.failure(f"Health check failed with status {response.status_code}")
|
monitoring/prometheus/alert_rules.yml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
groups:
|
| 2 |
+
- name: hopcroft_alerts
|
| 3 |
+
rules:
|
| 4 |
+
- alert: ServiceDown
|
| 5 |
+
expr: up == 0
|
| 6 |
+
for: 1m
|
| 7 |
+
labels:
|
| 8 |
+
severity: critical
|
| 9 |
+
annotations:
|
| 10 |
+
summary: "Service {{ $labels.instance }} is down"
|
| 11 |
+
description: "The job {{ $labels.job }} has been down for more than 1 minute."
|
| 12 |
+
|
| 13 |
+
- alert: HighErrorRate
|
| 14 |
+
expr: |
|
| 15 |
+
sum(rate(hopcroft_requests_total{http_status=~"5.."}[5m]))
|
| 16 |
+
/
|
| 17 |
+
sum(rate(hopcroft_requests_total[5m])) > 0.1
|
| 18 |
+
for: 5m
|
| 19 |
+
labels:
|
| 20 |
+
severity: warning
|
| 21 |
+
annotations:
|
| 22 |
+
summary: "High error rate on {{ $labels.instance }}"
|
| 23 |
+
description: "Error rate is above 10% for the last 5 minutes (current value: {{ $value | printf \"%.2f\" }})."
|
| 24 |
+
|
| 25 |
+
- alert: SlowRequests
|
| 26 |
+
expr: histogram_quantile(0.95, sum by (le, endpoint) (rate(hopcroft_request_duration_seconds_bucket[5m]))) > 2
|
| 27 |
+
for: 5m
|
| 28 |
+
labels:
|
| 29 |
+
severity: warning
|
| 30 |
+
annotations:
|
| 31 |
+
summary: "Slow requests on {{ $labels.endpoint }}"
|
| 32 |
+
description: "95th percentile of request latency is above 2s (current value: {{ $value | printf \"%.2f\" }}s)."
|
monitoring/prometheus/prometheus.yml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
global:
|
| 2 |
+
scrape_interval: 15s
|
| 3 |
+
evaluation_interval: 15s
|
| 4 |
+
external_labels:
|
| 5 |
+
monitor: 'hopcroft-monitor'
|
| 6 |
+
environment: 'development'
|
| 7 |
+
|
| 8 |
+
rule_files:
|
| 9 |
+
- "alert_rules.yml"
|
| 10 |
+
|
| 11 |
+
alerting:
|
| 12 |
+
alertmanagers:
|
| 13 |
+
- static_configs:
|
| 14 |
+
- targets:
|
| 15 |
+
- 'alertmanager:9093'
|
| 16 |
+
|
| 17 |
+
scrape_configs:
|
| 18 |
+
- job_name: 'hopcroft-api'
|
| 19 |
+
metrics_path: '/metrics'
|
| 20 |
+
static_configs:
|
| 21 |
+
- targets: ['hopcroft-api:8080']
|
| 22 |
+
scrape_interval: 10s
|
| 23 |
+
|
| 24 |
+
- job_name: 'prometheus'
|
| 25 |
+
static_configs:
|
| 26 |
+
- targets: ['localhost:9090']
|
| 27 |
+
|
| 28 |
+
- job_name: 'pushgateway'
|
| 29 |
+
honor_labels: true
|
| 30 |
+
static_configs:
|
| 31 |
+
- targets: ['pushgateway:9091']
|
| 32 |
+
scrape_interval: 30s
|
monitoring/screenshots/incident acknowlege mail.png
ADDED
|
Git LFS Details
|
monitoring/screenshots/incident acknowlege.png
ADDED
|
Git LFS Details
|
monitoring/screenshots/incident mail.png
ADDED
|
Git LFS Details
|
monitoring/screenshots/incident resolved mail.png
ADDED
|
Git LFS Details
|
monitoring/screenshots/incident resolved.png
ADDED
|
Git LFS Details
|
monitoring/screenshots/incident.png
ADDED
|
Git LFS Details
|
monitoring/screenshots/monitors.png
ADDED
|
Git LFS Details
|
nginx.conf
ADDED
|
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
worker_processes 1;
|
| 2 |
+
pid /tmp/nginx.pid;
|
| 3 |
+
error_log stderr info; # Log to stderr to see errors in HF Space Logs
|
| 4 |
+
|
| 5 |
+
events {
|
| 6 |
+
worker_connections 1024;
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
http {
|
| 10 |
+
include /etc/nginx/mime.types;
|
| 11 |
+
default_type application/octet-stream;
|
| 12 |
+
|
| 13 |
+
# HF Space runs as non-root, use /tmp for everything
|
| 14 |
+
access_log /dev/stdout;
|
| 15 |
+
client_body_temp_path /tmp/client_temp;
|
| 16 |
+
proxy_temp_path /tmp/proxy_temp;
|
| 17 |
+
fastcgi_temp_path /tmp/fastcgi_temp;
|
| 18 |
+
uwsgi_temp_path /tmp/uwsgi_temp;
|
| 19 |
+
scgi_temp_path /tmp/scgi_temp;
|
| 20 |
+
|
| 21 |
+
sendfile on;
|
| 22 |
+
keepalive_timeout 65;
|
| 23 |
+
|
| 24 |
+
upstream streamlit {
|
| 25 |
+
server 127.0.0.1:8501;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
upstream fastapi {
|
| 29 |
+
server 127.0.0.1:8000;
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
server {
|
| 33 |
+
listen 7860;
|
| 34 |
+
server_name localhost;
|
| 35 |
+
|
| 36 |
+
# Health endpoint for HF readiness check
|
| 37 |
+
location /health {
|
| 38 |
+
proxy_pass http://fastapi/health;
|
| 39 |
+
proxy_set_header Host $host;
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
# FastAPI Documentation
|
| 43 |
+
location /docs {
|
| 44 |
+
proxy_pass http://fastapi/docs;
|
| 45 |
+
proxy_set_header Host $host;
|
| 46 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 47 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 48 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
location /redoc {
|
| 52 |
+
proxy_pass http://fastapi/redoc;
|
| 53 |
+
proxy_set_header Host $host;
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
location /openapi.json {
|
| 57 |
+
proxy_pass http://fastapi/openapi.json;
|
| 58 |
+
proxy_set_header Host $host;
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
# FastAPI API Endpoints
|
| 62 |
+
location /predict {
|
| 63 |
+
proxy_pass http://fastapi/predict;
|
| 64 |
+
proxy_set_header Host $host;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
location /predictions {
|
| 68 |
+
proxy_pass http://fastapi/predictions;
|
| 69 |
+
proxy_set_header Host $host;
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# Streamlit (Catch-all)
|
| 73 |
+
location / {
|
| 74 |
+
proxy_pass http://streamlit;
|
| 75 |
+
proxy_set_header Host $host;
|
| 76 |
+
proxy_set_header X-Real-IP $remote_addr;
|
| 77 |
+
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
| 78 |
+
proxy_set_header X-Forwarded-Proto $scheme;
|
| 79 |
+
proxy_set_header X-Forwarded-Host $host;
|
| 80 |
+
|
| 81 |
+
# WebSocket support for Streamlit
|
| 82 |
+
proxy_http_version 1.1;
|
| 83 |
+
proxy_set_header Upgrade $http_upgrade;
|
| 84 |
+
proxy_set_header Connection "upgrade";
|
| 85 |
+
proxy_read_timeout 86400;
|
| 86 |
+
|
| 87 |
+
# Prevent 502 if Streamlit is slow
|
| 88 |
+
proxy_connect_timeout 60s;
|
| 89 |
+
proxy_send_timeout 60s;
|
| 90 |
+
}
|
| 91 |
+
}
|
| 92 |
+
}
|
reports/alerting_test_report/alerting_report.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Prometheus + Alertmanager Alerting Report
|
| 2 |
+
|
| 3 |
+
This report documents the configuration and verification of the alerting system for the Hopcroft Project.
|
| 4 |
+
|
| 5 |
+
## 1. Alerting Rules
|
| 6 |
+
The `monitoring/prometheus/alert_rules.yml` file is configured with the following rules:
|
| 7 |
+
- **ServiceDown**: Triggers if a service is unreachable for 1 minute.
|
| 8 |
+
- **HighErrorRate**: Triggers if the error rate exceeds 10%.
|
| 9 |
+
- **SlowRequests**: Triggers if the 95th percentile of request latency exceeds 2 seconds.
|
| 10 |
+
|
| 11 |
+
## 2. Alertmanager Configuration
|
| 12 |
+
The `monitoring/alertmanager/config.yml` file includes:
|
| 13 |
+
- **Grouping**: Alerts are grouped by `alertname` and `severity`.
|
| 14 |
+
- **Inhibition**: Critical alerts suppress warning-level alerts.
|
| 15 |
+
- **Receiver**: A webhook receiver is configured to forward notifications.
|
| 16 |
+
|
| 17 |
+
## 3. Verification of "Firing" Alert
|
| 18 |
+
The test was conducted by stopping the `hopcroft-api` container and waiting for the 1-minute threshold to be reached.
|
| 19 |
+
|
| 20 |
+
### Verification Proofs:
|
| 21 |
+
|
| 22 |
+
1. **Prometheus - Alert Firing**:
|
| 23 |
+
The following image shows the `ServiceDown` alert in the **FIRING** state within the Prometheus dashboard.
|
| 24 |
+

|
| 25 |
+
|
| 26 |
+
2. **Alertmanager - Notification Received**:
|
| 27 |
+
The following image shows the Alertmanager interface with the alert correctly received from Prometheus.
|
| 28 |
+

|
| 29 |
+
|
| 30 |
+
### Restoration
|
| 31 |
+
Following verification, the `hopcroft-api` service was restarted and monitored until it returned to a healthy state.
|
reports/alerting_test_report/alertmanager_firing.png
ADDED
|
Git LFS Details
|
reports/alerting_test_report/prometheus_firing.png
ADDED
|
Git LFS Details
|
requirements.txt
CHANGED
|
@@ -23,6 +23,7 @@ sentence-transformers
|
|
| 23 |
|
| 24 |
# API Framework
|
| 25 |
fastapi[standard]>=0.115.0
|
|
|
|
| 26 |
pydantic>=2.0.0
|
| 27 |
uvicorn>=0.30.0
|
| 28 |
httpx>=0.27.0
|
|
@@ -47,6 +48,8 @@ pytest-json-report>=1.5.0
|
|
| 47 |
pytest-cov>=4.0.0
|
| 48 |
pytest-xdist>=3.0.0
|
| 49 |
|
|
|
|
|
|
|
| 50 |
# Data validation and quality
|
| 51 |
great_expectations>=0.18.0
|
| 52 |
deepchecks>=0.18.0
|
|
|
|
| 23 |
|
| 24 |
# API Framework
|
| 25 |
fastapi[standard]>=0.115.0
|
| 26 |
+
prometheus-client>=0.17.0
|
| 27 |
pydantic>=2.0.0
|
| 28 |
uvicorn>=0.30.0
|
| 29 |
httpx>=0.27.0
|
|
|
|
| 48 |
pytest-cov>=4.0.0
|
| 49 |
pytest-xdist>=3.0.0
|
| 50 |
|
| 51 |
+
# Load testing
|
| 52 |
+
locust>=2.20.0
|
| 53 |
# Data validation and quality
|
| 54 |
great_expectations>=0.18.0
|
| 55 |
deepchecks>=0.18.0
|
scripts/start_space.sh
CHANGED
|
@@ -16,28 +16,82 @@ USER=${DAGSHUB_USERNAME:-$MLFLOW_TRACKING_USERNAME}
|
|
| 16 |
PASS=${DAGSHUB_TOKEN:-$MLFLOW_TRACKING_PASSWORD}
|
| 17 |
|
| 18 |
if [ -n "$USER" ] && [ -n "$PASS" ]; then
|
| 19 |
-
echo "Configuring DVC authentication for DagsHub..."
|
| 20 |
# Configure local config (not committed)
|
| 21 |
dvc remote modify origin --local auth basic
|
| 22 |
dvc remote modify origin --local user "$USER"
|
| 23 |
dvc remote modify origin --local password "$PASS"
|
| 24 |
else
|
| 25 |
-
echo "WARNING: No DagsHub credentials found. DVC pull might fail if the remote is private."
|
| 26 |
fi
|
| 27 |
|
| 28 |
-
echo "Pulling models from DVC..."
|
| 29 |
# Pull only the necessary files for inference
|
| 30 |
dvc pull models/random_forest_tfidf_gridsearch.pkl.dvc \
|
| 31 |
models/tfidf_vectorizer.pkl.dvc \
|
| 32 |
-
models/label_names.pkl.dvc
|
| 33 |
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
# Wait for API to start
|
| 38 |
-
echo "Waiting for API to start..."
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
-
echo "
|
| 42 |
-
|
| 43 |
-
streamlit run hopcroft_skill_classification_tool_competition/streamlit_app.py --server.port 7860 --server.address 0.0.0.0
|
|
|
|
| 16 |
PASS=${DAGSHUB_TOKEN:-$MLFLOW_TRACKING_PASSWORD}
|
| 17 |
|
| 18 |
if [ -n "$USER" ] && [ -n "$PASS" ]; then
|
| 19 |
+
echo "$(date) - Configuring DVC authentication for DagsHub..."
|
| 20 |
# Configure local config (not committed)
|
| 21 |
dvc remote modify origin --local auth basic
|
| 22 |
dvc remote modify origin --local user "$USER"
|
| 23 |
dvc remote modify origin --local password "$PASS"
|
| 24 |
else
|
| 25 |
+
echo "$(date) - WARNING: No DagsHub credentials found. DVC pull might fail if the remote is private."
|
| 26 |
fi
|
| 27 |
|
| 28 |
+
echo "$(date) - Pulling models from DVC..."
|
| 29 |
# Pull only the necessary files for inference
|
| 30 |
dvc pull models/random_forest_tfidf_gridsearch.pkl.dvc \
|
| 31 |
models/tfidf_vectorizer.pkl.dvc \
|
| 32 |
+
models/label_names.pkl.dvc || echo "DVC pull failed, but continuing..."
|
| 33 |
|
| 34 |
+
# Create Nginx temp directories
|
| 35 |
+
mkdir -p /tmp/client_temp /tmp/proxy_temp /tmp/fastcgi_temp /tmp/uwsgi_temp /tmp/scgi_temp
|
| 36 |
+
|
| 37 |
+
echo "$(date) - Checking models existence..."
|
| 38 |
+
ls -la models/
|
| 39 |
+
|
| 40 |
+
echo "$(date) - Starting FastAPI application in background..."
|
| 41 |
+
# Using 0.0.0.0 to be safe
|
| 42 |
+
uvicorn hopcroft_skill_classification_tool_competition.main:app --host 0.0.0.0 --port 8000 >> /tmp/fastapi.log 2>&1 &
|
| 43 |
|
| 44 |
# Wait for API to start
|
| 45 |
+
echo "$(date) - Waiting for API to start (30s)..."
|
| 46 |
+
for i in {1..30}; do
|
| 47 |
+
if curl -s http://127.0.0.1:8000/health > /dev/null; then
|
| 48 |
+
echo "$(date) - API is UP!"
|
| 49 |
+
break
|
| 50 |
+
fi
|
| 51 |
+
echo "$(date) - Waiting... ($i/30)"
|
| 52 |
+
sleep 2
|
| 53 |
+
done
|
| 54 |
+
|
| 55 |
+
echo "$(date) - Starting Nginx reverse proxy..."
|
| 56 |
+
if ! command -v nginx &> /dev/null; then
|
| 57 |
+
echo "$(date) - ERROR: nginx not found in PATH"
|
| 58 |
+
exit 1
|
| 59 |
+
fi
|
| 60 |
+
nginx -c /app/nginx.conf -g "daemon off;" >> /tmp/nginx_startup.log 2>&1 &
|
| 61 |
+
|
| 62 |
+
echo "$(date) - Waiting for Nginx to initialize..."
|
| 63 |
+
sleep 5
|
| 64 |
+
|
| 65 |
+
# Check if Nginx is running
|
| 66 |
+
if ps aux | grep -v grep | grep -q "nginx"; then
|
| 67 |
+
echo "$(date) - Nginx is running."
|
| 68 |
+
else
|
| 69 |
+
echo "$(date) - ERROR: Nginx failed to start. Logs:"
|
| 70 |
+
cat /tmp/nginx_startup.log
|
| 71 |
+
fi
|
| 72 |
+
|
| 73 |
+
echo "$(date) - Final backend check before starting Streamlit..."
|
| 74 |
+
curl -v http://127.0.0.1:8000/health || echo "FastAPI health check failed!"
|
| 75 |
+
|
| 76 |
+
echo "$(date) - Starting Streamlit application on 127.0.0.1:8501..."
|
| 77 |
+
export API_BASE_URL="http://127.0.0.1:8000"
|
| 78 |
+
streamlit run hopcroft_skill_classification_tool_competition/streamlit_app.py \
|
| 79 |
+
--server.port 8501 \
|
| 80 |
+
--server.address 127.0.0.1 \
|
| 81 |
+
--server.enableCORS=false \
|
| 82 |
+
--server.enableXsrfProtection=false \
|
| 83 |
+
--server.headless true &
|
| 84 |
+
|
| 85 |
+
# Wait for Streamlit to start
|
| 86 |
+
echo "$(date) - Waiting for Streamlit to start (30s)..."
|
| 87 |
+
for i in {1..30}; do
|
| 88 |
+
if curl -s http://127.0.0.1:8501/healthz > /dev/null; then
|
| 89 |
+
echo "$(date) - Streamlit is UP!"
|
| 90 |
+
break
|
| 91 |
+
fi
|
| 92 |
+
echo "$(date) - Waiting for Streamlit... ($i/30)"
|
| 93 |
+
sleep 2
|
| 94 |
+
done
|
| 95 |
|
| 96 |
+
echo "$(date) - Process started. Tailing Nginx logs for debug..."
|
| 97 |
+
tail -f /tmp/nginx_startup.log /tmp/fastapi.log
|
|
|