antofra10 commited on
Commit
6c56755
·
unverified ·
2 Parent(s): 73d22a4 4ba57df

Merge pull request #33 from se4ai2526-uniba/Milestone-6

Browse files
.gitattributes CHANGED
@@ -1 +1,2 @@
 
1
  *.png filter=lfs diff=lfs merge=lfs -text
 
1
+ docs/img/*.png filter=lfs diff=lfs merge=lfs -text
2
  *.png filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -11,6 +11,9 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
11
  RUN apt-get update && apt-get install -y \
12
  git \
13
  dos2unix \
 
 
 
14
  && rm -rf /var/lib/apt/lists/*
15
 
16
  # Create a non-root user
 
11
  RUN apt-get update && apt-get install -y \
12
  git \
13
  dos2unix \
14
+ nginx \
15
+ procps \
16
+ curl \
17
  && rm -rf /var/lib/apt/lists/*
18
 
19
  # Create a non-root user
README.md CHANGED
@@ -5,6 +5,7 @@ colorFrom: blue
5
  colorTo: green
6
  sdk: docker
7
  app_port: 7860
 
8
  ---
9
 
10
  # Hopcroft_Skill-Classification-Tool-Competition
@@ -477,6 +478,47 @@ docker-compose down
477
  ```
478
 
479
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
480
  ## Demo UI (Streamlit)
481
 
482
  The Streamlit GUI provides an interactive web interface for the skill classification API.
 
5
  colorTo: green
6
  sdk: docker
7
  app_port: 7860
8
+ api_docs_url: /docs
9
  ---
10
 
11
  # Hopcroft_Skill-Classification-Tool-Competition
 
478
  ```
479
 
480
 
481
+ --------
482
+
483
+ ## Hugging Face Spaces Deployment
484
+
485
+ This project is configured to run on [Hugging Face Spaces](https://huggingface.co/spaces) using Docker.
486
+
487
+ ### 1. Setup Space
488
+ 1. Create a new Space on Hugging Face.
489
+ 2. Select **Docker** as the SDK.
490
+ 3. Choose the **Blank** template or upload your code.
491
+
492
+ ### 2. Configure Secrets
493
+ To enable the application to pull models from DagsHub via DVC, you must configure the following **Variables and Secrets** in your Space settings:
494
+
495
+ | Name | Type | Description |
496
+ |------|------|-------------|
497
+ | `DAGSHUB_USERNAME` | Secret | Your DagsHub username. |
498
+ | `DAGSHUB_TOKEN` | Secret | Your DagsHub access token (Settings -> Tokens). |
499
+
500
+ > [!IMPORTANT]
501
+ > These secrets are injected into the container at runtime. The `scripts/start_space.sh` script uses them to authenticate DVC and pull the required model files (`.pkl`) before starting the API and GUI.
502
+
503
+ ### 3. Automated Startup
504
+ The deployment follows this automated flow:
505
+ 1. **Dockerfile**: Builds the environment, installs dependencies, and sets up Nginx.
506
+ 2. **scripts/start_space.sh**:
507
+ - Configures DVC with your secrets.
508
+ - Pulls models from the DagsHub remote.
509
+ - Starts the **FastAPI** backend (port 8000).
510
+ - Starts the **Streamlit** frontend (port 8501).
511
+ - Starts **Nginx** (port 7860) as a reverse proxy to route traffic.
512
+
513
+ ### 4. Direct Access
514
+ Once deployed, your Space will be available at:
515
+ `https://huggingface.co/spaces/se4ai2526-uniba/Hopcroft`
516
+
517
+ The API documentation will be accessible at:
518
+ `https://huggingface.co/spaces/se4ai2526-uniba/Hopcroft/docs`
519
+
520
+ --------
521
+
522
  ## Demo UI (Streamlit)
523
 
524
  The Streamlit GUI provides an interactive web interface for the skill classification API.
docker-compose.yml CHANGED
@@ -47,6 +47,75 @@ services:
47
  condition: service_healthy
48
  restart: unless-stopped
49
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  networks:
51
  hopcroft-net:
52
  driver: bridge
@@ -54,3 +123,7 @@ networks:
54
  volumes:
55
  hopcroft-logs:
56
  driver: local
 
 
 
 
 
47
  condition: service_healthy
48
  restart: unless-stopped
49
 
50
+ prometheus:
51
+ image: prom/prometheus:latest
52
+ container_name: prometheus
53
+ volumes:
54
+ - ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
55
+ - ./monitoring/prometheus/alert_rules.yml:/etc/prometheus/alert_rules.yml
56
+ ports:
57
+ - "9090:9090"
58
+ networks:
59
+ - hopcroft-net
60
+ depends_on:
61
+ - alertmanager
62
+ restart: unless-stopped
63
+
64
+ alertmanager:
65
+ image: prom/alertmanager:latest
66
+ container_name: alertmanager
67
+ volumes:
68
+ - ./monitoring/alertmanager/config.yml:/etc/alertmanager/config.yml
69
+ ports:
70
+ - "9093:9093"
71
+ networks:
72
+ - hopcroft-net
73
+ restart: unless-stopped
74
+
75
+ grafana:
76
+ image: grafana/grafana:latest
77
+ container_name: grafana
78
+ ports:
79
+ - "3000:3000"
80
+ environment:
81
+ - GF_SECURITY_ADMIN_USER=admin
82
+ - GF_SECURITY_ADMIN_PASSWORD=admin
83
+ - GF_USERS_ALLOW_SIGN_UP=false
84
+ - GF_SERVER_ROOT_URL=http://localhost:3000
85
+ volumes:
86
+ # Provisioning: auto-configure datasources and dashboards
87
+ - ./monitoring/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
88
+ - ./monitoring/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards
89
+ - ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards
90
+ # Persistent storage for Grafana data
91
+ - grafana-data:/var/lib/grafana
92
+ networks:
93
+ - hopcroft-net
94
+ depends_on:
95
+ - prometheus
96
+ restart: unless-stopped
97
+ healthcheck:
98
+ test: ["CMD-SHELL", "curl -f http://localhost:3000/api/health || exit 1"]
99
+ interval: 30s
100
+ timeout: 10s
101
+ retries: 3
102
+
103
+ pushgateway:
104
+ image: prom/pushgateway:latest
105
+ container_name: pushgateway
106
+ ports:
107
+ - "9091:9091"
108
+ networks:
109
+ - hopcroft-net
110
+ restart: unless-stopped
111
+ command:
112
+ - '--web.listen-address=:9091'
113
+ - '--persistence.file=/data/pushgateway.data'
114
+ - '--persistence.interval=5m'
115
+ volumes:
116
+ - pushgateway-data:/data
117
+
118
+
119
  networks:
120
  hopcroft-net:
121
  driver: bridge
 
123
  volumes:
124
  hopcroft-logs:
125
  driver: local
126
+ grafana-data:
127
+ driver: local
128
+ pushgateway-data:
129
+ driver: local
hopcroft_skill_classification_tool_competition/main.py CHANGED
@@ -22,9 +22,17 @@ import os
22
  import time
23
  from typing import List
24
 
25
- from fastapi import FastAPI, HTTPException, status
26
  from fastapi.responses import JSONResponse, RedirectResponse
27
  import mlflow
 
 
 
 
 
 
 
 
28
  from pydantic import ValidationError
29
 
30
  from hopcroft_skill_classification_tool_competition.api_models import (
@@ -40,6 +48,34 @@ from hopcroft_skill_classification_tool_competition.api_models import (
40
  from hopcroft_skill_classification_tool_competition.config import MLFLOW_CONFIG
41
  from hopcroft_skill_classification_tool_competition.modeling.predict import SkillPredictor
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  predictor = None
44
  model_version = "1.0.0"
45
 
@@ -85,6 +121,43 @@ app = FastAPI(
85
  )
86
 
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  @app.get("/", tags=["Root"])
89
  async def root():
90
  """Return basic API information."""
@@ -143,9 +216,11 @@ async def predict_skills(issue: IssueInput) -> PredictionRecord:
143
 
144
  # Combine text fields if needed, or just use issue_text
145
  # The predictor expects a single string
 
146
  full_text = f"{issue.issue_text} {issue.issue_description or ''} {issue.repo_name or ''}"
147
 
148
- predictions_data = predictor.predict(full_text)
 
149
 
150
  # Convert to Pydantic models
151
  predictions = [
 
22
  import time
23
  from typing import List
24
 
25
+ from fastapi import FastAPI, HTTPException, status, Request, Response
26
  from fastapi.responses import JSONResponse, RedirectResponse
27
  import mlflow
28
+ from prometheus_client import (
29
+ CONTENT_TYPE_LATEST,
30
+ Counter,
31
+ Gauge,
32
+ Histogram,
33
+ Summary,
34
+ generate_latest,
35
+ )
36
  from pydantic import ValidationError
37
 
38
  from hopcroft_skill_classification_tool_competition.api_models import (
 
48
  from hopcroft_skill_classification_tool_competition.config import MLFLOW_CONFIG
49
  from hopcroft_skill_classification_tool_competition.modeling.predict import SkillPredictor
50
 
51
+ # Define Prometheus Metrics
52
+ # Counter: Total number of requests
53
+ REQUESTS_TOTAL = Counter(
54
+ "hopcroft_requests_total",
55
+ "Total number of requests",
56
+ ["method", "endpoint", "http_status"],
57
+ )
58
+
59
+ # Histogram: Request duration
60
+ REQUEST_DURATION_SECONDS = Histogram(
61
+ "hopcroft_request_duration_seconds",
62
+ "Request duration in seconds",
63
+ ["method", "endpoint"],
64
+ )
65
+
66
+ # Gauge: In-progress requests
67
+ IN_PROGRESS_REQUESTS = Gauge(
68
+ "hopcroft_in_progress_requests",
69
+ "Number of requests currently in progress",
70
+ ["method", "endpoint"],
71
+ )
72
+
73
+ # Summary: Model prediction time
74
+ MODEL_PREDICTION_SECONDS = Summary(
75
+ "hopcroft_prediction_processing_seconds",
76
+ "Time spent processing model predictions",
77
+ )
78
+
79
  predictor = None
80
  model_version = "1.0.0"
81
 
 
121
  )
122
 
123
 
124
+ @app.middleware("http")
125
+ async def monitor_requests(request: Request, call_next):
126
+ """Middleware to collect Prometheus metrics for each request."""
127
+ method = request.method
128
+ # Use a simplified path or template if possible to avoid high cardinality
129
+ # For now, using request.url.path is acceptable for this scale
130
+ endpoint = request.url.path
131
+
132
+ IN_PROGRESS_REQUESTS.labels(method=method, endpoint=endpoint).inc()
133
+ start_time = time.time()
134
+
135
+ try:
136
+ response = await call_next(request)
137
+ status_code = response.status_code
138
+ REQUESTS_TOTAL.labels(
139
+ method=method, endpoint=endpoint, http_status=status_code
140
+ ).inc()
141
+ return response
142
+ except Exception as e:
143
+ REQUESTS_TOTAL.labels(
144
+ method=method, endpoint=endpoint, http_status=500
145
+ ).inc()
146
+ raise e
147
+ finally:
148
+ duration = time.time() - start_time
149
+ REQUEST_DURATION_SECONDS.labels(method=method, endpoint=endpoint).observe(
150
+ duration
151
+ )
152
+ IN_PROGRESS_REQUESTS.labels(method=method, endpoint=endpoint).dec()
153
+
154
+
155
+ @app.get("/metrics", tags=["Observability"])
156
+ async def metrics():
157
+ """Expose Prometheus metrics."""
158
+ return Response(content=generate_latest(), media_type=CONTENT_TYPE_LATEST)
159
+
160
+
161
  @app.get("/", tags=["Root"])
162
  async def root():
163
  """Return basic API information."""
 
216
 
217
  # Combine text fields if needed, or just use issue_text
218
  # The predictor expects a single string
219
+ # The predictor expects a single string
220
  full_text = f"{issue.issue_text} {issue.issue_description or ''} {issue.repo_name or ''}"
221
 
222
+ with MODEL_PREDICTION_SECONDS.time():
223
+ predictions_data = predictor.predict(full_text)
224
 
225
  # Convert to Pydantic models
226
  predictions = [
monitoring/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Metrics Collection & Verification
2
+
3
+ This directory contains the configuration for Prometheus monitoring.
4
+
5
+ ## Configuration
6
+ - **Prometheus Config**: `prometheus/prometheus.yml`
7
+ - **Scrape Target**: `hopcroft-api:8080`
8
+ - **Metrics Endpoint**: `http://localhost:8080/metrics`
9
+
10
+ ## Verification Queries (PromQL)
11
+
12
+ You can run these queries in the Prometheus Expression Browser (`http://localhost:9090/graph`):
13
+
14
+ ### 1. Request Rate (Counter)
15
+ Shows the rate of requests per second over the last minute.
16
+ ```promql
17
+ rate(hopcroft_requests_total[1m])
18
+ ```
19
+
20
+ ### 2. Average Request Duration (Histogram)
21
+ Calculates average latency.
22
+ ```promql
23
+ rate(hopcroft_request_duration_seconds_sum[5m]) / rate(hopcroft_request_duration_seconds_count[5m])
24
+ ```
25
+
26
+ ### 3. Current In-Progress Requests (Gauge)
27
+ Shows how many requests are currently being processed.
28
+ ```promql
29
+ hopcroft_in_progress_requests
30
+ ```
31
+
32
+ ### 4. Model Prediction Time (Summary)
33
+ Shows the 90th percentile of model prediction time.
34
+ ```promql
35
+ hopcroft_prediction_processing_seconds{quantile="0.9"}
36
+ ```
37
+
38
+ ---
39
+
40
+ ## Uptime Monitoring (Better Stack)
41
+
42
+ We used Better Stack Uptime to monitor the availability of the production deployment hosted on Hugging Face Spaces.
43
+
44
+ **Base URL**
45
+ - https://dacrow13-hopcroft-skill-classification.hf.space
46
+
47
+ **Monitored endpoints**
48
+ - https://dacrow13-hopcroft-skill-classification.hf.space/health
49
+ - https://dacrow13-hopcroft-skill-classification.hf.space/openapi.json
50
+ - https://dacrow13-hopcroft-skill-classification.hf.space/docs
51
+
52
+ **Checks and alerts**
53
+ - Monitors are configured to run from multiple locations.
54
+ - Email notifications are enabled for failures.
55
+ - A failure scenario was tested to confirm Better Stack reports the server error details.
56
+
57
+ - Screenshots are available in `monitoring/screenshots/`.
58
+
59
+ ---
60
+
61
+ ## Grafana Dashboard
62
+
63
+ Grafana provides real-time visualization of system metrics and drift detection status.
64
+
65
+ ### Configuration
66
+ - **Port**: `3000`
67
+ - **Credentials**: `admin` / `admin`
68
+ - **Dashboard**: Hopcroft Monitoring Dashboard
69
+ - **Datasource**: Prometheus (auto-provisioned)
70
+ - **Provisioning Files**:
71
+ - Datasources: `grafana/provisioning/datasources/prometheus.yml`
72
+ - Dashboards: `grafana/provisioning/dashboards/dashboard.yml`
73
+ - Dashboard JSON: `grafana/dashboards/hopcroft_dashboard.json`
74
+
75
+ ### Dashboard Panels
76
+ 1. **API Request Rate**: Rate of incoming requests per endpoint
77
+ 2. **API Latency**: Average response time per endpoint
78
+ 3. **Drift Detection Status**: Real-time drift detection indicator (0=No Drift, 1=Drift Detected)
79
+ 4. **Drift P-Value**: Statistical significance of detected drift
80
+ 5. **Drift Distance**: Kolmogorov-Smirnov distance metric
81
+
82
+ ### Access
83
+ Navigate to `http://localhost:3000` and login with the provided credentials. The dashboard refreshes every 10 seconds.
84
+
85
+ ---
86
+
87
+ ## Data Drift Detection
88
+
89
+ Automated distribution shift detection using statistical testing to monitor model input data quality.
90
+
91
+ ### Algorithm
92
+ - **Method**: Kolmogorov-Smirnov Two-Sample Test (scipy-based)
93
+ - **Baseline Data**: 1000 samples from training set
94
+ - **Detection Threshold**: p-value < 0.05 (with Bonferroni correction)
95
+ - **Metrics Published**: drift_detected, drift_p_value, drift_distance, drift_check_timestamp
96
+
97
+ ### Scripts
98
+
99
+ #### Baseline Preparation
100
+ **Script**: `drift/scripts/prepare_baseline.py`
101
+
102
+ Functionality:
103
+ - Loads data from SQLite database (`data/raw/skillscope_data.db`)
104
+ - Extracts numeric features only
105
+ - Samples 1000 representative records
106
+ - Saves to `drift/baseline/reference_data.pkl`
107
+
108
+ Usage:
109
+ ```bash
110
+ cd monitoring/drift/scripts
111
+ python prepare_baseline.py
112
+ ```
113
+
114
+ #### Drift Detection
115
+ **Script**: `drift/scripts/run_drift_check.py`
116
+
117
+ Functionality:
118
+ - Loads baseline reference data
119
+ - Compares with new production data
120
+ - Performs KS test on each feature
121
+ - Pushes metrics to Pushgateway
122
+ - Saves results to `drift/reports/`
123
+
124
+ Usage:
125
+ ```bash
126
+ cd monitoring/drift/scripts
127
+ python run_drift_check.py
128
+ ```
129
+
130
+ ### Verification
131
+ Check Pushgateway metrics:
132
+ ```bash
133
+ curl http://localhost:9091/metrics | grep drift
134
+ ```
135
+
136
+ Query in Prometheus:
137
+ ```promql
138
+ drift_detected
139
+ drift_p_value
140
+ drift_distance
141
+ ```
142
+
143
+ ---
144
+
145
+ ## Pushgateway
146
+
147
+ Pushgateway collects metrics from short-lived jobs such as the drift detection script.
148
+
149
+ ### Configuration
150
+ - **Port**: `9091`
151
+ - **Persistence**: Enabled with 5-minute intervals
152
+ - **Data Volume**: `pushgateway-data`
153
+
154
+ ### Metrics Endpoint
155
+ Access metrics at `http://localhost:9091/metrics`
156
+
157
+ ### Integration
158
+ The drift detection script pushes metrics to Pushgateway, which are then scraped by Prometheus and displayed in Grafana.
159
+
160
+ ---
161
+
162
+ ## Alerting
163
+
164
+ Alert rules are defined in `prometheus/alert_rules.yml`:
165
+
166
+ - **High Latency**: Triggered when average latency exceeds 2 seconds
167
+ - **High Error Rate**: Triggered when error rate exceeds 5%
168
+ - **Data Drift Detected**: Triggered when drift_detected = 1
169
+
170
+ Alerts are routed to Alertmanager (`http://localhost:9093`) and can be configured to send notifications via email, Slack, or other channels in `alertmanager/config.yml`.
171
+
172
+ ---
173
+
174
+ ## Complete Stack Usage
175
+
176
+ ### Starting All Services
177
+ ```bash
178
+ # Start all monitoring services
179
+ docker compose up -d
180
+
181
+ # Verify all containers are running
182
+ docker compose ps
183
+
184
+ # Check Prometheus targets
185
+ curl http://localhost:9090/targets
186
+
187
+ # Check Grafana health
188
+ curl http://localhost:3000/api/health
189
+ ```
190
+
191
+ ### Running Drift Detection Workflow
192
+
193
+ 1. **Prepare Baseline (One-time setup)**
194
+ ```bash
195
+ cd monitoring/drift/scripts
196
+ python prepare_baseline.py
197
+ ```
198
+
199
+ 2. **Execute Drift Check**
200
+ ```bash
201
+ python run_drift_check.py
202
+ ```
203
+
204
+ 3. **Verify Results**
205
+ - Check Pushgateway: `http://localhost:9091`
206
+ - Check Prometheus: `http://localhost:9090/graph`
207
+ - Check Grafana: `http://localhost:3000`
monitoring/alertmanager/config.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ resolve_timeout: 5m
3
+
4
+ route:
5
+ group_by: ['alertname', 'severity']
6
+ group_wait: 10s
7
+ group_interval: 10s
8
+ repeat_interval: 1h
9
+ receiver: 'log-receiver'
10
+
11
+ receivers:
12
+ - name: 'log-receiver'
13
+ webhook_configs:
14
+ - url: 'http://hopcroft-api:8080/health'
15
+
16
+ inhibition_rules:
17
+ - source_match:
18
+ severity: 'critical'
19
+ target_match:
20
+ severity: 'warning'
21
+ equal: ['alertname', 'dev', 'instance']
monitoring/drift/scripts/prepare_baseline.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Prepare baseline/reference data for drift detection.
3
+ This script samples representative data from the training set.
4
+ """
5
+
6
+ import pickle
7
+ import pandas as pd
8
+ import numpy as np
9
+ import sqlite3
10
+ from pathlib import Path
11
+ from sklearn.model_selection import train_test_split
12
+
13
+ # Paths
14
+ PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
15
+ BASELINE_DIR = Path(__file__).parent.parent / "baseline"
16
+ BASELINE_DIR.mkdir(parents=True, exist_ok=True)
17
+
18
+
19
+ def load_training_data():
20
+ """Load the original training dataset from SQLite database."""
21
+ # Load from SQLite database
22
+ db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db"
23
+
24
+ if not db_path.exists():
25
+ raise FileNotFoundError(f"Database not found at {db_path}")
26
+
27
+ print(f"Loading data from database: {db_path}")
28
+ conn = sqlite3.connect(db_path)
29
+
30
+ # Load from the main table
31
+ query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000"
32
+ df = pd.read_sql_query(query, conn)
33
+ conn.close()
34
+
35
+ print(f"Loaded {len(df)} training samples")
36
+ return df
37
+
38
+
39
+ def prepare_baseline(df, sample_size=1000, random_state=42):
40
+ """
41
+ Sample representative baseline data.
42
+
43
+ Args:
44
+ df: Training dataframe
45
+ sample_size: Number of samples for baseline
46
+ random_state: Random seed for reproducibility
47
+
48
+ Returns:
49
+ Baseline dataframe
50
+ """
51
+ # Stratified sampling if you have labels
52
+ if 'label' in df.columns:
53
+ _, baseline_df = train_test_split(
54
+ df,
55
+ test_size=sample_size,
56
+ random_state=random_state,
57
+ stratify=df['label']
58
+ )
59
+ else:
60
+ baseline_df = df.sample(n=min(sample_size, len(df)), random_state=random_state)
61
+
62
+ print(f"Sampled {len(baseline_df)} baseline samples")
63
+ return baseline_df
64
+
65
+
66
+ def extract_features(df):
67
+ """
68
+ Extract features used for drift detection.
69
+ Should match the features used by your model.
70
+ """
71
+
72
+ # Select only numeric columns, exclude labels and IDs
73
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
74
+ exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id']
75
+ feature_columns = [col for col in numeric_cols if col not in exclude_cols]
76
+
77
+ X = df[feature_columns].values
78
+
79
+ print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples")
80
+ return X
81
+
82
+
83
+ def save_baseline(baseline_data, filename="reference_data.pkl"):
84
+ """Save baseline data to disk."""
85
+ baseline_path = BASELINE_DIR / filename
86
+
87
+ with open(baseline_path, 'wb') as f:
88
+ pickle.dump(baseline_data, f)
89
+
90
+ print(f"Baseline saved to {baseline_path}")
91
+ print(f" Shape: {baseline_data.shape}")
92
+ print(f" Size: {baseline_path.stat().st_size / 1024:.2f} KB")
93
+
94
+
95
+ def main():
96
+ """Main execution."""
97
+ print("=" * 60)
98
+ print("Preparing Baseline Data for Drift Detection")
99
+ print("=" * 60)
100
+
101
+ # Load data
102
+ df = load_training_data()
103
+
104
+ # Sample baseline
105
+ baseline_df = prepare_baseline(df, sample_size=1000)
106
+
107
+ # Extract features
108
+ X_baseline = extract_features(baseline_df)
109
+
110
+ # Save
111
+ save_baseline(X_baseline)
112
+
113
+ print("\n" + "=" * 60)
114
+ print("Baseline preparation complete!")
115
+ print("=" * 60)
116
+
117
+
118
+ if __name__ == "__main__":
119
+ main()
monitoring/drift/scripts/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ alibi-detect>=0.11.4
2
+ pandas>=2.0.0
3
+ numpy>=1.24.0
4
+ scikit-learn>=1.3.0
5
+ requests>=2.31.0
6
+ mlflow>=2.8.0
monitoring/drift/scripts/run_drift_check.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Drift Detection using Scipy KS Test.
3
+ Detects distribution shifts between baseline and new data.
4
+ """
5
+
6
+ import pickle
7
+ import json
8
+ import requests
9
+ import numpy as np
10
+ import pandas as pd
11
+ from pathlib import Path
12
+ from datetime import datetime
13
+ from scipy.stats import ks_2samp
14
+ from typing import Dict, Tuple
15
+
16
+ # Configuration
17
+ PROJECT_ROOT = Path(__file__).parent.parent.parent.parent
18
+ BASELINE_DIR = Path(__file__).parent.parent / "baseline"
19
+ REPORTS_DIR = Path(__file__).parent.parent / "reports"
20
+ REPORTS_DIR.mkdir(parents=True, exist_ok=True)
21
+
22
+ PUSHGATEWAY_URL = "http://localhost:9091"
23
+ P_VALUE_THRESHOLD = 0.05 # Significance level
24
+
25
+
26
+ def load_baseline() -> np.ndarray:
27
+ """Load reference/baseline data."""
28
+ baseline_path = BASELINE_DIR / "reference_data.pkl"
29
+
30
+ if not baseline_path.exists():
31
+ raise FileNotFoundError(
32
+ f"Baseline data not found at {baseline_path}\n"
33
+ f"Run `python prepare_baseline.py` first!"
34
+ )
35
+
36
+ with open(baseline_path, 'rb') as f:
37
+ X_baseline = pickle.load(f)
38
+
39
+ print(f"Loaded baseline data: {X_baseline.shape}")
40
+ return X_baseline
41
+
42
+
43
+ def load_new_data() -> np.ndarray:
44
+ """
45
+ Load new/production data to check for drift.
46
+
47
+ In production, this would fetch from:
48
+ - Database
49
+ - S3 bucket
50
+ - API logs
51
+ - Data lake
52
+
53
+ For now, simulate or load from file.
54
+ """
55
+
56
+ # Option 1: Load from file
57
+ data_path = PROJECT_ROOT / "data" / "test.csv"
58
+ if data_path.exists():
59
+ df = pd.read_csv(data_path)
60
+ # Extract same features as baseline
61
+ feature_columns = [col for col in df.columns if col not in ['label', 'id', 'timestamp']]
62
+ X_new = df[feature_columns].values[:500] # Take 500 samples
63
+ print(f"Loaded new data from file: {X_new.shape}")
64
+ return X_new
65
+
66
+ # Option 2: Simulate (for testing)
67
+ print("Simulating new data (no test file found)")
68
+ X_baseline = load_baseline()
69
+ # Add slight shift to simulate drift
70
+ X_new = X_baseline[:500] + np.random.normal(0, 0.1, (500, X_baseline.shape[1]))
71
+ return X_new
72
+
73
+
74
+ def run_drift_detection(X_baseline: np.ndarray, X_new: np.ndarray) -> Dict:
75
+ """
76
+ Run Kolmogorov-Smirnov drift detection using scipy.
77
+
78
+ Args:
79
+ X_baseline: Reference data
80
+ X_new: New data to check
81
+
82
+ Returns:
83
+ Drift detection results
84
+ """
85
+ print("\n" + "=" * 60)
86
+ print("Running Drift Detection (Kolmogorov-Smirnov Test)")
87
+ print("=" * 60)
88
+
89
+ # Run KS test for each feature
90
+ p_values = []
91
+ distances = []
92
+
93
+ for i in range(X_baseline.shape[1]):
94
+ statistic, p_value = ks_2samp(X_baseline[:, i], X_new[:, i])
95
+ p_values.append(p_value)
96
+ distances.append(statistic)
97
+
98
+ # Aggregate results
99
+ min_p_value = np.min(p_values)
100
+ max_distance = np.max(distances)
101
+
102
+ # Apply Bonferroni correction for multiple testing
103
+ adjusted_threshold = P_VALUE_THRESHOLD / X_baseline.shape[1]
104
+ drift_detected = min_p_value < adjusted_threshold
105
+
106
+ # Extract results
107
+ results = {
108
+ "timestamp": datetime.now().isoformat(),
109
+ "drift_detected": int(drift_detected),
110
+ "p_value": float(min_p_value),
111
+ "threshold": adjusted_threshold,
112
+ "distance": float(max_distance),
113
+ "baseline_samples": X_baseline.shape[0],
114
+ "new_samples": X_new.shape[0],
115
+ "num_features": X_baseline.shape[1]
116
+ }
117
+
118
+ # Print results
119
+ print(f"\nResults:")
120
+ print(f" Drift Detected: {'YES' if results['drift_detected'] else 'NO'}")
121
+ print(f" P-Value: {results['p_value']:.6f} (adjusted threshold: {adjusted_threshold:.6f})")
122
+ print(f" Distance: {results['distance']:.6f}")
123
+ print(f" Baseline: {X_baseline.shape[0]} samples")
124
+ print(f" New Data: {X_new.shape[0]} samples")
125
+
126
+ return results
127
+
128
+
129
+ def save_report(results: Dict):
130
+ """Save drift detection report to file."""
131
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
132
+ report_path = REPORTS_DIR / f"drift_report_{timestamp}.json"
133
+
134
+ with open(report_path, 'w') as f:
135
+ json.dump(results, f, indent=2)
136
+
137
+ print(f"\nReport saved to: {report_path}")
138
+
139
+
140
+ def push_to_prometheus(results: Dict):
141
+ """
142
+ Push drift metrics to Prometheus via Pushgateway.
143
+
144
+ This allows Prometheus to scrape short-lived job metrics.
145
+ """
146
+ metrics = f"""# TYPE drift_detected gauge
147
+ # HELP drift_detected Whether data drift was detected (1=yes, 0=no)
148
+ drift_detected {results['drift_detected']}
149
+
150
+ # TYPE drift_p_value gauge
151
+ # HELP drift_p_value P-value from drift detection test
152
+ drift_p_value {results['p_value']}
153
+
154
+ # TYPE drift_distance gauge
155
+ # HELP drift_distance Statistical distance between distributions
156
+ drift_distance {results['distance']}
157
+
158
+ # TYPE drift_check_timestamp gauge
159
+ # HELP drift_check_timestamp Unix timestamp of last drift check
160
+ drift_check_timestamp {datetime.now().timestamp()}
161
+ """
162
+
163
+ try:
164
+ response = requests.post(
165
+ f"{PUSHGATEWAY_URL}/metrics/job/drift_detection/instance/hopcroft",
166
+ data=metrics,
167
+ headers={'Content-Type': 'text/plain'}
168
+ )
169
+ response.raise_for_status()
170
+ print(f"Metrics pushed to Pushgateway at {PUSHGATEWAY_URL}")
171
+ except requests.exceptions.RequestException as e:
172
+ print(f"Failed to push to Pushgateway: {e}")
173
+ print(f" Make sure Pushgateway is running: docker compose ps pushgateway")
174
+
175
+
176
+ def main():
177
+ """Main execution."""
178
+ print("\n" + "=" * 60)
179
+ print("Hopcroft Data Drift Detection")
180
+ print("=" * 60)
181
+
182
+ try:
183
+ # Load data
184
+ X_baseline = load_baseline()
185
+ X_new = load_new_data()
186
+
187
+ # Run drift detection
188
+ results = run_drift_detection(X_baseline, X_new)
189
+
190
+ # Save report
191
+ save_report(results)
192
+
193
+ # Push to Prometheus
194
+ push_to_prometheus(results)
195
+
196
+ print("\n" + "=" * 60)
197
+ print("Drift Detection Complete!")
198
+ print("=" * 60)
199
+
200
+ if results['drift_detected']:
201
+ print("\nWARNING: Data drift detected!")
202
+ print(f" P-value: {results['p_value']:.6f} < {P_VALUE_THRESHOLD}")
203
+ return 1
204
+ else:
205
+ print("\nNo significant drift detected")
206
+ return 0
207
+
208
+ except Exception as e:
209
+ print(f"\nError: {e}")
210
+ import traceback
211
+ traceback.print_exc()
212
+ return 1
213
+
214
+
215
+ if __name__ == "__main__":
216
+ exit(main())
monitoring/grafana/dashboards/hopcroft_dashboard.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": "-- Grafana --",
7
+ "enable": true,
8
+ "hide": true,
9
+ "iconColor": "rgba(0, 211, 255, 1)",
10
+ "name": "Annotations & Alerts",
11
+ "type": "dashboard"
12
+ }
13
+ ]
14
+ },
15
+ "editable": true,
16
+ "gnetId": null,
17
+ "graphTooltip": 1,
18
+ "id": null,
19
+ "links": [],
20
+ "panels": [
21
+ {
22
+ "datasource": "Prometheus",
23
+ "fieldConfig": {
24
+ "defaults": {
25
+ "color": {
26
+ "mode": "thresholds"
27
+ },
28
+ "mappings": [],
29
+ "thresholds": {
30
+ "mode": "absolute",
31
+ "steps": [
32
+ {
33
+ "color": "green",
34
+ "value": null
35
+ },
36
+ {
37
+ "color": "red",
38
+ "value": 80
39
+ }
40
+ ]
41
+ },
42
+ "unit": "reqps"
43
+ }
44
+ },
45
+ "gridPos": {
46
+ "h": 8,
47
+ "w": 6,
48
+ "x": 0,
49
+ "y": 0
50
+ },
51
+ "id": 1,
52
+ "options": {
53
+ "orientation": "auto",
54
+ "reduceOptions": {
55
+ "calcs": ["lastNotNull"],
56
+ "fields": "",
57
+ "values": false
58
+ },
59
+ "showThresholdLabels": false,
60
+ "showThresholdMarkers": true
61
+ },
62
+ "pluginVersion": "9.0.0",
63
+ "targets": [
64
+ {
65
+ "expr": "rate(fastapi_requests_total[1m])",
66
+ "refId": "A"
67
+ }
68
+ ],
69
+ "title": "Request Rate",
70
+ "type": "gauge",
71
+ "description": "Number of requests per second handled by the API"
72
+ },
73
+ {
74
+ "datasource": "Prometheus",
75
+ "fieldConfig": {
76
+ "defaults": {
77
+ "color": {
78
+ "mode": "palette-classic"
79
+ },
80
+ "custom": {
81
+ "axisLabel": "",
82
+ "axisPlacement": "auto",
83
+ "barAlignment": 0,
84
+ "drawStyle": "line",
85
+ "fillOpacity": 10,
86
+ "gradientMode": "none",
87
+ "hideFrom": {
88
+ "tooltip": false,
89
+ "viz": false,
90
+ "legend": false
91
+ },
92
+ "lineInterpolation": "linear",
93
+ "lineWidth": 1,
94
+ "pointSize": 5,
95
+ "scaleDistribution": {
96
+ "type": "linear"
97
+ },
98
+ "showPoints": "never",
99
+ "spanNulls": true
100
+ },
101
+ "mappings": [],
102
+ "thresholds": {
103
+ "mode": "absolute",
104
+ "steps": [
105
+ {
106
+ "color": "green",
107
+ "value": null
108
+ }
109
+ ]
110
+ },
111
+ "unit": "ms"
112
+ }
113
+ },
114
+ "gridPos": {
115
+ "h": 8,
116
+ "w": 18,
117
+ "x": 6,
118
+ "y": 0
119
+ },
120
+ "id": 2,
121
+ "options": {
122
+ "legend": {
123
+ "calcs": ["mean", "max"],
124
+ "displayMode": "table",
125
+ "placement": "right"
126
+ },
127
+ "tooltip": {
128
+ "mode": "multi"
129
+ }
130
+ },
131
+ "pluginVersion": "9.0.0",
132
+ "targets": [
133
+ {
134
+ "expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
135
+ "legendFormat": "p95",
136
+ "refId": "A"
137
+ },
138
+ {
139
+ "expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
140
+ "legendFormat": "p50 (median)",
141
+ "refId": "B"
142
+ }
143
+ ],
144
+ "title": "Request Latency (p50, p95)",
145
+ "type": "timeseries",
146
+ "description": "API response time percentiles over time"
147
+ },
148
+ {
149
+ "datasource": "Prometheus",
150
+ "fieldConfig": {
151
+ "defaults": {
152
+ "color": {
153
+ "mode": "thresholds"
154
+ },
155
+ "mappings": [
156
+ {
157
+ "options": {
158
+ "0": {
159
+ "color": "red",
160
+ "index": 1,
161
+ "text": "No Drift"
162
+ },
163
+ "1": {
164
+ "color": "green",
165
+ "index": 0,
166
+ "text": "Drift Detected"
167
+ }
168
+ },
169
+ "type": "value"
170
+ }
171
+ ],
172
+ "thresholds": {
173
+ "mode": "absolute",
174
+ "steps": [
175
+ {
176
+ "color": "green",
177
+ "value": null
178
+ }
179
+ ]
180
+ }
181
+ }
182
+ },
183
+ "gridPos": {
184
+ "h": 6,
185
+ "w": 6,
186
+ "x": 0,
187
+ "y": 8
188
+ },
189
+ "id": 3,
190
+ "options": {
191
+ "orientation": "auto",
192
+ "reduceOptions": {
193
+ "calcs": ["lastNotNull"],
194
+ "fields": "",
195
+ "values": false
196
+ },
197
+ "showThresholdLabels": false,
198
+ "showThresholdMarkers": true,
199
+ "text": {}
200
+ },
201
+ "pluginVersion": "9.0.0",
202
+ "targets": [
203
+ {
204
+ "expr": "drift_detected",
205
+ "refId": "A"
206
+ }
207
+ ],
208
+ "title": "Data Drift Status",
209
+ "type": "stat",
210
+ "description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
211
+ },
212
+ {
213
+ "datasource": "Prometheus",
214
+ "fieldConfig": {
215
+ "defaults": {
216
+ "color": {
217
+ "mode": "thresholds"
218
+ },
219
+ "decimals": 4,
220
+ "mappings": [],
221
+ "thresholds": {
222
+ "mode": "absolute",
223
+ "steps": [
224
+ {
225
+ "color": "green",
226
+ "value": null
227
+ },
228
+ {
229
+ "color": "yellow",
230
+ "value": 0.01
231
+ },
232
+ {
233
+ "color": "red",
234
+ "value": 0.05
235
+ }
236
+ ]
237
+ },
238
+ "unit": "short"
239
+ }
240
+ },
241
+ "gridPos": {
242
+ "h": 6,
243
+ "w": 6,
244
+ "x": 6,
245
+ "y": 8
246
+ },
247
+ "id": 4,
248
+ "options": {
249
+ "orientation": "auto",
250
+ "reduceOptions": {
251
+ "calcs": ["lastNotNull"],
252
+ "fields": "",
253
+ "values": false
254
+ },
255
+ "showThresholdLabels": false,
256
+ "showThresholdMarkers": true,
257
+ "text": {}
258
+ },
259
+ "pluginVersion": "9.0.0",
260
+ "targets": [
261
+ {
262
+ "expr": "drift_p_value",
263
+ "refId": "A"
264
+ }
265
+ ],
266
+ "title": "Drift P-Value",
267
+ "type": "stat",
268
+ "description": "Statistical significance of detected drift (lower = more significant)"
269
+ },
270
+ {
271
+ "datasource": "Prometheus",
272
+ "fieldConfig": {
273
+ "defaults": {
274
+ "color": {
275
+ "mode": "palette-classic"
276
+ },
277
+ "custom": {
278
+ "axisLabel": "",
279
+ "axisPlacement": "auto",
280
+ "barAlignment": 0,
281
+ "drawStyle": "line",
282
+ "fillOpacity": 10,
283
+ "gradientMode": "none",
284
+ "hideFrom": {
285
+ "tooltip": false,
286
+ "viz": false,
287
+ "legend": false
288
+ },
289
+ "lineInterpolation": "linear",
290
+ "lineWidth": 1,
291
+ "pointSize": 5,
292
+ "scaleDistribution": {
293
+ "type": "linear"
294
+ },
295
+ "showPoints": "auto",
296
+ "spanNulls": false
297
+ },
298
+ "mappings": [],
299
+ "thresholds": {
300
+ "mode": "absolute",
301
+ "steps": [
302
+ {
303
+ "color": "green",
304
+ "value": null
305
+ }
306
+ ]
307
+ },
308
+ "unit": "short"
309
+ }
310
+ },
311
+ "gridPos": {
312
+ "h": 6,
313
+ "w": 12,
314
+ "x": 12,
315
+ "y": 8
316
+ },
317
+ "id": 5,
318
+ "options": {
319
+ "legend": {
320
+ "calcs": ["mean", "lastNotNull"],
321
+ "displayMode": "table",
322
+ "placement": "right"
323
+ },
324
+ "tooltip": {
325
+ "mode": "multi"
326
+ }
327
+ },
328
+ "pluginVersion": "9.0.0",
329
+ "targets": [
330
+ {
331
+ "expr": "drift_distance",
332
+ "legendFormat": "Distance",
333
+ "refId": "A"
334
+ }
335
+ ],
336
+ "title": "Drift Distance Over Time",
337
+ "type": "timeseries",
338
+ "description": "Statistical distance between baseline and current data distribution"
339
+ }
340
+ ],
341
+ "refresh": "10s",
342
+ "schemaVersion": 36,
343
+ "style": "dark",
344
+ "tags": ["hopcroft", "ml", "monitoring"],
345
+ "templating": {
346
+ "list": []
347
+ },
348
+ "time": {
349
+ "from": "now-1h",
350
+ "to": "now"
351
+ },
352
+ "timepicker": {},
353
+ "timezone": "",
354
+ "title": "Hopcroft ML Model Monitoring",
355
+ "uid": "hopcroft-ml-dashboard",
356
+ "version": 1,
357
+ "weekStart": ""
358
+ }
monitoring/grafana/provisioning/dashboards/dashboard.yml ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ providers:
4
+ - name: 'Hopcroft Dashboards'
5
+ orgId: 1
6
+ folder: ''
7
+ type: file
8
+ disableDeletion: false
9
+ updateIntervalSeconds: 10
10
+ allowUiUpdates: true
11
+ options:
12
+ path: /var/lib/grafana/dashboards
13
+ foldersFromFilesStructure: true
monitoring/grafana/provisioning/dashboards/hopcroft_dashboard.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": "-- Grafana --",
7
+ "enable": true,
8
+ "hide": true,
9
+ "iconColor": "rgba(0, 211, 255, 1)",
10
+ "name": "Annotations & Alerts",
11
+ "type": "dashboard"
12
+ }
13
+ ]
14
+ },
15
+ "editable": true,
16
+ "gnetId": null,
17
+ "graphTooltip": 1,
18
+ "id": null,
19
+ "links": [],
20
+ "panels": [
21
+ {
22
+ "datasource": "Prometheus",
23
+ "fieldConfig": {
24
+ "defaults": {
25
+ "color": {
26
+ "mode": "thresholds"
27
+ },
28
+ "mappings": [],
29
+ "thresholds": {
30
+ "mode": "absolute",
31
+ "steps": [
32
+ {
33
+ "color": "green",
34
+ "value": null
35
+ },
36
+ {
37
+ "color": "red",
38
+ "value": 80
39
+ }
40
+ ]
41
+ },
42
+ "unit": "reqps"
43
+ }
44
+ },
45
+ "gridPos": {
46
+ "h": 8,
47
+ "w": 6,
48
+ "x": 0,
49
+ "y": 0
50
+ },
51
+ "id": 1,
52
+ "options": {
53
+ "orientation": "auto",
54
+ "reduceOptions": {
55
+ "calcs": ["lastNotNull"],
56
+ "fields": "",
57
+ "values": false
58
+ },
59
+ "showThresholdLabels": false,
60
+ "showThresholdMarkers": true
61
+ },
62
+ "pluginVersion": "9.0.0",
63
+ "targets": [
64
+ {
65
+ "expr": "rate(fastapi_requests_total[1m])",
66
+ "refId": "A"
67
+ }
68
+ ],
69
+ "title": "Request Rate",
70
+ "type": "gauge",
71
+ "description": "Number of requests per second handled by the API"
72
+ },
73
+ {
74
+ "datasource": "Prometheus",
75
+ "fieldConfig": {
76
+ "defaults": {
77
+ "color": {
78
+ "mode": "palette-classic"
79
+ },
80
+ "custom": {
81
+ "axisLabel": "",
82
+ "axisPlacement": "auto",
83
+ "barAlignment": 0,
84
+ "drawStyle": "line",
85
+ "fillOpacity": 10,
86
+ "gradientMode": "none",
87
+ "hideFrom": {
88
+ "tooltip": false,
89
+ "viz": false,
90
+ "legend": false
91
+ },
92
+ "lineInterpolation": "linear",
93
+ "lineWidth": 1,
94
+ "pointSize": 5,
95
+ "scaleDistribution": {
96
+ "type": "linear"
97
+ },
98
+ "showPoints": "never",
99
+ "spanNulls": true
100
+ },
101
+ "mappings": [],
102
+ "thresholds": {
103
+ "mode": "absolute",
104
+ "steps": [
105
+ {
106
+ "color": "green",
107
+ "value": null
108
+ }
109
+ ]
110
+ },
111
+ "unit": "ms"
112
+ }
113
+ },
114
+ "gridPos": {
115
+ "h": 8,
116
+ "w": 18,
117
+ "x": 6,
118
+ "y": 0
119
+ },
120
+ "id": 2,
121
+ "options": {
122
+ "legend": {
123
+ "calcs": ["mean", "max"],
124
+ "displayMode": "table",
125
+ "placement": "right"
126
+ },
127
+ "tooltip": {
128
+ "mode": "multi"
129
+ }
130
+ },
131
+ "pluginVersion": "9.0.0",
132
+ "targets": [
133
+ {
134
+ "expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
135
+ "legendFormat": "p95",
136
+ "refId": "A"
137
+ },
138
+ {
139
+ "expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
140
+ "legendFormat": "p50 (median)",
141
+ "refId": "B"
142
+ }
143
+ ],
144
+ "title": "Request Latency (p50, p95)",
145
+ "type": "timeseries",
146
+ "description": "API response time percentiles over time"
147
+ },
148
+ {
149
+ "datasource": "Prometheus",
150
+ "fieldConfig": {
151
+ "defaults": {
152
+ "color": {
153
+ "mode": "thresholds"
154
+ },
155
+ "mappings": [
156
+ {
157
+ "options": {
158
+ "0": {
159
+ "color": "red",
160
+ "index": 1,
161
+ "text": "No Drift"
162
+ },
163
+ "1": {
164
+ "color": "green",
165
+ "index": 0,
166
+ "text": "Drift Detected"
167
+ }
168
+ },
169
+ "type": "value"
170
+ }
171
+ ],
172
+ "thresholds": {
173
+ "mode": "absolute",
174
+ "steps": [
175
+ {
176
+ "color": "green",
177
+ "value": null
178
+ }
179
+ ]
180
+ }
181
+ }
182
+ },
183
+ "gridPos": {
184
+ "h": 6,
185
+ "w": 6,
186
+ "x": 0,
187
+ "y": 8
188
+ },
189
+ "id": 3,
190
+ "options": {
191
+ "orientation": "auto",
192
+ "reduceOptions": {
193
+ "calcs": ["lastNotNull"],
194
+ "fields": "",
195
+ "values": false
196
+ },
197
+ "showThresholdLabels": false,
198
+ "showThresholdMarkers": true,
199
+ "text": {}
200
+ },
201
+ "pluginVersion": "9.0.0",
202
+ "targets": [
203
+ {
204
+ "expr": "drift_detected",
205
+ "refId": "A"
206
+ }
207
+ ],
208
+ "title": "Data Drift Status",
209
+ "type": "stat",
210
+ "description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
211
+ },
212
+ {
213
+ "datasource": "Prometheus",
214
+ "fieldConfig": {
215
+ "defaults": {
216
+ "color": {
217
+ "mode": "thresholds"
218
+ },
219
+ "decimals": 4,
220
+ "mappings": [],
221
+ "thresholds": {
222
+ "mode": "absolute",
223
+ "steps": [
224
+ {
225
+ "color": "green",
226
+ "value": null
227
+ },
228
+ {
229
+ "color": "yellow",
230
+ "value": 0.01
231
+ },
232
+ {
233
+ "color": "red",
234
+ "value": 0.05
235
+ }
236
+ ]
237
+ },
238
+ "unit": "short"
239
+ }
240
+ },
241
+ "gridPos": {
242
+ "h": 6,
243
+ "w": 6,
244
+ "x": 6,
245
+ "y": 8
246
+ },
247
+ "id": 4,
248
+ "options": {
249
+ "orientation": "auto",
250
+ "reduceOptions": {
251
+ "calcs": ["lastNotNull"],
252
+ "fields": "",
253
+ "values": false
254
+ },
255
+ "showThresholdLabels": false,
256
+ "showThresholdMarkers": true,
257
+ "text": {}
258
+ },
259
+ "pluginVersion": "9.0.0",
260
+ "targets": [
261
+ {
262
+ "expr": "drift_p_value",
263
+ "refId": "A"
264
+ }
265
+ ],
266
+ "title": "Drift P-Value",
267
+ "type": "stat",
268
+ "description": "Statistical significance of detected drift (lower = more significant)"
269
+ },
270
+ {
271
+ "datasource": "Prometheus",
272
+ "fieldConfig": {
273
+ "defaults": {
274
+ "color": {
275
+ "mode": "palette-classic"
276
+ },
277
+ "custom": {
278
+ "axisLabel": "",
279
+ "axisPlacement": "auto",
280
+ "barAlignment": 0,
281
+ "drawStyle": "line",
282
+ "fillOpacity": 10,
283
+ "gradientMode": "none",
284
+ "hideFrom": {
285
+ "tooltip": false,
286
+ "viz": false,
287
+ "legend": false
288
+ },
289
+ "lineInterpolation": "linear",
290
+ "lineWidth": 1,
291
+ "pointSize": 5,
292
+ "scaleDistribution": {
293
+ "type": "linear"
294
+ },
295
+ "showPoints": "auto",
296
+ "spanNulls": false
297
+ },
298
+ "mappings": [],
299
+ "thresholds": {
300
+ "mode": "absolute",
301
+ "steps": [
302
+ {
303
+ "color": "green",
304
+ "value": null
305
+ }
306
+ ]
307
+ },
308
+ "unit": "short"
309
+ }
310
+ },
311
+ "gridPos": {
312
+ "h": 6,
313
+ "w": 12,
314
+ "x": 12,
315
+ "y": 8
316
+ },
317
+ "id": 5,
318
+ "options": {
319
+ "legend": {
320
+ "calcs": ["mean", "lastNotNull"],
321
+ "displayMode": "table",
322
+ "placement": "right"
323
+ },
324
+ "tooltip": {
325
+ "mode": "multi"
326
+ }
327
+ },
328
+ "pluginVersion": "9.0.0",
329
+ "targets": [
330
+ {
331
+ "expr": "drift_distance",
332
+ "legendFormat": "Distance",
333
+ "refId": "A"
334
+ }
335
+ ],
336
+ "title": "Drift Distance Over Time",
337
+ "type": "timeseries",
338
+ "description": "Statistical distance between baseline and current data distribution"
339
+ }
340
+ ],
341
+ "refresh": "10s",
342
+ "schemaVersion": 36,
343
+ "style": "dark",
344
+ "tags": ["hopcroft", "ml", "monitoring"],
345
+ "templating": {
346
+ "list": []
347
+ },
348
+ "time": {
349
+ "from": "now-1h",
350
+ "to": "now"
351
+ },
352
+ "timepicker": {},
353
+ "timezone": "",
354
+ "title": "Hopcroft ML Model Monitoring",
355
+ "uid": "hopcroft-ml-dashboard",
356
+ "version": 1,
357
+ "weekStart": ""
358
+ }
monitoring/grafana/provisioning/datasources/prometheus.yml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: 1
2
+
3
+ datasources:
4
+ - name: Prometheus
5
+ type: prometheus
6
+ access: proxy
7
+ uid: prometheus
8
+ orgId: 1
9
+ url: http://prometheus:9090
10
+ isDefault: true
11
+ editable: true
12
+ jsonData:
13
+ httpMethod: POST
14
+ timeInterval: "15s"
monitoring/locust/README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Locust Load Testing - Skill Classification API
2
+
3
+ Questa directory contiene gli script per il load testing della Skill Classification API utilizzando [Locust](https://locust.io/).
4
+
5
+ ## Prerequisiti
6
+
7
+ Assicurati di avere Python installato e installa Locust:
8
+
9
+ ```bash
10
+ pip install locust
11
+ ```
12
+
13
+ ## Avvio del Test
14
+
15
+ ### 1. Avvia Locust
16
+
17
+ Dalla directory `monitoring/locust/`, esegui:
18
+
19
+ ```bash
20
+ locust -f locustfile.py
21
+ ```
22
+
23
+ ### 2. Accedi alla Web UI
24
+
25
+ Apri il browser e vai a: **http://localhost:8089**
26
+
27
+ ### 3. Configura il Test
28
+
29
+ Nella Web UI, configura i seguenti parametri:
30
+
31
+ | Parametro | Descrizione | Valore Consigliato |
32
+ |-----------|-------------|-------------------|
33
+ | **Host** | URL dell'API da testare | `http://localhost:8080` (Docker) o `http://localhost:8000` (locale) |
34
+ | **Number of users** | Numero totale di utenti simulati | 10-100 |
35
+ | **Spawn rate** | Utenti da creare al secondo | 1-10 |
36
+
37
+ ### 4. Avvia il Test
38
+
39
+ Clicca su **"Start swarming"** per avviare il test di carico.
40
+
41
+ ## Task Implementati
42
+
43
+ Lo script simula il comportamento di utenti reali con i seguenti task:
44
+
45
+ | Task | Endpoint | Metodo | Peso | Descrizione |
46
+ |------|----------|--------|------|-------------|
47
+ | **Predizione Singola** | `/predict` | POST | 3 | Classifica un singolo issue text. Task principale, eseguito più frequentemente. |
48
+ | **Predizione Batch** | `/predict/batch` | POST | 1 | Classifica multipli issue text in una singola richiesta. |
49
+ | **Monitoraggio e Storia** | `/predictions`, `/health` | GET | 1 | Visualizza la cronologia delle predizioni e verifica lo stato del sistema. |
50
+
51
+ ### Distribuzione dei Pesi
52
+
53
+ Con i pesi configurati (3:1:1), la distribuzione approssimativa delle richieste è:
54
+ - **60%** - Predizione Singola
55
+ - **20%** - Predizione Batch
56
+ - **20%** - Monitoraggio e Storia
57
+
58
+ ### Tempo di Attesa
59
+
60
+ Ogni utente attende tra **1 e 5 secondi** tra un task e l'altro per simulare un comportamento realistico.
61
+
62
+ ## Metriche Monitorate
63
+
64
+ Durante il test, Locust fornisce le seguenti metriche in tempo reale:
65
+
66
+ - **RPS (Requests Per Second)**: Numero di richieste al secondo
67
+ - **Response Time**: Tempo medio/mediano/percentili di risposta
68
+ - **Failure Rate**: Percentuale di richieste fallite
69
+ - **Active Users**: Numero di utenti attualmente attivi
70
+
71
+ ## Opzioni Avanzate
72
+
73
+ ### Esecuzione Headless (senza UI)
74
+
75
+ ```bash
76
+ locust -f locustfile.py --headless -u 50 -r 5 -t 5m --host http://localhost:8000
77
+ ```
78
+
79
+ | Opzione | Descrizione |
80
+ |---------|-------------|
81
+ | `--headless` | Esegui senza Web UI |
82
+ | `-u 50` | 50 utenti simulati |
83
+ | `-r 5` | 5 utenti creati al secondo |
84
+ | `-t 5m` | Durata del test: 5 minuti |
85
+ | `--host` | URL dell'API |
86
+
87
+ ### Esportazione Risultati
88
+
89
+ ```bash
90
+ locust -f locustfile.py --headless -u 50 -r 5 -t 5m --host http://localhost:8000 --csv=results
91
+ ```
92
+
93
+ Questo creerà file CSV con i risultati del test.
94
+
95
+ ## Struttura File
96
+
97
+ ```
98
+ monitoring/locust/
99
+ ├── locustfile.py # Script principale di load testing
100
+ └── README.md # Questa documentazione
101
+ ```
monitoring/locust/locustfile.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Locust Load Testing Script for Skill Classification API
3
+
4
+ This script defines user behavior for load testing the prediction and monitoring
5
+ endpoints of the Skill Classification API.
6
+ """
7
+
8
+ from locust import HttpUser, task, between
9
+
10
+
11
+ class SkillClassificationUser(HttpUser):
12
+ """
13
+ Simulated user for load testing the Skill Classification API.
14
+
15
+ This user performs the following actions:
16
+ - Single predictions (most frequent)
17
+ - Batch predictions
18
+ - Monitoring and health checks
19
+ """
20
+
21
+ # Default host for the API (can be overridden via --host flag or Web UI)
22
+ # Use http://localhost:8080 for Docker or http://localhost:8000 for local dev
23
+ host = "http://localhost:8080"
24
+
25
+ # Wait between 1 and 5 seconds between tasks to simulate real user behavior
26
+ wait_time = between(1, 5)
27
+
28
+ @task(3)
29
+ def predict_single(self):
30
+ """
31
+ Task 1: Single Prediction (Weight: 3)
32
+
33
+ Performs a POST request to /predict with a single issue text.
34
+ This is the main task and executes more frequently due to higher weight.
35
+ """
36
+ payload = {
37
+ "issue_text": "Fix authentication bug in login module"
38
+ }
39
+
40
+ with self.client.post(
41
+ "/predict",
42
+ json=payload,
43
+ catch_response=True
44
+ ) as response:
45
+ if response.status_code == 201:
46
+ response.success()
47
+ else:
48
+ response.failure(f"Prediction failed with status {response.status_code}")
49
+
50
+ @task(1)
51
+ def predict_batch(self):
52
+ """
53
+ Task 2: Batch Prediction (Weight: 1)
54
+
55
+ Performs a POST request to /predict/batch with multiple issue texts.
56
+ """
57
+ payload = {
58
+ "issues": [
59
+ {"issue_text": "Test 1"},
60
+ {"issue_text": "Test 2"}
61
+ ]
62
+ }
63
+
64
+ with self.client.post(
65
+ "/predict/batch",
66
+ json=payload,
67
+ catch_response=True
68
+ ) as response:
69
+ if response.status_code == 200:
70
+ response.success()
71
+ else:
72
+ response.failure(f"Batch prediction failed with status {response.status_code}")
73
+
74
+ @task(1)
75
+ def monitoring_and_history(self):
76
+ """
77
+ Task 3: Monitoring and History (Weight: 1)
78
+
79
+ Performs GET requests to check prediction history and system health.
80
+ """
81
+ # Check prediction history
82
+ with self.client.get(
83
+ "/predictions",
84
+ catch_response=True
85
+ ) as response:
86
+ if 200 <= response.status_code < 300:
87
+ response.success()
88
+ else:
89
+ response.failure(f"Predictions history failed with status {response.status_code}")
90
+
91
+ # Check system health
92
+ with self.client.get(
93
+ "/health",
94
+ catch_response=True
95
+ ) as response:
96
+ if 200 <= response.status_code < 300:
97
+ response.success()
98
+ else:
99
+ response.failure(f"Health check failed with status {response.status_code}")
monitoring/prometheus/alert_rules.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ groups:
2
+ - name: hopcroft_alerts
3
+ rules:
4
+ - alert: ServiceDown
5
+ expr: up == 0
6
+ for: 1m
7
+ labels:
8
+ severity: critical
9
+ annotations:
10
+ summary: "Service {{ $labels.instance }} is down"
11
+ description: "The job {{ $labels.job }} has been down for more than 1 minute."
12
+
13
+ - alert: HighErrorRate
14
+ expr: |
15
+ sum(rate(hopcroft_requests_total{http_status=~"5.."}[5m]))
16
+ /
17
+ sum(rate(hopcroft_requests_total[5m])) > 0.1
18
+ for: 5m
19
+ labels:
20
+ severity: warning
21
+ annotations:
22
+ summary: "High error rate on {{ $labels.instance }}"
23
+ description: "Error rate is above 10% for the last 5 minutes (current value: {{ $value | printf \"%.2f\" }})."
24
+
25
+ - alert: SlowRequests
26
+ expr: histogram_quantile(0.95, sum by (le, endpoint) (rate(hopcroft_request_duration_seconds_bucket[5m]))) > 2
27
+ for: 5m
28
+ labels:
29
+ severity: warning
30
+ annotations:
31
+ summary: "Slow requests on {{ $labels.endpoint }}"
32
+ description: "95th percentile of request latency is above 2s (current value: {{ $value | printf \"%.2f\" }}s)."
monitoring/prometheus/prometheus.yml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ global:
2
+ scrape_interval: 15s
3
+ evaluation_interval: 15s
4
+ external_labels:
5
+ monitor: 'hopcroft-monitor'
6
+ environment: 'development'
7
+
8
+ rule_files:
9
+ - "alert_rules.yml"
10
+
11
+ alerting:
12
+ alertmanagers:
13
+ - static_configs:
14
+ - targets:
15
+ - 'alertmanager:9093'
16
+
17
+ scrape_configs:
18
+ - job_name: 'hopcroft-api'
19
+ metrics_path: '/metrics'
20
+ static_configs:
21
+ - targets: ['hopcroft-api:8080']
22
+ scrape_interval: 10s
23
+
24
+ - job_name: 'prometheus'
25
+ static_configs:
26
+ - targets: ['localhost:9090']
27
+
28
+ - job_name: 'pushgateway'
29
+ honor_labels: true
30
+ static_configs:
31
+ - targets: ['pushgateway:9091']
32
+ scrape_interval: 30s
monitoring/screenshots/incident acknowlege mail.png ADDED

Git LFS Details

  • SHA256: 0447e51deb21dcea0b8d9b1662f6c550324d905515077c516f6cb8a7cdd17e92
  • Pointer size: 131 Bytes
  • Size of remote file: 103 kB
monitoring/screenshots/incident acknowlege.png ADDED

Git LFS Details

  • SHA256: 8444eb1725d6ce3e04718b02f4f6bf567d7df069f5b8d9b38266f9cfea2ccb45
  • Pointer size: 131 Bytes
  • Size of remote file: 201 kB
monitoring/screenshots/incident mail.png ADDED

Git LFS Details

  • SHA256: dc754c8d812d966a9ea1cb45611402a524e2ec3326fa6fbbb5d9db6d9d299e92
  • Pointer size: 131 Bytes
  • Size of remote file: 108 kB
monitoring/screenshots/incident resolved mail.png ADDED

Git LFS Details

  • SHA256: c55ee1a6ecf074e41ef073e27c04f3061008176a5fc5c7f9346e8e6ba078a645
  • Pointer size: 131 Bytes
  • Size of remote file: 110 kB
monitoring/screenshots/incident resolved.png ADDED

Git LFS Details

  • SHA256: 8fa04b3e42bb88ccd3a321352b75ada58eb640079d178cf7331105820ffdbbcf
  • Pointer size: 131 Bytes
  • Size of remote file: 208 kB
monitoring/screenshots/incident.png ADDED

Git LFS Details

  • SHA256: 6c98d1b7b44ba732b02be6a722bd6b0baa6e66f8982a17137a453a2088edc9d3
  • Pointer size: 131 Bytes
  • Size of remote file: 193 kB
monitoring/screenshots/monitors.png ADDED

Git LFS Details

  • SHA256: 8130d58c03ba6d02b42d527ec6dd9f7cf14f5116dacae6f6e713cb14c93e8a36
  • Pointer size: 130 Bytes
  • Size of remote file: 71.8 kB
nginx.conf ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ worker_processes 1;
2
+ pid /tmp/nginx.pid;
3
+ error_log stderr info; # Log to stderr to see errors in HF Space Logs
4
+
5
+ events {
6
+ worker_connections 1024;
7
+ }
8
+
9
+ http {
10
+ include /etc/nginx/mime.types;
11
+ default_type application/octet-stream;
12
+
13
+ # HF Space runs as non-root, use /tmp for everything
14
+ access_log /dev/stdout;
15
+ client_body_temp_path /tmp/client_temp;
16
+ proxy_temp_path /tmp/proxy_temp;
17
+ fastcgi_temp_path /tmp/fastcgi_temp;
18
+ uwsgi_temp_path /tmp/uwsgi_temp;
19
+ scgi_temp_path /tmp/scgi_temp;
20
+
21
+ sendfile on;
22
+ keepalive_timeout 65;
23
+
24
+ upstream streamlit {
25
+ server 127.0.0.1:8501;
26
+ }
27
+
28
+ upstream fastapi {
29
+ server 127.0.0.1:8000;
30
+ }
31
+
32
+ server {
33
+ listen 7860;
34
+ server_name localhost;
35
+
36
+ # Health endpoint for HF readiness check
37
+ location /health {
38
+ proxy_pass http://fastapi/health;
39
+ proxy_set_header Host $host;
40
+ }
41
+
42
+ # FastAPI Documentation
43
+ location /docs {
44
+ proxy_pass http://fastapi/docs;
45
+ proxy_set_header Host $host;
46
+ proxy_set_header X-Real-IP $remote_addr;
47
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
48
+ proxy_set_header X-Forwarded-Proto $scheme;
49
+ }
50
+
51
+ location /redoc {
52
+ proxy_pass http://fastapi/redoc;
53
+ proxy_set_header Host $host;
54
+ }
55
+
56
+ location /openapi.json {
57
+ proxy_pass http://fastapi/openapi.json;
58
+ proxy_set_header Host $host;
59
+ }
60
+
61
+ # FastAPI API Endpoints
62
+ location /predict {
63
+ proxy_pass http://fastapi/predict;
64
+ proxy_set_header Host $host;
65
+ }
66
+
67
+ location /predictions {
68
+ proxy_pass http://fastapi/predictions;
69
+ proxy_set_header Host $host;
70
+ }
71
+
72
+ # Streamlit (Catch-all)
73
+ location / {
74
+ proxy_pass http://streamlit;
75
+ proxy_set_header Host $host;
76
+ proxy_set_header X-Real-IP $remote_addr;
77
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
78
+ proxy_set_header X-Forwarded-Proto $scheme;
79
+ proxy_set_header X-Forwarded-Host $host;
80
+
81
+ # WebSocket support for Streamlit
82
+ proxy_http_version 1.1;
83
+ proxy_set_header Upgrade $http_upgrade;
84
+ proxy_set_header Connection "upgrade";
85
+ proxy_read_timeout 86400;
86
+
87
+ # Prevent 502 if Streamlit is slow
88
+ proxy_connect_timeout 60s;
89
+ proxy_send_timeout 60s;
90
+ }
91
+ }
92
+ }
reports/alerting_test_report/alerting_report.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prometheus + Alertmanager Alerting Report
2
+
3
+ This report documents the configuration and verification of the alerting system for the Hopcroft Project.
4
+
5
+ ## 1. Alerting Rules
6
+ The `monitoring/prometheus/alert_rules.yml` file is configured with the following rules:
7
+ - **ServiceDown**: Triggers if a service is unreachable for 1 minute.
8
+ - **HighErrorRate**: Triggers if the error rate exceeds 10%.
9
+ - **SlowRequests**: Triggers if the 95th percentile of request latency exceeds 2 seconds.
10
+
11
+ ## 2. Alertmanager Configuration
12
+ The `monitoring/alertmanager/config.yml` file includes:
13
+ - **Grouping**: Alerts are grouped by `alertname` and `severity`.
14
+ - **Inhibition**: Critical alerts suppress warning-level alerts.
15
+ - **Receiver**: A webhook receiver is configured to forward notifications.
16
+
17
+ ## 3. Verification of "Firing" Alert
18
+ The test was conducted by stopping the `hopcroft-api` container and waiting for the 1-minute threshold to be reached.
19
+
20
+ ### Verification Proofs:
21
+
22
+ 1. **Prometheus - Alert Firing**:
23
+ The following image shows the `ServiceDown` alert in the **FIRING** state within the Prometheus dashboard.
24
+ ![Prometheus Firing Alert](prometheus_firing.png)
25
+
26
+ 2. **Alertmanager - Notification Received**:
27
+ The following image shows the Alertmanager interface with the alert correctly received from Prometheus.
28
+ ![Alertmanager Firing Alert](alertmanager_firing.png)
29
+
30
+ ### Restoration
31
+ Following verification, the `hopcroft-api` service was restarted and monitored until it returned to a healthy state.
reports/alerting_test_report/alertmanager_firing.png ADDED

Git LFS Details

  • SHA256: e1e229bae18738bef152e7401dc49c2f746f811c0dc0d8368153e97a2f50f6a0
  • Pointer size: 130 Bytes
  • Size of remote file: 46.2 kB
reports/alerting_test_report/prometheus_firing.png ADDED

Git LFS Details

  • SHA256: 2642504fc3d9c32e5f126f275bd4ff409cb0c832a9bb3836dd6fbd10452079d8
  • Pointer size: 130 Bytes
  • Size of remote file: 67.6 kB
requirements.txt CHANGED
@@ -23,6 +23,7 @@ sentence-transformers
23
 
24
  # API Framework
25
  fastapi[standard]>=0.115.0
 
26
  pydantic>=2.0.0
27
  uvicorn>=0.30.0
28
  httpx>=0.27.0
@@ -47,6 +48,8 @@ pytest-json-report>=1.5.0
47
  pytest-cov>=4.0.0
48
  pytest-xdist>=3.0.0
49
 
 
 
50
  # Data validation and quality
51
  great_expectations>=0.18.0
52
  deepchecks>=0.18.0
 
23
 
24
  # API Framework
25
  fastapi[standard]>=0.115.0
26
+ prometheus-client>=0.17.0
27
  pydantic>=2.0.0
28
  uvicorn>=0.30.0
29
  httpx>=0.27.0
 
48
  pytest-cov>=4.0.0
49
  pytest-xdist>=3.0.0
50
 
51
+ # Load testing
52
+ locust>=2.20.0
53
  # Data validation and quality
54
  great_expectations>=0.18.0
55
  deepchecks>=0.18.0
scripts/start_space.sh CHANGED
@@ -16,28 +16,82 @@ USER=${DAGSHUB_USERNAME:-$MLFLOW_TRACKING_USERNAME}
16
  PASS=${DAGSHUB_TOKEN:-$MLFLOW_TRACKING_PASSWORD}
17
 
18
  if [ -n "$USER" ] && [ -n "$PASS" ]; then
19
- echo "Configuring DVC authentication for DagsHub..."
20
  # Configure local config (not committed)
21
  dvc remote modify origin --local auth basic
22
  dvc remote modify origin --local user "$USER"
23
  dvc remote modify origin --local password "$PASS"
24
  else
25
- echo "WARNING: No DagsHub credentials found. DVC pull might fail if the remote is private."
26
  fi
27
 
28
- echo "Pulling models from DVC..."
29
  # Pull only the necessary files for inference
30
  dvc pull models/random_forest_tfidf_gridsearch.pkl.dvc \
31
  models/tfidf_vectorizer.pkl.dvc \
32
- models/label_names.pkl.dvc
33
 
34
- echo "Starting FastAPI application in background..."
35
- uvicorn hopcroft_skill_classification_tool_competition.main:app --host 0.0.0.0 --port 8000 &
 
 
 
 
 
 
 
36
 
37
  # Wait for API to start
38
- echo "Waiting for API to start..."
39
- sleep 10
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- echo "Starting Streamlit application..."
42
- export API_BASE_URL="http://localhost:8000"
43
- streamlit run hopcroft_skill_classification_tool_competition/streamlit_app.py --server.port 7860 --server.address 0.0.0.0
 
16
  PASS=${DAGSHUB_TOKEN:-$MLFLOW_TRACKING_PASSWORD}
17
 
18
  if [ -n "$USER" ] && [ -n "$PASS" ]; then
19
+ echo "$(date) - Configuring DVC authentication for DagsHub..."
20
  # Configure local config (not committed)
21
  dvc remote modify origin --local auth basic
22
  dvc remote modify origin --local user "$USER"
23
  dvc remote modify origin --local password "$PASS"
24
  else
25
+ echo "$(date) - WARNING: No DagsHub credentials found. DVC pull might fail if the remote is private."
26
  fi
27
 
28
+ echo "$(date) - Pulling models from DVC..."
29
  # Pull only the necessary files for inference
30
  dvc pull models/random_forest_tfidf_gridsearch.pkl.dvc \
31
  models/tfidf_vectorizer.pkl.dvc \
32
+ models/label_names.pkl.dvc || echo "DVC pull failed, but continuing..."
33
 
34
+ # Create Nginx temp directories
35
+ mkdir -p /tmp/client_temp /tmp/proxy_temp /tmp/fastcgi_temp /tmp/uwsgi_temp /tmp/scgi_temp
36
+
37
+ echo "$(date) - Checking models existence..."
38
+ ls -la models/
39
+
40
+ echo "$(date) - Starting FastAPI application in background..."
41
+ # Using 0.0.0.0 to be safe
42
+ uvicorn hopcroft_skill_classification_tool_competition.main:app --host 0.0.0.0 --port 8000 >> /tmp/fastapi.log 2>&1 &
43
 
44
  # Wait for API to start
45
+ echo "$(date) - Waiting for API to start (30s)..."
46
+ for i in {1..30}; do
47
+ if curl -s http://127.0.0.1:8000/health > /dev/null; then
48
+ echo "$(date) - API is UP!"
49
+ break
50
+ fi
51
+ echo "$(date) - Waiting... ($i/30)"
52
+ sleep 2
53
+ done
54
+
55
+ echo "$(date) - Starting Nginx reverse proxy..."
56
+ if ! command -v nginx &> /dev/null; then
57
+ echo "$(date) - ERROR: nginx not found in PATH"
58
+ exit 1
59
+ fi
60
+ nginx -c /app/nginx.conf -g "daemon off;" >> /tmp/nginx_startup.log 2>&1 &
61
+
62
+ echo "$(date) - Waiting for Nginx to initialize..."
63
+ sleep 5
64
+
65
+ # Check if Nginx is running
66
+ if ps aux | grep -v grep | grep -q "nginx"; then
67
+ echo "$(date) - Nginx is running."
68
+ else
69
+ echo "$(date) - ERROR: Nginx failed to start. Logs:"
70
+ cat /tmp/nginx_startup.log
71
+ fi
72
+
73
+ echo "$(date) - Final backend check before starting Streamlit..."
74
+ curl -v http://127.0.0.1:8000/health || echo "FastAPI health check failed!"
75
+
76
+ echo "$(date) - Starting Streamlit application on 127.0.0.1:8501..."
77
+ export API_BASE_URL="http://127.0.0.1:8000"
78
+ streamlit run hopcroft_skill_classification_tool_competition/streamlit_app.py \
79
+ --server.port 8501 \
80
+ --server.address 127.0.0.1 \
81
+ --server.enableCORS=false \
82
+ --server.enableXsrfProtection=false \
83
+ --server.headless true &
84
+
85
+ # Wait for Streamlit to start
86
+ echo "$(date) - Waiting for Streamlit to start (30s)..."
87
+ for i in {1..30}; do
88
+ if curl -s http://127.0.0.1:8501/healthz > /dev/null; then
89
+ echo "$(date) - Streamlit is UP!"
90
+ break
91
+ fi
92
+ echo "$(date) - Waiting for Streamlit... ($i/30)"
93
+ sleep 2
94
+ done
95
 
96
+ echo "$(date) - Process started. Tailing Nginx logs for debug..."
97
+ tail -f /tmp/nginx_startup.log /tmp/fastapi.log