Mauro Carlucci commited on
Commit
7f13b65
·
unverified ·
2 Parent(s): 27bcd6a 0b25b12

Merge pull request #41 from se4ai2526-uniba/Fix-Grafana-Prometheus

Browse files
Dockerfile CHANGED
@@ -40,9 +40,26 @@ RUN wget https://github.com/prometheus/prometheus/releases/download/v2.45.0/prom
40
  mkdir -p /etc/prometheus /var/lib/prometheus && \
41
  rm -rf prometheus-*
42
 
43
- COPY monitoring/grafana/provisioning /etc/grafana/provisioning
44
- COPY monitoring/grafana/dashboards /var/lib/grafana/dashboards
45
- COPY monitoring/prometheus/prometheus.yml /etc/prometheus/prometheus.yml
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  # Copy requirements first for caching
48
  COPY requirements.txt .
@@ -58,8 +75,8 @@ COPY --chown=user:user . .
58
  # Ensure the user has permissions on the app directory (needed for dvc init if .dvc is missing)
59
  RUN chown -R user:user /app
60
 
61
- # Fix line endings and permissions for the start script
62
- RUN dos2unix docker/scripts/start_space.sh && \
63
  chmod +x docker/scripts/start_space.sh
64
 
65
  # Install the project itself
 
40
  mkdir -p /etc/prometheus /var/lib/prometheus && \
41
  rm -rf prometheus-*
42
 
43
+ # Alertmanager
44
+ RUN wget https://github.com/prometheus/alertmanager/releases/download/v0.25.0/alertmanager-0.25.0.linux-amd64.tar.gz && \
45
+ tar xvfz alertmanager-*.tar.gz && \
46
+ mv alertmanager-*/alertmanager /usr/local/bin/ && \
47
+ mv alertmanager-*/amtool /usr/local/bin/ && \
48
+ mkdir -p /etc/alertmanager /var/lib/alertmanager && \
49
+ rm -rf alertmanager-*
50
+
51
+ # Pushgateway
52
+ RUN wget https://github.com/prometheus/pushgateway/releases/download/v1.6.0/pushgateway-1.6.0.linux-amd64.tar.gz && \
53
+ tar xvfz pushgateway-*.tar.gz && \
54
+ mv pushgateway-*/pushgateway /usr/local/bin/ && \
55
+ rm -rf pushgateway-*
56
+
57
+ COPY --chown=user monitoring/grafana/provisioning /etc/grafana/provisioning
58
+ COPY --chown=user monitoring/grafana/dashboards /var/lib/grafana/dashboards
59
+ COPY --chown=user monitoring/prometheus/prometheus.yml /etc/prometheus/prometheus.yml
60
+ COPY --chown=user monitoring/prometheus/alert_rules.yml /etc/prometheus/alert_rules.yml
61
+ COPY --chown=user monitoring/alertmanager/config.yml /etc/alertmanager/config.yml
62
+
63
 
64
  # Copy requirements first for caching
65
  COPY requirements.txt .
 
75
  # Ensure the user has permissions on the app directory (needed for dvc init if .dvc is missing)
76
  RUN chown -R user:user /app
77
 
78
+ # Fix line endings and permissions for the start script and configs
79
+ RUN find . -name "*.sh" -o -name "*.yml" -o -name "*.ini" -o -name "*.json" | xargs dos2unix && \
80
  chmod +x docker/scripts/start_space.sh
81
 
82
  # Install the project itself
docker/nginx.conf CHANGED
@@ -33,6 +33,14 @@ http {
33
  server 127.0.0.1:9090;
34
  }
35
 
 
 
 
 
 
 
 
 
36
  server {
37
  listen 7860;
38
  server_name localhost;
@@ -73,6 +81,15 @@ http {
73
  proxy_set_header Host $host;
74
  }
75
 
 
 
 
 
 
 
 
 
 
76
  # Grafana
77
  location = /grafana {
78
  return 301 /grafana/;
@@ -108,6 +125,24 @@ http {
108
  proxy_set_header X-Forwarded-Proto $scheme;
109
  }
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  # Streamlit (Catch-all)
112
  location / {
113
  proxy_pass http://streamlit;
 
33
  server 127.0.0.1:9090;
34
  }
35
 
36
+ upstream alertmanager {
37
+ server 127.0.0.1:9093;
38
+ }
39
+
40
+ upstream pushgateway {
41
+ server 127.0.0.1:9091;
42
+ }
43
+
44
  server {
45
  listen 7860;
46
  server_name localhost;
 
81
  proxy_set_header Host $host;
82
  }
83
 
84
+ # FastAPI Metrics endpoint for Prometheus
85
+ location /metrics {
86
+ proxy_pass http://fastapi/metrics;
87
+ proxy_set_header Host $host;
88
+ proxy_set_header X-Real-IP $remote_addr;
89
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
90
+ proxy_set_header X-Forwarded-Proto $scheme;
91
+ }
92
+
93
  # Grafana
94
  location = /grafana {
95
  return 301 /grafana/;
 
125
  proxy_set_header X-Forwarded-Proto $scheme;
126
  }
127
 
128
+ # Alertmanager UI
129
+ location /alertmanager/ {
130
+ proxy_pass http://alertmanager;
131
+ proxy_set_header Host $host;
132
+ proxy_set_header X-Real-IP $remote_addr;
133
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
134
+ proxy_set_header X-Forwarded-Proto $scheme;
135
+ }
136
+
137
+ # Pushgateway UI
138
+ location /pushgateway/ {
139
+ proxy_pass http://pushgateway;
140
+ proxy_set_header Host $host;
141
+ proxy_set_header X-Real-IP $remote_addr;
142
+ proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
143
+ proxy_set_header X-Forwarded-Proto $scheme;
144
+ }
145
+
146
  # Streamlit (Catch-all)
147
  location / {
148
  proxy_pass http://streamlit;
docker/scripts/start_space.sh CHANGED
@@ -53,23 +53,47 @@ for i in {1..30}; do
53
  done
54
 
55
  echo "$(date) - Configuring and starting Prometheus..."
56
- # Create a config for the space
57
- cat <<EOF > /tmp/prometheus.yml
58
- global:
59
- scrape_interval: 15s
60
- evaluation_interval: 15s
61
-
62
- scrape_configs:
63
- - job_name: 'hopcroft-api'
64
- metrics_path: '/metrics'
65
- static_configs:
66
- - targets: ['127.0.0.1:8000']
67
- scrape_interval: 10s
68
-
69
- - job_name: 'prometheus'
70
- static_configs:
71
- - targets: ['127.0.0.1:9090']
72
- EOF
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
 
74
  # Determine Prometheus External URL
75
  # Always use relative path so it works on both huggingface.co and .hf.space domains
@@ -101,15 +125,23 @@ else
101
  GRAFANA_ROOT_URL="http://localhost:3000/grafana/"
102
  fi
103
 
 
 
 
 
104
  echo "$(date) - Starting Grafana with Root URL: $GRAFANA_ROOT_URL"
105
- grafana-server --homepath=/usr/share/grafana \
 
 
106
  --config=/app/monitoring/grafana/grafana.ini \
107
- cfg:default.paths.data=/tmp/grafana_data \
108
- cfg:default.paths.logs=/tmp/grafana_logs \
109
- cfg:default.paths.plugins=/usr/share/grafana/plugins \
 
110
  cfg:server.root_url="$GRAFANA_ROOT_URL" \
111
  cfg:server.serve_from_sub_path=true \
112
- >> /tmp/grafana.log 2>&1 &
 
113
 
114
  # Wait for Grafana to start
115
  echo "$(date) - Waiting for Grafana (20s)..."
@@ -124,6 +156,12 @@ for i in {1..20}; do
124
  sleep 1
125
  done
126
 
 
 
 
 
 
 
127
 
128
  echo "$(date) - Starting Nginx reverse proxy..."
129
  if ! command -v nginx &> /dev/null; then
@@ -166,5 +204,5 @@ for i in {1..30}; do
166
  sleep 2
167
  done
168
 
169
- echo "$(date) - Process started. Tailing Nginx logs for debug..."
170
- tail -f /tmp/nginx_startup.log /tmp/fastapi.log
 
53
  done
54
 
55
  echo "$(date) - Configuring and starting Prometheus..."
56
+ # Patch Grafana Datasource for Localhost (HF Space) and fix URL path
57
+ # Replace prometheus:9090 with 127.0.0.1:9090/prometheus in all datasource configs
58
+ find /app/monitoring/grafana/provisioning/datasources -name '*.yml' -exec sed -i 's/prometheus:9090/127.0.0.1:9090\/prometheus/g' {} +
59
+
60
+ # Copy production configs to /tmp for modification
61
+ cp /etc/prometheus/prometheus.yml /tmp/prometheus.yml
62
+ cp /etc/prometheus/alert_rules.yml /tmp/alert_rules.yml
63
+ cp /etc/alertmanager/config.yml /tmp/alertmanager.yml
64
+
65
+ # Modify Prometheus config for local execution (replace docker-compose service names with localhost)
66
+ # hopcroft-api:8080 -> 127.0.0.1:8000 (API runs on 8000 in Space)
67
+ sed -i 's/hopcroft-api:8080/127.0.0.1:8000/g' /tmp/prometheus.yml
68
+ # Alertmanager: hopcroft-api:8080 -> 127.0.0.1:8000
69
+ sed -i 's/hopcroft-api:8080/127.0.0.1:8000/g' /tmp/alertmanager.yml
70
+ # alertmanager:9093 -> 127.0.0.1:9093
71
+ sed -i 's/alertmanager:9093/127.0.0.1:9093/g' /tmp/prometheus.yml
72
+ # pushgateway:9091 -> 127.0.0.1:9091
73
+ sed -i 's/pushgateway:9091/127.0.0.1:9091/g' /tmp/prometheus.yml
74
+ # Fix alert_rules path to be absolute or relative to execution
75
+ sed -i 's|"alert_rules.yml"|"/tmp/alert_rules.yml"|g' /tmp/prometheus.yml
76
+
77
+ # FIX: Add path prefixes to match --web.route-prefix arguments
78
+ # Add metrics_path for self-scraping prometheus
79
+ sed -i 's/job_name: "prometheus"/job_name: "prometheus"\n metrics_path: "\/prometheus\/metrics"/g' /tmp/prometheus.yml
80
+ # Add metrics_path for pushgateway
81
+ sed -i 's/job_name: "pushgateway"/job_name: "pushgateway"\n metrics_path: "\/pushgateway\/metrics"/g' /tmp/prometheus.yml
82
+ # Add path_prefix for Alertmanager
83
+ sed -i 's/ - static_configs:/ - path_prefix: "\/alertmanager\/"\n static_configs:/g' /tmp/prometheus.yml
84
+
85
+ echo "$(date) - Starting Alertmanager..."
86
+ alertmanager \
87
+ --config.file=/tmp/alertmanager.yml \
88
+ --storage.path=/tmp/alertmanager_data \
89
+ --web.route-prefix=/alertmanager/ \
90
+ >> /tmp/alertmanager.log 2>&1 &
91
+
92
+ echo "$(date) - Starting Pushgateway..."
93
+ pushgateway \
94
+ --persistence.file=/tmp/pushgateway_data \
95
+ --web.route-prefix=/pushgateway/ \
96
+ >> /tmp/pushgateway.log 2>&1 &
97
 
98
  # Determine Prometheus External URL
99
  # Always use relative path so it works on both huggingface.co and .hf.space domains
 
125
  GRAFANA_ROOT_URL="http://localhost:3000/grafana/"
126
  fi
127
 
128
+ # Locate Grafana binary
129
+ GRAFANA_BIN=$(which grafana-server || echo "/usr/sbin/grafana-server")
130
+ echo "$(date) - Found Grafana binary at: $GRAFANA_BIN"
131
+
132
  echo "$(date) - Starting Grafana with Root URL: $GRAFANA_ROOT_URL"
133
+
134
+ # Use the project's grafana.ini which we have permissions to read
135
+ $GRAFANA_BIN --homepath=/usr/share/grafana \
136
  --config=/app/monitoring/grafana/grafana.ini \
137
+ cfg:paths.data=/tmp/grafana_data \
138
+ cfg:paths.logs=/tmp/grafana_logs \
139
+ cfg:paths.plugins=/usr/share/grafana/plugins \
140
+ cfg:paths.provisioning=/app/monitoring/grafana/provisioning \
141
  cfg:server.root_url="$GRAFANA_ROOT_URL" \
142
  cfg:server.serve_from_sub_path=true \
143
+ cfg:server.http_port=3000 \
144
+ > /tmp/grafana.log 2>&1 &
145
 
146
  # Wait for Grafana to start
147
  echo "$(date) - Waiting for Grafana (20s)..."
 
156
  sleep 1
157
  done
158
 
159
+ # If Grafana is still down, print logs
160
+ if ! curl -s http://127.0.0.1:3000/api/health > /dev/null; then
161
+ echo "$(date) - ERROR: Grafana failed to start within 20 seconds. Dumping logs:"
162
+ cat /tmp/grafana.log
163
+ fi
164
+
165
 
166
  echo "$(date) - Starting Nginx reverse proxy..."
167
  if ! command -v nginx &> /dev/null; then
 
204
  sleep 2
205
  done
206
 
207
+ echo "$(date) - Process started. Tailing logs for debug..."
208
+ tail -f /tmp/nginx_startup.log /tmp/fastapi.log /tmp/grafana.log /tmp/prometheus.log
docs/design_choices.md CHANGED
@@ -465,11 +465,12 @@ async def monitor_requests(request, call_next):
465
  ### Grafana Visualization
466
 
467
  **Dashboard Panels:**
468
- 1. API Request Rate (time series)
469
- 2. API Latency Percentiles (heatmap)
470
- 3. Drift Detection Status (stat panel)
471
- 4. Drift P-Value Trend (time series)
472
- 5. Error Rate (gauge)
 
473
 
474
  **Data Sources:**
475
  - Prometheus: Real-time metrics
 
465
  ### Grafana Visualization
466
 
467
  **Dashboard Panels:**
468
+ 1. Request Rate (gauge)
469
+ 2. Request Latency p50/p95 (time series)
470
+ 3. In-Progress Requests (stat panel)
471
+ 4. Error Rate 5xx (stat panel)
472
+ 5. Model Prediction Time (time series)
473
+ 6. Requests by Endpoint (bar chart)
474
 
475
  **Data Sources:**
476
  - Prometheus: Real-time metrics
docs/milestone_summaries.md CHANGED
@@ -242,10 +242,12 @@ Jobs:
242
 
243
  ### Grafana Dashboards
244
 
245
- - **API Request Rate**: Real-time requests per second
246
- - **API Latency**: P50, P90, P99 percentiles
247
- - **Drift Detection Status**: Binary indicator (0/1)
248
- - **Drift P-Value**: Statistical significance metric
 
 
249
 
250
  ### Data Drift Detection
251
 
 
242
 
243
  ### Grafana Dashboards
244
 
245
+ - **Request Rate**: Real-time requests per second
246
+ - **Request Latency (p50, p95)**: Response time percentiles
247
+ - **In-Progress Requests**: Currently processing requests
248
+ - **Error Rate (5xx)**: Failed request percentage
249
+ - **Model Prediction Time**: Inference latency
250
+ - **Requests by Endpoint**: Traffic distribution
251
 
252
  ### Data Drift Detection
253
 
docs/user_guide.md CHANGED
@@ -406,11 +406,12 @@ The pre-configured dashboard includes:
406
 
407
  | Panel | Description |
408
  |-------|-------------|
409
- | API Request Rate | Real-time requests per endpoint |
410
- | API Latency | Response time distribution |
411
- | Drift Detection Status | Binary indicator (0=No Drift, 1=Drift) |
412
- | Drift P-Value | Statistical significance |
413
- | Drift Distance | KS test distance metric |
 
414
 
415
  ### Data Drift Detection
416
 
 
406
 
407
  | Panel | Description |
408
  |-------|-------------|
409
+ | Request Rate | Real-time requests per second |
410
+ | Request Latency (p50, p95) | Response time percentiles |
411
+ | In-Progress Requests | Currently processing requests |
412
+ | Error Rate (5xx) | Percentage of failed requests |
413
+ | Model Prediction Time | Average model inference latency |
414
+ | Requests by Endpoint | Traffic distribution per endpoint |
415
 
416
  ### Data Drift Detection
417
 
monitoring/alertmanager/config.yml CHANGED
@@ -2,20 +2,20 @@ global:
2
  resolve_timeout: 5m
3
 
4
  route:
5
- group_by: ['alertname', 'severity']
6
  group_wait: 10s
7
  group_interval: 10s
8
  repeat_interval: 1h
9
- receiver: 'log-receiver'
10
 
11
  receivers:
12
- - name: 'log-receiver'
13
  webhook_configs:
14
- - url: 'http://hopcroft-api:8080/health'
15
 
16
  inhibition_rules:
17
  - source_match:
18
- severity: 'critical'
19
  target_match:
20
- severity: 'warning'
21
- equal: ['alertname', 'dev', 'instance']
 
2
  resolve_timeout: 5m
3
 
4
  route:
5
+ group_by: ["alertname", "severity"]
6
  group_wait: 10s
7
  group_interval: 10s
8
  repeat_interval: 1h
9
+ receiver: "log-receiver"
10
 
11
  receivers:
12
+ - name: "log-receiver"
13
  webhook_configs:
14
+ - url: "http://hopcroft-api:8080/health"
15
 
16
  inhibition_rules:
17
  - source_match:
18
+ severity: "critical"
19
  target_match:
20
+ severity: "warning"
21
+ equal: ["alertname", "dev", "instance"]
monitoring/grafana/dashboards/hopcroft_dashboard.json CHANGED
@@ -62,7 +62,7 @@
62
  "pluginVersion": "9.0.0",
63
  "targets": [
64
  {
65
- "expr": "rate(fastapi_requests_total[1m])",
66
  "refId": "A"
67
  }
68
  ],
@@ -131,12 +131,12 @@
131
  "pluginVersion": "9.0.0",
132
  "targets": [
133
  {
134
- "expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
135
  "legendFormat": "p95",
136
  "refId": "A"
137
  },
138
  {
139
- "expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
140
  "legendFormat": "p50 (median)",
141
  "refId": "B"
142
  }
@@ -152,32 +152,25 @@
152
  "color": {
153
  "mode": "thresholds"
154
  },
155
- "mappings": [
156
- {
157
- "options": {
158
- "0": {
159
- "color": "red",
160
- "index": 1,
161
- "text": "No Drift"
162
- },
163
- "1": {
164
- "color": "green",
165
- "index": 0,
166
- "text": "Drift Detected"
167
- }
168
- },
169
- "type": "value"
170
- }
171
- ],
172
  "thresholds": {
173
  "mode": "absolute",
174
  "steps": [
175
  {
176
  "color": "green",
177
  "value": null
 
 
 
 
 
 
 
 
178
  }
179
  ]
180
- }
 
181
  }
182
  },
183
  "gridPos": {
@@ -201,13 +194,13 @@
201
  "pluginVersion": "9.0.0",
202
  "targets": [
203
  {
204
- "expr": "drift_detected",
205
  "refId": "A"
206
  }
207
  ],
208
- "title": "Data Drift Status",
209
  "type": "stat",
210
- "description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
211
  },
212
  {
213
  "datasource": "Prometheus",
@@ -216,7 +209,7 @@
216
  "color": {
217
  "mode": "thresholds"
218
  },
219
- "decimals": 4,
220
  "mappings": [],
221
  "thresholds": {
222
  "mode": "absolute",
@@ -235,7 +228,7 @@
235
  }
236
  ]
237
  },
238
- "unit": "short"
239
  }
240
  },
241
  "gridPos": {
@@ -259,13 +252,13 @@
259
  "pluginVersion": "9.0.0",
260
  "targets": [
261
  {
262
- "expr": "drift_p_value",
263
  "refId": "A"
264
  }
265
  ],
266
- "title": "Drift P-Value",
267
  "type": "stat",
268
- "description": "Statistical significance of detected drift (lower = more significant)"
269
  },
270
  {
271
  "datasource": "Prometheus",
@@ -305,7 +298,7 @@
305
  }
306
  ]
307
  },
308
- "unit": "short"
309
  }
310
  },
311
  "gridPos": {
@@ -328,14 +321,84 @@
328
  "pluginVersion": "9.0.0",
329
  "targets": [
330
  {
331
- "expr": "drift_distance",
332
- "legendFormat": "Distance",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  "refId": "A"
334
  }
335
  ],
336
- "title": "Drift Distance Over Time",
337
  "type": "timeseries",
338
- "description": "Statistical distance between baseline and current data distribution"
339
  }
340
  ],
341
  "refresh": "10s",
@@ -353,6 +416,6 @@
353
  "timezone": "",
354
  "title": "Hopcroft ML Model Monitoring",
355
  "uid": "hopcroft-ml-dashboard",
356
- "version": 1,
357
  "weekStart": ""
358
  }
 
62
  "pluginVersion": "9.0.0",
63
  "targets": [
64
  {
65
+ "expr": "sum(rate(hopcroft_requests_total[1m]))",
66
  "refId": "A"
67
  }
68
  ],
 
131
  "pluginVersion": "9.0.0",
132
  "targets": [
133
  {
134
+ "expr": "histogram_quantile(0.95, sum(rate(hopcroft_request_duration_seconds_bucket[5m])) by (le)) * 1000",
135
  "legendFormat": "p95",
136
  "refId": "A"
137
  },
138
  {
139
+ "expr": "histogram_quantile(0.50, sum(rate(hopcroft_request_duration_seconds_bucket[5m])) by (le)) * 1000",
140
  "legendFormat": "p50 (median)",
141
  "refId": "B"
142
  }
 
152
  "color": {
153
  "mode": "thresholds"
154
  },
155
+ "mappings": [],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
  "thresholds": {
157
  "mode": "absolute",
158
  "steps": [
159
  {
160
  "color": "green",
161
  "value": null
162
+ },
163
+ {
164
+ "color": "yellow",
165
+ "value": 1
166
+ },
167
+ {
168
+ "color": "red",
169
+ "value": 5
170
  }
171
  ]
172
+ },
173
+ "unit": "short"
174
  }
175
  },
176
  "gridPos": {
 
194
  "pluginVersion": "9.0.0",
195
  "targets": [
196
  {
197
+ "expr": "sum(hopcroft_in_progress_requests)",
198
  "refId": "A"
199
  }
200
  ],
201
+ "title": "In-Progress Requests",
202
  "type": "stat",
203
+ "description": "Number of requests currently being processed"
204
  },
205
  {
206
  "datasource": "Prometheus",
 
209
  "color": {
210
  "mode": "thresholds"
211
  },
212
+ "decimals": 2,
213
  "mappings": [],
214
  "thresholds": {
215
  "mode": "absolute",
 
228
  }
229
  ]
230
  },
231
+ "unit": "percentunit"
232
  }
233
  },
234
  "gridPos": {
 
252
  "pluginVersion": "9.0.0",
253
  "targets": [
254
  {
255
+ "expr": "sum(rate(hopcroft_requests_total{http_status=~\"5..\"}[5m])) / sum(rate(hopcroft_requests_total[5m]))",
256
  "refId": "A"
257
  }
258
  ],
259
+ "title": "Error Rate (5xx)",
260
  "type": "stat",
261
+ "description": "Percentage of requests resulting in 5xx errors"
262
  },
263
  {
264
  "datasource": "Prometheus",
 
298
  }
299
  ]
300
  },
301
+ "unit": "s"
302
  }
303
  },
304
  "gridPos": {
 
321
  "pluginVersion": "9.0.0",
322
  "targets": [
323
  {
324
+ "expr": "rate(hopcroft_prediction_processing_seconds_sum[5m]) / rate(hopcroft_prediction_processing_seconds_count[5m])",
325
+ "legendFormat": "Avg Prediction Time",
326
+ "refId": "A"
327
+ }
328
+ ],
329
+ "title": "Model Prediction Time",
330
+ "type": "timeseries",
331
+ "description": "Average time spent processing model predictions"
332
+ },
333
+ {
334
+ "datasource": "Prometheus",
335
+ "fieldConfig": {
336
+ "defaults": {
337
+ "color": {
338
+ "mode": "palette-classic"
339
+ },
340
+ "custom": {
341
+ "axisLabel": "",
342
+ "axisPlacement": "auto",
343
+ "barAlignment": 0,
344
+ "drawStyle": "bars",
345
+ "fillOpacity": 80,
346
+ "gradientMode": "none",
347
+ "hideFrom": {
348
+ "tooltip": false,
349
+ "viz": false,
350
+ "legend": false
351
+ },
352
+ "lineInterpolation": "linear",
353
+ "lineWidth": 1,
354
+ "pointSize": 5,
355
+ "scaleDistribution": {
356
+ "type": "linear"
357
+ },
358
+ "showPoints": "never",
359
+ "spanNulls": false
360
+ },
361
+ "mappings": [],
362
+ "thresholds": {
363
+ "mode": "absolute",
364
+ "steps": [
365
+ {
366
+ "color": "green",
367
+ "value": null
368
+ }
369
+ ]
370
+ },
371
+ "unit": "short"
372
+ }
373
+ },
374
+ "gridPos": {
375
+ "h": 8,
376
+ "w": 24,
377
+ "x": 0,
378
+ "y": 14
379
+ },
380
+ "id": 6,
381
+ "options": {
382
+ "legend": {
383
+ "calcs": ["sum"],
384
+ "displayMode": "table",
385
+ "placement": "right"
386
+ },
387
+ "tooltip": {
388
+ "mode": "multi"
389
+ }
390
+ },
391
+ "pluginVersion": "9.0.0",
392
+ "targets": [
393
+ {
394
+ "expr": "sum by (endpoint) (increase(hopcroft_requests_total[5m]))",
395
+ "legendFormat": "{{endpoint}}",
396
  "refId": "A"
397
  }
398
  ],
399
+ "title": "Requests by Endpoint",
400
  "type": "timeseries",
401
+ "description": "Number of requests per endpoint over time"
402
  }
403
  ],
404
  "refresh": "10s",
 
416
  "timezone": "",
417
  "title": "Hopcroft ML Model Monitoring",
418
  "uid": "hopcroft-ml-dashboard",
419
+ "version": 2,
420
  "weekStart": ""
421
  }
monitoring/grafana/provisioning/dashboards/dashboard.yml CHANGED
@@ -1,13 +1,13 @@
1
  apiVersion: 1
2
 
3
  providers:
4
- - name: 'Hopcroft Dashboards'
5
  orgId: 1
6
- folder: ''
7
  type: file
8
  disableDeletion: false
9
  updateIntervalSeconds: 10
10
  allowUiUpdates: true
11
  options:
12
  path: /var/lib/grafana/dashboards
13
- foldersFromFilesStructure: true
 
1
  apiVersion: 1
2
 
3
  providers:
4
+ - name: "Hopcroft Dashboards"
5
  orgId: 1
6
+ folder: "Hopcroft Project"
7
  type: file
8
  disableDeletion: false
9
  updateIntervalSeconds: 10
10
  allowUiUpdates: true
11
  options:
12
  path: /var/lib/grafana/dashboards
13
+ foldersFromFilesStructure: false
monitoring/grafana/provisioning/dashboards/hopcroft_dashboard.json DELETED
@@ -1,358 +0,0 @@
1
- {
2
- "annotations": {
3
- "list": [
4
- {
5
- "builtIn": 1,
6
- "datasource": "-- Grafana --",
7
- "enable": true,
8
- "hide": true,
9
- "iconColor": "rgba(0, 211, 255, 1)",
10
- "name": "Annotations & Alerts",
11
- "type": "dashboard"
12
- }
13
- ]
14
- },
15
- "editable": true,
16
- "gnetId": null,
17
- "graphTooltip": 1,
18
- "id": null,
19
- "links": [],
20
- "panels": [
21
- {
22
- "datasource": "Prometheus",
23
- "fieldConfig": {
24
- "defaults": {
25
- "color": {
26
- "mode": "thresholds"
27
- },
28
- "mappings": [],
29
- "thresholds": {
30
- "mode": "absolute",
31
- "steps": [
32
- {
33
- "color": "green",
34
- "value": null
35
- },
36
- {
37
- "color": "red",
38
- "value": 80
39
- }
40
- ]
41
- },
42
- "unit": "reqps"
43
- }
44
- },
45
- "gridPos": {
46
- "h": 8,
47
- "w": 6,
48
- "x": 0,
49
- "y": 0
50
- },
51
- "id": 1,
52
- "options": {
53
- "orientation": "auto",
54
- "reduceOptions": {
55
- "calcs": ["lastNotNull"],
56
- "fields": "",
57
- "values": false
58
- },
59
- "showThresholdLabels": false,
60
- "showThresholdMarkers": true
61
- },
62
- "pluginVersion": "9.0.0",
63
- "targets": [
64
- {
65
- "expr": "rate(fastapi_requests_total[1m])",
66
- "refId": "A"
67
- }
68
- ],
69
- "title": "Request Rate",
70
- "type": "gauge",
71
- "description": "Number of requests per second handled by the API"
72
- },
73
- {
74
- "datasource": "Prometheus",
75
- "fieldConfig": {
76
- "defaults": {
77
- "color": {
78
- "mode": "palette-classic"
79
- },
80
- "custom": {
81
- "axisLabel": "",
82
- "axisPlacement": "auto",
83
- "barAlignment": 0,
84
- "drawStyle": "line",
85
- "fillOpacity": 10,
86
- "gradientMode": "none",
87
- "hideFrom": {
88
- "tooltip": false,
89
- "viz": false,
90
- "legend": false
91
- },
92
- "lineInterpolation": "linear",
93
- "lineWidth": 1,
94
- "pointSize": 5,
95
- "scaleDistribution": {
96
- "type": "linear"
97
- },
98
- "showPoints": "never",
99
- "spanNulls": true
100
- },
101
- "mappings": [],
102
- "thresholds": {
103
- "mode": "absolute",
104
- "steps": [
105
- {
106
- "color": "green",
107
- "value": null
108
- }
109
- ]
110
- },
111
- "unit": "ms"
112
- }
113
- },
114
- "gridPos": {
115
- "h": 8,
116
- "w": 18,
117
- "x": 6,
118
- "y": 0
119
- },
120
- "id": 2,
121
- "options": {
122
- "legend": {
123
- "calcs": ["mean", "max"],
124
- "displayMode": "table",
125
- "placement": "right"
126
- },
127
- "tooltip": {
128
- "mode": "multi"
129
- }
130
- },
131
- "pluginVersion": "9.0.0",
132
- "targets": [
133
- {
134
- "expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
135
- "legendFormat": "p95",
136
- "refId": "A"
137
- },
138
- {
139
- "expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
140
- "legendFormat": "p50 (median)",
141
- "refId": "B"
142
- }
143
- ],
144
- "title": "Request Latency (p50, p95)",
145
- "type": "timeseries",
146
- "description": "API response time percentiles over time"
147
- },
148
- {
149
- "datasource": "Prometheus",
150
- "fieldConfig": {
151
- "defaults": {
152
- "color": {
153
- "mode": "thresholds"
154
- },
155
- "mappings": [
156
- {
157
- "options": {
158
- "0": {
159
- "color": "red",
160
- "index": 1,
161
- "text": "No Drift"
162
- },
163
- "1": {
164
- "color": "green",
165
- "index": 0,
166
- "text": "Drift Detected"
167
- }
168
- },
169
- "type": "value"
170
- }
171
- ],
172
- "thresholds": {
173
- "mode": "absolute",
174
- "steps": [
175
- {
176
- "color": "green",
177
- "value": null
178
- }
179
- ]
180
- }
181
- }
182
- },
183
- "gridPos": {
184
- "h": 6,
185
- "w": 6,
186
- "x": 0,
187
- "y": 8
188
- },
189
- "id": 3,
190
- "options": {
191
- "orientation": "auto",
192
- "reduceOptions": {
193
- "calcs": ["lastNotNull"],
194
- "fields": "",
195
- "values": false
196
- },
197
- "showThresholdLabels": false,
198
- "showThresholdMarkers": true,
199
- "text": {}
200
- },
201
- "pluginVersion": "9.0.0",
202
- "targets": [
203
- {
204
- "expr": "drift_detected",
205
- "refId": "A"
206
- }
207
- ],
208
- "title": "Data Drift Status",
209
- "type": "stat",
210
- "description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
211
- },
212
- {
213
- "datasource": "Prometheus",
214
- "fieldConfig": {
215
- "defaults": {
216
- "color": {
217
- "mode": "thresholds"
218
- },
219
- "decimals": 4,
220
- "mappings": [],
221
- "thresholds": {
222
- "mode": "absolute",
223
- "steps": [
224
- {
225
- "color": "green",
226
- "value": null
227
- },
228
- {
229
- "color": "yellow",
230
- "value": 0.01
231
- },
232
- {
233
- "color": "red",
234
- "value": 0.05
235
- }
236
- ]
237
- },
238
- "unit": "short"
239
- }
240
- },
241
- "gridPos": {
242
- "h": 6,
243
- "w": 6,
244
- "x": 6,
245
- "y": 8
246
- },
247
- "id": 4,
248
- "options": {
249
- "orientation": "auto",
250
- "reduceOptions": {
251
- "calcs": ["lastNotNull"],
252
- "fields": "",
253
- "values": false
254
- },
255
- "showThresholdLabels": false,
256
- "showThresholdMarkers": true,
257
- "text": {}
258
- },
259
- "pluginVersion": "9.0.0",
260
- "targets": [
261
- {
262
- "expr": "drift_p_value",
263
- "refId": "A"
264
- }
265
- ],
266
- "title": "Drift P-Value",
267
- "type": "stat",
268
- "description": "Statistical significance of detected drift (lower = more significant)"
269
- },
270
- {
271
- "datasource": "Prometheus",
272
- "fieldConfig": {
273
- "defaults": {
274
- "color": {
275
- "mode": "palette-classic"
276
- },
277
- "custom": {
278
- "axisLabel": "",
279
- "axisPlacement": "auto",
280
- "barAlignment": 0,
281
- "drawStyle": "line",
282
- "fillOpacity": 10,
283
- "gradientMode": "none",
284
- "hideFrom": {
285
- "tooltip": false,
286
- "viz": false,
287
- "legend": false
288
- },
289
- "lineInterpolation": "linear",
290
- "lineWidth": 1,
291
- "pointSize": 5,
292
- "scaleDistribution": {
293
- "type": "linear"
294
- },
295
- "showPoints": "auto",
296
- "spanNulls": false
297
- },
298
- "mappings": [],
299
- "thresholds": {
300
- "mode": "absolute",
301
- "steps": [
302
- {
303
- "color": "green",
304
- "value": null
305
- }
306
- ]
307
- },
308
- "unit": "short"
309
- }
310
- },
311
- "gridPos": {
312
- "h": 6,
313
- "w": 12,
314
- "x": 12,
315
- "y": 8
316
- },
317
- "id": 5,
318
- "options": {
319
- "legend": {
320
- "calcs": ["mean", "lastNotNull"],
321
- "displayMode": "table",
322
- "placement": "right"
323
- },
324
- "tooltip": {
325
- "mode": "multi"
326
- }
327
- },
328
- "pluginVersion": "9.0.0",
329
- "targets": [
330
- {
331
- "expr": "drift_distance",
332
- "legendFormat": "Distance",
333
- "refId": "A"
334
- }
335
- ],
336
- "title": "Drift Distance Over Time",
337
- "type": "timeseries",
338
- "description": "Statistical distance between baseline and current data distribution"
339
- }
340
- ],
341
- "refresh": "10s",
342
- "schemaVersion": 36,
343
- "style": "dark",
344
- "tags": ["hopcroft", "ml", "monitoring"],
345
- "templating": {
346
- "list": []
347
- },
348
- "time": {
349
- "from": "now-1h",
350
- "to": "now"
351
- },
352
- "timepicker": {},
353
- "timezone": "",
354
- "title": "Hopcroft ML Model Monitoring",
355
- "uid": "hopcroft-ml-dashboard",
356
- "version": 1,
357
- "weekStart": ""
358
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
monitoring/grafana/provisioning/datasources/prometheus.yml CHANGED
@@ -11,4 +11,4 @@ datasources:
11
  editable: true
12
  jsonData:
13
  httpMethod: POST
14
- timeInterval: "15s"
 
11
  editable: true
12
  jsonData:
13
  httpMethod: POST
14
+ timeInterval: "15s"
monitoring/prometheus/prometheus.yml CHANGED
@@ -2,8 +2,8 @@ global:
2
  scrape_interval: 15s
3
  evaluation_interval: 15s
4
  external_labels:
5
- monitor: 'hopcroft-monitor'
6
- environment: 'development'
7
 
8
  rule_files:
9
  - "alert_rules.yml"
@@ -12,21 +12,21 @@ alerting:
12
  alertmanagers:
13
  - static_configs:
14
  - targets:
15
- - 'alertmanager:9093'
16
 
17
  scrape_configs:
18
- - job_name: 'hopcroft-api'
19
- metrics_path: '/metrics'
20
  static_configs:
21
- - targets: ['hopcroft-api:8080']
22
  scrape_interval: 10s
23
 
24
- - job_name: 'prometheus'
25
  static_configs:
26
- - targets: ['localhost:9090']
27
 
28
- - job_name: 'pushgateway'
29
- honor_labels: true
30
  static_configs:
31
- - targets: ['pushgateway:9091']
32
  scrape_interval: 30s
 
2
  scrape_interval: 15s
3
  evaluation_interval: 15s
4
  external_labels:
5
+ monitor: "hopcroft-monitor"
6
+ environment: "development"
7
 
8
  rule_files:
9
  - "alert_rules.yml"
 
12
  alertmanagers:
13
  - static_configs:
14
  - targets:
15
+ - "alertmanager:9093"
16
 
17
  scrape_configs:
18
+ - job_name: "hopcroft-api"
19
+ metrics_path: "/metrics"
20
  static_configs:
21
+ - targets: ["hopcroft-api:8080"]
22
  scrape_interval: 10s
23
 
24
+ - job_name: "prometheus"
25
  static_configs:
26
+ - targets: ["localhost:9090"]
27
 
28
+ - job_name: "pushgateway"
29
+ honor_labels: true
30
  static_configs:
31
+ - targets: ["pushgateway:9091"]
32
  scrape_interval: 30s