giuto commited on
Commit
1396866
·
1 Parent(s): 8ed9c1d

Update Grafana and Prometheus configurations, enhance drift detection scripts, and add monitoring dashboard

Browse files
docker-compose.yml CHANGED
@@ -72,7 +72,7 @@ services:
72
  - hopcroft-net
73
  restart: unless-stopped
74
 
75
- grafana:
76
  image: grafana/grafana:latest
77
  container_name: grafana
78
  ports:
@@ -82,7 +82,6 @@ grafana:
82
  - GF_SECURITY_ADMIN_PASSWORD=admin
83
  - GF_USERS_ALLOW_SIGN_UP=false
84
  - GF_SERVER_ROOT_URL=http://localhost:3000
85
- - GF_INSTALL_PLUGINS=grafana-piechart-panel
86
  volumes:
87
  # Provisioning: auto-configure datasources and dashboards
88
  - ./monitoring/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
@@ -101,7 +100,7 @@ grafana:
101
  timeout: 10s
102
  retries: 3
103
 
104
- pushgateway:
105
  image: prom/pushgateway:latest
106
  container_name: pushgateway
107
  ports:
 
72
  - hopcroft-net
73
  restart: unless-stopped
74
 
75
+ grafana:
76
  image: grafana/grafana:latest
77
  container_name: grafana
78
  ports:
 
82
  - GF_SECURITY_ADMIN_PASSWORD=admin
83
  - GF_USERS_ALLOW_SIGN_UP=false
84
  - GF_SERVER_ROOT_URL=http://localhost:3000
 
85
  volumes:
86
  # Provisioning: auto-configure datasources and dashboards
87
  - ./monitoring/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources
 
100
  timeout: 10s
101
  retries: 3
102
 
103
+ pushgateway:
104
  image: prom/pushgateway:latest
105
  container_name: pushgateway
106
  ports:
monitoring/drift/scripts/prepare_baseline.py CHANGED
@@ -6,6 +6,7 @@ This script samples representative data from the training set.
6
  import pickle
7
  import pandas as pd
8
  import numpy as np
 
9
  from pathlib import Path
10
  from sklearn.model_selection import train_test_split
11
 
@@ -16,14 +17,21 @@ BASELINE_DIR.mkdir(parents=True, exist_ok=True)
16
 
17
 
18
  def load_training_data():
19
- """Load the original training dataset."""
20
- # Adjust path to your actual data
21
- data_path = PROJECT_ROOT / "data" / "train.csv"
22
 
23
- if not data_path.exists():
24
- raise FileNotFoundError(f"Training data not found at {data_path}")
 
 
 
 
 
 
 
 
25
 
26
- df = pd.read_csv(data_path)
27
  print(f"Loaded {len(df)} training samples")
28
  return df
29
 
@@ -61,10 +69,14 @@ def extract_features(df):
61
  Should match the features used by your model.
62
  """
63
 
64
- feature_columns = [col for col in df.columns if col not in ['label', 'id', 'timestamp']]
 
 
 
 
65
  X = df[feature_columns].values
66
 
67
- print(f"Extracted {X.shape[1]} features from {X.shape[0]} samples")
68
  return X
69
 
70
 
 
6
  import pickle
7
  import pandas as pd
8
  import numpy as np
9
+ import sqlite3
10
  from pathlib import Path
11
  from sklearn.model_selection import train_test_split
12
 
 
17
 
18
 
19
  def load_training_data():
20
+ """Load the original training dataset from SQLite database."""
21
+ # Load from SQLite database
22
+ db_path = PROJECT_ROOT / "data" / "raw" / "skillscope_data.db"
23
 
24
+ if not db_path.exists():
25
+ raise FileNotFoundError(f"Database not found at {db_path}")
26
+
27
+ print(f"Loading data from database: {db_path}")
28
+ conn = sqlite3.connect(db_path)
29
+
30
+ # Load from the main table
31
+ query = "SELECT * FROM nlbse_tool_competition_data_by_issue LIMIT 10000"
32
+ df = pd.read_sql_query(query, conn)
33
+ conn.close()
34
 
 
35
  print(f"Loaded {len(df)} training samples")
36
  return df
37
 
 
69
  Should match the features used by your model.
70
  """
71
 
72
+ # Select only numeric columns, exclude labels and IDs
73
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
74
+ exclude_cols = ['label', 'id', 'timestamp', 'issue_id', 'file_id', 'method_id', 'class_id']
75
+ feature_columns = [col for col in numeric_cols if col not in exclude_cols]
76
+
77
  X = df[feature_columns].values
78
 
79
+ print(f"Extracted {X.shape[1]} numeric features from {X.shape[0]} samples")
80
  return X
81
 
82
 
monitoring/drift/scripts/run_drift_check.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Data Drift Detection using Alibi Detect.
3
  Detects distribution shifts between baseline and new data.
4
  """
5
 
@@ -10,7 +10,7 @@ import numpy as np
10
  import pandas as pd
11
  from pathlib import Path
12
  from datetime import datetime
13
- from alibi_detect.cd import KSDrift
14
  from typing import Dict, Tuple
15
 
16
  # Configuration
@@ -73,7 +73,7 @@ def load_new_data() -> np.ndarray:
73
 
74
  def run_drift_detection(X_baseline: np.ndarray, X_new: np.ndarray) -> Dict:
75
  """
76
- Run Kolmogorov-Smirnov drift detection.
77
 
78
  Args:
79
  X_baseline: Reference data
@@ -86,24 +86,30 @@ def run_drift_detection(X_baseline: np.ndarray, X_new: np.ndarray) -> Dict:
86
  print("Running Drift Detection (Kolmogorov-Smirnov Test)")
87
  print("=" * 60)
88
 
89
- # Initialize detector
90
- cd = KSDrift(
91
- X_baseline,
92
- p_val=P_VALUE_THRESHOLD,
93
- alternative='two-sided',
94
- correction='bonferroni' # Multiple testing correction
95
- )
96
 
97
- # Run detection
98
- preds = cd.predict(X_new)
 
 
 
 
 
 
 
 
 
 
99
 
100
  # Extract results
101
  results = {
102
  "timestamp": datetime.now().isoformat(),
103
- "drift_detected": int(preds['data']['is_drift']),
104
- "p_value": float(preds['data']['p_val']),
105
- "threshold": P_VALUE_THRESHOLD,
106
- "distance": float(preds['data']['distance']),
107
  "baseline_samples": X_baseline.shape[0],
108
  "new_samples": X_new.shape[0],
109
  "num_features": X_baseline.shape[1]
@@ -112,7 +118,7 @@ def run_drift_detection(X_baseline: np.ndarray, X_new: np.ndarray) -> Dict:
112
  # Print results
113
  print(f"\nResults:")
114
  print(f" Drift Detected: {'YES' if results['drift_detected'] else 'NO'}")
115
- print(f" P-Value: {results['p_value']:.6f} (threshold: {P_VALUE_THRESHOLD})")
116
  print(f" Distance: {results['distance']:.6f}")
117
  print(f" Baseline: {X_baseline.shape[0]} samples")
118
  print(f" New Data: {X_new.shape[0]} samples")
 
1
  """
2
+ Data Drift Detection using Scipy KS Test.
3
  Detects distribution shifts between baseline and new data.
4
  """
5
 
 
10
  import pandas as pd
11
  from pathlib import Path
12
  from datetime import datetime
13
+ from scipy.stats import ks_2samp
14
  from typing import Dict, Tuple
15
 
16
  # Configuration
 
73
 
74
  def run_drift_detection(X_baseline: np.ndarray, X_new: np.ndarray) -> Dict:
75
  """
76
+ Run Kolmogorov-Smirnov drift detection using scipy.
77
 
78
  Args:
79
  X_baseline: Reference data
 
86
  print("Running Drift Detection (Kolmogorov-Smirnov Test)")
87
  print("=" * 60)
88
 
89
+ # Run KS test for each feature
90
+ p_values = []
91
+ distances = []
 
 
 
 
92
 
93
+ for i in range(X_baseline.shape[1]):
94
+ statistic, p_value = ks_2samp(X_baseline[:, i], X_new[:, i])
95
+ p_values.append(p_value)
96
+ distances.append(statistic)
97
+
98
+ # Aggregate results
99
+ min_p_value = np.min(p_values)
100
+ max_distance = np.max(distances)
101
+
102
+ # Apply Bonferroni correction for multiple testing
103
+ adjusted_threshold = P_VALUE_THRESHOLD / X_baseline.shape[1]
104
+ drift_detected = min_p_value < adjusted_threshold
105
 
106
  # Extract results
107
  results = {
108
  "timestamp": datetime.now().isoformat(),
109
+ "drift_detected": int(drift_detected),
110
+ "p_value": float(min_p_value),
111
+ "threshold": adjusted_threshold,
112
+ "distance": float(max_distance),
113
  "baseline_samples": X_baseline.shape[0],
114
  "new_samples": X_new.shape[0],
115
  "num_features": X_baseline.shape[1]
 
118
  # Print results
119
  print(f"\nResults:")
120
  print(f" Drift Detected: {'YES' if results['drift_detected'] else 'NO'}")
121
+ print(f" P-Value: {results['p_value']:.6f} (adjusted threshold: {adjusted_threshold:.6f})")
122
  print(f" Distance: {results['distance']:.6f}")
123
  print(f" Baseline: {X_baseline.shape[0]} samples")
124
  print(f" New Data: {X_new.shape[0]} samples")
monitoring/grafana/dashboards/hopcroft_dashboard.json ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": "-- Grafana --",
7
+ "enable": true,
8
+ "hide": true,
9
+ "iconColor": "rgba(0, 211, 255, 1)",
10
+ "name": "Annotations & Alerts",
11
+ "type": "dashboard"
12
+ }
13
+ ]
14
+ },
15
+ "editable": true,
16
+ "gnetId": null,
17
+ "graphTooltip": 1,
18
+ "id": null,
19
+ "links": [],
20
+ "panels": [
21
+ {
22
+ "datasource": "Prometheus",
23
+ "fieldConfig": {
24
+ "defaults": {
25
+ "color": {
26
+ "mode": "thresholds"
27
+ },
28
+ "mappings": [],
29
+ "thresholds": {
30
+ "mode": "absolute",
31
+ "steps": [
32
+ {
33
+ "color": "green",
34
+ "value": null
35
+ },
36
+ {
37
+ "color": "red",
38
+ "value": 80
39
+ }
40
+ ]
41
+ },
42
+ "unit": "reqps"
43
+ }
44
+ },
45
+ "gridPos": {
46
+ "h": 8,
47
+ "w": 6,
48
+ "x": 0,
49
+ "y": 0
50
+ },
51
+ "id": 1,
52
+ "options": {
53
+ "orientation": "auto",
54
+ "reduceOptions": {
55
+ "calcs": ["lastNotNull"],
56
+ "fields": "",
57
+ "values": false
58
+ },
59
+ "showThresholdLabels": false,
60
+ "showThresholdMarkers": true
61
+ },
62
+ "pluginVersion": "9.0.0",
63
+ "targets": [
64
+ {
65
+ "expr": "rate(fastapi_requests_total[1m])",
66
+ "refId": "A"
67
+ }
68
+ ],
69
+ "title": "Request Rate",
70
+ "type": "gauge",
71
+ "description": "Number of requests per second handled by the API"
72
+ },
73
+ {
74
+ "datasource": "Prometheus",
75
+ "fieldConfig": {
76
+ "defaults": {
77
+ "color": {
78
+ "mode": "palette-classic"
79
+ },
80
+ "custom": {
81
+ "axisLabel": "",
82
+ "axisPlacement": "auto",
83
+ "barAlignment": 0,
84
+ "drawStyle": "line",
85
+ "fillOpacity": 10,
86
+ "gradientMode": "none",
87
+ "hideFrom": {
88
+ "tooltip": false,
89
+ "viz": false,
90
+ "legend": false
91
+ },
92
+ "lineInterpolation": "linear",
93
+ "lineWidth": 1,
94
+ "pointSize": 5,
95
+ "scaleDistribution": {
96
+ "type": "linear"
97
+ },
98
+ "showPoints": "never",
99
+ "spanNulls": true
100
+ },
101
+ "mappings": [],
102
+ "thresholds": {
103
+ "mode": "absolute",
104
+ "steps": [
105
+ {
106
+ "color": "green",
107
+ "value": null
108
+ }
109
+ ]
110
+ },
111
+ "unit": "ms"
112
+ }
113
+ },
114
+ "gridPos": {
115
+ "h": 8,
116
+ "w": 18,
117
+ "x": 6,
118
+ "y": 0
119
+ },
120
+ "id": 2,
121
+ "options": {
122
+ "legend": {
123
+ "calcs": ["mean", "max"],
124
+ "displayMode": "table",
125
+ "placement": "right"
126
+ },
127
+ "tooltip": {
128
+ "mode": "multi"
129
+ }
130
+ },
131
+ "pluginVersion": "9.0.0",
132
+ "targets": [
133
+ {
134
+ "expr": "histogram_quantile(0.95, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
135
+ "legendFormat": "p95",
136
+ "refId": "A"
137
+ },
138
+ {
139
+ "expr": "histogram_quantile(0.50, rate(fastapi_request_duration_seconds_bucket[5m])) * 1000",
140
+ "legendFormat": "p50 (median)",
141
+ "refId": "B"
142
+ }
143
+ ],
144
+ "title": "Request Latency (p50, p95)",
145
+ "type": "timeseries",
146
+ "description": "API response time percentiles over time"
147
+ },
148
+ {
149
+ "datasource": "Prometheus",
150
+ "fieldConfig": {
151
+ "defaults": {
152
+ "color": {
153
+ "mode": "thresholds"
154
+ },
155
+ "mappings": [
156
+ {
157
+ "options": {
158
+ "0": {
159
+ "color": "red",
160
+ "index": 1,
161
+ "text": "No Drift"
162
+ },
163
+ "1": {
164
+ "color": "green",
165
+ "index": 0,
166
+ "text": "Drift Detected"
167
+ }
168
+ },
169
+ "type": "value"
170
+ }
171
+ ],
172
+ "thresholds": {
173
+ "mode": "absolute",
174
+ "steps": [
175
+ {
176
+ "color": "green",
177
+ "value": null
178
+ }
179
+ ]
180
+ }
181
+ }
182
+ },
183
+ "gridPos": {
184
+ "h": 6,
185
+ "w": 6,
186
+ "x": 0,
187
+ "y": 8
188
+ },
189
+ "id": 3,
190
+ "options": {
191
+ "orientation": "auto",
192
+ "reduceOptions": {
193
+ "calcs": ["lastNotNull"],
194
+ "fields": "",
195
+ "values": false
196
+ },
197
+ "showThresholdLabels": false,
198
+ "showThresholdMarkers": true,
199
+ "text": {}
200
+ },
201
+ "pluginVersion": "9.0.0",
202
+ "targets": [
203
+ {
204
+ "expr": "drift_detected",
205
+ "refId": "A"
206
+ }
207
+ ],
208
+ "title": "Data Drift Status",
209
+ "type": "stat",
210
+ "description": "Current data drift detection status (1 = drift detected, 0 = no drift)"
211
+ },
212
+ {
213
+ "datasource": "Prometheus",
214
+ "fieldConfig": {
215
+ "defaults": {
216
+ "color": {
217
+ "mode": "thresholds"
218
+ },
219
+ "decimals": 4,
220
+ "mappings": [],
221
+ "thresholds": {
222
+ "mode": "absolute",
223
+ "steps": [
224
+ {
225
+ "color": "green",
226
+ "value": null
227
+ },
228
+ {
229
+ "color": "yellow",
230
+ "value": 0.01
231
+ },
232
+ {
233
+ "color": "red",
234
+ "value": 0.05
235
+ }
236
+ ]
237
+ },
238
+ "unit": "short"
239
+ }
240
+ },
241
+ "gridPos": {
242
+ "h": 6,
243
+ "w": 6,
244
+ "x": 6,
245
+ "y": 8
246
+ },
247
+ "id": 4,
248
+ "options": {
249
+ "orientation": "auto",
250
+ "reduceOptions": {
251
+ "calcs": ["lastNotNull"],
252
+ "fields": "",
253
+ "values": false
254
+ },
255
+ "showThresholdLabels": false,
256
+ "showThresholdMarkers": true,
257
+ "text": {}
258
+ },
259
+ "pluginVersion": "9.0.0",
260
+ "targets": [
261
+ {
262
+ "expr": "drift_p_value",
263
+ "refId": "A"
264
+ }
265
+ ],
266
+ "title": "Drift P-Value",
267
+ "type": "stat",
268
+ "description": "Statistical significance of detected drift (lower = more significant)"
269
+ },
270
+ {
271
+ "datasource": "Prometheus",
272
+ "fieldConfig": {
273
+ "defaults": {
274
+ "color": {
275
+ "mode": "palette-classic"
276
+ },
277
+ "custom": {
278
+ "axisLabel": "",
279
+ "axisPlacement": "auto",
280
+ "barAlignment": 0,
281
+ "drawStyle": "line",
282
+ "fillOpacity": 10,
283
+ "gradientMode": "none",
284
+ "hideFrom": {
285
+ "tooltip": false,
286
+ "viz": false,
287
+ "legend": false
288
+ },
289
+ "lineInterpolation": "linear",
290
+ "lineWidth": 1,
291
+ "pointSize": 5,
292
+ "scaleDistribution": {
293
+ "type": "linear"
294
+ },
295
+ "showPoints": "auto",
296
+ "spanNulls": false
297
+ },
298
+ "mappings": [],
299
+ "thresholds": {
300
+ "mode": "absolute",
301
+ "steps": [
302
+ {
303
+ "color": "green",
304
+ "value": null
305
+ }
306
+ ]
307
+ },
308
+ "unit": "short"
309
+ }
310
+ },
311
+ "gridPos": {
312
+ "h": 6,
313
+ "w": 12,
314
+ "x": 12,
315
+ "y": 8
316
+ },
317
+ "id": 5,
318
+ "options": {
319
+ "legend": {
320
+ "calcs": ["mean", "lastNotNull"],
321
+ "displayMode": "table",
322
+ "placement": "right"
323
+ },
324
+ "tooltip": {
325
+ "mode": "multi"
326
+ }
327
+ },
328
+ "pluginVersion": "9.0.0",
329
+ "targets": [
330
+ {
331
+ "expr": "drift_distance",
332
+ "legendFormat": "Distance",
333
+ "refId": "A"
334
+ }
335
+ ],
336
+ "title": "Drift Distance Over Time",
337
+ "type": "timeseries",
338
+ "description": "Statistical distance between baseline and current data distribution"
339
+ }
340
+ ],
341
+ "refresh": "10s",
342
+ "schemaVersion": 36,
343
+ "style": "dark",
344
+ "tags": ["hopcroft", "ml", "monitoring"],
345
+ "templating": {
346
+ "list": []
347
+ },
348
+ "time": {
349
+ "from": "now-1h",
350
+ "to": "now"
351
+ },
352
+ "timepicker": {},
353
+ "timezone": "",
354
+ "title": "Hopcroft ML Model Monitoring",
355
+ "uid": "hopcroft-ml-dashboard",
356
+ "version": 1,
357
+ "weekStart": ""
358
+ }
monitoring/grafana/provisioning/datasources/prometheus.yml CHANGED
@@ -4,6 +4,8 @@ datasources:
4
  - name: Prometheus
5
  type: prometheus
6
  access: proxy
 
 
7
  url: http://prometheus:9090
8
  isDefault: true
9
  editable: true
 
4
  - name: Prometheus
5
  type: prometheus
6
  access: proxy
7
+ uid: prometheus
8
+ orgId: 1
9
  url: http://prometheus:9090
10
  isDefault: true
11
  editable: true
monitoring/prometheus/prometheus.yml CHANGED
@@ -16,18 +16,15 @@ alerting:
16
 
17
  scrape_configs:
18
  - job_name: 'hopcroft-api'
 
19
  static_configs:
20
  - targets: ['hopcroft-api:8080']
 
 
21
  - job_name: 'prometheus'
22
  static_configs:
23
  - targets: ['localhost:9090']
24
 
25
- - job_name: 'hopcroft-api'
26
- metrics_path: '/metrics'
27
- static_configs:
28
- - targets: ['hopcroft-api:8080']
29
- scrape_interval: 10s
30
-
31
  - job_name: 'pushgateway'
32
  honor_labels: true
33
  static_configs:
 
16
 
17
  scrape_configs:
18
  - job_name: 'hopcroft-api'
19
+ metrics_path: '/metrics'
20
  static_configs:
21
  - targets: ['hopcroft-api:8080']
22
+ scrape_interval: 10s
23
+
24
  - job_name: 'prometheus'
25
  static_configs:
26
  - targets: ['localhost:9090']
27
 
 
 
 
 
 
 
28
  - job_name: 'pushgateway'
29
  honor_labels: true
30
  static_configs:
requirements.txt CHANGED
@@ -65,6 +65,4 @@ pytest-html
65
  pytest-json-report
66
 
67
  # GUI
68
- streamlit>=1.28.0
69
-
70
- alibi-detect>=0.11.4
 
65
  pytest-json-report
66
 
67
  # GUI
68
+ streamlit>=1.28.0