ardaatahan commited on
Commit
150521a
·
unverified ·
2 Parent(s): e48caf9 4a8873b

Merge pull request #12 from argmaxinc/SW-710-add-regression-test-dashboard-check-rule-for-wer-discrepancy

Browse files
.github/scripts/test_wer_regression_check.py ADDED
@@ -0,0 +1,483 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Test script for WER regression detection
4
+ Tests all regression detection functions with synthetic and real data
5
+ """
6
+
7
+ import json
8
+ import sys
9
+ from wer_regression_check import (
10
+ detect_device_regressions,
11
+ detect_os_regressions,
12
+ detect_release_regressions,
13
+ detect_speed_device_regressions,
14
+ detect_speed_os_regressions,
15
+ detect_speed_release_regressions,
16
+ detect_tokens_device_regressions,
17
+ detect_tokens_os_regressions,
18
+ detect_tokens_release_regressions,
19
+ generate_slack_message,
20
+ load_performance_data
21
+ )
22
+
23
+
24
+ def test_wer_detection_with_synthetic_data():
25
+ """Test WER detection with known synthetic data"""
26
+ print("\n" + "="*80)
27
+ print("TEST 1: WER Detection with Synthetic Data")
28
+ print("="*80)
29
+
30
+ # Create synthetic data where we know there should be regressions
31
+ # Historical data (best performances)
32
+ historical_data = [
33
+ # Model A: iPhone has best WER of 10%
34
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0},
35
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0},
36
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.2, "speed": 9.8, "tokens_per_second": 49.0},
37
+
38
+ # Model B: iOS 17 has best WER of 10%
39
+ {"model": "model-b", "device": "iPhone 15", "os": "iOS 17", "average_wer": 10.0, "speed": 20.0, "tokens_per_second": 100.0},
40
+ {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 19.0, "tokens_per_second": 95.0},
41
+
42
+ # Model C: No regression scenario
43
+ {"model": "model-c", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 10.0, "tokens_per_second": 50.0},
44
+ {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 9.5, "tokens_per_second": 48.0},
45
+ ]
46
+
47
+ # Current data (latest release with regressions)
48
+ current_data = [
49
+ # Model A: iPad Pro has regressed to 15% WER (50% worse than best 10%)
50
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 15.0, "speed": 8.0, "tokens_per_second": 40.0},
51
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.3, "speed": 9.7, "tokens_per_second": 49.5},
52
+
53
+ # Model B: iOS 18 has regressed to 13% WER (30% worse than best 10%)
54
+ {"model": "model-b", "device": "iPhone 15", "os": "iOS 18", "average_wer": 13.0, "speed": 15.0, "tokens_per_second": 75.0},
55
+
56
+ # Model C: Still within 20% (11% vs best 10%)
57
+ {"model": "model-c", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 9.0, "tokens_per_second": 45.0},
58
+ ]
59
+
60
+ # Test device regressions
61
+ device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0)
62
+ print(f"\n✓ Device WER Regressions Found: {len(device_regressions)}")
63
+
64
+ # Debug: print all found regressions
65
+ for r in device_regressions:
66
+ print(f" - {r['model']}: {r['device']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)")
67
+
68
+ # Model A should trigger (iPad Pro is ~40% worse than iPhone)
69
+ # Model C should NOT trigger (iPad Pro is only 10% worse)
70
+ assert len(device_regressions) >= 1, f"Expected at least 1 device regression, got {len(device_regressions)}"
71
+
72
+ # Verify model-a is in the regressions
73
+ model_a_regressions = [r for r in device_regressions if r["model"] == "model-a"]
74
+ assert len(model_a_regressions) > 0, "Expected model-a to have device regression"
75
+ print(f"\n✓ Model-a correctly flagged for device regression")
76
+
77
+ # Test OS regressions
78
+ os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0)
79
+ print(f"\n✓ OS WER Regressions Found: {len(os_regressions)}")
80
+
81
+ # Debug: print all found OS regressions
82
+ for r in os_regressions:
83
+ print(f" - {r['model']}: {r['os']} has {r['current_value']}% WER vs best {r['best_value']}% (diff: {r['percentage_diff']}%)")
84
+
85
+ assert len(os_regressions) >= 1, f"Expected at least 1 OS regression, got {len(os_regressions)}"
86
+
87
+ # Verify model-b is in the regressions
88
+ model_b_regressions = [r for r in os_regressions if r["model"] == "model-b"]
89
+ assert len(model_b_regressions) > 0, "Expected model-b to have OS regression"
90
+ print(f"\n✓ Model-b correctly flagged for OS regression")
91
+
92
+ print("\n✅ TEST 1 PASSED: WER detection works correctly with synthetic data")
93
+ return True
94
+
95
+
96
+ def test_speed_detection_with_synthetic_data():
97
+ """Test speed detection with known synthetic data"""
98
+ print("\n" + "="*80)
99
+ print("TEST 2: Speed Detection with Synthetic Data")
100
+ print("="*80)
101
+
102
+ # Historical data (best performances)
103
+ historical_data = [
104
+ # Model A: iPhone has best speed of 100
105
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 200.0},
106
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 190.0},
107
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 98.0, "tokens_per_second": 195.0},
108
+ ]
109
+
110
+ # Current data (with speed regression)
111
+ current_data = [
112
+ # Model A: iPad Pro has regressed to 60 speed (40% slower than best 100)
113
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 11.0, "speed": 60.0, "tokens_per_second": 120.0},
114
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.2, "speed": 97.0, "tokens_per_second": 195.0},
115
+ ]
116
+
117
+ # Test device speed regressions
118
+ speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0)
119
+ print(f"\n✓ Device Speed Regressions Found: {len(speed_device_regressions)}")
120
+ assert len(speed_device_regressions) == 1, f"Expected 1 speed device regression, got {len(speed_device_regressions)}"
121
+ print(f" - {speed_device_regressions[0]['model']}: {speed_device_regressions[0]['device']} has {speed_device_regressions[0]['current_value']}x speed vs best {speed_device_regressions[0]['best_value']}x")
122
+
123
+ print("\n✅ TEST 2 PASSED: Speed detection works correctly with synthetic data")
124
+ return True
125
+
126
+
127
+ def test_tokens_detection_with_synthetic_data():
128
+ """Test tokens per second detection with known synthetic data"""
129
+ print("\n" + "="*80)
130
+ print("TEST 3: Tokens/Second Detection with Synthetic Data")
131
+ print("="*80)
132
+
133
+ # Historical data (best performances)
134
+ historical_data = [
135
+ # Model A: iPhone has best tokens/sec of 500
136
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
137
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 98.0, "tokens_per_second": 490.0},
138
+ ]
139
+
140
+ # Current data (with tokens/sec regression)
141
+ current_data = [
142
+ # Model A: iPad Pro has regressed to 300 tokens/sec (40% slower than best 500)
143
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.0, "speed": 80.0, "tokens_per_second": 300.0},
144
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.1, "speed": 99.0, "tokens_per_second": 495.0},
145
+ ]
146
+
147
+ # Test device tokens regressions
148
+ tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0)
149
+ print(f"\n✓ Device Tokens/Sec Regressions Found: {len(tokens_device_regressions)}")
150
+ assert len(tokens_device_regressions) == 1, f"Expected 1 tokens device regression, got {len(tokens_device_regressions)}"
151
+ print(f" - {tokens_device_regressions[0]['model']}: {tokens_device_regressions[0]['device']} has {tokens_device_regressions[0]['current_value']} tokens/sec vs best {tokens_device_regressions[0]['best_value']}")
152
+
153
+ print("\n✅ TEST 3 PASSED: Tokens/sec detection works correctly with synthetic data")
154
+ return True
155
+
156
+
157
+ def test_release_regression_detection():
158
+ """Test release-to-release regression detection"""
159
+ print("\n" + "="*80)
160
+ print("TEST 4: Release-to-Release Regression Detection")
161
+ print("="*80)
162
+
163
+ # Previous release data (best performance)
164
+ previous_data = [
165
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
166
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0, "tokens_per_second": 490.0},
167
+ ]
168
+
169
+ # Current release data (degraded performance - 50% worse)
170
+ current_data = [
171
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 15.0, "speed": 60.0, "tokens_per_second": 300.0},
172
+ ]
173
+
174
+ # Test WER release regression
175
+ wer_release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0)
176
+ print(f"\n✓ WER Release Regressions Found: {len(wer_release_regressions)}")
177
+ assert len(wer_release_regressions) == 1, f"Expected 1 WER release regression, got {len(wer_release_regressions)}"
178
+ print(f" - {wer_release_regressions[0]['model']}: WER increased from {wer_release_regressions[0]['best_historical_value']}% to {wer_release_regressions[0]['current_value']}%")
179
+
180
+ # Test speed release regression
181
+ speed_release_regressions = detect_speed_release_regressions(current_data, previous_data, threshold=20.0)
182
+ print(f"\n✓ Speed Release Regressions Found: {len(speed_release_regressions)}")
183
+ assert len(speed_release_regressions) == 1, f"Expected 1 speed release regression, got {len(speed_release_regressions)}"
184
+ print(f" - {speed_release_regressions[0]['model']}: Speed decreased from {speed_release_regressions[0]['best_historical_value']}x to {speed_release_regressions[0]['current_value']}x")
185
+
186
+ # Test tokens release regression
187
+ tokens_release_regressions = detect_tokens_release_regressions(current_data, previous_data, threshold=20.0)
188
+ print(f"\n✓ Tokens/Sec Release Regressions Found: {len(tokens_release_regressions)}")
189
+ assert len(tokens_release_regressions) == 1, f"Expected 1 tokens release regression, got {len(tokens_release_regressions)}"
190
+ print(f" - {tokens_release_regressions[0]['model']}: Tokens/sec decreased from {tokens_release_regressions[0]['best_historical_value']} to {tokens_release_regressions[0]['current_value']}")
191
+
192
+ print("\n✅ TEST 4 PASSED: Release-to-release regression detection works correctly")
193
+ return True
194
+
195
+
196
+ def test_slack_message_generation():
197
+ """Test Slack message generation"""
198
+ print("\n" + "="*80)
199
+ print("TEST 5: Slack Message Generation")
200
+ print("="*80)
201
+
202
+ # Create sample regressions
203
+ sample_regressions = [
204
+ {
205
+ "type": "device_wer_discrepancy",
206
+ "metric": "WER",
207
+ "model": "test-model",
208
+ "device": "iPad Pro",
209
+ "current_value": 35.0,
210
+ "best_value": 25.0,
211
+ "best_device": "iPhone 15",
212
+ "best_os": "iOS 18",
213
+ "percentage_diff": 40.0
214
+ },
215
+ {
216
+ "type": "device_speed_discrepancy",
217
+ "metric": "Speed",
218
+ "model": "test-model",
219
+ "device": "iPad Pro",
220
+ "current_value": 60.0,
221
+ "best_value": 100.0,
222
+ "best_device": "iPhone 15",
223
+ "best_os": "iOS 18",
224
+ "percentage_diff": 40.0
225
+ }
226
+ ]
227
+
228
+ # Generate Slack message
229
+ slack_payload = generate_slack_message(sample_regressions)
230
+
231
+ assert slack_payload is not None, "Expected Slack payload to be generated"
232
+ assert "blocks" in slack_payload, "Expected 'blocks' in Slack payload"
233
+ assert len(slack_payload["blocks"]) > 0, "Expected at least one block in Slack payload"
234
+
235
+ print(f"\n✓ Slack Message Generated Successfully")
236
+ print(f" - Total blocks: {len(slack_payload['blocks'])}")
237
+ print(f"\n📧 Full Slack Message Payload:")
238
+ print("=" * 80)
239
+ print(json.dumps(slack_payload, indent=2))
240
+ print("=" * 80)
241
+
242
+ print("\n✅ TEST 5 PASSED: Slack message generation works correctly")
243
+ return True
244
+
245
+
246
+ def test_edge_cases():
247
+ """Test edge cases"""
248
+ print("\n" + "="*80)
249
+ print("TEST 6: Edge Cases")
250
+ print("="*80)
251
+
252
+ # Test with single data point (should not trigger any regressions - no historical comparison)
253
+ single_current = [
254
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0, "tokens_per_second": 500.0},
255
+ ]
256
+ empty_historical = []
257
+
258
+ device_regressions = detect_device_regressions(single_current, empty_historical, threshold=20.0)
259
+ assert len(device_regressions) == 0, f"Expected 0 regressions with no historical data, got {len(device_regressions)}"
260
+ print("✓ Single data point with no historical data handled correctly (no regressions)")
261
+
262
+ # Test with empty current data
263
+ empty_regressions = detect_device_regressions([], single_current, threshold=20.0)
264
+ assert len(empty_regressions) == 0, "Expected 0 regressions with empty current data"
265
+ print("✓ Empty current data handled correctly")
266
+
267
+ # Test with missing fields (tokens_per_second missing)
268
+ partial_historical = [
269
+ {"model": "model-a", "device": "iPhone 15", "os": "iOS 18", "average_wer": 10.0, "speed": 100.0},
270
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 10.5, "speed": 95.0},
271
+ ]
272
+ partial_current = [
273
+ {"model": "model-a", "device": "iPad Pro", "os": "iOS 18", "average_wer": 30.0, "speed": 80.0},
274
+ ]
275
+
276
+ # Should still work for WER and speed
277
+ device_regressions = detect_device_regressions(partial_current, partial_historical, threshold=20.0)
278
+ print(f"✓ Partial data (missing tokens) handled correctly: {len(device_regressions)} WER regressions found")
279
+
280
+ # Should not crash for tokens
281
+ tokens_regressions = detect_tokens_device_regressions(partial_current, partial_historical, threshold=20.0)
282
+ assert len(tokens_regressions) == 0, "Expected 0 tokens regressions when field is missing"
283
+ print("✓ Missing tokens_per_second field handled gracefully")
284
+
285
+ print("\n✅ TEST 6 PASSED: Edge cases handled correctly")
286
+ return True
287
+
288
+
289
+ def test_with_real_data_sample():
290
+ """Test with a small sample of real data to verify calculations"""
291
+ print("\n" + "="*80)
292
+ print("TEST 7: Real Data Sample Verification")
293
+ print("="*80)
294
+
295
+ try:
296
+ # Load a sample of real data
297
+ real_data = load_performance_data("dashboard_data/performance_data.json")
298
+
299
+ if len(real_data) == 0:
300
+ print("⚠️ No real data found, skipping this test")
301
+ return True
302
+
303
+ print(f"✓ Loaded {len(real_data)} real data points")
304
+
305
+ # Get unique models
306
+ models = set(entry["model"] for entry in real_data)
307
+ print(f"✓ Found {len(models)} unique models")
308
+
309
+ # Split into current (last 10%) and historical (all data) for testing
310
+ split_point = int(len(real_data) * 0.9)
311
+ historical_data = real_data[:split_point] if split_point > 0 else real_data
312
+ current_data = real_data[split_point:] if split_point > 0 else real_data[:10]
313
+
314
+ # Run detection on real data
315
+ device_regressions = detect_device_regressions(current_data, historical_data, threshold=20.0)
316
+ os_regressions = detect_os_regressions(current_data, historical_data, threshold=20.0)
317
+ speed_device_regressions = detect_speed_device_regressions(current_data, historical_data, threshold=20.0)
318
+ tokens_device_regressions = detect_tokens_device_regressions(current_data, historical_data, threshold=20.0)
319
+
320
+ print(f"\n✓ Real Data Analysis:")
321
+ print(f" - WER device regressions: {len(device_regressions)}")
322
+ print(f" - WER OS regressions: {len(os_regressions)}")
323
+ print(f" - Speed device regressions: {len(speed_device_regressions)}")
324
+ print(f" - Tokens device regressions: {len(tokens_device_regressions)}")
325
+
326
+ # Show a few examples if any found
327
+ if device_regressions:
328
+ print(f"\n Example WER regression:")
329
+ r = device_regressions[0]
330
+ print(f" Model: {r['model']}")
331
+ print(f" Device: {r['device']} on {r['os']}")
332
+ print(f" Current: {r['current_value']}% WER")
333
+ print(f" Historical best: {r['best_value']}% WER")
334
+ print(f" Deviation: +{r['percentage_diff']}%")
335
+
336
+ if speed_device_regressions:
337
+ print(f"\n Example Speed regression:")
338
+ r = speed_device_regressions[0]
339
+ print(f" Model: {r['model']}")
340
+ print(f" Device: {r['device']} on {r['os']}")
341
+ print(f" Current: {r['current_value']}x speed")
342
+ print(f" Historical best: {r['best_value']}x speed")
343
+ print(f" Slower by: {r['percentage_diff']}%")
344
+
345
+ print("\n✅ TEST 7 PASSED: Real data processed successfully")
346
+ return True
347
+
348
+ except FileNotFoundError:
349
+ print("⚠️ dashboard_data/performance_data.json not found, skipping real data test")
350
+ return True
351
+ except Exception as e:
352
+ print(f"❌ Error processing real data: {e}")
353
+ return False
354
+
355
+
356
+ def manual_verification_helper():
357
+ """Print data for manual verification"""
358
+ print("\n" + "="*80)
359
+ print("MANUAL VERIFICATION HELPER")
360
+ print("="*80)
361
+
362
+ try:
363
+ real_data = load_performance_data("dashboard_data/performance_data.json")
364
+
365
+ # Pick a model to analyze in detail
366
+ models = {}
367
+ for entry in real_data:
368
+ model = entry["model"]
369
+ if model not in models:
370
+ models[model] = []
371
+ models[model].append(entry)
372
+
373
+ # Find a model with multiple entries
374
+ for model_name, entries in list(models.items())[:3]: # Check first 3 models
375
+ if len(entries) >= 3:
376
+ print(f"\n📊 Model: {model_name}")
377
+ print(f" Total data points: {len(entries)}")
378
+
379
+ # Show WER stats
380
+ wer_values = [e["average_wer"] for e in entries]
381
+ print(f"\n WER Analysis:")
382
+ print(f" - Best (min): {min(wer_values):.2f}%")
383
+ print(f" - Worst (max): {max(wer_values):.2f}%")
384
+ print(f" - Difference: {((max(wer_values) - min(wer_values)) / min(wer_values) * 100):.1f}%")
385
+
386
+ # Show by device
387
+ devices = {}
388
+ for entry in entries:
389
+ device = entry["device"]
390
+ if device not in devices:
391
+ devices[device] = []
392
+ devices[device].append(entry["average_wer"])
393
+
394
+ print(f"\n WER by Device:")
395
+ for device, wers in devices.items():
396
+ avg_wer = sum(wers) / len(wers)
397
+ num_samples = len(wers)
398
+ print(f" - {device}: {avg_wer:.2f}% avg ({num_samples} test runs)")
399
+
400
+ # Show speed stats if available
401
+ if "speed" in entries[0]:
402
+ speed_values = [e["speed"] for e in entries]
403
+ print(f"\n Speed Analysis:")
404
+ print(f" - Best (max): {max(speed_values):.2f}x")
405
+ print(f" - Worst (min): {min(speed_values):.2f}x")
406
+ print(f" - Difference: {((max(speed_values) - min(speed_values)) / max(speed_values) * 100):.1f}%")
407
+
408
+ break
409
+
410
+ print("\n" + "="*80)
411
+ print("Use the above data to manually verify regression detection logic")
412
+ print("="*80)
413
+
414
+ except Exception as e:
415
+ print(f"Could not load data for manual verification: {e}")
416
+
417
+
418
+ def run_all_tests():
419
+ """Run all tests"""
420
+ print("\n" + "="*80)
421
+ print("🧪 RUNNING ALL REGRESSION DETECTION TESTS")
422
+ print("="*80)
423
+
424
+ tests = [
425
+ ("WER Detection (Synthetic)", test_wer_detection_with_synthetic_data),
426
+ ("Speed Detection (Synthetic)", test_speed_detection_with_synthetic_data),
427
+ ("Tokens Detection (Synthetic)", test_tokens_detection_with_synthetic_data),
428
+ ("Release Regression Detection", test_release_regression_detection),
429
+ ("Slack Message Generation", test_slack_message_generation),
430
+ ("Edge Cases", test_edge_cases),
431
+ ("Real Data Sample", test_with_real_data_sample),
432
+ ]
433
+
434
+ passed = 0
435
+ failed = 0
436
+
437
+ for test_name, test_func in tests:
438
+ try:
439
+ if test_func():
440
+ passed += 1
441
+ else:
442
+ failed += 1
443
+ print(f"\n❌ {test_name} FAILED")
444
+ except AssertionError as e:
445
+ failed += 1
446
+ print(f"\n❌ {test_name} FAILED: {e}")
447
+ except Exception as e:
448
+ failed += 1
449
+ print(f"\n❌ {test_name} ERROR: {e}")
450
+ import traceback
451
+ traceback.print_exc()
452
+
453
+ # Print summary
454
+ print("\n" + "="*80)
455
+ print("TEST SUMMARY")
456
+ print("="*80)
457
+ print(f"✅ Passed: {passed}/{len(tests)}")
458
+ print(f"❌ Failed: {failed}/{len(tests)}")
459
+
460
+ if failed == 0:
461
+ print("\n🎉 ALL TESTS PASSED! The implementation is working correctly.")
462
+ print("\nNext steps:")
463
+ print("1. Run manual verification helper to spot-check real data")
464
+ print("2. Test in a non-production environment first")
465
+ print("3. Monitor the first few runs carefully")
466
+ else:
467
+ print(f"\n⚠️ {failed} test(s) failed. Please review and fix issues.")
468
+ return False
469
+
470
+ return True
471
+
472
+
473
+ if __name__ == "__main__":
474
+ success = run_all_tests()
475
+
476
+ # Optionally run manual verification helper
477
+ print("\n" + "="*80)
478
+ response = input("Run manual verification helper? (y/n): ")
479
+ if response.lower() == 'y':
480
+ manual_verification_helper()
481
+
482
+ sys.exit(0 if success else 1)
483
+
.github/scripts/wer_regression_check.py ADDED
@@ -0,0 +1,922 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ WhisperKit Performance Regression Detection Script
4
+
5
+ This script detects significant performance regressions per model by:
6
+ - Tracking the best (lowest) WER for each model
7
+ - Tracking the best (highest) speed and tokens per second for each model
8
+ - Comparing all configurations against those best baselines
9
+ - Alerting if any configuration deviates by > 20%
10
+
11
+ If any model shows discrepancy > 20%, it alerts via Slack.
12
+ """
13
+
14
+ import json
15
+ import os
16
+ import statistics
17
+ from collections import defaultdict
18
+ from typing import Dict, List, Tuple, Optional
19
+ import pandas as pd
20
+
21
+
22
+ def load_performance_data(file_path: str, commit_hash: Optional[str] = None) -> List[Dict]:
23
+ """Load performance data from JSON file, optionally filtering by commit hash."""
24
+ data = []
25
+ try:
26
+ with open(file_path, "r") as f:
27
+ for line in f:
28
+ try:
29
+ item = json.loads(line.strip())
30
+ if commit_hash is None or item.get("commit_hash") == commit_hash:
31
+ data.append(item)
32
+ except json.JSONDecodeError:
33
+ continue
34
+ except FileNotFoundError:
35
+ print(f"Warning: Performance data file not found: {file_path}")
36
+ return []
37
+
38
+ return data
39
+
40
+
41
+ def calculate_wer_statistics(wer_values: List[float]) -> Dict[str, float]:
42
+ """Calculate WER statistics for a list of values."""
43
+ if not wer_values:
44
+ return {"mean": 0, "median": 0, "min": 0, "max": 0, "std": 0}
45
+
46
+ return {
47
+ "mean": statistics.mean(wer_values),
48
+ "median": statistics.median(wer_values),
49
+ "min": min(wer_values),
50
+ "max": max(wer_values),
51
+ "std": statistics.stdev(wer_values) if len(wer_values) > 1 else 0
52
+ }
53
+
54
+
55
+ def detect_device_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
56
+ """
57
+ Detect WER regressions for devices in current release.
58
+ Compares current data points against historical best for each model+device combination.
59
+ Returns list of regression alerts.
60
+ """
61
+ regressions = []
62
+
63
+ # Build historical best WER for each model+device combination
64
+ historical_best = {}
65
+ best_configs = {}
66
+ for entry in all_historical_data:
67
+ key = (entry["model"], entry["device"])
68
+ if key not in historical_best:
69
+ historical_best[key] = entry["average_wer"]
70
+ best_configs[key] = entry
71
+ elif entry["average_wer"] < historical_best[key]:
72
+ historical_best[key] = entry["average_wer"]
73
+ best_configs[key] = entry
74
+
75
+ # Check each current data point against historical best
76
+ for entry in current_data:
77
+ key = (entry["model"], entry["device"])
78
+
79
+ if key not in historical_best:
80
+ continue # No historical data for this combination
81
+
82
+ best_wer = historical_best[key]
83
+ best_config = best_configs[key]
84
+ current_wer = entry["average_wer"]
85
+
86
+ if best_wer > 0: # Avoid division by zero
87
+ pct_diff = (current_wer - best_wer) / best_wer * 100
88
+
89
+ # Only flag if current is significantly worse than historical best
90
+ if pct_diff > threshold:
91
+ regressions.append({
92
+ "type": "device_wer_discrepancy",
93
+ "metric": "WER",
94
+ "model": entry["model"],
95
+ "device": entry["device"],
96
+ "os": entry["os"],
97
+ "current_value": round(current_wer, 2),
98
+ "best_value": round(best_wer, 2),
99
+ "best_device": best_config["device"],
100
+ "best_os": best_config["os"],
101
+ "percentage_diff": round(pct_diff, 1)
102
+ })
103
+
104
+ return regressions
105
+
106
+
107
+ def detect_os_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
108
+ """
109
+ Detect WER regressions for OS versions in current release.
110
+ Compares current data points against historical best for each model+OS combination.
111
+ Returns list of regression alerts.
112
+ """
113
+ regressions = []
114
+
115
+ # Build historical best WER for each model+OS combination
116
+ historical_best = {}
117
+ best_configs = {}
118
+ for entry in all_historical_data:
119
+ key = (entry["model"], entry["os"])
120
+ if key not in historical_best:
121
+ historical_best[key] = entry["average_wer"]
122
+ best_configs[key] = entry
123
+ elif entry["average_wer"] < historical_best[key]:
124
+ historical_best[key] = entry["average_wer"]
125
+ best_configs[key] = entry
126
+
127
+ # Check each current data point against historical best
128
+ for entry in current_data:
129
+ key = (entry["model"], entry["os"])
130
+
131
+ if key not in historical_best:
132
+ continue # No historical data for this combination
133
+
134
+ best_wer = historical_best[key]
135
+ best_config = best_configs[key]
136
+ current_wer = entry["average_wer"]
137
+
138
+ if best_wer > 0: # Avoid division by zero
139
+ pct_diff = (current_wer - best_wer) / best_wer * 100
140
+
141
+ # Only flag if current is significantly worse than historical best
142
+ if pct_diff > threshold:
143
+ regressions.append({
144
+ "type": "os_wer_discrepancy",
145
+ "metric": "WER",
146
+ "model": entry["model"],
147
+ "device": entry["device"],
148
+ "os": entry["os"],
149
+ "current_value": round(current_wer, 2),
150
+ "best_value": round(best_wer, 2),
151
+ "best_device": best_config["device"],
152
+ "best_os": best_config["os"],
153
+ "percentage_diff": round(pct_diff, 1)
154
+ })
155
+
156
+ return regressions
157
+
158
+
159
+ def detect_release_regressions(current_data: List[Dict], previous_data: List[Dict],
160
+ threshold: float = 20.0) -> List[Dict]:
161
+ """
162
+ Detect WER regressions in current release for each model.
163
+ Compares current WER against the best (lowest) historical WER for that model.
164
+ Returns list of regression alerts.
165
+ """
166
+ regressions = []
167
+
168
+ if not previous_data:
169
+ print("No previous release data available for comparison")
170
+ return regressions
171
+
172
+ # Combine all historical data
173
+ all_historical = previous_data
174
+
175
+ # Group by model
176
+ model_current = defaultdict(list)
177
+ model_historical = defaultdict(list)
178
+
179
+ for entry in current_data:
180
+ model_current[entry["model"]].append(entry)
181
+
182
+ for entry in all_historical:
183
+ model_historical[entry["model"]].append(entry)
184
+
185
+ # Check each model
186
+ for model in model_current.keys():
187
+ if model not in model_historical:
188
+ continue # No historical data for this model
189
+
190
+ # Find best historical WER for this model
191
+ best_historical_wer = min(entry["average_wer"] for entry in model_historical[model])
192
+ best_config = next(e for e in model_historical[model] if e["average_wer"] == best_historical_wer)
193
+
194
+ # Check each current configuration against best historical
195
+ for current_entry in model_current[model]:
196
+ current_wer = current_entry["average_wer"]
197
+
198
+ if best_historical_wer > 0: # Avoid division by zero
199
+ pct_change = (current_wer - best_historical_wer) / best_historical_wer * 100
200
+
201
+ # Only flag significant WER increases (regressions)
202
+ if pct_change > threshold:
203
+ regressions.append({
204
+ "type": "release_wer_regression",
205
+ "metric": "WER",
206
+ "model": model,
207
+ "device": current_entry["device"],
208
+ "os": current_entry["os"],
209
+ "current_value": round(current_wer, 2),
210
+ "best_historical_value": round(best_historical_wer, 2),
211
+ "best_device": best_config["device"],
212
+ "best_os": best_config["os"],
213
+ "percentage_increase": round(pct_change, 1)
214
+ })
215
+
216
+ return regressions
217
+
218
+
219
+ def detect_speed_device_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
220
+ """
221
+ Detect speed regressions for devices in current release.
222
+ Compares current data points against historical best for each model+device combination.
223
+ Returns list of regression alerts.
224
+ """
225
+ regressions = []
226
+
227
+ # Build historical best speed for each model+device combination
228
+ historical_best = {}
229
+ best_configs = {}
230
+ for entry in all_historical_data:
231
+ if "speed" not in entry:
232
+ continue
233
+ key = (entry["model"], entry["device"])
234
+ if key not in historical_best:
235
+ historical_best[key] = entry["speed"]
236
+ best_configs[key] = entry
237
+ elif entry["speed"] > historical_best[key]:
238
+ historical_best[key] = entry["speed"]
239
+ best_configs[key] = entry
240
+
241
+ # Check each current data point against historical best
242
+ for entry in current_data:
243
+ if "speed" not in entry:
244
+ continue
245
+
246
+ key = (entry["model"], entry["device"])
247
+
248
+ if key not in historical_best:
249
+ continue # No historical data for this combination
250
+
251
+ best_speed = historical_best[key]
252
+ best_config = best_configs[key]
253
+ current_speed = entry["speed"]
254
+
255
+ if best_speed > 0: # Avoid division by zero
256
+ pct_diff = (best_speed - current_speed) / best_speed * 100
257
+
258
+ # Only flag if current is significantly slower than historical best
259
+ if pct_diff > threshold:
260
+ regressions.append({
261
+ "type": "device_speed_discrepancy",
262
+ "metric": "Speed",
263
+ "model": entry["model"],
264
+ "device": entry["device"],
265
+ "os": entry["os"],
266
+ "current_value": round(current_speed, 2),
267
+ "best_value": round(best_speed, 2),
268
+ "best_device": best_config["device"],
269
+ "best_os": best_config["os"],
270
+ "percentage_diff": round(pct_diff, 1)
271
+ })
272
+
273
+ return regressions
274
+
275
+
276
+ def detect_speed_os_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
277
+ """
278
+ Detect speed regressions for OS versions in current release.
279
+ Compares current data points against historical best for each model+OS combination.
280
+ Returns list of regression alerts.
281
+ """
282
+ regressions = []
283
+
284
+ # Build historical best speed for each model+OS combination
285
+ historical_best = {}
286
+ best_configs = {}
287
+ for entry in all_historical_data:
288
+ if "speed" not in entry:
289
+ continue
290
+ key = (entry["model"], entry["os"])
291
+ if key not in historical_best:
292
+ historical_best[key] = entry["speed"]
293
+ best_configs[key] = entry
294
+ elif entry["speed"] > historical_best[key]:
295
+ historical_best[key] = entry["speed"]
296
+ best_configs[key] = entry
297
+
298
+ # Check each current data point against historical best
299
+ for entry in current_data:
300
+ if "speed" not in entry:
301
+ continue
302
+
303
+ key = (entry["model"], entry["os"])
304
+
305
+ if key not in historical_best:
306
+ continue # No historical data for this combination
307
+
308
+ best_speed = historical_best[key]
309
+ best_config = best_configs[key]
310
+ current_speed = entry["speed"]
311
+
312
+ if best_speed > 0: # Avoid division by zero
313
+ pct_diff = (best_speed - current_speed) / best_speed * 100
314
+
315
+ # Only flag if current is significantly slower than historical best
316
+ if pct_diff > threshold:
317
+ regressions.append({
318
+ "type": "os_speed_discrepancy",
319
+ "metric": "Speed",
320
+ "model": entry["model"],
321
+ "device": entry["device"],
322
+ "os": entry["os"],
323
+ "current_value": round(current_speed, 2),
324
+ "best_value": round(best_speed, 2),
325
+ "best_device": best_config["device"],
326
+ "best_os": best_config["os"],
327
+ "percentage_diff": round(pct_diff, 1)
328
+ })
329
+
330
+ return regressions
331
+
332
+
333
+ def detect_speed_release_regressions(current_data: List[Dict], previous_data: List[Dict],
334
+ threshold: float = 20.0) -> List[Dict]:
335
+ """
336
+ Detect speed regressions in current release for each model.
337
+ Compares current speed against the best (highest) historical speed for that model.
338
+ Returns list of regression alerts.
339
+ """
340
+ regressions = []
341
+
342
+ if not previous_data:
343
+ return regressions
344
+
345
+ # Group by model
346
+ model_current = defaultdict(list)
347
+ model_historical = defaultdict(list)
348
+
349
+ for entry in current_data:
350
+ if "speed" in entry:
351
+ model_current[entry["model"]].append(entry)
352
+
353
+ for entry in previous_data:
354
+ if "speed" in entry:
355
+ model_historical[entry["model"]].append(entry)
356
+
357
+ # Check each model
358
+ for model in model_current.keys():
359
+ if model not in model_historical:
360
+ continue # No historical data for this model
361
+
362
+ # Find best historical speed for this model
363
+ best_historical_speed = max(entry["speed"] for entry in model_historical[model])
364
+ best_config = next(e for e in model_historical[model] if e["speed"] == best_historical_speed)
365
+
366
+ # Check each current configuration against best historical
367
+ for current_entry in model_current[model]:
368
+ current_speed = current_entry["speed"]
369
+
370
+ if best_historical_speed > 0: # Avoid division by zero
371
+ pct_change = (best_historical_speed - current_speed) / best_historical_speed * 100
372
+
373
+ # Only flag significant speed decreases (regressions)
374
+ if pct_change > threshold:
375
+ regressions.append({
376
+ "type": "release_speed_regression",
377
+ "metric": "Speed",
378
+ "model": model,
379
+ "device": current_entry["device"],
380
+ "os": current_entry["os"],
381
+ "current_value": round(current_speed, 2),
382
+ "best_historical_value": round(best_historical_speed, 2),
383
+ "best_device": best_config["device"],
384
+ "best_os": best_config["os"],
385
+ "percentage_decrease": round(pct_change, 1)
386
+ })
387
+
388
+ return regressions
389
+
390
+
391
+ def detect_tokens_device_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
392
+ """
393
+ Detect tokens per second regressions for devices in current release.
394
+ Compares current data points against historical best for each model+device combination.
395
+ Returns list of regression alerts.
396
+ """
397
+ regressions = []
398
+
399
+ # Build historical best tokens/sec for each model+device combination
400
+ historical_best = {}
401
+ best_configs = {}
402
+ for entry in all_historical_data:
403
+ if "tokens_per_second" not in entry:
404
+ continue
405
+ key = (entry["model"], entry["device"])
406
+ if key not in historical_best:
407
+ historical_best[key] = entry["tokens_per_second"]
408
+ best_configs[key] = entry
409
+ elif entry["tokens_per_second"] > historical_best[key]:
410
+ historical_best[key] = entry["tokens_per_second"]
411
+ best_configs[key] = entry
412
+
413
+ # Check each current data point against historical best
414
+ for entry in current_data:
415
+ if "tokens_per_second" not in entry:
416
+ continue
417
+
418
+ key = (entry["model"], entry["device"])
419
+
420
+ if key not in historical_best:
421
+ continue # No historical data for this combination
422
+
423
+ best_tokens = historical_best[key]
424
+ best_config = best_configs[key]
425
+ current_tokens = entry["tokens_per_second"]
426
+
427
+ if best_tokens > 0: # Avoid division by zero
428
+ pct_diff = (best_tokens - current_tokens) / best_tokens * 100
429
+
430
+ # Only flag if current is significantly slower than historical best
431
+ if pct_diff > threshold:
432
+ regressions.append({
433
+ "type": "device_tokens_discrepancy",
434
+ "metric": "Tokens/Second",
435
+ "model": entry["model"],
436
+ "device": entry["device"],
437
+ "os": entry["os"],
438
+ "current_value": round(current_tokens, 2),
439
+ "best_value": round(best_tokens, 2),
440
+ "best_device": best_config["device"],
441
+ "best_os": best_config["os"],
442
+ "percentage_diff": round(pct_diff, 1)
443
+ })
444
+
445
+ return regressions
446
+
447
+
448
+ def detect_tokens_os_regressions(current_data: List[Dict], all_historical_data: List[Dict], threshold: float = 20.0) -> List[Dict]:
449
+ """
450
+ Detect tokens per second regressions for OS versions in current release.
451
+ Compares current data points against historical best for each model+OS combination.
452
+ Returns list of regression alerts.
453
+ """
454
+ regressions = []
455
+
456
+ # Build historical best tokens/sec for each model+OS combination
457
+ historical_best = {}
458
+ best_configs = {}
459
+ for entry in all_historical_data:
460
+ if "tokens_per_second" not in entry:
461
+ continue
462
+ key = (entry["model"], entry["os"])
463
+ if key not in historical_best:
464
+ historical_best[key] = entry["tokens_per_second"]
465
+ best_configs[key] = entry
466
+ elif entry["tokens_per_second"] > historical_best[key]:
467
+ historical_best[key] = entry["tokens_per_second"]
468
+ best_configs[key] = entry
469
+
470
+ # Check each current data point against historical best
471
+ for entry in current_data:
472
+ if "tokens_per_second" not in entry:
473
+ continue
474
+
475
+ key = (entry["model"], entry["os"])
476
+
477
+ if key not in historical_best:
478
+ continue # No historical data for this combination
479
+
480
+ best_tokens = historical_best[key]
481
+ best_config = best_configs[key]
482
+ current_tokens = entry["tokens_per_second"]
483
+
484
+ if best_tokens > 0: # Avoid division by zero
485
+ pct_diff = (best_tokens - current_tokens) / best_tokens * 100
486
+
487
+ # Only flag if current is significantly slower than historical best
488
+ if pct_diff > threshold:
489
+ regressions.append({
490
+ "type": "os_tokens_discrepancy",
491
+ "metric": "Tokens/Second",
492
+ "model": entry["model"],
493
+ "device": entry["device"],
494
+ "os": entry["os"],
495
+ "current_value": round(current_tokens, 2),
496
+ "best_value": round(best_tokens, 2),
497
+ "best_device": best_config["device"],
498
+ "best_os": best_config["os"],
499
+ "percentage_diff": round(pct_diff, 1)
500
+ })
501
+
502
+ return regressions
503
+
504
+
505
+ def detect_tokens_release_regressions(current_data: List[Dict], previous_data: List[Dict],
506
+ threshold: float = 20.0) -> List[Dict]:
507
+ """
508
+ Detect tokens per second regressions in current release for each model.
509
+ Compares current tokens/sec against the best (highest) historical tokens/sec for that model.
510
+ Returns list of regression alerts.
511
+ """
512
+ regressions = []
513
+
514
+ if not previous_data:
515
+ return regressions
516
+
517
+ # Group by model
518
+ model_current = defaultdict(list)
519
+ model_historical = defaultdict(list)
520
+
521
+ for entry in current_data:
522
+ if "tokens_per_second" in entry:
523
+ model_current[entry["model"]].append(entry)
524
+
525
+ for entry in previous_data:
526
+ if "tokens_per_second" in entry:
527
+ model_historical[entry["model"]].append(entry)
528
+
529
+ # Check each model
530
+ for model in model_current.keys():
531
+ if model not in model_historical:
532
+ continue # No historical data for this model
533
+
534
+ # Find best historical tokens/sec for this model
535
+ best_historical_tokens = max(entry["tokens_per_second"] for entry in model_historical[model])
536
+ best_config = next(e for e in model_historical[model] if e["tokens_per_second"] == best_historical_tokens)
537
+
538
+ # Check each current configuration against best historical
539
+ for current_entry in model_current[model]:
540
+ current_tokens = current_entry["tokens_per_second"]
541
+
542
+ if best_historical_tokens > 0: # Avoid division by zero
543
+ pct_change = (best_historical_tokens - current_tokens) / best_historical_tokens * 100
544
+
545
+ # Only flag significant tokens/sec decreases (regressions)
546
+ if pct_change > threshold:
547
+ regressions.append({
548
+ "type": "release_tokens_regression",
549
+ "metric": "Tokens/Second",
550
+ "model": model,
551
+ "device": current_entry["device"],
552
+ "os": current_entry["os"],
553
+ "current_value": round(current_tokens, 2),
554
+ "best_historical_value": round(best_historical_tokens, 2),
555
+ "best_device": best_config["device"],
556
+ "best_os": best_config["os"],
557
+ "percentage_decrease": round(pct_change, 1)
558
+ })
559
+
560
+ return regressions
561
+
562
+
563
+ def generate_slack_message(regressions: List[Dict]) -> Dict:
564
+ """Generate Slack message payload for performance regression alerts."""
565
+
566
+ if not regressions:
567
+ return None
568
+
569
+ blocks = [
570
+ {
571
+ "type": "header",
572
+ "text": {
573
+ "type": "plain_text",
574
+ "text": "⚠️ WhisperKit Performance Regression Alert",
575
+ "emoji": True
576
+ }
577
+ },
578
+ {
579
+ "type": "context",
580
+ "elements": [
581
+ {
582
+ "type": "mrkdwn",
583
+ "text": f"*Detected {len(regressions)} significant performance regression(s)*"
584
+ }
585
+ ]
586
+ },
587
+ {"type": "divider"}
588
+ ]
589
+
590
+ # Group regressions by type
591
+ wer_device = [r for r in regressions if r["type"] == "device_wer_discrepancy"]
592
+ wer_os = [r for r in regressions if r["type"] == "os_wer_discrepancy"]
593
+ wer_release = [r for r in regressions if r["type"] == "release_wer_regression"]
594
+
595
+ speed_device = [r for r in regressions if r["type"] == "device_speed_discrepancy"]
596
+ speed_os = [r for r in regressions if r["type"] == "os_speed_discrepancy"]
597
+ speed_release = [r for r in regressions if r["type"] == "release_speed_regression"]
598
+
599
+ tokens_device = [r for r in regressions if r["type"] == "device_tokens_discrepancy"]
600
+ tokens_os = [r for r in regressions if r["type"] == "os_tokens_discrepancy"]
601
+ tokens_release = [r for r in regressions if r["type"] == "release_tokens_regression"]
602
+
603
+ # WER Regressions
604
+ if wer_device:
605
+ blocks.append({
606
+ "type": "section",
607
+ "text": {
608
+ "type": "mrkdwn",
609
+ "text": "*WER Device Discrepancies:*"
610
+ }
611
+ })
612
+
613
+ for regression in wer_device:
614
+ blocks.append({
615
+ "type": "section",
616
+ "text": {
617
+ "type": "mrkdwn",
618
+ "text": f"*{regression['model']}*\n"
619
+ f"• {regression['device']}: {regression['current_value']}% WER\n"
620
+ f"• Best: {regression['best_value']}% WER ({regression['best_device']} on {regression['best_os']})\n"
621
+ f"• Deviation: +{regression['percentage_diff']}%"
622
+ }
623
+ })
624
+
625
+ if wer_os:
626
+ if wer_device:
627
+ blocks.append({"type": "divider"})
628
+
629
+ blocks.append({
630
+ "type": "section",
631
+ "text": {
632
+ "type": "mrkdwn",
633
+ "text": "*WER OS Version Discrepancies:*"
634
+ }
635
+ })
636
+
637
+ for regression in wer_os:
638
+ blocks.append({
639
+ "type": "section",
640
+ "text": {
641
+ "type": "mrkdwn",
642
+ "text": f"*{regression['model']}*\n"
643
+ f"• {regression['os']}: {regression['current_value']}% WER\n"
644
+ f"• Best: {regression['best_value']}% WER ({regression['best_device']} on {regression['best_os']})\n"
645
+ f"• Deviation: +{regression['percentage_diff']}%"
646
+ }
647
+ })
648
+
649
+ if wer_release:
650
+ if wer_device or wer_os:
651
+ blocks.append({"type": "divider"})
652
+
653
+ blocks.append({
654
+ "type": "section",
655
+ "text": {
656
+ "type": "mrkdwn",
657
+ "text": "*WER Release-to-Release Regressions:*"
658
+ }
659
+ })
660
+
661
+ for regression in wer_release:
662
+ blocks.append({
663
+ "type": "section",
664
+ "text": {
665
+ "type": "mrkdwn",
666
+ "text": f"*{regression['model']}* on {regression['device']} ({regression['os']})\n"
667
+ f"• Current: {regression['current_value']}% WER\n"
668
+ f"• Best Historical: {regression['best_historical_value']}% WER ({regression['best_device']} on {regression['best_os']})\n"
669
+ f"• Increase: +{regression['percentage_increase']}%"
670
+ }
671
+ })
672
+
673
+ # Speed Regressions
674
+ if speed_device:
675
+ if wer_device or wer_os or wer_release:
676
+ blocks.append({"type": "divider"})
677
+
678
+ blocks.append({
679
+ "type": "section",
680
+ "text": {
681
+ "type": "mrkdwn",
682
+ "text": "*Speed Device Discrepancies:*"
683
+ }
684
+ })
685
+
686
+ for regression in speed_device:
687
+ blocks.append({
688
+ "type": "section",
689
+ "text": {
690
+ "type": "mrkdwn",
691
+ "text": f"*{regression['model']}*\n"
692
+ f"• {regression['device']}: {regression['current_value']}x speed\n"
693
+ f"• Best: {regression['best_value']}x speed ({regression['best_device']} on {regression['best_os']})\n"
694
+ f"• Slower by: {regression['percentage_diff']}%"
695
+ }
696
+ })
697
+
698
+ if speed_os:
699
+ if any([wer_device, wer_os, wer_release, speed_device]):
700
+ blocks.append({"type": "divider"})
701
+
702
+ blocks.append({
703
+ "type": "section",
704
+ "text": {
705
+ "type": "mrkdwn",
706
+ "text": "*Speed OS Version Discrepancies:*"
707
+ }
708
+ })
709
+
710
+ for regression in speed_os:
711
+ blocks.append({
712
+ "type": "section",
713
+ "text": {
714
+ "type": "mrkdwn",
715
+ "text": f"*{regression['model']}*\n"
716
+ f"• {regression['os']}: {regression['current_value']}x speed\n"
717
+ f"• Best: {regression['best_value']}x speed ({regression['best_device']} on {regression['best_os']})\n"
718
+ f"• Slower by: {regression['percentage_diff']}%"
719
+ }
720
+ })
721
+
722
+ if speed_release:
723
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os]):
724
+ blocks.append({"type": "divider"})
725
+
726
+ blocks.append({
727
+ "type": "section",
728
+ "text": {
729
+ "type": "mrkdwn",
730
+ "text": "*Speed Release-to-Release Regressions:*"
731
+ }
732
+ })
733
+
734
+ for regression in speed_release:
735
+ blocks.append({
736
+ "type": "section",
737
+ "text": {
738
+ "type": "mrkdwn",
739
+ "text": f"*{regression['model']}* on {regression['device']} ({regression['os']})\n"
740
+ f"• Current: {regression['current_value']}x speed\n"
741
+ f"• Best Historical: {regression['best_historical_value']}x speed ({regression['best_device']} on {regression['best_os']})\n"
742
+ f"• Slower by: {regression.get('percentage_decrease', regression.get('percentage_increase', 0))}%"
743
+ }
744
+ })
745
+
746
+ # Tokens Per Second Regressions
747
+ if tokens_device:
748
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os, speed_release]):
749
+ blocks.append({"type": "divider"})
750
+
751
+ blocks.append({
752
+ "type": "section",
753
+ "text": {
754
+ "type": "mrkdwn",
755
+ "text": "*Tokens/Second Device Discrepancies:*"
756
+ }
757
+ })
758
+
759
+ for regression in tokens_device:
760
+ blocks.append({
761
+ "type": "section",
762
+ "text": {
763
+ "type": "mrkdwn",
764
+ "text": f"*{regression['model']}*\n"
765
+ f"• {regression['device']}: {regression['current_value']} tokens/sec\n"
766
+ f"• Best: {regression['best_value']} tokens/sec ({regression['best_device']} on {regression['best_os']})\n"
767
+ f"• Slower by: {regression['percentage_diff']}%"
768
+ }
769
+ })
770
+
771
+ if tokens_os:
772
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os, speed_release, tokens_device]):
773
+ blocks.append({"type": "divider"})
774
+
775
+ blocks.append({
776
+ "type": "section",
777
+ "text": {
778
+ "type": "mrkdwn",
779
+ "text": "*Tokens/Second OS Version Discrepancies:*"
780
+ }
781
+ })
782
+
783
+ for regression in tokens_os:
784
+ blocks.append({
785
+ "type": "section",
786
+ "text": {
787
+ "type": "mrkdwn",
788
+ "text": f"*{regression['model']}*\n"
789
+ f"• {regression['os']}: {regression['current_value']} tokens/sec\n"
790
+ f"• Best: {regression['best_value']} tokens/sec ({regression['best_device']} on {regression['best_os']})\n"
791
+ f"• Slower by: {regression['percentage_diff']}%"
792
+ }
793
+ })
794
+
795
+ if tokens_release:
796
+ if any([wer_device, wer_os, wer_release, speed_device, speed_os, speed_release, tokens_device, tokens_os]):
797
+ blocks.append({"type": "divider"})
798
+
799
+ blocks.append({
800
+ "type": "section",
801
+ "text": {
802
+ "type": "mrkdwn",
803
+ "text": "*Tokens/Second Release-to-Release Regressions:*"
804
+ }
805
+ })
806
+
807
+ for regression in tokens_release:
808
+ blocks.append({
809
+ "type": "section",
810
+ "text": {
811
+ "type": "mrkdwn",
812
+ "text": f"*{regression['model']}* on {regression['device']} ({regression['os']})\n"
813
+ f"• Current: {regression['current_value']} tokens/sec\n"
814
+ f"• Best Historical: {regression['best_historical_value']} tokens/sec ({regression['best_device']} on {regression['best_os']})\n"
815
+ f"• Slower by: {regression.get('percentage_decrease', regression.get('percentage_increase', 0))}%"
816
+ }
817
+ })
818
+
819
+ return {"blocks": blocks}
820
+
821
+
822
+ def check_performance_regressions():
823
+ """Main function to check for performance regressions and generate alerts."""
824
+
825
+ # Load version data to get commit hashes
826
+ try:
827
+ with open("dashboard_data/version.json", "r") as f:
828
+ version_data = json.load(f)
829
+ except FileNotFoundError:
830
+ print("Error: version.json not found")
831
+ return
832
+
833
+ releases = version_data.get("releases", [])
834
+ if len(releases) < 1:
835
+ print("Not enough release data for comparison")
836
+ return
837
+
838
+ # Get current and previous commit hashes
839
+ current_commit = releases[-1] if releases else None
840
+ previous_commit = releases[-2] if len(releases) >= 2 else None
841
+
842
+ print(f"Checking performance regressions for current commit: {current_commit}")
843
+ if previous_commit:
844
+ print(f"Comparing against previous commit: {previous_commit}")
845
+
846
+ # Load performance data - get all historical data for cross-version analysis
847
+ all_historical_data = load_performance_data("dashboard_data/performance_data.json")
848
+ current_data = load_performance_data("dashboard_data/performance_data.json", current_commit)
849
+ previous_data = load_performance_data("dashboard_data/performance_data.json", previous_commit) if previous_commit else []
850
+
851
+ print(f"Loaded {len(current_data)} current data points, {len(previous_data)} previous data points")
852
+ print(f"Loaded {len(all_historical_data)} total historical data points for cross-version analysis")
853
+
854
+ all_regressions = []
855
+
856
+ # WER Checks
857
+ print("\n=== Checking WER Regressions ===")
858
+ device_regressions = detect_device_regressions(current_data, all_historical_data, threshold=20.0)
859
+ all_regressions.extend(device_regressions)
860
+ print(f"Found {len(device_regressions)} WER device discrepancies")
861
+
862
+ os_regressions = detect_os_regressions(current_data, all_historical_data, threshold=20.0)
863
+ all_regressions.extend(os_regressions)
864
+ print(f"Found {len(os_regressions)} WER OS discrepancies")
865
+
866
+ release_regressions = detect_release_regressions(current_data, previous_data, threshold=20.0)
867
+ all_regressions.extend(release_regressions)
868
+ print(f"Found {len(release_regressions)} WER release regressions")
869
+
870
+ # Speed Checks
871
+ print("\n=== Checking Speed Regressions ===")
872
+ speed_device_regressions = detect_speed_device_regressions(current_data, all_historical_data, threshold=20.0)
873
+ all_regressions.extend(speed_device_regressions)
874
+ print(f"Found {len(speed_device_regressions)} speed device discrepancies")
875
+
876
+ speed_os_regressions = detect_speed_os_regressions(current_data, all_historical_data, threshold=20.0)
877
+ all_regressions.extend(speed_os_regressions)
878
+ print(f"Found {len(speed_os_regressions)} speed OS discrepancies")
879
+
880
+ speed_release_regressions = detect_speed_release_regressions(current_data, previous_data, threshold=20.0)
881
+ all_regressions.extend(speed_release_regressions)
882
+ print(f"Found {len(speed_release_regressions)} speed release regressions")
883
+
884
+ # Tokens Per Second Checks
885
+ print("\n=== Checking Tokens/Second Regressions ===")
886
+ tokens_device_regressions = detect_tokens_device_regressions(current_data, all_historical_data, threshold=20.0)
887
+ all_regressions.extend(tokens_device_regressions)
888
+ print(f"Found {len(tokens_device_regressions)} tokens/sec device discrepancies")
889
+
890
+ tokens_os_regressions = detect_tokens_os_regressions(current_data, all_historical_data, threshold=20.0)
891
+ all_regressions.extend(tokens_os_regressions)
892
+ print(f"Found {len(tokens_os_regressions)} tokens/sec OS discrepancies")
893
+
894
+ tokens_release_regressions = detect_tokens_release_regressions(current_data, previous_data, threshold=20.0)
895
+ all_regressions.extend(tokens_release_regressions)
896
+ print(f"Found {len(tokens_release_regressions)} tokens/sec release regressions")
897
+
898
+ # Generate outputs
899
+ github_output = os.getenv("GITHUB_OUTPUT")
900
+ if github_output:
901
+ with open(github_output, "a") as f:
902
+ print(f"has_performance_regressions={'true' if all_regressions else 'false'}", file=f)
903
+ print(f"performance_regression_count={len(all_regressions)}", file=f)
904
+
905
+ if all_regressions:
906
+ slack_payload = generate_slack_message(all_regressions)
907
+ if slack_payload:
908
+ f.write("performance_regression_slack_payload<<EOF\n")
909
+ json.dump(slack_payload, f, indent=2)
910
+ f.write("\nEOF\n")
911
+
912
+ # Print summary for debugging
913
+ if all_regressions:
914
+ print(f"\n⚠️ ALERT: Found {len(all_regressions)} performance regressions!")
915
+ for regression in all_regressions:
916
+ print(f" - {regression['type']}: {regression.get('model', 'N/A')}")
917
+ else:
918
+ print("\n✅ No significant performance regressions detected")
919
+
920
+
921
+ if __name__ == "__main__":
922
+ check_performance_regressions()
.github/workflows/dataset_update.yml CHANGED
@@ -146,3 +146,18 @@ jobs:
146
  }
147
  env:
148
  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  }
147
  env:
148
  SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
149
+
150
+ - name: Check for WER Regressions
151
+ if: steps.check.outputs.has_updates == 'true'
152
+ id: wer_check
153
+ run: python .github/scripts/wer_regression_check.py
154
+
155
+ - name: Alert WER Regressions
156
+ if: steps.check.outputs.has_updates == 'true' && steps.wer_check.outputs.has_wer_regressions == 'true'
157
+ uses: slackapi/slack-github-action@v1.27.0
158
+ with:
159
+ channel-id: ${{ secrets.SLACK_CHANNEL_ID }}
160
+ payload: |
161
+ ${{ steps.wer_check.outputs.wer_regression_slack_payload }}
162
+ env:
163
+ SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}