Nyk commited on
Commit
58cb911
·
1 Parent(s): 687f7e3

fix(workload): harden signal recommendations and add route e2e coverage

Browse files
README.md CHANGED
@@ -418,6 +418,32 @@ pnpm test:e2e # Playwright E2E
418
  pnpm quality:gate # All checks
419
  ```
420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  ## Roadmap
422
 
423
  See [open issues](https://github.com/builderz-labs/mission-control/issues) for planned work and the [v1.0.0 release notes](https://github.com/builderz-labs/mission-control/releases/tag/v1.0.0) for what shipped.
 
418
  pnpm quality:gate # All checks
419
  ```
420
 
421
+ ## Workload Signals Contract
422
+
423
+ `GET /api/workload` returns a workload snapshot and one recommendation:
424
+
425
+ - `normal`: system healthy, submit freely
426
+ - `throttle`: reduce submission rate / defer non-critical work
427
+ - `shed`: submit only critical work
428
+ - `pause`: hold submissions until capacity returns
429
+
430
+ Low-signal behavior:
431
+
432
+ - `capacity.error_rate_5m` is clamped to `[0,1]`
433
+ - `queue.estimated_wait_confidence` is `calculated` or `unknown`
434
+ - queue breakdown maps include stable keys even when counts are zero
435
+
436
+ Runtime-tunable thresholds:
437
+
438
+ - `MC_WORKLOAD_QUEUE_DEPTH_NORMAL`
439
+ - `MC_WORKLOAD_QUEUE_DEPTH_THROTTLE`
440
+ - `MC_WORKLOAD_QUEUE_DEPTH_SHED`
441
+ - `MC_WORKLOAD_BUSY_RATIO_THROTTLE`
442
+ - `MC_WORKLOAD_BUSY_RATIO_SHED`
443
+ - `MC_WORKLOAD_ERROR_RATE_THROTTLE`
444
+ - `MC_WORKLOAD_ERROR_RATE_SHED`
445
+ - `MC_WORKLOAD_RECENT_WINDOW_SECONDS`
446
+
447
  ## Roadmap
448
 
449
  See [open issues](https://github.com/builderz-labs/mission-control/issues) for planned work and the [v1.0.0 release notes](https://github.com/builderz-labs/mission-control/releases/tag/v1.0.0) for what shipped.
openapi.json CHANGED
@@ -4626,6 +4626,93 @@
4626
  }
4627
  }
4628
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4629
  "/api/events": {
4630
  "get": {
4631
  "tags": [
 
4626
  }
4627
  }
4628
  },
4629
+ "/api/workload": {
4630
+ "get": {
4631
+ "tags": [
4632
+ "Monitoring"
4633
+ ],
4634
+ "summary": "Get real-time workload recommendation",
4635
+ "description": "Returns system workload metrics and an actionable recommendation: `normal`, `throttle`, `shed`, or `pause`. Thresholds are runtime-configurable via `MC_WORKLOAD_*` environment variables.",
4636
+ "operationId": "getWorkloadSignals",
4637
+ "responses": {
4638
+ "200": {
4639
+ "description": "Workload snapshot and recommendation",
4640
+ "content": {
4641
+ "application/json": {
4642
+ "schema": {
4643
+ "type": "object",
4644
+ "properties": {
4645
+ "timestamp": { "type": "integer" },
4646
+ "workspace_id": { "type": "integer" },
4647
+ "capacity": {
4648
+ "type": "object",
4649
+ "properties": {
4650
+ "active_tasks": { "type": "integer" },
4651
+ "tasks_last_5m": { "type": "integer" },
4652
+ "errors_last_5m": { "type": "integer" },
4653
+ "error_rate_5m": { "type": "number", "minimum": 0, "maximum": 1 },
4654
+ "completions_last_hour": { "type": "integer" },
4655
+ "avg_completion_rate_per_hour": { "type": "number" }
4656
+ }
4657
+ },
4658
+ "queue": {
4659
+ "type": "object",
4660
+ "properties": {
4661
+ "total_pending": { "type": "integer" },
4662
+ "by_status": { "type": "object", "additionalProperties": { "type": "integer" } },
4663
+ "by_priority": { "type": "object", "additionalProperties": { "type": "integer" } },
4664
+ "oldest_pending_age_seconds": { "type": ["integer", "null"] },
4665
+ "estimated_wait_seconds": { "type": ["integer", "null"] },
4666
+ "estimated_wait_confidence": { "type": "string", "enum": ["calculated", "unknown"] }
4667
+ }
4668
+ },
4669
+ "agents": {
4670
+ "type": "object",
4671
+ "properties": {
4672
+ "total": { "type": "integer" },
4673
+ "online": { "type": "integer" },
4674
+ "busy": { "type": "integer" },
4675
+ "idle": { "type": "integer" },
4676
+ "offline": { "type": "integer" },
4677
+ "busy_ratio": { "type": "number", "minimum": 0, "maximum": 1 },
4678
+ "load_distribution": {
4679
+ "type": "array",
4680
+ "items": {
4681
+ "type": "object",
4682
+ "properties": {
4683
+ "agent": { "type": "string" },
4684
+ "assigned": { "type": "integer" },
4685
+ "in_progress": { "type": "integer" }
4686
+ }
4687
+ }
4688
+ }
4689
+ }
4690
+ },
4691
+ "recommendation": {
4692
+ "type": "object",
4693
+ "properties": {
4694
+ "action": { "type": "string", "enum": ["normal", "throttle", "shed", "pause"] },
4695
+ "reason": { "type": "string" },
4696
+ "details": { "type": "array", "items": { "type": "string" } },
4697
+ "submit_ok": { "type": "boolean" },
4698
+ "suggested_delay_ms": { "type": "integer" }
4699
+ }
4700
+ },
4701
+ "thresholds": {
4702
+ "type": "object",
4703
+ "description": "Effective runtime thresholds after environment overrides."
4704
+ }
4705
+ }
4706
+ }
4707
+ }
4708
+ }
4709
+ },
4710
+ "401": {
4711
+ "$ref": "#/components/responses/Unauthorized"
4712
+ }
4713
+ }
4714
+ }
4715
+ },
4716
  "/api/events": {
4717
  "get": {
4718
  "tags": [
playwright.config.ts CHANGED
@@ -18,9 +18,22 @@ export default defineConfig({
18
  { name: 'chromium', use: { ...devices['Desktop Chrome'] } }
19
  ],
20
  webServer: {
21
- command: 'pnpm start',
22
  url: 'http://127.0.0.1:3005',
23
  reuseExistingServer: true,
24
- timeout: 30_000,
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  }
26
  })
 
18
  { name: 'chromium', use: { ...devices['Desktop Chrome'] } }
19
  ],
20
  webServer: {
21
+ command: 'node .next/standalone/server.js',
22
  url: 'http://127.0.0.1:3005',
23
  reuseExistingServer: true,
24
+ timeout: 120_000,
25
+ env: {
26
+ ...process.env,
27
+ HOSTNAME: process.env.HOSTNAME || '127.0.0.1',
28
+ PORT: process.env.PORT || '3005',
29
+ MC_DISABLE_RATE_LIMIT: process.env.MC_DISABLE_RATE_LIMIT || '1',
30
+ MC_WORKLOAD_QUEUE_DEPTH_THROTTLE: process.env.MC_WORKLOAD_QUEUE_DEPTH_THROTTLE || '1000',
31
+ MC_WORKLOAD_QUEUE_DEPTH_SHED: process.env.MC_WORKLOAD_QUEUE_DEPTH_SHED || '2000',
32
+ MC_WORKLOAD_ERROR_RATE_THROTTLE: process.env.MC_WORKLOAD_ERROR_RATE_THROTTLE || '1',
33
+ MC_WORKLOAD_ERROR_RATE_SHED: process.env.MC_WORKLOAD_ERROR_RATE_SHED || '1',
34
+ API_KEY: process.env.API_KEY || 'test-api-key-e2e-12345',
35
+ AUTH_USER: process.env.AUTH_USER || 'testadmin',
36
+ AUTH_PASS: process.env.AUTH_PASS || 'testpass1234!',
37
+ },
38
  }
39
  })
src/app/api/workload/route.ts CHANGED
@@ -56,16 +56,27 @@ export async function GET(request: NextRequest) {
56
  }
57
 
58
  // Configurable thresholds for recommendation engine
59
- const THRESHOLDS = {
60
- queue_depth_normal: 20,
61
- queue_depth_throttle: 50,
62
- queue_depth_shed: 100,
63
- busy_agent_ratio_throttle: 0.8,
64
- busy_agent_ratio_shed: 0.95,
65
- error_rate_throttle: 0.1,
66
- error_rate_shed: 0.25,
67
- recent_window_seconds: 300, // 5 minutes for recent activity
68
- };
 
 
 
 
 
 
 
 
 
 
 
69
 
70
  interface CapacityMetrics {
71
  active_tasks: number;
@@ -82,6 +93,7 @@ interface QueueMetrics {
82
  by_priority: Record<string, number>;
83
  oldest_pending_age_seconds: number | null;
84
  estimated_wait_seconds: number | null;
 
85
  }
86
 
87
  interface AgentMetrics {
@@ -124,11 +136,13 @@ function buildCapacityMetrics(db: any, workspaceId: number, now: number): Capaci
124
  `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
125
  ).get(workspaceId, dayAgo) as any).c;
126
 
 
 
127
  return {
128
  active_tasks: activeTasks,
129
  tasks_last_5m: tasksLast5m,
130
  errors_last_5m: errorsLast5m,
131
- error_rate_5m: totalLast5m > 0 ? Math.round((errorsLast5m / totalLast5m) * 10000) / 10000 : 0,
132
  completions_last_hour: completionsLastHour,
133
  avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
134
  };
@@ -165,12 +179,23 @@ function buildQueueMetrics(db: any, workspaceId: number): QueueMetrics {
165
  ? Math.round((totalPending / completionsLastHour) * 3600)
166
  : null;
167
 
 
 
 
 
 
 
 
 
 
 
168
  return {
169
  total_pending: totalPending,
170
- by_status: Object.fromEntries(byStatus.map(r => [r.status, r.count])),
171
- by_priority: Object.fromEntries(byPriority.map(r => [r.priority, r.count])),
172
  oldest_pending_age_seconds: oldestAge,
173
  estimated_wait_seconds: estimatedWait,
 
174
  };
175
  }
176
 
@@ -260,9 +285,9 @@ function computeRecommendation(
260
  }
261
 
262
  // No online agents = pause
263
- if (agents.online === 0 && agents.total > 0) {
264
  level = 'pause';
265
- reasons.push('No agents online');
266
  }
267
 
268
  const delayMap: Record<RecommendationLevel, number> = {
 
56
  }
57
 
58
  // Configurable thresholds for recommendation engine
59
+ function numEnv(name: string, fallback: number): number {
60
+ const raw = process.env[name];
61
+ if (!raw || raw.trim().length === 0) return fallback;
62
+ const parsed = Number(raw);
63
+ return Number.isFinite(parsed) ? parsed : fallback;
64
+ }
65
+
66
+ function buildThresholds() {
67
+ return {
68
+ queue_depth_normal: numEnv('MC_WORKLOAD_QUEUE_DEPTH_NORMAL', 20),
69
+ queue_depth_throttle: numEnv('MC_WORKLOAD_QUEUE_DEPTH_THROTTLE', 50),
70
+ queue_depth_shed: numEnv('MC_WORKLOAD_QUEUE_DEPTH_SHED', 100),
71
+ busy_agent_ratio_throttle: numEnv('MC_WORKLOAD_BUSY_RATIO_THROTTLE', 0.8),
72
+ busy_agent_ratio_shed: numEnv('MC_WORKLOAD_BUSY_RATIO_SHED', 0.95),
73
+ error_rate_throttle: numEnv('MC_WORKLOAD_ERROR_RATE_THROTTLE', 0.1),
74
+ error_rate_shed: numEnv('MC_WORKLOAD_ERROR_RATE_SHED', 0.25),
75
+ recent_window_seconds: Math.max(1, Math.floor(numEnv('MC_WORKLOAD_RECENT_WINDOW_SECONDS', 300))),
76
+ };
77
+ }
78
+
79
+ const THRESHOLDS = buildThresholds();
80
 
81
  interface CapacityMetrics {
82
  active_tasks: number;
 
93
  by_priority: Record<string, number>;
94
  oldest_pending_age_seconds: number | null;
95
  estimated_wait_seconds: number | null;
96
+ estimated_wait_confidence: 'calculated' | 'unknown';
97
  }
98
 
99
  interface AgentMetrics {
 
136
  `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
137
  ).get(workspaceId, dayAgo) as any).c;
138
 
139
+ const safeErrorRate = totalLast5m > 0 ? errorsLast5m / totalLast5m : 0;
140
+
141
  return {
142
  active_tasks: activeTasks,
143
  tasks_last_5m: tasksLast5m,
144
  errors_last_5m: errorsLast5m,
145
+ error_rate_5m: Math.max(0, Math.min(1, Math.round(safeErrorRate * 10000) / 10000)),
146
  completions_last_hour: completionsLastHour,
147
  avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
148
  };
 
179
  ? Math.round((totalPending / completionsLastHour) * 3600)
180
  : null;
181
 
182
+ const statusMap = Object.fromEntries(byStatus.map(r => [r.status, r.count]));
183
+ for (const status of pendingStatuses) {
184
+ if (typeof statusMap[status] !== 'number') statusMap[status] = 0;
185
+ }
186
+
187
+ const priorityMap = Object.fromEntries(byPriority.map(r => [r.priority, r.count]));
188
+ for (const priority of ['low', 'medium', 'high', 'critical', 'urgent']) {
189
+ if (typeof priorityMap[priority] !== 'number') priorityMap[priority] = 0;
190
+ }
191
+
192
  return {
193
  total_pending: totalPending,
194
+ by_status: statusMap,
195
+ by_priority: priorityMap,
196
  oldest_pending_age_seconds: oldestAge,
197
  estimated_wait_seconds: estimatedWait,
198
+ estimated_wait_confidence: estimatedWait === null ? 'unknown' : 'calculated',
199
  };
200
  }
201
 
 
285
  }
286
 
287
  // No online agents = pause
288
+ if (agents.online === 0) {
289
  level = 'pause';
290
+ reasons.push(agents.total > 0 ? 'No agents online' : 'No agents registered');
291
  }
292
 
293
  const delayMap: Record<RecommendationLevel, number> = {
tests/workload-signals.spec.ts ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { test, expect } from '@playwright/test'
2
+ import { API_KEY_HEADER, createTestAgent, deleteTestAgent, createTestTask, deleteTestTask } from './helpers'
3
+
4
+ test.describe('Workload Signals API', () => {
5
+ const agentCleanup: number[] = []
6
+ const taskCleanup: number[] = []
7
+
8
+ test.afterEach(async ({ request }) => {
9
+ for (const id of taskCleanup) {
10
+ await deleteTestTask(request, id).catch(() => {})
11
+ }
12
+ taskCleanup.length = 0
13
+
14
+ for (const id of agentCleanup) {
15
+ await deleteTestAgent(request, id).catch(() => {})
16
+ }
17
+ agentCleanup.length = 0
18
+ })
19
+
20
+ test('returns normal recommendation under light load', async ({ request }) => {
21
+ const { id } = await createTestAgent(request, { status: 'idle' })
22
+ agentCleanup.push(id)
23
+
24
+ const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
25
+ expect(res.status()).toBe(200)
26
+ const body = await res.json()
27
+
28
+ expect(body.recommendation.action).toBe('normal')
29
+ expect(body.recommendation.submit_ok).toBe(true)
30
+ })
31
+
32
+ test('returns throttle recommendation at high busy ratio', async ({ request }) => {
33
+ const idleAgent = await createTestAgent(request, { status: 'idle' })
34
+ agentCleanup.push(idleAgent.id)
35
+ for (let i = 0; i < 4; i++) {
36
+ const busyAgent = await createTestAgent(request, { status: 'busy' })
37
+ agentCleanup.push(busyAgent.id)
38
+ }
39
+
40
+ const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
41
+ expect(res.status()).toBe(200)
42
+ const body = await res.json()
43
+
44
+ expect(body.recommendation.action).toBe('throttle')
45
+ expect(body.recommendation.submit_ok).toBe(true)
46
+ })
47
+
48
+ test('returns shed recommendation at critical busy ratio', async ({ request }) => {
49
+ const idleAgent = await createTestAgent(request, { status: 'idle' })
50
+ agentCleanup.push(idleAgent.id)
51
+ for (let i = 0; i < 19; i++) {
52
+ const busyAgent = await createTestAgent(request, { status: 'busy' })
53
+ agentCleanup.push(busyAgent.id)
54
+ }
55
+
56
+ const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
57
+ expect(res.status()).toBe(200)
58
+ const body = await res.json()
59
+
60
+ expect(body.recommendation.action).toBe('shed')
61
+ expect(body.recommendation.submit_ok).toBe(false)
62
+ })
63
+
64
+ test('returns pause recommendation when no agents are online', async ({ request }) => {
65
+ const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
66
+ expect(res.status()).toBe(200)
67
+ const body = await res.json()
68
+
69
+ expect(body.agents.online).toBe(0)
70
+ expect(body.recommendation.action).toBe('pause')
71
+ expect(body.recommendation.submit_ok).toBe(false)
72
+ })
73
+
74
+ test('returns consistent response for low-signal conditions', async ({ request }) => {
75
+ const { id } = await createTestAgent(request, { status: 'idle' })
76
+ agentCleanup.push(id)
77
+
78
+ const task = await createTestTask(request, { status: 'inbox' })
79
+ taskCleanup.push(task.id)
80
+
81
+ const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
82
+ expect(res.status()).toBe(200)
83
+ const body = await res.json()
84
+
85
+ expect(body.capacity.error_rate_5m).toBeGreaterThanOrEqual(0)
86
+ expect(body.capacity.error_rate_5m).toBeLessThanOrEqual(1)
87
+ expect(body.queue.by_status).toHaveProperty('inbox')
88
+ expect(body.queue.by_status).toHaveProperty('assigned')
89
+ expect(body.queue.by_status).toHaveProperty('in_progress')
90
+ expect(body.queue.by_priority).toHaveProperty('critical')
91
+ expect(body.queue.by_priority).toHaveProperty('high')
92
+ expect(body.queue.by_priority).toHaveProperty('medium')
93
+ expect(body.queue.by_priority).toHaveProperty('low')
94
+ expect(['calculated', 'unknown']).toContain(body.queue.estimated_wait_confidence)
95
+ })
96
+ })