HuggingClaw-MissionControl

Runtime error

App Files Files Community

Nyk commited on Mar 5

Commit

58cb911

1 Parent(s): 687f7e3

fix(workload): harden signal recommendations and add route e2e coverage

Browse files

Files changed (5) hide show

README.md +26 -0
openapi.json +87 -0
playwright.config.ts +15 -2
src/app/api/workload/route.ts +40 -15
tests/workload-signals.spec.ts +96 -0

README.md CHANGED Viewed

@@ -418,6 +418,32 @@ pnpm test:e2e         # Playwright E2E
 pnpm quality:gate     # All checks
 ```
 ## Roadmap
 See [open issues](https://github.com/builderz-labs/mission-control/issues) for planned work and the [v1.0.0 release notes](https://github.com/builderz-labs/mission-control/releases/tag/v1.0.0) for what shipped.

 pnpm quality:gate     # All checks
 ```
+## Workload Signals Contract
+`GET /api/workload` returns a workload snapshot and one recommendation:
+- `normal`: system healthy, submit freely
+- `throttle`: reduce submission rate / defer non-critical work
+- `shed`: submit only critical work
+- `pause`: hold submissions until capacity returns
+Low-signal behavior:
+- `capacity.error_rate_5m` is clamped to `[0,1]`
+- `queue.estimated_wait_confidence` is `calculated` or `unknown`
+- queue breakdown maps include stable keys even when counts are zero
+Runtime-tunable thresholds:
+- `MC_WORKLOAD_QUEUE_DEPTH_NORMAL`
+- `MC_WORKLOAD_QUEUE_DEPTH_THROTTLE`
+- `MC_WORKLOAD_QUEUE_DEPTH_SHED`
+- `MC_WORKLOAD_BUSY_RATIO_THROTTLE`
+- `MC_WORKLOAD_BUSY_RATIO_SHED`
+- `MC_WORKLOAD_ERROR_RATE_THROTTLE`
+- `MC_WORKLOAD_ERROR_RATE_SHED`
+- `MC_WORKLOAD_RECENT_WINDOW_SECONDS`
 ## Roadmap
 See [open issues](https://github.com/builderz-labs/mission-control/issues) for planned work and the [v1.0.0 release notes](https://github.com/builderz-labs/mission-control/releases/tag/v1.0.0) for what shipped.

openapi.json CHANGED Viewed

@@ -4626,6 +4626,93 @@
         }
       }
     },
     "/api/events": {
       "get": {
         "tags": [

         }
       }
     },
+    "/api/workload": {
+      "get": {
+        "tags": [
+          "Monitoring"
+        ],
+        "summary": "Get real-time workload recommendation",
+        "description": "Returns system workload metrics and an actionable recommendation: `normal`, `throttle`, `shed`, or `pause`. Thresholds are runtime-configurable via `MC_WORKLOAD_*` environment variables.",
+        "operationId": "getWorkloadSignals",
+        "responses": {
+          "200": {
+            "description": "Workload snapshot and recommendation",
+            "content": {
+              "application/json": {
+                "schema": {
+                  "type": "object",
+                  "properties": {
+                    "timestamp": { "type": "integer" },
+                    "workspace_id": { "type": "integer" },
+                    "capacity": {
+                      "type": "object",
+                      "properties": {
+                        "active_tasks": { "type": "integer" },
+                        "tasks_last_5m": { "type": "integer" },
+                        "errors_last_5m": { "type": "integer" },
+                        "error_rate_5m": { "type": "number", "minimum": 0, "maximum": 1 },
+                        "completions_last_hour": { "type": "integer" },
+                        "avg_completion_rate_per_hour": { "type": "number" }
+                      }
+                    },
+                    "queue": {
+                      "type": "object",
+                      "properties": {
+                        "total_pending": { "type": "integer" },
+                        "by_status": { "type": "object", "additionalProperties": { "type": "integer" } },
+                        "by_priority": { "type": "object", "additionalProperties": { "type": "integer" } },
+                        "oldest_pending_age_seconds": { "type": ["integer", "null"] },
+                        "estimated_wait_seconds": { "type": ["integer", "null"] },
+                        "estimated_wait_confidence": { "type": "string", "enum": ["calculated", "unknown"] }
+                      }
+                    },
+                    "agents": {
+                      "type": "object",
+                      "properties": {
+                        "total": { "type": "integer" },
+                        "online": { "type": "integer" },
+                        "busy": { "type": "integer" },
+                        "idle": { "type": "integer" },
+                        "offline": { "type": "integer" },
+                        "busy_ratio": { "type": "number", "minimum": 0, "maximum": 1 },
+                        "load_distribution": {
+                          "type": "array",
+                          "items": {
+                            "type": "object",
+                            "properties": {
+                              "agent": { "type": "string" },
+                              "assigned": { "type": "integer" },
+                              "in_progress": { "type": "integer" }
+                            }
+                          }
+                        }
+                      }
+                    },
+                    "recommendation": {
+                      "type": "object",
+                      "properties": {
+                        "action": { "type": "string", "enum": ["normal", "throttle", "shed", "pause"] },
+                        "reason": { "type": "string" },
+                        "details": { "type": "array", "items": { "type": "string" } },
+                        "submit_ok": { "type": "boolean" },
+                        "suggested_delay_ms": { "type": "integer" }
+                      }
+                    },
+                    "thresholds": {
+                      "type": "object",
+                      "description": "Effective runtime thresholds after environment overrides."
+                    }
+                  }
+                }
+              }
+            }
+          },
+          "401": {
+            "$ref": "#/components/responses/Unauthorized"
+          }
+        }
+      }
+    },
     "/api/events": {
       "get": {
         "tags": [

playwright.config.ts CHANGED Viewed

@@ -18,9 +18,22 @@ export default defineConfig({
     { name: 'chromium', use: { ...devices['Desktop Chrome'] } }
   ],
   webServer: {
-    command: 'pnpm start',
     url: 'http://127.0.0.1:3005',
     reuseExistingServer: true,
-    timeout: 30_000,
   }
 })

     { name: 'chromium', use: { ...devices['Desktop Chrome'] } }
   ],
   webServer: {
+    command: 'node .next/standalone/server.js',
     url: 'http://127.0.0.1:3005',
     reuseExistingServer: true,
+    timeout: 120_000,
+    env: {
+      ...process.env,
+      HOSTNAME: process.env.HOSTNAME || '127.0.0.1',
+      PORT: process.env.PORT || '3005',
+      MC_DISABLE_RATE_LIMIT: process.env.MC_DISABLE_RATE_LIMIT || '1',
+      MC_WORKLOAD_QUEUE_DEPTH_THROTTLE: process.env.MC_WORKLOAD_QUEUE_DEPTH_THROTTLE || '1000',
+      MC_WORKLOAD_QUEUE_DEPTH_SHED: process.env.MC_WORKLOAD_QUEUE_DEPTH_SHED || '2000',
+      MC_WORKLOAD_ERROR_RATE_THROTTLE: process.env.MC_WORKLOAD_ERROR_RATE_THROTTLE || '1',
+      MC_WORKLOAD_ERROR_RATE_SHED: process.env.MC_WORKLOAD_ERROR_RATE_SHED || '1',
+      API_KEY: process.env.API_KEY || 'test-api-key-e2e-12345',
+      AUTH_USER: process.env.AUTH_USER || 'testadmin',
+      AUTH_PASS: process.env.AUTH_PASS || 'testpass1234!',
+    },
   }
 })

src/app/api/workload/route.ts CHANGED Viewed

@@ -56,16 +56,27 @@ export async function GET(request: NextRequest) {
 }
 // Configurable thresholds for recommendation engine
-const THRESHOLDS = {
-  queue_depth_normal: 20,
-  queue_depth_throttle: 50,
-  queue_depth_shed: 100,
-  busy_agent_ratio_throttle: 0.8,
-  busy_agent_ratio_shed: 0.95,
-  error_rate_throttle: 0.1,
-  error_rate_shed: 0.25,
-  recent_window_seconds: 300, // 5 minutes for recent activity
-};
 interface CapacityMetrics {
   active_tasks: number;
@@ -82,6 +93,7 @@ interface QueueMetrics {
   by_priority: Record<string, number>;
   oldest_pending_age_seconds: number | null;
   estimated_wait_seconds: number | null;
 }
 interface AgentMetrics {
@@ -124,11 +136,13 @@ function buildCapacityMetrics(db: any, workspaceId: number, now: number): Capaci
     `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
   ).get(workspaceId, dayAgo) as any).c;
   return {
     active_tasks: activeTasks,
     tasks_last_5m: tasksLast5m,
     errors_last_5m: errorsLast5m,
-    error_rate_5m: totalLast5m > 0 ? Math.round((errorsLast5m / totalLast5m) * 10000) / 10000 : 0,
     completions_last_hour: completionsLastHour,
     avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
   };
@@ -165,12 +179,23 @@ function buildQueueMetrics(db: any, workspaceId: number): QueueMetrics {
     ? Math.round((totalPending / completionsLastHour) * 3600)
     : null;
   return {
     total_pending: totalPending,
-    by_status: Object.fromEntries(byStatus.map(r => [r.status, r.count])),
-    by_priority: Object.fromEntries(byPriority.map(r => [r.priority, r.count])),
     oldest_pending_age_seconds: oldestAge,
     estimated_wait_seconds: estimatedWait,
   };
 }
@@ -260,9 +285,9 @@ function computeRecommendation(
   }
   // No online agents = pause
-  if (agents.online === 0 && agents.total > 0) {
     level = 'pause';
-    reasons.push('No agents online');
   }
   const delayMap: Record<RecommendationLevel, number> = {

 }
 // Configurable thresholds for recommendation engine
+function numEnv(name: string, fallback: number): number {
+  const raw = process.env[name];
+  if (!raw || raw.trim().length === 0) return fallback;
+  const parsed = Number(raw);
+  return Number.isFinite(parsed) ? parsed : fallback;
+}
+function buildThresholds() {
+  return {
+    queue_depth_normal: numEnv('MC_WORKLOAD_QUEUE_DEPTH_NORMAL', 20),
+    queue_depth_throttle: numEnv('MC_WORKLOAD_QUEUE_DEPTH_THROTTLE', 50),
+    queue_depth_shed: numEnv('MC_WORKLOAD_QUEUE_DEPTH_SHED', 100),
+    busy_agent_ratio_throttle: numEnv('MC_WORKLOAD_BUSY_RATIO_THROTTLE', 0.8),
+    busy_agent_ratio_shed: numEnv('MC_WORKLOAD_BUSY_RATIO_SHED', 0.95),
+    error_rate_throttle: numEnv('MC_WORKLOAD_ERROR_RATE_THROTTLE', 0.1),
+    error_rate_shed: numEnv('MC_WORKLOAD_ERROR_RATE_SHED', 0.25),
+    recent_window_seconds: Math.max(1, Math.floor(numEnv('MC_WORKLOAD_RECENT_WINDOW_SECONDS', 300))),
+  };
+}
+const THRESHOLDS = buildThresholds();
 interface CapacityMetrics {
   active_tasks: number;
   by_priority: Record<string, number>;
   oldest_pending_age_seconds: number | null;
   estimated_wait_seconds: number | null;
+  estimated_wait_confidence: 'calculated' | 'unknown';
 }
 interface AgentMetrics {
     `SELECT COUNT(*) as c FROM tasks WHERE workspace_id = ? AND status = 'done' AND updated_at >= ?`
   ).get(workspaceId, dayAgo) as any).c;
+  const safeErrorRate = totalLast5m > 0 ? errorsLast5m / totalLast5m : 0;
   return {
     active_tasks: activeTasks,
     tasks_last_5m: tasksLast5m,
     errors_last_5m: errorsLast5m,
+    error_rate_5m: Math.max(0, Math.min(1, Math.round(safeErrorRate * 10000) / 10000)),
     completions_last_hour: completionsLastHour,
     avg_completion_rate_per_hour: Math.round((completionsLastDay / 24) * 100) / 100,
   };
     ? Math.round((totalPending / completionsLastHour) * 3600)
     : null;
+  const statusMap = Object.fromEntries(byStatus.map(r => [r.status, r.count]));
+  for (const status of pendingStatuses) {
+    if (typeof statusMap[status] !== 'number') statusMap[status] = 0;
+  }
+  const priorityMap = Object.fromEntries(byPriority.map(r => [r.priority, r.count]));
+  for (const priority of ['low', 'medium', 'high', 'critical', 'urgent']) {
+    if (typeof priorityMap[priority] !== 'number') priorityMap[priority] = 0;
+  }
   return {
     total_pending: totalPending,
+    by_status: statusMap,
+    by_priority: priorityMap,
     oldest_pending_age_seconds: oldestAge,
     estimated_wait_seconds: estimatedWait,
+    estimated_wait_confidence: estimatedWait === null ? 'unknown' : 'calculated',
   };
 }
   }
   // No online agents = pause
+  if (agents.online === 0) {
     level = 'pause';
+    reasons.push(agents.total > 0 ? 'No agents online' : 'No agents registered');
   }
   const delayMap: Record<RecommendationLevel, number> = {

tests/workload-signals.spec.ts ADDED Viewed

	@@ -0,0 +1,96 @@

+import { test, expect } from '@playwright/test'
+import { API_KEY_HEADER, createTestAgent, deleteTestAgent, createTestTask, deleteTestTask } from './helpers'
+test.describe('Workload Signals API', () => {
+  const agentCleanup: number[] = []
+  const taskCleanup: number[] = []
+  test.afterEach(async ({ request }) => {
+    for (const id of taskCleanup) {
+      await deleteTestTask(request, id).catch(() => {})
+    }
+    taskCleanup.length = 0
+    for (const id of agentCleanup) {
+      await deleteTestAgent(request, id).catch(() => {})
+    }
+    agentCleanup.length = 0
+  })
+  test('returns normal recommendation under light load', async ({ request }) => {
+    const { id } = await createTestAgent(request, { status: 'idle' })
+    agentCleanup.push(id)
+    const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
+    expect(res.status()).toBe(200)
+    const body = await res.json()
+    expect(body.recommendation.action).toBe('normal')
+    expect(body.recommendation.submit_ok).toBe(true)
+  })
+  test('returns throttle recommendation at high busy ratio', async ({ request }) => {
+    const idleAgent = await createTestAgent(request, { status: 'idle' })
+    agentCleanup.push(idleAgent.id)
+    for (let i = 0; i < 4; i++) {
+      const busyAgent = await createTestAgent(request, { status: 'busy' })
+      agentCleanup.push(busyAgent.id)
+    }
+    const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
+    expect(res.status()).toBe(200)
+    const body = await res.json()
+    expect(body.recommendation.action).toBe('throttle')
+    expect(body.recommendation.submit_ok).toBe(true)
+  })
+  test('returns shed recommendation at critical busy ratio', async ({ request }) => {
+    const idleAgent = await createTestAgent(request, { status: 'idle' })
+    agentCleanup.push(idleAgent.id)
+    for (let i = 0; i < 19; i++) {
+      const busyAgent = await createTestAgent(request, { status: 'busy' })
+      agentCleanup.push(busyAgent.id)
+    }
+    const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
+    expect(res.status()).toBe(200)
+    const body = await res.json()
+    expect(body.recommendation.action).toBe('shed')
+    expect(body.recommendation.submit_ok).toBe(false)
+  })
+  test('returns pause recommendation when no agents are online', async ({ request }) => {
+    const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
+    expect(res.status()).toBe(200)
+    const body = await res.json()
+    expect(body.agents.online).toBe(0)
+    expect(body.recommendation.action).toBe('pause')
+    expect(body.recommendation.submit_ok).toBe(false)
+  })
+  test('returns consistent response for low-signal conditions', async ({ request }) => {
+    const { id } = await createTestAgent(request, { status: 'idle' })
+    agentCleanup.push(id)
+    const task = await createTestTask(request, { status: 'inbox' })
+    taskCleanup.push(task.id)
+    const res = await request.get('/api/workload', { headers: API_KEY_HEADER })
+    expect(res.status()).toBe(200)
+    const body = await res.json()
+    expect(body.capacity.error_rate_5m).toBeGreaterThanOrEqual(0)
+    expect(body.capacity.error_rate_5m).toBeLessThanOrEqual(1)
+    expect(body.queue.by_status).toHaveProperty('inbox')
+    expect(body.queue.by_status).toHaveProperty('assigned')
+    expect(body.queue.by_status).toHaveProperty('in_progress')
+    expect(body.queue.by_priority).toHaveProperty('critical')
+    expect(body.queue.by_priority).toHaveProperty('high')
+    expect(body.queue.by_priority).toHaveProperty('medium')
+    expect(body.queue.by_priority).toHaveProperty('low')
+    expect(['calculated', 'unknown']).toContain(body.queue.estimated_wait_confidence)
+  })
+})