HuggingClaw-MissionControl / src /lib /agent-evals.ts
nyk
feat(refactor): ready for manual QA after main sync (#274)
b6ecafa unverified
/**
* Agent Evals — four-layer evaluation engine for agent performance.
*
* Layer 1 (Output): Task completion and correctness scoring
* Layer 2 (Trace): Convergence analysis and reasoning coherence
* Layer 3 (Component): Tool reliability from MCP call logs
* Layer 4 (Drift): Rolling baseline comparison with threshold detection
*/
import { getDatabase } from '@/lib/db'
export type EvalLayer = 'output' | 'trace' | 'component' | 'drift'
export interface EvalResult {
layer: EvalLayer
score: number
passed: boolean
detail: string
}
export interface DriftResult {
metric: string
current: number
baseline: number
delta: number
drifted: boolean
threshold: number
}
// ---------------------------------------------------------------------------
// Layer 1: Output Evals
// ---------------------------------------------------------------------------
export function evalTaskCompletion(
agentName: string,
hours: number = 168,
workspaceId: number = 1,
): EvalResult {
const db = getDatabase()
const since = Math.floor(Date.now() / 1000) - hours * 3600
const row = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful
FROM tasks
WHERE assigned_to = ? AND workspace_id = ? AND created_at > ?
`).get(agentName, workspaceId, since) as any
const total = row?.total ?? 0
const completed = row?.completed ?? 0
const score = total > 0 ? completed / total : 1.0
return {
layer: 'output',
score: Math.round(score * 100) / 100,
passed: score >= 0.7,
detail: `${completed}/${total} tasks completed (${(score * 100).toFixed(0)}%)`,
}
}
export function evalCorrectnessScore(
agentName: string,
hours: number = 168,
workspaceId: number = 1,
): EvalResult {
const db = getDatabase()
const since = Math.floor(Date.now() / 1000) - hours * 3600
const row = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful,
AVG(CASE WHEN feedback_rating IS NOT NULL THEN feedback_rating ELSE NULL END) as avg_rating
FROM tasks
WHERE assigned_to = ? AND workspace_id = ? AND status = 'done' AND created_at > ?
`).get(agentName, workspaceId, since) as any
const total = row?.total ?? 0
const successful = row?.successful ?? 0
const successRate = total > 0 ? successful / total : 1.0
const avgRating = row?.avg_rating
// Blend success rate with feedback rating if available (normalized to 0-1 assuming 1-5 scale)
const score = avgRating != null
? (successRate * 0.6 + ((avgRating - 1) / 4) * 0.4)
: successRate
return {
layer: 'output',
score: Math.round(score * 100) / 100,
passed: score >= 0.6,
detail: `Correctness: ${(score * 100).toFixed(0)}% (${successful}/${total} successful${avgRating != null ? `, avg rating ${avgRating.toFixed(1)}` : ''})`,
}
}
export function runOutputEvals(
agentName: string,
hours: number = 168,
workspaceId: number = 1,
): EvalResult[] {
return [
evalTaskCompletion(agentName, hours, workspaceId),
evalCorrectnessScore(agentName, hours, workspaceId),
]
}
// ---------------------------------------------------------------------------
// Layer 2: Trace Evals
// ---------------------------------------------------------------------------
export function convergenceScore(
totalToolCalls: number,
uniqueTools: number,
): { score: number; looping: boolean } {
if (uniqueTools === 0) return { score: 1.0, looping: false }
const ratio = totalToolCalls / uniqueTools
// ratio > 3.0 indicates looping behavior
return {
score: Math.round(Math.min(1.0, 3.0 / ratio) * 100) / 100,
looping: ratio > 3.0,
}
}
export function evalReasoningCoherence(
agentName: string,
hours: number = 24,
workspaceId: number = 1,
): EvalResult {
const db = getDatabase()
const since = Math.floor(Date.now() / 1000) - hours * 3600
const row = db.prepare(`
SELECT
COUNT(*) as total_calls,
COUNT(DISTINCT tool_name) as unique_tools
FROM mcp_call_log
WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
`).get(agentName, workspaceId, since) as any
const total = row?.total_calls ?? 0
const unique = row?.unique_tools ?? 0
const { score, looping } = convergenceScore(total, unique)
return {
layer: 'trace',
score,
passed: !looping,
detail: `Convergence: ${total} calls across ${unique} unique tools (ratio ${unique > 0 ? (total / unique).toFixed(1) : 'N/A'})${looping ? ' — LOOPING DETECTED' : ''}`,
}
}
// ---------------------------------------------------------------------------
// Layer 3: Component Evals
// ---------------------------------------------------------------------------
export function evalToolReliability(
agentName: string,
hours: number = 24,
workspaceId: number = 1,
): EvalResult {
const db = getDatabase()
const since = Math.floor(Date.now() / 1000) - hours * 3600
const row = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
FROM mcp_call_log
WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
`).get(agentName, workspaceId, since) as any
const total = row?.total ?? 0
const successes = row?.successes ?? 0
const score = total > 0 ? successes / total : 1.0
return {
layer: 'component',
score: Math.round(score * 100) / 100,
passed: score >= 0.8,
detail: `Tool reliability: ${successes}/${total} successful (${(score * 100).toFixed(0)}%)`,
}
}
// ---------------------------------------------------------------------------
// Layer 4: Drift Detection
// ---------------------------------------------------------------------------
const DRIFT_THRESHOLD = 0.10
export function checkDrift(
current: number,
baseline: number,
threshold: number = DRIFT_THRESHOLD,
): DriftResult {
const delta = baseline !== 0
? Math.abs(current - baseline) / Math.abs(baseline)
: current !== 0 ? 1.0 : 0.0
return {
metric: '',
current,
baseline,
delta: Math.round(delta * 10000) / 10000,
drifted: delta > threshold,
threshold,
}
}
export function runDriftCheck(
agentName: string,
workspaceId: number = 1,
): DriftResult[] {
const db = getDatabase()
const now = Math.floor(Date.now() / 1000)
const oneWeek = 7 * 86400
const fourWeeks = 4 * 7 * 86400
// Current window: last 7 days
const currentStart = now - oneWeek
// Baseline window: 4 weeks ending 1 week ago
const baselineStart = now - fourWeeks
const baselineEnd = currentStart
// Metric: avg tokens per session
const currentTokens = db.prepare(`
SELECT AVG(input_tokens + output_tokens) as avg_tokens
FROM token_usage
WHERE agent_name = ? AND created_at > ?
`).get(agentName, currentStart) as any
const baselineTokens = db.prepare(`
SELECT AVG(input_tokens + output_tokens) as avg_tokens
FROM token_usage
WHERE agent_name = ? AND created_at > ? AND created_at <= ?
`).get(agentName, baselineStart, baselineEnd) as any
const tokenDrift = checkDrift(
currentTokens?.avg_tokens ?? 0,
baselineTokens?.avg_tokens ?? 0,
)
tokenDrift.metric = 'avg_tokens_per_session'
// Metric: tool success rate
const currentTools = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
FROM mcp_call_log
WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
`).get(agentName, workspaceId, currentStart) as any
const baselineTools = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
FROM mcp_call_log
WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
`).get(agentName, workspaceId, baselineStart, baselineEnd) as any
const currentSuccessRate = (currentTools?.total ?? 0) > 0
? (currentTools.successes / currentTools.total)
: 1.0
const baselineSuccessRate = (baselineTools?.total ?? 0) > 0
? (baselineTools.successes / baselineTools.total)
: 1.0
const toolDrift = checkDrift(currentSuccessRate, baselineSuccessRate)
toolDrift.metric = 'tool_success_rate'
// Metric: task completion rate
const currentTasks = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
FROM tasks
WHERE assigned_to = ? AND workspace_id = ? AND created_at > ?
`).get(agentName, workspaceId, currentStart) as any
const baselineTasks = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
FROM tasks
WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
`).get(agentName, workspaceId, baselineStart, baselineEnd) as any
const currentCompletionRate = (currentTasks?.total ?? 0) > 0
? (currentTasks.completed / currentTasks.total)
: 1.0
const baselineCompletionRate = (baselineTasks?.total ?? 0) > 0
? (baselineTasks.completed / baselineTasks.total)
: 1.0
const taskDrift = checkDrift(currentCompletionRate, baselineCompletionRate)
taskDrift.metric = 'task_completion_rate'
return [tokenDrift, toolDrift, taskDrift]
}
export function getDriftTimeline(
agentName: string,
weeks: number = 8,
workspaceId: number = 1,
): Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> {
const db = getDatabase()
const now = Math.floor(Date.now() / 1000)
const timeline: Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> = []
for (let i = weeks - 1; i >= 0; i--) {
const weekStart = now - (i + 1) * 7 * 86400
const weekEnd = now - i * 7 * 86400
const tokens = db.prepare(`
SELECT AVG(input_tokens + output_tokens) as avg_tokens
FROM token_usage
WHERE agent_name = ? AND created_at > ? AND created_at <= ?
`).get(agentName, weekStart, weekEnd) as any
const tools = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
FROM mcp_call_log
WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
`).get(agentName, workspaceId, weekStart, weekEnd) as any
const tasks = db.prepare(`
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
FROM tasks
WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
`).get(agentName, workspaceId, weekStart, weekEnd) as any
timeline.push({
weekStart,
avgTokens: Math.round(tokens?.avg_tokens ?? 0),
successRate: (tools?.total ?? 0) > 0 ? Math.round((tools.successes / tools.total) * 10000) / 100 : 100,
completionRate: (tasks?.total ?? 0) > 0 ? Math.round((tasks.completed / tasks.total) * 10000) / 100 : 100,
})
}
return timeline
}