/** * Agent Evals — four-layer evaluation engine for agent performance. * * Layer 1 (Output): Task completion and correctness scoring * Layer 2 (Trace): Convergence analysis and reasoning coherence * Layer 3 (Component): Tool reliability from MCP call logs * Layer 4 (Drift): Rolling baseline comparison with threshold detection */ import { getDatabase } from '@/lib/db' export type EvalLayer = 'output' | 'trace' | 'component' | 'drift' export interface EvalResult { layer: EvalLayer score: number passed: boolean detail: string } export interface DriftResult { metric: string current: number baseline: number delta: number drifted: boolean threshold: number } // --------------------------------------------------------------------------- // Layer 1: Output Evals // --------------------------------------------------------------------------- export function evalTaskCompletion( agentName: string, hours: number = 168, workspaceId: number = 1, ): EvalResult { const db = getDatabase() const since = Math.floor(Date.now() / 1000) - hours * 3600 const row = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed, SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful FROM tasks WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? `).get(agentName, workspaceId, since) as any const total = row?.total ?? 0 const completed = row?.completed ?? 0 const score = total > 0 ? completed / total : 1.0 return { layer: 'output', score: Math.round(score * 100) / 100, passed: score >= 0.7, detail: `${completed}/${total} tasks completed (${(score * 100).toFixed(0)}%)`, } } export function evalCorrectnessScore( agentName: string, hours: number = 168, workspaceId: number = 1, ): EvalResult { const db = getDatabase() const since = Math.floor(Date.now() / 1000) - hours * 3600 const row = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful, AVG(CASE WHEN feedback_rating IS NOT NULL THEN feedback_rating ELSE NULL END) as avg_rating FROM tasks WHERE assigned_to = ? AND workspace_id = ? AND status = 'done' AND created_at > ? `).get(agentName, workspaceId, since) as any const total = row?.total ?? 0 const successful = row?.successful ?? 0 const successRate = total > 0 ? successful / total : 1.0 const avgRating = row?.avg_rating // Blend success rate with feedback rating if available (normalized to 0-1 assuming 1-5 scale) const score = avgRating != null ? (successRate * 0.6 + ((avgRating - 1) / 4) * 0.4) : successRate return { layer: 'output', score: Math.round(score * 100) / 100, passed: score >= 0.6, detail: `Correctness: ${(score * 100).toFixed(0)}% (${successful}/${total} successful${avgRating != null ? `, avg rating ${avgRating.toFixed(1)}` : ''})`, } } export function runOutputEvals( agentName: string, hours: number = 168, workspaceId: number = 1, ): EvalResult[] { return [ evalTaskCompletion(agentName, hours, workspaceId), evalCorrectnessScore(agentName, hours, workspaceId), ] } // --------------------------------------------------------------------------- // Layer 2: Trace Evals // --------------------------------------------------------------------------- export function convergenceScore( totalToolCalls: number, uniqueTools: number, ): { score: number; looping: boolean } { if (uniqueTools === 0) return { score: 1.0, looping: false } const ratio = totalToolCalls / uniqueTools // ratio > 3.0 indicates looping behavior return { score: Math.round(Math.min(1.0, 3.0 / ratio) * 100) / 100, looping: ratio > 3.0, } } export function evalReasoningCoherence( agentName: string, hours: number = 24, workspaceId: number = 1, ): EvalResult { const db = getDatabase() const since = Math.floor(Date.now() / 1000) - hours * 3600 const row = db.prepare(` SELECT COUNT(*) as total_calls, COUNT(DISTINCT tool_name) as unique_tools FROM mcp_call_log WHERE agent_name = ? AND workspace_id = ? AND created_at > ? `).get(agentName, workspaceId, since) as any const total = row?.total_calls ?? 0 const unique = row?.unique_tools ?? 0 const { score, looping } = convergenceScore(total, unique) return { layer: 'trace', score, passed: !looping, detail: `Convergence: ${total} calls across ${unique} unique tools (ratio ${unique > 0 ? (total / unique).toFixed(1) : 'N/A'})${looping ? ' — LOOPING DETECTED' : ''}`, } } // --------------------------------------------------------------------------- // Layer 3: Component Evals // --------------------------------------------------------------------------- export function evalToolReliability( agentName: string, hours: number = 24, workspaceId: number = 1, ): EvalResult { const db = getDatabase() const since = Math.floor(Date.now() / 1000) - hours * 3600 const row = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes FROM mcp_call_log WHERE agent_name = ? AND workspace_id = ? AND created_at > ? `).get(agentName, workspaceId, since) as any const total = row?.total ?? 0 const successes = row?.successes ?? 0 const score = total > 0 ? successes / total : 1.0 return { layer: 'component', score: Math.round(score * 100) / 100, passed: score >= 0.8, detail: `Tool reliability: ${successes}/${total} successful (${(score * 100).toFixed(0)}%)`, } } // --------------------------------------------------------------------------- // Layer 4: Drift Detection // --------------------------------------------------------------------------- const DRIFT_THRESHOLD = 0.10 export function checkDrift( current: number, baseline: number, threshold: number = DRIFT_THRESHOLD, ): DriftResult { const delta = baseline !== 0 ? Math.abs(current - baseline) / Math.abs(baseline) : current !== 0 ? 1.0 : 0.0 return { metric: '', current, baseline, delta: Math.round(delta * 10000) / 10000, drifted: delta > threshold, threshold, } } export function runDriftCheck( agentName: string, workspaceId: number = 1, ): DriftResult[] { const db = getDatabase() const now = Math.floor(Date.now() / 1000) const oneWeek = 7 * 86400 const fourWeeks = 4 * 7 * 86400 // Current window: last 7 days const currentStart = now - oneWeek // Baseline window: 4 weeks ending 1 week ago const baselineStart = now - fourWeeks const baselineEnd = currentStart // Metric: avg tokens per session const currentTokens = db.prepare(` SELECT AVG(input_tokens + output_tokens) as avg_tokens FROM token_usage WHERE agent_name = ? AND created_at > ? `).get(agentName, currentStart) as any const baselineTokens = db.prepare(` SELECT AVG(input_tokens + output_tokens) as avg_tokens FROM token_usage WHERE agent_name = ? AND created_at > ? AND created_at <= ? `).get(agentName, baselineStart, baselineEnd) as any const tokenDrift = checkDrift( currentTokens?.avg_tokens ?? 0, baselineTokens?.avg_tokens ?? 0, ) tokenDrift.metric = 'avg_tokens_per_session' // Metric: tool success rate const currentTools = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes FROM mcp_call_log WHERE agent_name = ? AND workspace_id = ? AND created_at > ? `).get(agentName, workspaceId, currentStart) as any const baselineTools = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes FROM mcp_call_log WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ? `).get(agentName, workspaceId, baselineStart, baselineEnd) as any const currentSuccessRate = (currentTools?.total ?? 0) > 0 ? (currentTools.successes / currentTools.total) : 1.0 const baselineSuccessRate = (baselineTools?.total ?? 0) > 0 ? (baselineTools.successes / baselineTools.total) : 1.0 const toolDrift = checkDrift(currentSuccessRate, baselineSuccessRate) toolDrift.metric = 'tool_success_rate' // Metric: task completion rate const currentTasks = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed FROM tasks WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? `).get(agentName, workspaceId, currentStart) as any const baselineTasks = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed FROM tasks WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ? `).get(agentName, workspaceId, baselineStart, baselineEnd) as any const currentCompletionRate = (currentTasks?.total ?? 0) > 0 ? (currentTasks.completed / currentTasks.total) : 1.0 const baselineCompletionRate = (baselineTasks?.total ?? 0) > 0 ? (baselineTasks.completed / baselineTasks.total) : 1.0 const taskDrift = checkDrift(currentCompletionRate, baselineCompletionRate) taskDrift.metric = 'task_completion_rate' return [tokenDrift, toolDrift, taskDrift] } export function getDriftTimeline( agentName: string, weeks: number = 8, workspaceId: number = 1, ): Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> { const db = getDatabase() const now = Math.floor(Date.now() / 1000) const timeline: Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> = [] for (let i = weeks - 1; i >= 0; i--) { const weekStart = now - (i + 1) * 7 * 86400 const weekEnd = now - i * 7 * 86400 const tokens = db.prepare(` SELECT AVG(input_tokens + output_tokens) as avg_tokens FROM token_usage WHERE agent_name = ? AND created_at > ? AND created_at <= ? `).get(agentName, weekStart, weekEnd) as any const tools = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes FROM mcp_call_log WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ? `).get(agentName, workspaceId, weekStart, weekEnd) as any const tasks = db.prepare(` SELECT COUNT(*) as total, SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed FROM tasks WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ? `).get(agentName, workspaceId, weekStart, weekEnd) as any timeline.push({ weekStart, avgTokens: Math.round(tokens?.avg_tokens ?? 0), successRate: (tools?.total ?? 0) > 0 ? Math.round((tools.successes / tools.total) * 10000) / 100 : 100, completionRate: (tasks?.total ?? 0) > 0 ? Math.round((tasks.completed / tasks.total) * 10000) / 100 : 100, }) } return timeline }