Spaces:

tao-shen
/

HuggingClaw-MissionControl

Sleeping

HuggingClaw-MissionControl / src /lib /agent-evals.ts

nyk

feat(refactor): ready for manual QA after main sync (#274)

b6ecafa unverified 3 months ago

11.1 kB

	/**
	* Agent Evals — four-layer evaluation engine for agent performance.
	*
	* Layer 1 (Output): Task completion and correctness scoring
	* Layer 2 (Trace): Convergence analysis and reasoning coherence
	* Layer 3 (Component): Tool reliability from MCP call logs
	* Layer 4 (Drift): Rolling baseline comparison with threshold detection
	*/

	import { getDatabase } from '@/lib/db'

	export type EvalLayer = 'output' \| 'trace' \| 'component' \| 'drift'

	export interface EvalResult {
	layer: EvalLayer
	score: number
	passed: boolean
	detail: string
	}

	export interface DriftResult {
	metric: string
	current: number
	baseline: number
	delta: number
	drifted: boolean
	threshold: number
	}

	// ---------------------------------------------------------------------------
	// Layer 1: Output Evals
	// ---------------------------------------------------------------------------

	export function evalTaskCompletion(
	agentName: string,
	hours: number = 168,
	workspaceId: number = 1,
	): EvalResult {
	const db = getDatabase()
	const since = Math.floor(Date.now() / 1000) - hours * 3600

	const row = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed,
	SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful
	FROM tasks
	WHERE assigned_to = ? AND workspace_id = ? AND created_at > ?
	`).get(agentName, workspaceId, since) as any

	const total = row?.total ?? 0
	const completed = row?.completed ?? 0
	const score = total > 0 ? completed / total : 1.0

	return {
	layer: 'output',
	score: Math.round(score * 100) / 100,
	passed: score >= 0.7,
	detail: `${completed}/${total} tasks completed (${(score * 100).toFixed(0)}%)`,
	}
	}

	export function evalCorrectnessScore(
	agentName: string,
	hours: number = 168,
	workspaceId: number = 1,
	): EvalResult {
	const db = getDatabase()
	const since = Math.floor(Date.now() / 1000) - hours * 3600

	const row = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful,
	AVG(CASE WHEN feedback_rating IS NOT NULL THEN feedback_rating ELSE NULL END) as avg_rating
	FROM tasks
	WHERE assigned_to = ? AND workspace_id = ? AND status = 'done' AND created_at > ?
	`).get(agentName, workspaceId, since) as any

	const total = row?.total ?? 0
	const successful = row?.successful ?? 0
	const successRate = total > 0 ? successful / total : 1.0
	const avgRating = row?.avg_rating
	// Blend success rate with feedback rating if available (normalized to 0-1 assuming 1-5 scale)
	const score = avgRating != null
	? (successRate * 0.6 + ((avgRating - 1) / 4) * 0.4)
	: successRate

	return {
	layer: 'output',
	score: Math.round(score * 100) / 100,
	passed: score >= 0.6,
	detail: `Correctness: ${(score * 100).toFixed(0)}% (${successful}/${total} successful${avgRating != null ? `, avg rating ${avgRating.toFixed(1)}` : ''})`,
	}
	}

	export function runOutputEvals(
	agentName: string,
	hours: number = 168,
	workspaceId: number = 1,
	): EvalResult[] {
	return [
	evalTaskCompletion(agentName, hours, workspaceId),
	evalCorrectnessScore(agentName, hours, workspaceId),
	]
	}

	// ---------------------------------------------------------------------------
	// Layer 2: Trace Evals
	// ---------------------------------------------------------------------------

	export function convergenceScore(
	totalToolCalls: number,
	uniqueTools: number,
	): { score: number; looping: boolean } {
	if (uniqueTools === 0) return { score: 1.0, looping: false }
	const ratio = totalToolCalls / uniqueTools
	// ratio > 3.0 indicates looping behavior
	return {
	score: Math.round(Math.min(1.0, 3.0 / ratio) * 100) / 100,
	looping: ratio > 3.0,
	}
	}

	export function evalReasoningCoherence(
	agentName: string,
	hours: number = 24,
	workspaceId: number = 1,
	): EvalResult {
	const db = getDatabase()
	const since = Math.floor(Date.now() / 1000) - hours * 3600

	const row = db.prepare(`
	SELECT
	COUNT(*) as total_calls,
	COUNT(DISTINCT tool_name) as unique_tools
	FROM mcp_call_log
	WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
	`).get(agentName, workspaceId, since) as any

	const total = row?.total_calls ?? 0
	const unique = row?.unique_tools ?? 0
	const { score, looping } = convergenceScore(total, unique)

	return {
	layer: 'trace',
	score,
	passed: !looping,
	detail: `Convergence: ${total} calls across ${unique} unique tools (ratio ${unique > 0 ? (total / unique).toFixed(1) : 'N/A'})${looping ? ' — LOOPING DETECTED' : ''}`,
	}
	}

	// ---------------------------------------------------------------------------
	// Layer 3: Component Evals
	// ---------------------------------------------------------------------------

	export function evalToolReliability(
	agentName: string,
	hours: number = 24,
	workspaceId: number = 1,
	): EvalResult {
	const db = getDatabase()
	const since = Math.floor(Date.now() / 1000) - hours * 3600

	const row = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
	FROM mcp_call_log
	WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
	`).get(agentName, workspaceId, since) as any

	const total = row?.total ?? 0
	const successes = row?.successes ?? 0
	const score = total > 0 ? successes / total : 1.0

	return {
	layer: 'component',
	score: Math.round(score * 100) / 100,
	passed: score >= 0.8,
	detail: `Tool reliability: ${successes}/${total} successful (${(score * 100).toFixed(0)}%)`,
	}
	}

	// ---------------------------------------------------------------------------
	// Layer 4: Drift Detection
	// ---------------------------------------------------------------------------

	const DRIFT_THRESHOLD = 0.10

	export function checkDrift(
	current: number,
	baseline: number,
	threshold: number = DRIFT_THRESHOLD,
	): DriftResult {
	const delta = baseline !== 0
	? Math.abs(current - baseline) / Math.abs(baseline)
	: current !== 0 ? 1.0 : 0.0

	return {
	metric: '',
	current,
	baseline,
	delta: Math.round(delta * 10000) / 10000,
	drifted: delta > threshold,
	threshold,
	}
	}

	export function runDriftCheck(
	agentName: string,
	workspaceId: number = 1,
	): DriftResult[] {
	const db = getDatabase()
	const now = Math.floor(Date.now() / 1000)
	const oneWeek = 7 * 86400
	const fourWeeks = 4 * 7 * 86400

	// Current window: last 7 days
	const currentStart = now - oneWeek
	// Baseline window: 4 weeks ending 1 week ago
	const baselineStart = now - fourWeeks
	const baselineEnd = currentStart

	// Metric: avg tokens per session
	const currentTokens = db.prepare(`
	SELECT AVG(input_tokens + output_tokens) as avg_tokens
	FROM token_usage
	WHERE agent_name = ? AND created_at > ?
	`).get(agentName, currentStart) as any

	const baselineTokens = db.prepare(`
	SELECT AVG(input_tokens + output_tokens) as avg_tokens
	FROM token_usage
	WHERE agent_name = ? AND created_at > ? AND created_at <= ?
	`).get(agentName, baselineStart, baselineEnd) as any

	const tokenDrift = checkDrift(
	currentTokens?.avg_tokens ?? 0,
	baselineTokens?.avg_tokens ?? 0,
	)
	tokenDrift.metric = 'avg_tokens_per_session'

	// Metric: tool success rate
	const currentTools = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
	FROM mcp_call_log
	WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
	`).get(agentName, workspaceId, currentStart) as any

	const baselineTools = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
	FROM mcp_call_log
	WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
	`).get(agentName, workspaceId, baselineStart, baselineEnd) as any

	const currentSuccessRate = (currentTools?.total ?? 0) > 0
	? (currentTools.successes / currentTools.total)
	: 1.0
	const baselineSuccessRate = (baselineTools?.total ?? 0) > 0
	? (baselineTools.successes / baselineTools.total)
	: 1.0

	const toolDrift = checkDrift(currentSuccessRate, baselineSuccessRate)
	toolDrift.metric = 'tool_success_rate'

	// Metric: task completion rate
	const currentTasks = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
	FROM tasks
	WHERE assigned_to = ? AND workspace_id = ? AND created_at > ?
	`).get(agentName, workspaceId, currentStart) as any

	const baselineTasks = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
	FROM tasks
	WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
	`).get(agentName, workspaceId, baselineStart, baselineEnd) as any

	const currentCompletionRate = (currentTasks?.total ?? 0) > 0
	? (currentTasks.completed / currentTasks.total)
	: 1.0
	const baselineCompletionRate = (baselineTasks?.total ?? 0) > 0
	? (baselineTasks.completed / baselineTasks.total)
	: 1.0

	const taskDrift = checkDrift(currentCompletionRate, baselineCompletionRate)
	taskDrift.metric = 'task_completion_rate'

	return [tokenDrift, toolDrift, taskDrift]
	}

	export function getDriftTimeline(
	agentName: string,
	weeks: number = 8,
	workspaceId: number = 1,
	): Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> {
	const db = getDatabase()
	const now = Math.floor(Date.now() / 1000)
	const timeline: Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> = []

	for (let i = weeks - 1; i >= 0; i--) {
	const weekStart = now - (i + 1) * 7 * 86400
	const weekEnd = now - i * 7 * 86400

	const tokens = db.prepare(`
	SELECT AVG(input_tokens + output_tokens) as avg_tokens
	FROM token_usage
	WHERE agent_name = ? AND created_at > ? AND created_at <= ?
	`).get(agentName, weekStart, weekEnd) as any

	const tools = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
	FROM mcp_call_log
	WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
	`).get(agentName, workspaceId, weekStart, weekEnd) as any

	const tasks = db.prepare(`
	SELECT
	COUNT(*) as total,
	SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
	FROM tasks
	WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
	`).get(agentName, workspaceId, weekStart, weekEnd) as any

	timeline.push({
	weekStart,
	avgTokens: Math.round(tokens?.avg_tokens ?? 0),
	successRate: (tools?.total ?? 0) > 0 ? Math.round((tools.successes / tools.total) * 10000) / 100 : 100,
	completionRate: (tasks?.total ?? 0) > 0 ? Math.round((tasks.completed / tasks.total) * 10000) / 100 : 100,
	})
	}

	return timeline
	}