Spaces:

tao-shen
/

HuggingClaw-MissionControl

Sleeping

File size: 11,066 Bytes

b6ecafa

/**
 * Agent Evals — four-layer evaluation engine for agent performance.
 *
 * Layer 1 (Output): Task completion and correctness scoring
 * Layer 2 (Trace): Convergence analysis and reasoning coherence
 * Layer 3 (Component): Tool reliability from MCP call logs
 * Layer 4 (Drift): Rolling baseline comparison with threshold detection
 */

import { getDatabase } from '@/lib/db'

export type EvalLayer = 'output' | 'trace' | 'component' | 'drift'

export interface EvalResult {
  layer: EvalLayer
  score: number
  passed: boolean
  detail: string
}

export interface DriftResult {
  metric: string
  current: number
  baseline: number
  delta: number
  drifted: boolean
  threshold: number
}

// ---------------------------------------------------------------------------
// Layer 1: Output Evals
// ---------------------------------------------------------------------------

export function evalTaskCompletion(
  agentName: string,
  hours: number = 168,
  workspaceId: number = 1,
): EvalResult {
  const db = getDatabase()
  const since = Math.floor(Date.now() / 1000) - hours * 3600

  const row = db.prepare(`
    SELECT
      COUNT(*) as total,
      SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed,
      SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful
    FROM tasks
    WHERE assigned_to = ? AND workspace_id = ? AND created_at > ?
  `).get(agentName, workspaceId, since) as any

  const total = row?.total ?? 0
  const completed = row?.completed ?? 0
  const score = total > 0 ? completed / total : 1.0

  return {
    layer: 'output',
    score: Math.round(score * 100) / 100,
    passed: score >= 0.7,
    detail: `${completed}/${total} tasks completed (${(score * 100).toFixed(0)}%)`,
  }
}

export function evalCorrectnessScore(
  agentName: string,
  hours: number = 168,
  workspaceId: number = 1,
): EvalResult {
  const db = getDatabase()
  const since = Math.floor(Date.now() / 1000) - hours * 3600

  const row = db.prepare(`
    SELECT
      COUNT(*) as total,
      SUM(CASE WHEN outcome = 'success' THEN 1 ELSE 0 END) as successful,
      AVG(CASE WHEN feedback_rating IS NOT NULL THEN feedback_rating ELSE NULL END) as avg_rating
    FROM tasks
    WHERE assigned_to = ? AND workspace_id = ? AND status = 'done' AND created_at > ?
  `).get(agentName, workspaceId, since) as any

  const total = row?.total ?? 0
  const successful = row?.successful ?? 0
  const successRate = total > 0 ? successful / total : 1.0
  const avgRating = row?.avg_rating
  // Blend success rate with feedback rating if available (normalized to 0-1 assuming 1-5 scale)
  const score = avgRating != null
    ? (successRate * 0.6 + ((avgRating - 1) / 4) * 0.4)
    : successRate

  return {
    layer: 'output',
    score: Math.round(score * 100) / 100,
    passed: score >= 0.6,
    detail: `Correctness: ${(score * 100).toFixed(0)}% (${successful}/${total} successful${avgRating != null ? `, avg rating ${avgRating.toFixed(1)}` : ''})`,
  }
}

export function runOutputEvals(
  agentName: string,
  hours: number = 168,
  workspaceId: number = 1,
): EvalResult[] {
  return [
    evalTaskCompletion(agentName, hours, workspaceId),
    evalCorrectnessScore(agentName, hours, workspaceId),
  ]
}

// ---------------------------------------------------------------------------
// Layer 2: Trace Evals
// ---------------------------------------------------------------------------

export function convergenceScore(
  totalToolCalls: number,
  uniqueTools: number,
): { score: number; looping: boolean } {
  if (uniqueTools === 0) return { score: 1.0, looping: false }
  const ratio = totalToolCalls / uniqueTools
  // ratio > 3.0 indicates looping behavior
  return {
    score: Math.round(Math.min(1.0, 3.0 / ratio) * 100) / 100,
    looping: ratio > 3.0,
  }
}

export function evalReasoningCoherence(
  agentName: string,
  hours: number = 24,
  workspaceId: number = 1,
): EvalResult {
  const db = getDatabase()
  const since = Math.floor(Date.now() / 1000) - hours * 3600

  const row = db.prepare(`
    SELECT
      COUNT(*) as total_calls,
      COUNT(DISTINCT tool_name) as unique_tools
    FROM mcp_call_log
    WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
  `).get(agentName, workspaceId, since) as any

  const total = row?.total_calls ?? 0
  const unique = row?.unique_tools ?? 0
  const { score, looping } = convergenceScore(total, unique)

  return {
    layer: 'trace',
    score,
    passed: !looping,
    detail: `Convergence: ${total} calls across ${unique} unique tools (ratio ${unique > 0 ? (total / unique).toFixed(1) : 'N/A'})${looping ? ' — LOOPING DETECTED' : ''}`,
  }
}

// ---------------------------------------------------------------------------
// Layer 3: Component Evals
// ---------------------------------------------------------------------------

export function evalToolReliability(
  agentName: string,
  hours: number = 24,
  workspaceId: number = 1,
): EvalResult {
  const db = getDatabase()
  const since = Math.floor(Date.now() / 1000) - hours * 3600

  const row = db.prepare(`
    SELECT
      COUNT(*) as total,
      SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
    FROM mcp_call_log
    WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
  `).get(agentName, workspaceId, since) as any

  const total = row?.total ?? 0
  const successes = row?.successes ?? 0
  const score = total > 0 ? successes / total : 1.0

  return {
    layer: 'component',
    score: Math.round(score * 100) / 100,
    passed: score >= 0.8,
    detail: `Tool reliability: ${successes}/${total} successful (${(score * 100).toFixed(0)}%)`,
  }
}

// ---------------------------------------------------------------------------
// Layer 4: Drift Detection
// ---------------------------------------------------------------------------

const DRIFT_THRESHOLD = 0.10

export function checkDrift(
  current: number,
  baseline: number,
  threshold: number = DRIFT_THRESHOLD,
): DriftResult {
  const delta = baseline !== 0
    ? Math.abs(current - baseline) / Math.abs(baseline)
    : current !== 0 ? 1.0 : 0.0

  return {
    metric: '',
    current,
    baseline,
    delta: Math.round(delta * 10000) / 10000,
    drifted: delta > threshold,
    threshold,
  }
}

export function runDriftCheck(
  agentName: string,
  workspaceId: number = 1,
): DriftResult[] {
  const db = getDatabase()
  const now = Math.floor(Date.now() / 1000)
  const oneWeek = 7 * 86400
  const fourWeeks = 4 * 7 * 86400

  // Current window: last 7 days
  const currentStart = now - oneWeek
  // Baseline window: 4 weeks ending 1 week ago
  const baselineStart = now - fourWeeks
  const baselineEnd = currentStart

  // Metric: avg tokens per session
  const currentTokens = db.prepare(`
    SELECT AVG(input_tokens + output_tokens) as avg_tokens
    FROM token_usage
    WHERE agent_name = ? AND created_at > ?
  `).get(agentName, currentStart) as any

  const baselineTokens = db.prepare(`
    SELECT AVG(input_tokens + output_tokens) as avg_tokens
    FROM token_usage
    WHERE agent_name = ? AND created_at > ? AND created_at <= ?
  `).get(agentName, baselineStart, baselineEnd) as any

  const tokenDrift = checkDrift(
    currentTokens?.avg_tokens ?? 0,
    baselineTokens?.avg_tokens ?? 0,
  )
  tokenDrift.metric = 'avg_tokens_per_session'

  // Metric: tool success rate
  const currentTools = db.prepare(`
    SELECT
      COUNT(*) as total,
      SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
    FROM mcp_call_log
    WHERE agent_name = ? AND workspace_id = ? AND created_at > ?
  `).get(agentName, workspaceId, currentStart) as any

  const baselineTools = db.prepare(`
    SELECT
      COUNT(*) as total,
      SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
    FROM mcp_call_log
    WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
  `).get(agentName, workspaceId, baselineStart, baselineEnd) as any

  const currentSuccessRate = (currentTools?.total ?? 0) > 0
    ? (currentTools.successes / currentTools.total)
    : 1.0
  const baselineSuccessRate = (baselineTools?.total ?? 0) > 0
    ? (baselineTools.successes / baselineTools.total)
    : 1.0

  const toolDrift = checkDrift(currentSuccessRate, baselineSuccessRate)
  toolDrift.metric = 'tool_success_rate'

  // Metric: task completion rate
  const currentTasks = db.prepare(`
    SELECT
      COUNT(*) as total,
      SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
    FROM tasks
    WHERE assigned_to = ? AND workspace_id = ? AND created_at > ?
  `).get(agentName, workspaceId, currentStart) as any

  const baselineTasks = db.prepare(`
    SELECT
      COUNT(*) as total,
      SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
    FROM tasks
    WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
  `).get(agentName, workspaceId, baselineStart, baselineEnd) as any

  const currentCompletionRate = (currentTasks?.total ?? 0) > 0
    ? (currentTasks.completed / currentTasks.total)
    : 1.0
  const baselineCompletionRate = (baselineTasks?.total ?? 0) > 0
    ? (baselineTasks.completed / baselineTasks.total)
    : 1.0

  const taskDrift = checkDrift(currentCompletionRate, baselineCompletionRate)
  taskDrift.metric = 'task_completion_rate'

  return [tokenDrift, toolDrift, taskDrift]
}

export function getDriftTimeline(
  agentName: string,
  weeks: number = 8,
  workspaceId: number = 1,
): Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> {
  const db = getDatabase()
  const now = Math.floor(Date.now() / 1000)
  const timeline: Array<{ weekStart: number; avgTokens: number; successRate: number; completionRate: number }> = []

  for (let i = weeks - 1; i >= 0; i--) {
    const weekStart = now - (i + 1) * 7 * 86400
    const weekEnd = now - i * 7 * 86400

    const tokens = db.prepare(`
      SELECT AVG(input_tokens + output_tokens) as avg_tokens
      FROM token_usage
      WHERE agent_name = ? AND created_at > ? AND created_at <= ?
    `).get(agentName, weekStart, weekEnd) as any

    const tools = db.prepare(`
      SELECT
        COUNT(*) as total,
        SUM(CASE WHEN success = 1 THEN 1 ELSE 0 END) as successes
      FROM mcp_call_log
      WHERE agent_name = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
    `).get(agentName, workspaceId, weekStart, weekEnd) as any

    const tasks = db.prepare(`
      SELECT
        COUNT(*) as total,
        SUM(CASE WHEN status = 'done' THEN 1 ELSE 0 END) as completed
      FROM tasks
      WHERE assigned_to = ? AND workspace_id = ? AND created_at > ? AND created_at <= ?
    `).get(agentName, workspaceId, weekStart, weekEnd) as any

    timeline.push({
      weekStart,
      avgTokens: Math.round(tokens?.avg_tokens ?? 0),
      successRate: (tools?.total ?? 0) > 0 ? Math.round((tools.successes / tools.total) * 10000) / 100 : 100,
      completionRate: (tasks?.total ?? 0) > 0 ? Math.round((tasks.completed / tasks.total) * 10000) / 100 : 100,
    })
  }

  return timeline
}