HuggingClaw-MissionControl / src /lib /__tests__ /agent-evals.test.ts
nyk
feat(refactor): ready for manual QA after main sync (#274)
b6ecafa unverified
import { describe, it, expect, vi, beforeEach } from 'vitest'
const mockGet = vi.fn()
const mockAll = vi.fn(() => [])
const mockRun = vi.fn(() => ({ lastInsertRowid: 1, changes: 1 }))
const mockPrepare = vi.fn(() => ({ get: mockGet, all: mockAll, run: mockRun }))
vi.mock('@/lib/db', () => ({
getDatabase: () => ({ prepare: mockPrepare }),
}))
import { convergenceScore, checkDrift, evalTaskCompletion, evalCorrectnessScore } from '@/lib/agent-evals'
describe('convergenceScore', () => {
it('returns score 1.0 when no unique tools', () => {
const result = convergenceScore(0, 0)
expect(result.score).toBe(1.0)
expect(result.looping).toBe(false)
})
it('returns score 1.0 when ratio is 1:1', () => {
const result = convergenceScore(5, 5)
expect(result.score).toBe(1.0)
expect(result.looping).toBe(false)
})
it('returns score 1.0 when ratio is exactly 3:1', () => {
const result = convergenceScore(15, 5)
expect(result.score).toBe(1.0)
expect(result.looping).toBe(false)
})
it('detects looping when ratio exceeds 3:1', () => {
const result = convergenceScore(20, 5)
expect(result.looping).toBe(true)
expect(result.score).toBeLessThan(1.0)
})
it('returns lower score with higher ratio', () => {
const low = convergenceScore(6, 5)
const high = convergenceScore(30, 5)
expect(high.score).toBeLessThan(low.score)
})
it('clamps score between 0 and 1', () => {
const result = convergenceScore(1000, 1)
expect(result.score).toBeGreaterThanOrEqual(0)
expect(result.score).toBeLessThanOrEqual(1)
})
})
describe('checkDrift', () => {
it('returns no drift when current equals baseline', () => {
const result = checkDrift(0.8, 0.8)
expect(result.drifted).toBe(false)
expect(result.delta).toBe(0)
})
it('detects drift when delta exceeds threshold', () => {
const result = checkDrift(0.5, 0.8, 0.10)
expect(result.drifted).toBe(true)
expect(result.delta).toBeGreaterThan(0.10)
})
it('returns no drift when delta is within threshold', () => {
const result = checkDrift(0.79, 0.8, 0.10)
expect(result.drifted).toBe(false)
})
it('handles zero baseline correctly', () => {
const result = checkDrift(0.5, 0)
expect(result.drifted).toBe(true)
expect(result.delta).toBe(1.0)
})
it('handles both zero correctly', () => {
const result = checkDrift(0, 0)
expect(result.drifted).toBe(false)
expect(result.delta).toBe(0)
})
it('uses default threshold of 0.10', () => {
const result = checkDrift(0.95, 0.8)
// delta = |0.95 - 0.8| / 0.8 = 0.1875
expect(result.drifted).toBe(true)
expect(result.threshold).toBe(0.10)
})
})
describe('evalTaskCompletion', () => {
beforeEach(() => {
vi.clearAllMocks()
})
it('returns score based on completed/total ratio', () => {
mockGet.mockReturnValue({ total: 10, completed: 7, successful: 5 })
const result = evalTaskCompletion('test-agent', 168, 1)
expect(result.layer).toBe('output')
expect(result.score).toBe(0.7)
expect(result.passed).toBe(true)
})
it('returns score 1.0 when no tasks exist', () => {
mockGet.mockReturnValue({ total: 0, completed: 0, successful: 0 })
const result = evalTaskCompletion('new-agent', 168, 1)
expect(result.score).toBe(1.0)
expect(result.passed).toBe(true)
})
it('fails when completion rate is below 70%', () => {
mockGet.mockReturnValue({ total: 10, completed: 5, successful: 3 })
const result = evalTaskCompletion('slow-agent', 168, 1)
expect(result.score).toBe(0.5)
expect(result.passed).toBe(false)
})
})
describe('evalCorrectnessScore', () => {
beforeEach(() => {
vi.clearAllMocks()
})
it('returns success rate when no feedback ratings', () => {
mockGet.mockReturnValue({ total: 10, successful: 8, avg_rating: null })
const result = evalCorrectnessScore('test-agent', 168, 1)
expect(result.layer).toBe('output')
expect(result.score).toBe(0.8)
expect(result.passed).toBe(true)
})
it('blends success rate with feedback rating', () => {
mockGet.mockReturnValue({ total: 10, successful: 10, avg_rating: 4.0 })
const result = evalCorrectnessScore('rated-agent', 168, 1)
// score = 1.0 * 0.6 + ((4-1)/4) * 0.4 = 0.6 + 0.3 = 0.9
expect(result.score).toBe(0.9)
expect(result.passed).toBe(true)
})
it('fails when score is below 0.6', () => {
mockGet.mockReturnValue({ total: 10, successful: 3, avg_rating: null })
const result = evalCorrectnessScore('bad-agent', 168, 1)
expect(result.score).toBe(0.3)
expect(result.passed).toBe(false)
})
})