Spaces:
Sleeping
Sleeping
| import { describe, it, expect, vi, beforeEach } from 'vitest' | |
| const mockGet = vi.fn() | |
| const mockAll = vi.fn(() => []) | |
| const mockRun = vi.fn(() => ({ lastInsertRowid: 1, changes: 1 })) | |
| const mockPrepare = vi.fn(() => ({ get: mockGet, all: mockAll, run: mockRun })) | |
| vi.mock('@/lib/db', () => ({ | |
| getDatabase: () => ({ prepare: mockPrepare }), | |
| })) | |
| import { convergenceScore, checkDrift, evalTaskCompletion, evalCorrectnessScore } from '@/lib/agent-evals' | |
| describe('convergenceScore', () => { | |
| it('returns score 1.0 when no unique tools', () => { | |
| const result = convergenceScore(0, 0) | |
| expect(result.score).toBe(1.0) | |
| expect(result.looping).toBe(false) | |
| }) | |
| it('returns score 1.0 when ratio is 1:1', () => { | |
| const result = convergenceScore(5, 5) | |
| expect(result.score).toBe(1.0) | |
| expect(result.looping).toBe(false) | |
| }) | |
| it('returns score 1.0 when ratio is exactly 3:1', () => { | |
| const result = convergenceScore(15, 5) | |
| expect(result.score).toBe(1.0) | |
| expect(result.looping).toBe(false) | |
| }) | |
| it('detects looping when ratio exceeds 3:1', () => { | |
| const result = convergenceScore(20, 5) | |
| expect(result.looping).toBe(true) | |
| expect(result.score).toBeLessThan(1.0) | |
| }) | |
| it('returns lower score with higher ratio', () => { | |
| const low = convergenceScore(6, 5) | |
| const high = convergenceScore(30, 5) | |
| expect(high.score).toBeLessThan(low.score) | |
| }) | |
| it('clamps score between 0 and 1', () => { | |
| const result = convergenceScore(1000, 1) | |
| expect(result.score).toBeGreaterThanOrEqual(0) | |
| expect(result.score).toBeLessThanOrEqual(1) | |
| }) | |
| }) | |
| describe('checkDrift', () => { | |
| it('returns no drift when current equals baseline', () => { | |
| const result = checkDrift(0.8, 0.8) | |
| expect(result.drifted).toBe(false) | |
| expect(result.delta).toBe(0) | |
| }) | |
| it('detects drift when delta exceeds threshold', () => { | |
| const result = checkDrift(0.5, 0.8, 0.10) | |
| expect(result.drifted).toBe(true) | |
| expect(result.delta).toBeGreaterThan(0.10) | |
| }) | |
| it('returns no drift when delta is within threshold', () => { | |
| const result = checkDrift(0.79, 0.8, 0.10) | |
| expect(result.drifted).toBe(false) | |
| }) | |
| it('handles zero baseline correctly', () => { | |
| const result = checkDrift(0.5, 0) | |
| expect(result.drifted).toBe(true) | |
| expect(result.delta).toBe(1.0) | |
| }) | |
| it('handles both zero correctly', () => { | |
| const result = checkDrift(0, 0) | |
| expect(result.drifted).toBe(false) | |
| expect(result.delta).toBe(0) | |
| }) | |
| it('uses default threshold of 0.10', () => { | |
| const result = checkDrift(0.95, 0.8) | |
| // delta = |0.95 - 0.8| / 0.8 = 0.1875 | |
| expect(result.drifted).toBe(true) | |
| expect(result.threshold).toBe(0.10) | |
| }) | |
| }) | |
| describe('evalTaskCompletion', () => { | |
| beforeEach(() => { | |
| vi.clearAllMocks() | |
| }) | |
| it('returns score based on completed/total ratio', () => { | |
| mockGet.mockReturnValue({ total: 10, completed: 7, successful: 5 }) | |
| const result = evalTaskCompletion('test-agent', 168, 1) | |
| expect(result.layer).toBe('output') | |
| expect(result.score).toBe(0.7) | |
| expect(result.passed).toBe(true) | |
| }) | |
| it('returns score 1.0 when no tasks exist', () => { | |
| mockGet.mockReturnValue({ total: 0, completed: 0, successful: 0 }) | |
| const result = evalTaskCompletion('new-agent', 168, 1) | |
| expect(result.score).toBe(1.0) | |
| expect(result.passed).toBe(true) | |
| }) | |
| it('fails when completion rate is below 70%', () => { | |
| mockGet.mockReturnValue({ total: 10, completed: 5, successful: 3 }) | |
| const result = evalTaskCompletion('slow-agent', 168, 1) | |
| expect(result.score).toBe(0.5) | |
| expect(result.passed).toBe(false) | |
| }) | |
| }) | |
| describe('evalCorrectnessScore', () => { | |
| beforeEach(() => { | |
| vi.clearAllMocks() | |
| }) | |
| it('returns success rate when no feedback ratings', () => { | |
| mockGet.mockReturnValue({ total: 10, successful: 8, avg_rating: null }) | |
| const result = evalCorrectnessScore('test-agent', 168, 1) | |
| expect(result.layer).toBe('output') | |
| expect(result.score).toBe(0.8) | |
| expect(result.passed).toBe(true) | |
| }) | |
| it('blends success rate with feedback rating', () => { | |
| mockGet.mockReturnValue({ total: 10, successful: 10, avg_rating: 4.0 }) | |
| const result = evalCorrectnessScore('rated-agent', 168, 1) | |
| // score = 1.0 * 0.6 + ((4-1)/4) * 0.4 = 0.6 + 0.3 = 0.9 | |
| expect(result.score).toBe(0.9) | |
| expect(result.passed).toBe(true) | |
| }) | |
| it('fails when score is below 0.6', () => { | |
| mockGet.mockReturnValue({ total: 10, successful: 3, avg_rating: null }) | |
| const result = evalCorrectnessScore('bad-agent', 168, 1) | |
| expect(result.score).toBe(0.3) | |
| expect(result.passed).toBe(false) | |
| }) | |
| }) | |