biodsbench-adapter / src /harness /evaluation /batchRunner.test.ts
starpacker52's picture
Add files using upload-large-folder tool
57c9ddf verified
Raw
History Blame Contribute Delete
4.37 kB
import { describe, expect, test } from 'bun:test'
import { runEvaluationBatch, type SpawnEvaluationWorker } from './batchRunner.js'
describe('runEvaluationBatch', () => {
test('spawns one worker process per task in source batch mode', async () => {
const spawned: Array<{ command: string; args: string[]; timeoutMs?: number }> = []
const spawnWorker: SpawnEvaluationWorker = async request => {
spawned.push({
command: request.command,
args: request.args,
timeoutMs: request.timeoutMs,
})
return { taskId: request.taskId, exitCode: 0 }
}
const result = await runEvaluationBatch({
taskIds: ['task_a', 'task_b'],
tasksDir: 'tasks',
runsDir: 'output/runs',
maxRounds: 2,
maxTurnsPerRound: 9,
timeoutSeconds: 120,
concurrency: 3,
workerTimeoutGraceSeconds: 10,
temperature: 0.2,
thinking: 'adaptive',
timestamp: '20260513_010203',
systemPromptPath: 'config/debug-prompt.md',
verbose: false,
spawnWorker,
})
expect(result.ok).toBe(true)
expect(spawned).toHaveLength(2)
expect(spawned[0].args).toContain('--worker-run')
expect(spawned[0].args).toContain('--task')
expect(spawned[0].args).toContain('task_a')
expect(spawned[0].args).toContain('--max-turns-per-round')
expect(spawned[0].args).toContain('9')
expect(spawned[0].args).toContain('--temperature')
expect(spawned[0].args).toContain('0.2')
expect(spawned[0].args).toContain('--thinking')
expect(spawned[0].args).toContain('adaptive')
expect(spawned[0].timeoutMs).toBe(130000)
expect(spawned[1].args).toContain('task_b')
})
test('runs workers as a fixed-size pipeline', async () => {
const started: string[] = []
const resolvers = new Map<string, (exitCode: number) => void>()
const spawnWorker: SpawnEvaluationWorker = request => {
started.push(request.taskId)
return new Promise(resolve => {
resolvers.set(request.taskId, exitCode =>
resolve({ taskId: request.taskId, exitCode }),
)
})
}
const running = runEvaluationBatch({
taskIds: ['a', 'b', 'c', 'd', 'e'],
tasksDir: 'tasks',
runsDir: 'output/runs',
maxRounds: 1,
timeoutSeconds: 120,
concurrency: 3,
temperature: 1,
thinking: 'disabled',
verbose: false,
spawnWorker,
})
await Promise.resolve()
expect(started).toEqual(['a', 'b', 'c'])
resolvers.get('b')?.(0)
await Promise.resolve()
expect(started).toEqual(['a', 'b', 'c', 'd'])
resolvers.get('a')?.(0)
await Promise.resolve()
expect(started).toEqual(['a', 'b', 'c', 'd', 'e'])
for (const taskId of ['c', 'd', 'e']) {
resolvers.get(taskId)?.(0)
}
const result = await running
expect(result.ok).toBe(true)
expect(result.workers.map(worker => worker.taskId)).toEqual(['a', 'b', 'c', 'd', 'e'])
})
test('continues launching queued workers after a worker fails', async () => {
const started: string[] = []
const spawnWorker: SpawnEvaluationWorker = async request => {
started.push(request.taskId)
return { taskId: request.taskId, exitCode: request.taskId === 'a' ? 1 : 0 }
}
const result = await runEvaluationBatch({
taskIds: ['a', 'b', 'c'],
tasksDir: 'tasks',
runsDir: 'output/runs',
maxRounds: 1,
timeoutSeconds: 120,
concurrency: 1,
temperature: 1,
thinking: 'disabled',
verbose: false,
spawnWorker,
})
expect(started).toEqual(['a', 'b', 'c'])
expect(result.ok).toBe(false)
expect(result.workers.map(worker => worker.exitCode)).toEqual([1, 0, 0])
})
test('passes worker watchdog timeout to spawned workers', async () => {
const timeouts: Array<number | undefined> = []
const spawnWorker: SpawnEvaluationWorker = async request => {
timeouts.push(request.timeoutMs)
return { taskId: request.taskId, exitCode: 0 }
}
await runEvaluationBatch({
taskIds: ['task_a'],
tasksDir: 'tasks',
runsDir: 'output/runs',
maxRounds: 1,
timeoutSeconds: 10,
workerTimeoutGraceSeconds: 7,
concurrency: 3,
temperature: 1,
thinking: 'disabled',
verbose: false,
spawnWorker,
})
expect(timeouts).toEqual([17000])
})
})