biodsbench-adapter / src /harness /evaluation /sourceTaskLoop.test.ts
starpacker52's picture
Add files using upload-large-folder tool
2c2dc59 verified
Raw
History Blame Contribute Delete
19 kB
import { existsSync } from 'fs'
import { mkdir, mkdtemp, readFile, writeFile } from 'fs/promises'
import { dirname, join } from 'path'
import { tmpdir } from 'os'
import { describe, expect, test } from 'bun:test'
import { runSourceTaskLoop } from './sourceTaskLoop.js'
import type {
JudgeRunner,
SourceAgentSession,
SourceAgentTurnInput,
} from './types.js'
async function makeTask(root: string, taskId: string, withRuntime = false): Promise<void> {
const taskDir = join(root, taskId)
await mkdir(join(taskDir, 'visible_data'), { recursive: true })
await mkdir(join(taskDir, 'evaluation'), { recursive: true })
await writeFile(join(taskDir, 'README.md'), '# Demo\n', 'utf8')
await writeFile(join(taskDir, 'visible_data', 'cases.json'), '[]', 'utf8')
await writeFile(join(taskDir, 'evaluation', 'judge.py'), '', 'utf8')
if (withRuntime) {
const pythonRel =
process.platform === 'win32'
? 'envs/runtime/.venv/Scripts/python.exe'
: 'envs/runtime/.venv-posix/bin/python'
const pythonAbs = join(taskDir, ...pythonRel.split('/'))
await mkdir(dirname(pythonAbs), { recursive: true })
await writeFile(pythonAbs, '', 'utf8')
await mkdir(join(taskDir, 'envs'), { recursive: true })
await writeFile(
join(taskDir, 'envs', 'env_manifest.json'),
JSON.stringify({
default_env: 'runtime',
envs: {
runtime: {
python: {
[process.platform === 'win32' ? 'windows' : 'posix']: pythonRel,
},
},
},
}),
'utf8',
)
}
await writeFile(
join(taskDir, 'task_manifest.json'),
JSON.stringify({
version: 1,
task_id: taskId,
public_bundle: withRuntime
? ['README.md', 'visible_data/', 'envs/']
: ['README.md', 'visible_data/'],
private_judge_bundle: ['evaluation/'],
entrypoints: withRuntime ? { environment: 'envs/env_manifest.json' } : {},
submission: { output_dir: 'outputs' },
}),
'utf8',
)
}
describe('runSourceTaskLoop', () => {
test('interrupts and closes agent event generator when agent inference times out', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-timeout-close-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'timeout_close_task', true)
let generatorClosed = false
let interrupted = false
let disposed = false
let releaseGenerator!: () => void
async function* hangingSubmit() {
try {
await new Promise<void>(resolve => {
releaseGenerator = resolve
})
} finally {
generatorClosed = true
}
}
const result = await runSourceTaskLoop({
taskId: 'timeout_close_task',
tasksDir,
runsDir,
maxRounds: 1,
timeoutSeconds: 1,
sessionDisposeGraceMs: 50,
sessionFactory: async () => ({
submit: hangingSubmit,
interrupt() {
interrupted = true
releaseGenerator()
},
async dispose() {
disposed = true
},
}),
judge: {
async run() {
throw new Error('judge should not run')
},
},
})
expect(result.status).toBe('timeout')
expect(generatorClosed).toBe(true)
expect(interrupted).toBe(true)
expect(disposed).toBe(true)
})
test('does not hang forever when session dispose never resolves', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-dispose-hang-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'dispose_hang_task', true)
const result = await runSourceTaskLoop({
taskId: 'dispose_hang_task',
tasksDir,
runsDir,
maxRounds: 1,
timeoutSeconds: 1,
sessionDisposeGraceMs: 50,
sessionFactory: async () => ({
async *submit() {
throw new Error('force dispose path')
},
async dispose() {
await new Promise(() => {})
},
}),
judge: {
async run() {
throw new Error('judge should not run')
},
},
})
expect(result.status).toBe('failed')
const events = await readFile(join(result.run.logsDir, 'run_events.jsonl'), 'utf8')
expect(events).toContain('session_dispose_timeout')
})
test('uses one source agent session across multiple judge feedback turns', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'demo_task', true)
const prompts: string[] = []
const startMaxTurns: Array<number | undefined> = []
const turnMaxTurns: Array<number | undefined> = []
const judgeRuntimePythons: string[] = []
const sessionRuntimePythons: string[] = []
let sessionCreations = 0
let disposed = false
const session: SourceAgentSession = {
async *submit(input: SourceAgentTurnInput) {
prompts.push(input.prompt)
turnMaxTurns.push(input.maxTurnsPerRound)
sessionRuntimePythons.push(input.runtime.python)
yield { type: 'assistant_text', text: `turn ${prompts.length}` }
yield {
type: 'finalize',
summary: 'ready',
files: ['outputs/case_000.npz'],
}
},
async dispose() {
disposed = true
},
}
const judge: JudgeRunner = {
async run(input) {
judgeRuntimePythons.push(input.runtime.python)
return prompts.length < 3
? {
status: 'fail',
reward: 0,
feedback: `missing final detail ${prompts.length}`,
raw: { status: 'fail' },
}
: {
status: 'pass',
reward: 1,
feedback: 'ok',
raw: { status: 'pass' },
}
},
}
const result = await runSourceTaskLoop({
taskId: 'demo_task',
tasksDir,
runsDir,
maxRounds: 3,
maxTurnsPerRound: 7,
timeoutSeconds: 30,
sessionFactory: async input => {
sessionCreations++
startMaxTurns.push(input.maxTurnsPerRound)
return session
},
judge,
})
expect(result.status).toBe('success')
expect(result.rounds).toBe(3)
expect(sessionCreations).toBe(1)
expect(startMaxTurns).toEqual([7])
expect(turnMaxTurns).toEqual([7, 7, 7])
expect(prompts).toHaveLength(3)
expect(prompts[0]).toContain('round_plan_file: workspace/plans/round_01.md')
expect(prompts[0]).toContain('# Demo')
expect(prompts[1]).toContain('<judge_feedback>')
expect(prompts[1]).toContain('message: missing final detail 1')
expect(prompts[1]).toContain('workspace/plans/round_02.md')
expect(prompts[2]).toContain('message: missing final detail 2')
expect(prompts[2]).toContain('workspace/plans/round_03.md')
expect(new Set(sessionRuntimePythons).size).toBe(1)
expect(judgeRuntimePythons).toEqual(sessionRuntimePythons)
expect(disposed).toBe(true)
expect(existsSync(join(result.run.logsDir, 'trajectory.clean.jsonl'))).toBe(true)
const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
expect(clean).toContain('"kind":"judge_result"')
expect(clean).not.toContain('result_path')
expect(clean).not.toContain('.judge_private')
expect(clean).not.toContain('"system_prompt"')
const raw = await readFile(join(result.run.logsDir, 'trajectory.raw.jsonl'), 'utf8')
expect(raw).toContain('"kind":"judge_result_raw"')
})
test('returns infra_error before creating a session when runtime is missing', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-infra-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'broken_runtime', false)
await mkdir(join(tasksDir, 'broken_runtime', 'envs'), { recursive: true })
await writeFile(
join(tasksDir, 'broken_runtime', 'envs', 'env_manifest.json'),
JSON.stringify({
default_env: 'runtime',
envs: {
runtime: {
python: {
windows: 'envs/runtime/.venv/Scripts/python.exe',
posix: 'envs/runtime/.venv/bin/python',
},
},
},
}),
'utf8',
)
const manifestPath = join(tasksDir, 'broken_runtime', 'task_manifest.json')
const manifest = JSON.parse(await readFile(manifestPath, 'utf8'))
manifest.public_bundle.push('envs/')
manifest.entrypoints = { environment: 'envs/env_manifest.json' }
await writeFile(manifestPath, JSON.stringify(manifest), 'utf8')
let sessionCreations = 0
const result = await runSourceTaskLoop({
taskId: 'broken_runtime',
tasksDir,
runsDir,
maxRounds: 1,
timeoutSeconds: 30,
sessionFactory: async () => {
sessionCreations++
throw new Error('should not create session')
},
judge: {
async run() {
throw new Error('judge should not run')
},
},
})
expect(result.status).toBe('infra_error')
expect(sessionCreations).toBe(0)
expect(result.lastJudgeResult).toBeUndefined()
})
test('does not impose a per-round turn cap unless explicitly requested', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-unlimited-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'unlimited_task', true)
const startMaxTurns: Array<number | undefined> = []
const turnMaxTurns: Array<number | undefined> = []
const result = await runSourceTaskLoop({
taskId: 'unlimited_task',
tasksDir,
runsDir,
maxRounds: 1,
timeoutSeconds: 30,
sessionFactory: async input => {
startMaxTurns.push(input.maxTurnsPerRound)
return {
async *submit(turnInput: SourceAgentTurnInput) {
turnMaxTurns.push(turnInput.maxTurnsPerRound)
yield { type: 'finalize', summary: 'ready', files: [] }
},
}
},
judge: {
async run() {
return {
status: 'pass',
reward: 1,
feedback: 'ok',
raw: { status: 'pass' },
}
},
},
})
expect(result.status).toBe('success')
expect(startMaxTurns).toEqual([undefined])
expect(turnMaxTurns).toEqual([undefined])
})
test('requests same-session recovery when an agent turn ends without finalize', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-recovery-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'recovery_task', true)
const prompts: string[] = []
let sessionCreations = 0
let judgeCalls = 0
const result = await runSourceTaskLoop({
taskId: 'recovery_task',
tasksDir,
runsDir,
maxRounds: 1,
timeoutSeconds: 30,
sessionFactory: async () => {
sessionCreations++
return {
async *submit(input: SourceAgentTurnInput) {
prompts.push(input.prompt)
if (prompts.length === 1) {
yield {
type: 'agent_result',
subtype: 'success',
stopReason: 'end_turn',
durationMs: 10,
usage: { input_tokens: 12, output_tokens: 3 },
} as never
return
}
yield { type: 'assistant_text', text: 'Recovering by submitting output.' }
yield {
type: 'finalize',
summary: 'ready after recovery',
files: ['outputs/case_000.npz'],
}
},
}
},
judge: {
async run() {
judgeCalls++
return {
status: 'pass',
reward: 1,
feedback: 'ok',
raw: { status: 'pass' },
}
},
},
})
expect(result.status).toBe('success')
expect(result.rounds).toBe(1)
expect(sessionCreations).toBe(1)
expect(judgeCalls).toBe(1)
expect(prompts).toHaveLength(2)
expect(prompts[1]).toContain('<no_finalize_recovery>')
expect(prompts[1]).toContain('call finalize_submission now')
const events = await readFile(join(result.run.logsDir, 'run_events.jsonl'), 'utf8')
expect(events).toContain('"type":"agent_recovery_started"')
expect(events).toContain('"type":"agent_recovery_finished"')
const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
expect(clean).toContain('"kind":"agent_result"')
expect(clean).toContain('"stop_reason":"end_turn"')
expect(clean).toContain('"kind":"recovery_started"')
expect(clean).toContain('"kind":"recovery_finished"')
expect(clean).toContain('"finalized":true')
expect(clean).toContain('ready after recovery')
})
test('does not judge or consume a round when validation never passes', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-validation-fail-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'validation_fail_task', true)
let judgeCalls = 0
const result = await runSourceTaskLoop({
taskId: 'validation_fail_task',
tasksDir,
runsDir,
maxRounds: 1,
timeoutSeconds: 30,
llmOptions: { temperature: 1, thinking: 'disabled' },
sessionFactory: async () => ({
async *submit(input: SourceAgentTurnInput) {
if (input.prompt.includes('<no_finalize_recovery>')) return
yield {
type: 'run_warning',
code: 'missing_round_plan',
message: 'workspace/plans/round_01.md is missing.',
}
yield {
type: 'submission_validation_failed',
result: {
ok: false,
normalizedFiles: [],
issues: [
{
code: 'missing_output_file',
path: 'outputs/case_000.npz',
message: 'outputs/case_000.npz is missing',
},
],
},
}
},
}),
judge: {
async run() {
judgeCalls++
throw new Error('judge should not run')
},
},
})
expect(result.status).toBe('failed')
expect(result.rounds).toBe(0)
expect(judgeCalls).toBe(0)
const summary = JSON.parse(
await readFile(join(result.run.logsDir, 'run_summary.json'), 'utf8'),
)
expect(summary.run_metadata.temperature_configured).toBe(1)
expect(summary.run_metadata.temperature_sent).toBe(1)
expect(summary.validation_attempts).toHaveLength(1)
expect(summary.validation_attempts[0].ok).toBe(false)
expect(summary.warnings).toHaveLength(1)
expect(summary.warnings[0].code).toBe('missing_round_plan')
const events = await readFile(join(result.run.logsDir, 'run_events.jsonl'), 'utf8')
expect(events).toContain('"type":"submission_validation_failed"')
expect(events).toContain('"type":"run_warning"')
const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
expect(clean).toContain('"kind":"submission_validation_failed"')
expect(clean).toContain('"kind":"trajectory_warning"')
})
test('invalid validation followed by valid finalize consumes one judge round', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-validation-retry-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'validation_retry_task', true)
let judgeCalls = 0
const result = await runSourceTaskLoop({
taskId: 'validation_retry_task',
tasksDir,
runsDir,
maxRounds: 2,
timeoutSeconds: 30,
sessionFactory: async () => ({
async *submit() {
yield {
type: 'submission_validation_failed',
result: {
ok: false,
normalizedFiles: [],
issues: [
{
code: 'shape_mismatch',
path: 'outputs/case_000.npz',
key: 'reconstruction',
message: 'shape mismatch',
},
],
},
}
yield {
type: 'submission_validation_passed',
result: {
ok: true,
normalizedFiles: ['outputs/case_000.npz'],
issues: [],
},
}
yield {
type: 'finalize',
summary: 'ready after retry',
files: ['outputs/case_000.npz'],
}
},
}),
judge: {
async run() {
judgeCalls++
return {
status: 'pass',
reward: 1,
feedback: 'ok',
raw: { status: 'pass' },
}
},
},
})
expect(result.status).toBe('success')
expect(result.rounds).toBe(1)
expect(judgeCalls).toBe(1)
const summary = JSON.parse(
await readFile(join(result.run.logsDir, 'run_summary.json'), 'utf8'),
)
expect(summary.validation_attempts.map((attempt: { ok: boolean }) => attempt.ok)).toEqual([
false,
true,
])
})
test('stops draining agent events immediately after finalize', async () => {
const root = await mkdtemp(join(tmpdir(), 'source-loop-finalize-terminal-'))
const tasksDir = join(root, 'tasks')
const runsDir = join(root, 'runs')
await makeTask(tasksDir, 'finalize_terminal_task', true)
const result = await runSourceTaskLoop({
taskId: 'finalize_terminal_task',
tasksDir,
runsDir,
maxRounds: 1,
timeoutSeconds: 30,
sessionFactory: async () => ({
async *submit() {
yield {
type: 'submission_validation_passed',
result: {
ok: true,
normalizedFiles: ['outputs/case_000.npz'],
issues: [],
},
}
yield {
type: 'finalize',
summary: 'ready',
files: ['outputs/case_000.npz'],
}
yield {
type: 'assistant_text',
text: 'BUG: this event should not be consumed after finalize.',
}
},
}),
judge: {
async run() {
return {
status: 'pass',
reward: 1,
feedback: 'ok',
raw: { status: 'pass' },
}
},
},
})
expect(result.status).toBe('success')
const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
expect(clean).toContain('"kind":"finalize"')
expect(clean).not.toContain('BUG: this event should not be consumed')
})
})