biodsbench-adapter / src /harness /evaluation /sourceTaskLoop.test.ts

Add files using upload-large-folder tool

2c2dc59 verified 17 days ago

19 kB

	import { existsSync } from 'fs'
	import { mkdir, mkdtemp, readFile, writeFile } from 'fs/promises'
	import { dirname, join } from 'path'
	import { tmpdir } from 'os'
	import { describe, expect, test } from 'bun:test'
	import { runSourceTaskLoop } from './sourceTaskLoop.js'
	import type {
	JudgeRunner,
	SourceAgentSession,
	SourceAgentTurnInput,
	} from './types.js'

	async function makeTask(root: string, taskId: string, withRuntime = false): Promise<void> {
	const taskDir = join(root, taskId)
	await mkdir(join(taskDir, 'visible_data'), { recursive: true })
	await mkdir(join(taskDir, 'evaluation'), { recursive: true })
	await writeFile(join(taskDir, 'README.md'), '# Demo\n', 'utf8')
	await writeFile(join(taskDir, 'visible_data', 'cases.json'), '[]', 'utf8')
	await writeFile(join(taskDir, 'evaluation', 'judge.py'), '', 'utf8')
	if (withRuntime) {
	const pythonRel =
	process.platform === 'win32'
	? 'envs/runtime/.venv/Scripts/python.exe'
	: 'envs/runtime/.venv-posix/bin/python'
	const pythonAbs = join(taskDir, ...pythonRel.split('/'))
	await mkdir(dirname(pythonAbs), { recursive: true })
	await writeFile(pythonAbs, '', 'utf8')
	await mkdir(join(taskDir, 'envs'), { recursive: true })
	await writeFile(
	join(taskDir, 'envs', 'env_manifest.json'),
	JSON.stringify({
	default_env: 'runtime',
	envs: {
	runtime: {
	python: {
	[process.platform === 'win32' ? 'windows' : 'posix']: pythonRel,
	},
	},
	},
	}),
	'utf8',
	)
	}
	await writeFile(
	join(taskDir, 'task_manifest.json'),
	JSON.stringify({
	version: 1,
	task_id: taskId,
	public_bundle: withRuntime
	? ['README.md', 'visible_data/', 'envs/']
	: ['README.md', 'visible_data/'],
	private_judge_bundle: ['evaluation/'],
	entrypoints: withRuntime ? { environment: 'envs/env_manifest.json' } : {},
	submission: { output_dir: 'outputs' },
	}),
	'utf8',
	)
	}

	describe('runSourceTaskLoop', () => {
	test('interrupts and closes agent event generator when agent inference times out', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-timeout-close-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'timeout_close_task', true)
	let generatorClosed = false
	let interrupted = false
	let disposed = false
	let releaseGenerator!: () => void

	async function* hangingSubmit() {
	try {
	await new Promise<void>(resolve => {
	releaseGenerator = resolve
	})
	} finally {
	generatorClosed = true
	}
	}

	const result = await runSourceTaskLoop({
	taskId: 'timeout_close_task',
	tasksDir,
	runsDir,
	maxRounds: 1,
	timeoutSeconds: 1,
	sessionDisposeGraceMs: 50,
	sessionFactory: async () => ({
	submit: hangingSubmit,
	interrupt() {
	interrupted = true
	releaseGenerator()
	},
	async dispose() {
	disposed = true
	},
	}),
	judge: {
	async run() {
	throw new Error('judge should not run')
	},
	},
	})

	expect(result.status).toBe('timeout')
	expect(generatorClosed).toBe(true)
	expect(interrupted).toBe(true)
	expect(disposed).toBe(true)
	})

	test('does not hang forever when session dispose never resolves', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-dispose-hang-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'dispose_hang_task', true)

	const result = await runSourceTaskLoop({
	taskId: 'dispose_hang_task',
	tasksDir,
	runsDir,
	maxRounds: 1,
	timeoutSeconds: 1,
	sessionDisposeGraceMs: 50,
	sessionFactory: async () => ({
	async *submit() {
	throw new Error('force dispose path')
	},
	async dispose() {
	await new Promise(() => {})
	},
	}),
	judge: {
	async run() {
	throw new Error('judge should not run')
	},
	},
	})

	expect(result.status).toBe('failed')
	const events = await readFile(join(result.run.logsDir, 'run_events.jsonl'), 'utf8')
	expect(events).toContain('session_dispose_timeout')
	})

	test('uses one source agent session across multiple judge feedback turns', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'demo_task', true)
	const prompts: string[] = []
	const startMaxTurns: Array<number \| undefined> = []
	const turnMaxTurns: Array<number \| undefined> = []
	const judgeRuntimePythons: string[] = []
	const sessionRuntimePythons: string[] = []
	let sessionCreations = 0
	let disposed = false
	const session: SourceAgentSession = {
	async *submit(input: SourceAgentTurnInput) {
	prompts.push(input.prompt)
	turnMaxTurns.push(input.maxTurnsPerRound)
	sessionRuntimePythons.push(input.runtime.python)
	yield { type: 'assistant_text', text: `turn ${prompts.length}` }
	yield {
	type: 'finalize',
	summary: 'ready',
	files: ['outputs/case_000.npz'],
	}
	},
	async dispose() {
	disposed = true
	},
	}
	const judge: JudgeRunner = {
	async run(input) {
	judgeRuntimePythons.push(input.runtime.python)
	return prompts.length < 3
	? {
	status: 'fail',
	reward: 0,
	feedback: `missing final detail ${prompts.length}`,
	raw: { status: 'fail' },
	}
	: {
	status: 'pass',
	reward: 1,
	feedback: 'ok',
	raw: { status: 'pass' },
	}
	},
	}

	const result = await runSourceTaskLoop({
	taskId: 'demo_task',
	tasksDir,
	runsDir,
	maxRounds: 3,
	maxTurnsPerRound: 7,
	timeoutSeconds: 30,
	sessionFactory: async input => {
	sessionCreations++
	startMaxTurns.push(input.maxTurnsPerRound)
	return session
	},
	judge,
	})

	expect(result.status).toBe('success')
	expect(result.rounds).toBe(3)
	expect(sessionCreations).toBe(1)
	expect(startMaxTurns).toEqual([7])
	expect(turnMaxTurns).toEqual([7, 7, 7])
	expect(prompts).toHaveLength(3)
	expect(prompts[0]).toContain('round_plan_file: workspace/plans/round_01.md')
	expect(prompts[0]).toContain('# Demo')
	expect(prompts[1]).toContain('<judge_feedback>')
	expect(prompts[1]).toContain('message: missing final detail 1')
	expect(prompts[1]).toContain('workspace/plans/round_02.md')
	expect(prompts[2]).toContain('message: missing final detail 2')
	expect(prompts[2]).toContain('workspace/plans/round_03.md')
	expect(new Set(sessionRuntimePythons).size).toBe(1)
	expect(judgeRuntimePythons).toEqual(sessionRuntimePythons)
	expect(disposed).toBe(true)
	expect(existsSync(join(result.run.logsDir, 'trajectory.clean.jsonl'))).toBe(true)
	const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
	expect(clean).toContain('"kind":"judge_result"')
	expect(clean).not.toContain('result_path')
	expect(clean).not.toContain('.judge_private')
	expect(clean).not.toContain('"system_prompt"')
	const raw = await readFile(join(result.run.logsDir, 'trajectory.raw.jsonl'), 'utf8')
	expect(raw).toContain('"kind":"judge_result_raw"')
	})

	test('returns infra_error before creating a session when runtime is missing', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-infra-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'broken_runtime', false)
	await mkdir(join(tasksDir, 'broken_runtime', 'envs'), { recursive: true })
	await writeFile(
	join(tasksDir, 'broken_runtime', 'envs', 'env_manifest.json'),
	JSON.stringify({
	default_env: 'runtime',
	envs: {
	runtime: {
	python: {
	windows: 'envs/runtime/.venv/Scripts/python.exe',
	posix: 'envs/runtime/.venv/bin/python',
	},
	},
	},
	}),
	'utf8',
	)
	const manifestPath = join(tasksDir, 'broken_runtime', 'task_manifest.json')
	const manifest = JSON.parse(await readFile(manifestPath, 'utf8'))
	manifest.public_bundle.push('envs/')
	manifest.entrypoints = { environment: 'envs/env_manifest.json' }
	await writeFile(manifestPath, JSON.stringify(manifest), 'utf8')
	let sessionCreations = 0

	const result = await runSourceTaskLoop({
	taskId: 'broken_runtime',
	tasksDir,
	runsDir,
	maxRounds: 1,
	timeoutSeconds: 30,
	sessionFactory: async () => {
	sessionCreations++
	throw new Error('should not create session')
	},
	judge: {
	async run() {
	throw new Error('judge should not run')
	},
	},
	})

	expect(result.status).toBe('infra_error')
	expect(sessionCreations).toBe(0)
	expect(result.lastJudgeResult).toBeUndefined()
	})

	test('does not impose a per-round turn cap unless explicitly requested', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-unlimited-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'unlimited_task', true)
	const startMaxTurns: Array<number \| undefined> = []
	const turnMaxTurns: Array<number \| undefined> = []

	const result = await runSourceTaskLoop({
	taskId: 'unlimited_task',
	tasksDir,
	runsDir,
	maxRounds: 1,
	timeoutSeconds: 30,
	sessionFactory: async input => {
	startMaxTurns.push(input.maxTurnsPerRound)
	return {
	async *submit(turnInput: SourceAgentTurnInput) {
	turnMaxTurns.push(turnInput.maxTurnsPerRound)
	yield { type: 'finalize', summary: 'ready', files: [] }
	},
	}
	},
	judge: {
	async run() {
	return {
	status: 'pass',
	reward: 1,
	feedback: 'ok',
	raw: { status: 'pass' },
	}
	},
	},
	})

	expect(result.status).toBe('success')
	expect(startMaxTurns).toEqual([undefined])
	expect(turnMaxTurns).toEqual([undefined])
	})

	test('requests same-session recovery when an agent turn ends without finalize', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-recovery-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'recovery_task', true)
	const prompts: string[] = []
	let sessionCreations = 0
	let judgeCalls = 0

	const result = await runSourceTaskLoop({
	taskId: 'recovery_task',
	tasksDir,
	runsDir,
	maxRounds: 1,
	timeoutSeconds: 30,
	sessionFactory: async () => {
	sessionCreations++
	return {
	async *submit(input: SourceAgentTurnInput) {
	prompts.push(input.prompt)
	if (prompts.length === 1) {
	yield {
	type: 'agent_result',
	subtype: 'success',
	stopReason: 'end_turn',
	durationMs: 10,
	usage: { input_tokens: 12, output_tokens: 3 },
	} as never
	return
	}
	yield { type: 'assistant_text', text: 'Recovering by submitting output.' }
	yield {
	type: 'finalize',
	summary: 'ready after recovery',
	files: ['outputs/case_000.npz'],
	}
	},
	}
	},
	judge: {
	async run() {
	judgeCalls++
	return {
	status: 'pass',
	reward: 1,
	feedback: 'ok',
	raw: { status: 'pass' },
	}
	},
	},
	})

	expect(result.status).toBe('success')
	expect(result.rounds).toBe(1)
	expect(sessionCreations).toBe(1)
	expect(judgeCalls).toBe(1)
	expect(prompts).toHaveLength(2)
	expect(prompts[1]).toContain('<no_finalize_recovery>')
	expect(prompts[1]).toContain('call finalize_submission now')

	const events = await readFile(join(result.run.logsDir, 'run_events.jsonl'), 'utf8')
	expect(events).toContain('"type":"agent_recovery_started"')
	expect(events).toContain('"type":"agent_recovery_finished"')

	const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
	expect(clean).toContain('"kind":"agent_result"')
	expect(clean).toContain('"stop_reason":"end_turn"')
	expect(clean).toContain('"kind":"recovery_started"')
	expect(clean).toContain('"kind":"recovery_finished"')
	expect(clean).toContain('"finalized":true')
	expect(clean).toContain('ready after recovery')
	})

	test('does not judge or consume a round when validation never passes', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-validation-fail-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'validation_fail_task', true)
	let judgeCalls = 0

	const result = await runSourceTaskLoop({
	taskId: 'validation_fail_task',
	tasksDir,
	runsDir,
	maxRounds: 1,
	timeoutSeconds: 30,
	llmOptions: { temperature: 1, thinking: 'disabled' },
	sessionFactory: async () => ({
	async *submit(input: SourceAgentTurnInput) {
	if (input.prompt.includes('<no_finalize_recovery>')) return
	yield {
	type: 'run_warning',
	code: 'missing_round_plan',
	message: 'workspace/plans/round_01.md is missing.',
	}
	yield {
	type: 'submission_validation_failed',
	result: {
	ok: false,
	normalizedFiles: [],
	issues: [
	{
	code: 'missing_output_file',
	path: 'outputs/case_000.npz',
	message: 'outputs/case_000.npz is missing',
	},
	],
	},
	}
	},
	}),
	judge: {
	async run() {
	judgeCalls++
	throw new Error('judge should not run')
	},
	},
	})

	expect(result.status).toBe('failed')
	expect(result.rounds).toBe(0)
	expect(judgeCalls).toBe(0)

	const summary = JSON.parse(
	await readFile(join(result.run.logsDir, 'run_summary.json'), 'utf8'),
	)
	expect(summary.run_metadata.temperature_configured).toBe(1)
	expect(summary.run_metadata.temperature_sent).toBe(1)
	expect(summary.validation_attempts).toHaveLength(1)
	expect(summary.validation_attempts[0].ok).toBe(false)
	expect(summary.warnings).toHaveLength(1)
	expect(summary.warnings[0].code).toBe('missing_round_plan')

	const events = await readFile(join(result.run.logsDir, 'run_events.jsonl'), 'utf8')
	expect(events).toContain('"type":"submission_validation_failed"')
	expect(events).toContain('"type":"run_warning"')

	const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
	expect(clean).toContain('"kind":"submission_validation_failed"')
	expect(clean).toContain('"kind":"trajectory_warning"')
	})

	test('invalid validation followed by valid finalize consumes one judge round', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-validation-retry-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'validation_retry_task', true)
	let judgeCalls = 0

	const result = await runSourceTaskLoop({
	taskId: 'validation_retry_task',
	tasksDir,
	runsDir,
	maxRounds: 2,
	timeoutSeconds: 30,
	sessionFactory: async () => ({
	async *submit() {
	yield {
	type: 'submission_validation_failed',
	result: {
	ok: false,
	normalizedFiles: [],
	issues: [
	{
	code: 'shape_mismatch',
	path: 'outputs/case_000.npz',
	key: 'reconstruction',
	message: 'shape mismatch',
	},
	],
	},
	}
	yield {
	type: 'submission_validation_passed',
	result: {
	ok: true,
	normalizedFiles: ['outputs/case_000.npz'],
	issues: [],
	},
	}
	yield {
	type: 'finalize',
	summary: 'ready after retry',
	files: ['outputs/case_000.npz'],
	}
	},
	}),
	judge: {
	async run() {
	judgeCalls++
	return {
	status: 'pass',
	reward: 1,
	feedback: 'ok',
	raw: { status: 'pass' },
	}
	},
	},
	})

	expect(result.status).toBe('success')
	expect(result.rounds).toBe(1)
	expect(judgeCalls).toBe(1)

	const summary = JSON.parse(
	await readFile(join(result.run.logsDir, 'run_summary.json'), 'utf8'),
	)
	expect(summary.validation_attempts.map((attempt: { ok: boolean }) => attempt.ok)).toEqual([
	false,
	true,
	])
	})

	test('stops draining agent events immediately after finalize', async () => {
	const root = await mkdtemp(join(tmpdir(), 'source-loop-finalize-terminal-'))
	const tasksDir = join(root, 'tasks')
	const runsDir = join(root, 'runs')
	await makeTask(tasksDir, 'finalize_terminal_task', true)

	const result = await runSourceTaskLoop({
	taskId: 'finalize_terminal_task',
	tasksDir,
	runsDir,
	maxRounds: 1,
	timeoutSeconds: 30,
	sessionFactory: async () => ({
	async *submit() {
	yield {
	type: 'submission_validation_passed',
	result: {
	ok: true,
	normalizedFiles: ['outputs/case_000.npz'],
	issues: [],
	},
	}
	yield {
	type: 'finalize',
	summary: 'ready',
	files: ['outputs/case_000.npz'],
	}
	yield {
	type: 'assistant_text',
	text: 'BUG: this event should not be consumed after finalize.',
	}
	},
	}),
	judge: {
	async run() {
	return {
	status: 'pass',
	reward: 1,
	feedback: 'ok',
	raw: { status: 'pass' },
	}
	},
	},
	})

	expect(result.status).toBe('success')
	const clean = await readFile(join(result.run.logsDir, 'trajectory.clean.jsonl'), 'utf8')
	expect(clean).toContain('"kind":"finalize"')
	expect(clean).not.toContain('BUG: this event should not be consumed')
	})
	})