ml-intern

Running

ml-intern / tests /unit /test_doom_loop_polling.py

Aksel Joonas Reedi

Keep repetition guard internal (#144)

e8ed637 unverified 23 days ago

3.42 kB

	"""Regression test for doom-loop false-positive on legitimate polling.

	Reproduces the failure mode in observatory sessions 40fcb414 ($32.59),
	8e90352e ($62.63), and 403178bf ($5.71) on 2026-04-25: the agent polled a
	long-running job with `bash sleep 300 && wc -l output` four times in a
	row. The arguments were byte-identical, but the results moved (27210 →
	36454 → 45770 → 55138 — actual progress). The detector hashed args only
	and false-fired the repetition guard, which made the agent abandon perfectly valid
	polling.

	After the fix the signature includes the tool result hash, so identical
	args + different results no longer trips the detector.
	"""

	from litellm import ChatCompletionMessageToolCall, Message

	from agent.core.doom_loop import check_for_doom_loop


	def _assistant(call_id: str, name: str, args: str) -> Message:
	return Message(
	role="assistant",
	content=None,
	tool_calls=[
	ChatCompletionMessageToolCall(
	id=call_id,
	type="function",
	function={"name": name, "arguments": args},
	)
	],
	)


	def _tool(call_id: str, name: str, content: str) -> Message:
	return Message(role="tool", content=content, tool_call_id=call_id, name=name)


	_POLL_ARGS = '{"command": "sleep 300 && ls /app/images/ \| wc -l"}'


	def test_polling_with_progressing_results_does_not_fire():
	msgs = [
	Message(role="user", content="run the job"),
	_assistant("c1", "bash", _POLL_ARGS),
	_tool("c1", "bash", "27210"),
	_assistant("c2", "bash", _POLL_ARGS),
	_tool("c2", "bash", "36454"),
	_assistant("c3", "bash", _POLL_ARGS),
	_tool("c3", "bash", "45770"),
	_assistant("c4", "bash", _POLL_ARGS),
	_tool("c4", "bash", "55138"),
	]
	assert check_for_doom_loop(msgs) is None


	def test_truly_stuck_polling_with_identical_results_still_fires():
	"""If the same poll returns the same number, the job is genuinely
	stuck and the detector SHOULD fire."""
	msgs = [
	_assistant("c1", "bash", _POLL_ARGS),
	_tool("c1", "bash", "55138"),
	_assistant("c2", "bash", _POLL_ARGS),
	_tool("c2", "bash", "55138"),
	_assistant("c3", "bash", _POLL_ARGS),
	_tool("c3", "bash", "55138"),
	]
	prompt = check_for_doom_loop(msgs)
	assert prompt is not None
	assert "REPETITION GUARD" in prompt
	assert "bash" in prompt


	def test_identical_calls_with_no_results_yet_still_fires():
	"""If three identical calls have no tool results (e.g. all cancelled
	or errored before a result was recorded), treat as a real loop."""
	msgs = [
	_assistant("c1", "write", '{"path": "/tmp/x", "content": "..."}'),
	_assistant("c2", "write", '{"path": "/tmp/x", "content": "..."}'),
	_assistant("c3", "write", '{"path": "/tmp/x", "content": "..."}'),
	]
	prompt = check_for_doom_loop(msgs)
	assert prompt is not None
	assert "REPETITION GUARD" in prompt
	assert "write" in prompt


	def test_different_args_does_not_fire():
	msgs = [
	_assistant("c1", "bash", '{"command": "ls /a"}'),
	_tool("c1", "bash", "ok"),
	_assistant("c2", "bash", '{"command": "ls /b"}'),
	_tool("c2", "bash", "ok"),
	_assistant("c3", "bash", '{"command": "ls /c"}'),
	_tool("c3", "bash", "ok"),
	]
	assert check_for_doom_loop(msgs) is None