| """Regression test for doom-loop false-positive on legitimate polling. |
| |
| Reproduces the failure mode in observatory sessions 40fcb414 ($32.59), |
| 8e90352e ($62.63), and 403178bf ($5.71) on 2026-04-25: the agent polled a |
| long-running job with `bash sleep 300 && wc -l output` four times in a |
| row. The arguments were byte-identical, but the results moved (27210 β |
| 36454 β 45770 β 55138 β actual progress). The detector hashed args only |
| and false-fired the repetition guard, which made the agent abandon perfectly valid |
| polling. |
| |
| After the fix the signature includes the tool result hash, so identical |
| args + different results no longer trips the detector. |
| """ |
|
|
| from litellm import ChatCompletionMessageToolCall, Message |
|
|
| from agent.core.doom_loop import check_for_doom_loop |
|
|
|
|
| def _assistant(call_id: str, name: str, args: str) -> Message: |
| return Message( |
| role="assistant", |
| content=None, |
| tool_calls=[ |
| ChatCompletionMessageToolCall( |
| id=call_id, |
| type="function", |
| function={"name": name, "arguments": args}, |
| ) |
| ], |
| ) |
|
|
|
|
| def _tool(call_id: str, name: str, content: str) -> Message: |
| return Message(role="tool", content=content, tool_call_id=call_id, name=name) |
|
|
|
|
| _POLL_ARGS = '{"command": "sleep 300 && ls /app/images/ | wc -l"}' |
|
|
|
|
| def test_polling_with_progressing_results_does_not_fire(): |
| msgs = [ |
| Message(role="user", content="run the job"), |
| _assistant("c1", "bash", _POLL_ARGS), |
| _tool("c1", "bash", "27210"), |
| _assistant("c2", "bash", _POLL_ARGS), |
| _tool("c2", "bash", "36454"), |
| _assistant("c3", "bash", _POLL_ARGS), |
| _tool("c3", "bash", "45770"), |
| _assistant("c4", "bash", _POLL_ARGS), |
| _tool("c4", "bash", "55138"), |
| ] |
| assert check_for_doom_loop(msgs) is None |
|
|
|
|
| def test_truly_stuck_polling_with_identical_results_still_fires(): |
| """If the same poll returns the same number, the job is genuinely |
| stuck and the detector SHOULD fire.""" |
| msgs = [ |
| _assistant("c1", "bash", _POLL_ARGS), |
| _tool("c1", "bash", "55138"), |
| _assistant("c2", "bash", _POLL_ARGS), |
| _tool("c2", "bash", "55138"), |
| _assistant("c3", "bash", _POLL_ARGS), |
| _tool("c3", "bash", "55138"), |
| ] |
| prompt = check_for_doom_loop(msgs) |
| assert prompt is not None |
| assert "REPETITION GUARD" in prompt |
| assert "bash" in prompt |
|
|
|
|
| def test_identical_calls_with_no_results_yet_still_fires(): |
| """If three identical calls have no tool results (e.g. all cancelled |
| or errored before a result was recorded), treat as a real loop.""" |
| msgs = [ |
| _assistant("c1", "write", '{"path": "/tmp/x", "content": "..."}'), |
| _assistant("c2", "write", '{"path": "/tmp/x", "content": "..."}'), |
| _assistant("c3", "write", '{"path": "/tmp/x", "content": "..."}'), |
| ] |
| prompt = check_for_doom_loop(msgs) |
| assert prompt is not None |
| assert "REPETITION GUARD" in prompt |
| assert "write" in prompt |
|
|
|
|
| def test_different_args_does_not_fire(): |
| msgs = [ |
| _assistant("c1", "bash", '{"command": "ls /a"}'), |
| _tool("c1", "bash", "ok"), |
| _assistant("c2", "bash", '{"command": "ls /b"}'), |
| _tool("c2", "bash", "ok"), |
| _assistant("c3", "bash", '{"command": "ls /c"}'), |
| _tool("c3", "bash", "ok"), |
| ] |
| assert check_for_doom_loop(msgs) is None |
|
|