"""Tests for error-kind hint routing on the DEFAULT composite (ADR-012 #2). The default composite is template -> raw-error -> judge. Before ADR-012 the raw-error layer consumed ANY site carrying an `error_message`, including style/communication/effort sites — exactly the sites the LLM judge exists to cover. These tests validate the DEFAULT path (raw-error NOT disabled): a style/communication site WITH an error_message routes through to the judge, while tool/runtime sites still use the raw-error layer. """ from __future__ import annotations from composer_replication.hint_generator import ( RoutingHintGenerator, RawErrorHintGenerator, default_composite, is_tool_runtime_kind, ) # --- the headline acceptance: style site reaches judge on the DEFAULT path --- def test_style_site_with_error_message_reaches_judge_on_default_composite(): calls = {"n": 0} def fake_complete(prompt: str) -> str: calls["n"] += 1 return "Be more concise; you repeated the same explanation twice." # NOTE: raw-error is ENABLED (the default). Pre-ADR-012 this would have been # eaten by the raw-error layer and the judge never called. comp = default_composite(llm_complete=fake_complete) # enable_raw_error=True hint = comp.generate( "verbose_communication", {"error_message": "The agent restated the plan three times."}, ) assert hint == "Be more concise; you repeated the same explanation twice." assert calls["n"] == 1, "style site must reach the judge, not the raw-error layer" def test_effort_site_with_message_routes_to_judge(): calls = {"n": 0} def fake_complete(prompt: str) -> str: calls["n"] += 1 return "Don't pad the answer; one example suffices." comp = default_composite(llm_complete=fake_complete) hint = comp.generate("low_effort_style", {"error_message": "padding detected"}) assert hint == "Don't pad the answer; one example suffices." assert calls["n"] == 1 # --- tool/runtime sites still served by raw-error (no regression) ----------- def test_tool_runtime_site_still_served_by_raw_error_no_judge(): calls = {"n": 0} def fake_complete(prompt: str) -> str: calls["n"] += 1 return "JUDGE (should not be called)" comp = default_composite(llm_complete=fake_complete) # an unmapped *runtime* error (no template) -> raw-error layer, not judge. hint = comp.generate("weird_runtime_error", {"error_message": "Segfault at 0x0"}) assert hint is not None assert "Segfault at 0x0" in hint assert calls["n"] == 0, "tool/runtime sites must be served by raw-error, not judge" def test_template_site_unaffected_by_routing(): comp = default_composite() # no judge hint = comp.generate("tool_not_found", {"available_tools": ["read", "write"]}) assert hint is not None and "Available tools" in hint # --- the route predicate ---------------------------------------------------- def test_route_predicate_classifies_kinds(): # tool/runtime for k in ("tool_not_found", "json_decode", "type_error", "runtime_error", "repeated_failure", "weird_runtime_error", "some_exception", "weird_unmapped_error"): assert is_tool_runtime_kind(k) is True, k # style/communication/effort for k in ("verbose_communication", "low_effort_style", "tone_violation", "rambling_explanation", "bad_formatting"): assert is_tool_runtime_kind(k) is False, k def test_routing_generator_returns_none_for_style_kind(): routed = RoutingHintGenerator(RawErrorHintGenerator()) # style kind WITH a message -> None (defer to judge), even though the inner # raw-error layer would have produced a hint. assert routed.generate("verbose_style", {"error_message": "too long"}) is None # tool/runtime kind WITH a message -> inner fires. out = routed.generate("runtime_error", {"error_message": "boom"}) assert out is not None and "boom" in out