File size: 3,996 Bytes
d02d724
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
"""Tests for error-kind hint routing on the DEFAULT composite (ADR-012 #2).

The default composite is template -> raw-error -> judge. Before ADR-012 the
raw-error layer consumed ANY site carrying an `error_message`, including
style/communication/effort sites — exactly the sites the LLM judge exists to
cover. These tests validate the DEFAULT path (raw-error NOT disabled): a
style/communication site WITH an error_message routes through to the judge,
while tool/runtime sites still use the raw-error layer.
"""
from __future__ import annotations

from composer_replication.hint_generator import (
    RoutingHintGenerator,
    RawErrorHintGenerator,
    default_composite,
    is_tool_runtime_kind,
)


# --- the headline acceptance: style site reaches judge on the DEFAULT path ---

def test_style_site_with_error_message_reaches_judge_on_default_composite():
    calls = {"n": 0}

    def fake_complete(prompt: str) -> str:
        calls["n"] += 1
        return "Be more concise; you repeated the same explanation twice."

    # NOTE: raw-error is ENABLED (the default). Pre-ADR-012 this would have been
    # eaten by the raw-error layer and the judge never called.
    comp = default_composite(llm_complete=fake_complete)  # enable_raw_error=True
    hint = comp.generate(
        "verbose_communication",
        {"error_message": "The agent restated the plan three times."},
    )
    assert hint == "Be more concise; you repeated the same explanation twice."
    assert calls["n"] == 1, "style site must reach the judge, not the raw-error layer"


def test_effort_site_with_message_routes_to_judge():
    calls = {"n": 0}

    def fake_complete(prompt: str) -> str:
        calls["n"] += 1
        return "Don't pad the answer; one example suffices."

    comp = default_composite(llm_complete=fake_complete)
    hint = comp.generate("low_effort_style", {"error_message": "padding detected"})
    assert hint == "Don't pad the answer; one example suffices."
    assert calls["n"] == 1


# --- tool/runtime sites still served by raw-error (no regression) -----------

def test_tool_runtime_site_still_served_by_raw_error_no_judge():
    calls = {"n": 0}

    def fake_complete(prompt: str) -> str:
        calls["n"] += 1
        return "JUDGE (should not be called)"

    comp = default_composite(llm_complete=fake_complete)
    # an unmapped *runtime* error (no template) -> raw-error layer, not judge.
    hint = comp.generate("weird_runtime_error", {"error_message": "Segfault at 0x0"})
    assert hint is not None
    assert "Segfault at 0x0" in hint
    assert calls["n"] == 0, "tool/runtime sites must be served by raw-error, not judge"


def test_template_site_unaffected_by_routing():
    comp = default_composite()  # no judge
    hint = comp.generate("tool_not_found", {"available_tools": ["read", "write"]})
    assert hint is not None and "Available tools" in hint


# --- the route predicate ----------------------------------------------------

def test_route_predicate_classifies_kinds():
    # tool/runtime
    for k in ("tool_not_found", "json_decode", "type_error", "runtime_error",
              "repeated_failure", "weird_runtime_error", "some_exception",
              "weird_unmapped_error"):
        assert is_tool_runtime_kind(k) is True, k
    # style/communication/effort
    for k in ("verbose_communication", "low_effort_style", "tone_violation",
              "rambling_explanation", "bad_formatting"):
        assert is_tool_runtime_kind(k) is False, k


def test_routing_generator_returns_none_for_style_kind():
    routed = RoutingHintGenerator(RawErrorHintGenerator())
    # style kind WITH a message -> None (defer to judge), even though the inner
    # raw-error layer would have produced a hint.
    assert routed.generate("verbose_style", {"error_message": "too long"}) is None
    # tool/runtime kind WITH a message -> inner fires.
    out = routed.generate("runtime_error", {"error_message": "boom"})
    assert out is not None and "boom" in out