feat(wave-a): close ADR-011 (SDPO alignment indices) + ADR-012 (review findings)

d02d724 29 days ago

4 kB

	"""Tests for error-kind hint routing on the DEFAULT composite (ADR-012 #2).

	The default composite is template -> raw-error -> judge. Before ADR-012 the
	raw-error layer consumed ANY site carrying an `error_message`, including
	style/communication/effort sites — exactly the sites the LLM judge exists to
	cover. These tests validate the DEFAULT path (raw-error NOT disabled): a
	style/communication site WITH an error_message routes through to the judge,
	while tool/runtime sites still use the raw-error layer.
	"""
	from __future__ import annotations

	from composer_replication.hint_generator import (
	RoutingHintGenerator,
	RawErrorHintGenerator,
	default_composite,
	is_tool_runtime_kind,
	)


	# --- the headline acceptance: style site reaches judge on the DEFAULT path ---

	def test_style_site_with_error_message_reaches_judge_on_default_composite():
	calls = {"n": 0}

	def fake_complete(prompt: str) -> str:
	calls["n"] += 1
	return "Be more concise; you repeated the same explanation twice."

	# NOTE: raw-error is ENABLED (the default). Pre-ADR-012 this would have been
	# eaten by the raw-error layer and the judge never called.
	comp = default_composite(llm_complete=fake_complete) # enable_raw_error=True
	hint = comp.generate(
	"verbose_communication",
	{"error_message": "The agent restated the plan three times."},
	)
	assert hint == "Be more concise; you repeated the same explanation twice."
	assert calls["n"] == 1, "style site must reach the judge, not the raw-error layer"


	def test_effort_site_with_message_routes_to_judge():
	calls = {"n": 0}

	def fake_complete(prompt: str) -> str:
	calls["n"] += 1
	return "Don't pad the answer; one example suffices."

	comp = default_composite(llm_complete=fake_complete)
	hint = comp.generate("low_effort_style", {"error_message": "padding detected"})
	assert hint == "Don't pad the answer; one example suffices."
	assert calls["n"] == 1


	# --- tool/runtime sites still served by raw-error (no regression) -----------

	def test_tool_runtime_site_still_served_by_raw_error_no_judge():
	calls = {"n": 0}

	def fake_complete(prompt: str) -> str:
	calls["n"] += 1
	return "JUDGE (should not be called)"

	comp = default_composite(llm_complete=fake_complete)
	# an unmapped runtime error (no template) -> raw-error layer, not judge.
	hint = comp.generate("weird_runtime_error", {"error_message": "Segfault at 0x0"})
	assert hint is not None
	assert "Segfault at 0x0" in hint
	assert calls["n"] == 0, "tool/runtime sites must be served by raw-error, not judge"


	def test_template_site_unaffected_by_routing():
	comp = default_composite() # no judge
	hint = comp.generate("tool_not_found", {"available_tools": ["read", "write"]})
	assert hint is not None and "Available tools" in hint


	# --- the route predicate ----------------------------------------------------

	def test_route_predicate_classifies_kinds():
	# tool/runtime
	for k in ("tool_not_found", "json_decode", "type_error", "runtime_error",
	"repeated_failure", "weird_runtime_error", "some_exception",
	"weird_unmapped_error"):
	assert is_tool_runtime_kind(k) is True, k
	# style/communication/effort
	for k in ("verbose_communication", "low_effort_style", "tone_violation",
	"rambling_explanation", "bad_formatting"):
	assert is_tool_runtime_kind(k) is False, k


	def test_routing_generator_returns_none_for_style_kind():
	routed = RoutingHintGenerator(RawErrorHintGenerator())
	# style kind WITH a message -> None (defer to judge), even though the inner
	# raw-error layer would have produced a hint.
	assert routed.generate("verbose_style", {"error_message": "too long"}) is None
	# tool/runtime kind WITH a message -> inner fires.
	out = routed.generate("runtime_error", {"error_message": "boom"})
	assert out is not None and "boom" in out