Spaces:
Running
Running
File size: 8,811 Bytes
b89c8aa | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | """Tests for the scripted expert oracle baseline (§22.1).
Verifies:
- Expert follows gold SOP for each ticket within budget constraints.
- Priority ordering: higher-priority tickets get first substantive actions.
- Returns a trace dict with score breakdown.
- No invalid actions are ever produced.
- Deterministic: same seed → identical trace.
- Score breakdown includes proper terminal business + UJCS scoring.
"""
from __future__ import annotations
import pytest
from ..baseline.scripted_expert import ScriptedExpert
from ..models import (
ActionType,
TaskDifficulty,
)
from ..server.triagesieve_env_environment import TriageSieveEnvironment
@pytest.fixture
def env() -> TriageSieveEnvironment:
"""Fresh environment instance."""
return TriageSieveEnvironment()
@pytest.fixture
def expert(env: TriageSieveEnvironment) -> ScriptedExpert:
"""Expert policy wired to the environment."""
return ScriptedExpert(env)
class TestScriptedExpertAPI:
"""Expert class public interface."""
def test_constructor_accepts_environment(self, env: TriageSieveEnvironment) -> None:
expert = ScriptedExpert(env)
assert expert.env is env
def test_run_episode_returns_trace_dict(self, expert: ScriptedExpert) -> None:
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert isinstance(trace, dict)
assert "episode_id" in trace
assert "seed" in trace
assert "action_sequence" in trace
assert "final_score" in trace
assert "score_breakdown" in trace
def test_run_episode_returns_done(self, expert: ScriptedExpert) -> None:
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert trace["done"] is True
def test_trace_score_breakdown_keys(self, expert: ScriptedExpert) -> None:
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
breakdown = trace["score_breakdown"]
assert "terminal_business_score" in breakdown
assert "ujcs_openenv" in breakdown
assert "episode_penalties" in breakdown
assert "priority_order_score" in breakdown
assert "invalid_action_count" in breakdown
assert "reassignment_count" in breakdown
class TestScriptedExpertCorrectness:
"""No invalid actions, correct scores for known archetypes."""
@pytest.mark.parametrize("seed", range(20))
def test_no_invalid_actions_easy(self, seed: int) -> None:
"""Expert should never produce an invalid action on any easy seed."""
env = TriageSieveEnvironment()
expert = ScriptedExpert(env)
trace = expert.run_episode(seed=seed, difficulty=TaskDifficulty.EASY)
for entry in trace["action_sequence"]:
assert entry["result"] == "ok", (
f"seed={seed}, step {entry['step']}: {entry['result']}"
)
@pytest.mark.parametrize("seed", range(10))
def test_no_invalid_actions_medium(self, seed: int) -> None:
"""Expert should never produce an invalid action on medium seeds."""
env = TriageSieveEnvironment()
expert = ScriptedExpert(env)
trace = expert.run_episode(seed=seed, difficulty=TaskDifficulty.MEDIUM)
for entry in trace["action_sequence"]:
assert entry["result"] == "ok", (
f"seed={seed}, step {entry['step']}: {entry['result']}"
)
@pytest.mark.parametrize("seed", range(10))
def test_no_invalid_actions_hard(self, seed: int) -> None:
"""Expert should never produce an invalid action on hard seeds."""
env = TriageSieveEnvironment()
expert = ScriptedExpert(env)
trace = expert.run_episode(seed=seed, difficulty=TaskDifficulty.HARD)
for entry in trace["action_sequence"]:
assert entry["result"] == "ok", (
f"seed={seed}, step {entry['step']}: {entry['result']}"
)
def test_zero_invalid_actions_in_breakdown(self, expert: ScriptedExpert) -> None:
"""Score breakdown must show 0 invalid actions."""
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert trace["score_breakdown"]["invalid_action_count"] == 0
def test_zero_reassignments(self, expert: ScriptedExpert) -> None:
"""Expert never routes a ticket twice."""
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert trace["score_breakdown"]["reassignment_count"] == 0
def test_zero_episode_penalties(self, expert: ScriptedExpert) -> None:
"""Expert produces no penalties."""
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert trace["score_breakdown"]["episode_penalties"] == 0.0
def test_positive_terminal_business_score(self, expert: ScriptedExpert) -> None:
"""Expert achieves a positive terminal business score."""
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert trace["score_breakdown"]["terminal_business_score"] > 0.0
def test_normal_archetype_high_score(self) -> None:
"""Seed=7 (failed_invoice_charge_dispute, no missing fields) scores >= 0.80."""
env = TriageSieveEnvironment()
expert = ScriptedExpert(env)
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert trace["final_score"] >= 0.80, (
f"Expert scored {trace['final_score']:.3f} on seed=7 easy"
)
class TestScriptedExpertDeterminism:
"""Same seed + same difficulty → identical trace."""
def test_same_seed_same_trace(self) -> None:
expert_a = ScriptedExpert(TriageSieveEnvironment())
trace_a = expert_a.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
expert_b = ScriptedExpert(TriageSieveEnvironment())
trace_b = expert_b.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
assert trace_a["final_score"] == trace_b["final_score"]
assert trace_a["action_sequence"] == trace_b["action_sequence"]
def test_different_seed_different_trace(self) -> None:
expert_a = ScriptedExpert(TriageSieveEnvironment())
trace_a = expert_a.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
expert_b = ScriptedExpert(TriageSieveEnvironment())
trace_b = expert_b.run_episode(seed=0, difficulty=TaskDifficulty.EASY)
assert trace_a["episode_id"] != trace_b["episode_id"]
class TestScriptedExpertActionSequence:
"""Verify action types follow expected SOP patterns."""
def test_easy_begins_with_open(self, expert: ScriptedExpert) -> None:
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
first_action = trace["action_sequence"][0]
assert first_action["action"]["action_type"] == ActionType.OPEN_TICKET.value
def test_ends_with_finish_episode(self, expert: ScriptedExpert) -> None:
trace = expert.run_episode(seed=1, difficulty=TaskDifficulty.EASY)
last_action = trace["action_sequence"][-1]
assert last_action["action"]["action_type"] == ActionType.FINISH_EPISODE.value
def test_classify_follows_open(self) -> None:
"""For normal (non-special) archetypes, classify follows open."""
env = TriageSieveEnvironment()
expert = ScriptedExpert(env)
# seed=7 = failed_invoice_charge_dispute (normal flow)
trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
types = [e["action"]["action_type"] for e in trace["action_sequence"]]
assert types[0] == ActionType.OPEN_TICKET.value
assert types[1] == ActionType.CLASSIFY_TICKET.value
class TestScriptedExpertMultiTicket:
"""Multi-ticket priority ordering (§19)."""
def test_hard_processes_multiple_tickets(self) -> None:
env = TriageSieveEnvironment()
expert = ScriptedExpert(env)
trace = expert.run_episode(seed=0, difficulty=TaskDifficulty.HARD)
ticket_ids_opened = [
entry["action"].get("ticket_id")
for entry in trace["action_sequence"]
if entry["action"]["action_type"] == ActionType.OPEN_TICKET.value
]
assert len(ticket_ids_opened) >= 2, "Hard episode should open multiple tickets"
def test_medium_priority_order_score(self) -> None:
"""Expert achieves perfect or near-perfect priority ordering on medium."""
env = TriageSieveEnvironment()
expert = ScriptedExpert(env)
trace = expert.run_episode(seed=0, difficulty=TaskDifficulty.MEDIUM)
assert trace["score_breakdown"]["priority_order_score"] >= 0.5
|