Spaces:

Angshuman28
/

triagesieve_env

Running

File size: 8,811 Bytes

b89c8aa

"""Tests for the scripted expert oracle baseline (§22.1).



Verifies:

- Expert follows gold SOP for each ticket within budget constraints.

- Priority ordering: higher-priority tickets get first substantive actions.

- Returns a trace dict with score breakdown.

- No invalid actions are ever produced.

- Deterministic: same seed → identical trace.

- Score breakdown includes proper terminal business + UJCS scoring.

"""

from __future__ import annotations

import pytest

from ..baseline.scripted_expert import ScriptedExpert
from ..models import (
    ActionType,
    TaskDifficulty,
)
from ..server.triagesieve_env_environment import TriageSieveEnvironment


@pytest.fixture
def env() -> TriageSieveEnvironment:
    """Fresh environment instance."""
    return TriageSieveEnvironment()


@pytest.fixture
def expert(env: TriageSieveEnvironment) -> ScriptedExpert:
    """Expert policy wired to the environment."""
    return ScriptedExpert(env)


class TestScriptedExpertAPI:
    """Expert class public interface."""

    def test_constructor_accepts_environment(self, env: TriageSieveEnvironment) -> None:
        expert = ScriptedExpert(env)
        assert expert.env is env

    def test_run_episode_returns_trace_dict(self, expert: ScriptedExpert) -> None:
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        assert isinstance(trace, dict)
        assert "episode_id" in trace
        assert "seed" in trace
        assert "action_sequence" in trace
        assert "final_score" in trace
        assert "score_breakdown" in trace

    def test_run_episode_returns_done(self, expert: ScriptedExpert) -> None:
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        assert trace["done"] is True

    def test_trace_score_breakdown_keys(self, expert: ScriptedExpert) -> None:
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        breakdown = trace["score_breakdown"]
        assert "terminal_business_score" in breakdown
        assert "ujcs_openenv" in breakdown
        assert "episode_penalties" in breakdown
        assert "priority_order_score" in breakdown
        assert "invalid_action_count" in breakdown
        assert "reassignment_count" in breakdown


class TestScriptedExpertCorrectness:
    """No invalid actions, correct scores for known archetypes."""

    @pytest.mark.parametrize("seed", range(20))
    def test_no_invalid_actions_easy(self, seed: int) -> None:
        """Expert should never produce an invalid action on any easy seed."""
        env = TriageSieveEnvironment()
        expert = ScriptedExpert(env)
        trace = expert.run_episode(seed=seed, difficulty=TaskDifficulty.EASY)
        for entry in trace["action_sequence"]:
            assert entry["result"] == "ok", (
                f"seed={seed}, step {entry['step']}: {entry['result']}"
            )

    @pytest.mark.parametrize("seed", range(10))
    def test_no_invalid_actions_medium(self, seed: int) -> None:
        """Expert should never produce an invalid action on medium seeds."""
        env = TriageSieveEnvironment()
        expert = ScriptedExpert(env)
        trace = expert.run_episode(seed=seed, difficulty=TaskDifficulty.MEDIUM)
        for entry in trace["action_sequence"]:
            assert entry["result"] == "ok", (
                f"seed={seed}, step {entry['step']}: {entry['result']}"
            )

    @pytest.mark.parametrize("seed", range(10))
    def test_no_invalid_actions_hard(self, seed: int) -> None:
        """Expert should never produce an invalid action on hard seeds."""
        env = TriageSieveEnvironment()
        expert = ScriptedExpert(env)
        trace = expert.run_episode(seed=seed, difficulty=TaskDifficulty.HARD)
        for entry in trace["action_sequence"]:
            assert entry["result"] == "ok", (
                f"seed={seed}, step {entry['step']}: {entry['result']}"
            )

    def test_zero_invalid_actions_in_breakdown(self, expert: ScriptedExpert) -> None:
        """Score breakdown must show 0 invalid actions."""
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        assert trace["score_breakdown"]["invalid_action_count"] == 0

    def test_zero_reassignments(self, expert: ScriptedExpert) -> None:
        """Expert never routes a ticket twice."""
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        assert trace["score_breakdown"]["reassignment_count"] == 0

    def test_zero_episode_penalties(self, expert: ScriptedExpert) -> None:
        """Expert produces no penalties."""
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        assert trace["score_breakdown"]["episode_penalties"] == 0.0

    def test_positive_terminal_business_score(self, expert: ScriptedExpert) -> None:
        """Expert achieves a positive terminal business score."""
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        assert trace["score_breakdown"]["terminal_business_score"] > 0.0

    def test_normal_archetype_high_score(self) -> None:
        """Seed=7 (failed_invoice_charge_dispute, no missing fields) scores >= 0.80."""
        env = TriageSieveEnvironment()
        expert = ScriptedExpert(env)
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        assert trace["final_score"] >= 0.80, (
            f"Expert scored {trace['final_score']:.3f} on seed=7 easy"
        )


class TestScriptedExpertDeterminism:
    """Same seed + same difficulty → identical trace."""

    def test_same_seed_same_trace(self) -> None:
        expert_a = ScriptedExpert(TriageSieveEnvironment())
        trace_a = expert_a.run_episode(seed=7, difficulty=TaskDifficulty.EASY)

        expert_b = ScriptedExpert(TriageSieveEnvironment())
        trace_b = expert_b.run_episode(seed=7, difficulty=TaskDifficulty.EASY)

        assert trace_a["final_score"] == trace_b["final_score"]
        assert trace_a["action_sequence"] == trace_b["action_sequence"]

    def test_different_seed_different_trace(self) -> None:
        expert_a = ScriptedExpert(TriageSieveEnvironment())
        trace_a = expert_a.run_episode(seed=7, difficulty=TaskDifficulty.EASY)

        expert_b = ScriptedExpert(TriageSieveEnvironment())
        trace_b = expert_b.run_episode(seed=0, difficulty=TaskDifficulty.EASY)

        assert trace_a["episode_id"] != trace_b["episode_id"]


class TestScriptedExpertActionSequence:
    """Verify action types follow expected SOP patterns."""

    def test_easy_begins_with_open(self, expert: ScriptedExpert) -> None:
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        first_action = trace["action_sequence"][0]
        assert first_action["action"]["action_type"] == ActionType.OPEN_TICKET.value

    def test_ends_with_finish_episode(self, expert: ScriptedExpert) -> None:
        trace = expert.run_episode(seed=1, difficulty=TaskDifficulty.EASY)
        last_action = trace["action_sequence"][-1]
        assert last_action["action"]["action_type"] == ActionType.FINISH_EPISODE.value

    def test_classify_follows_open(self) -> None:
        """For normal (non-special) archetypes, classify follows open."""
        env = TriageSieveEnvironment()
        expert = ScriptedExpert(env)
        # seed=7 = failed_invoice_charge_dispute (normal flow)
        trace = expert.run_episode(seed=7, difficulty=TaskDifficulty.EASY)
        types = [e["action"]["action_type"] for e in trace["action_sequence"]]
        assert types[0] == ActionType.OPEN_TICKET.value
        assert types[1] == ActionType.CLASSIFY_TICKET.value


class TestScriptedExpertMultiTicket:
    """Multi-ticket priority ordering (§19)."""

    def test_hard_processes_multiple_tickets(self) -> None:
        env = TriageSieveEnvironment()
        expert = ScriptedExpert(env)
        trace = expert.run_episode(seed=0, difficulty=TaskDifficulty.HARD)
        ticket_ids_opened = [
            entry["action"].get("ticket_id")
            for entry in trace["action_sequence"]
            if entry["action"]["action_type"] == ActionType.OPEN_TICKET.value
        ]
        assert len(ticket_ids_opened) >= 2, "Hard episode should open multiple tickets"

    def test_medium_priority_order_score(self) -> None:
        """Expert achieves perfect or near-perfect priority ordering on medium."""
        env = TriageSieveEnvironment()
        expert = ScriptedExpert(env)
        trace = expert.run_episode(seed=0, difficulty=TaskDifficulty.MEDIUM)
        assert trace["score_breakdown"]["priority_order_score"] >= 0.5