File size: 1,378 Bytes
95f11da
c8ebaee
 
 
 
379f291
95f11da
379f291
 
 
 
 
 
 
 
 
 
 
 
 
c8ebaee
 
 
 
 
 
 
 
 
 
 
 
b7aa1f0
c8ebaee
 
 
 
 
 
b7aa1f0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from evaluation.grading import grade_episode
from evaluation.rubrics import (
    CASE_DIMENSION_WEIGHTS,
    ChargebackOpsEpisodeRubric,
)
from server.chargeback_ops_environment import ChargebackOpsEnvironment
from scenarios.simulation import get_task


def test_grade_episode_bounds():
    env = ChargebackOpsEnvironment()
    env.reset(task_id="queue_optimization_hard")
    report = grade_episode(
        get_task("queue_optimization_hard"),
        env._progress_by_case,  # type: ignore[attr-defined]
        env.state.step_count,
        env.state.episode_id or "",
        completed=False,
    )
    assert 0.0 <= report.normalized_score <= 1.0


def test_environment_exposes_rubric_tree():
    """The env must wire an OpenEnv Rubric that exposes all 7 scoring dimensions."""

    env = ChargebackOpsEnvironment()
    assert isinstance(env.rubric, ChargebackOpsEpisodeRubric)

    names = {name for name, _ in env.rubric.named_rubrics()}
    expected = {
        "case_rubric",
        "case_rubric.aggregator",
        *(f"case_rubric.aggregator.rubric_{i}" for i in range(8)),
    }
    assert expected.issubset(names)

    # Weights must sum to 1.0 (WeightedSum enforces this at construction but
    # we lock the constant here so weight changes stay intentional).
    assert abs(sum(CASE_DIMENSION_WEIGHTS) - 1.0) < 1e-6
    assert len(CASE_DIMENSION_WEIGHTS) == 8