Spaces:
Sleeping
Sleeping
File size: 1,378 Bytes
95f11da c8ebaee 379f291 95f11da 379f291 c8ebaee b7aa1f0 c8ebaee b7aa1f0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | from evaluation.grading import grade_episode
from evaluation.rubrics import (
CASE_DIMENSION_WEIGHTS,
ChargebackOpsEpisodeRubric,
)
from server.chargeback_ops_environment import ChargebackOpsEnvironment
from scenarios.simulation import get_task
def test_grade_episode_bounds():
env = ChargebackOpsEnvironment()
env.reset(task_id="queue_optimization_hard")
report = grade_episode(
get_task("queue_optimization_hard"),
env._progress_by_case, # type: ignore[attr-defined]
env.state.step_count,
env.state.episode_id or "",
completed=False,
)
assert 0.0 <= report.normalized_score <= 1.0
def test_environment_exposes_rubric_tree():
"""The env must wire an OpenEnv Rubric that exposes all 7 scoring dimensions."""
env = ChargebackOpsEnvironment()
assert isinstance(env.rubric, ChargebackOpsEpisodeRubric)
names = {name for name, _ in env.rubric.named_rubrics()}
expected = {
"case_rubric",
"case_rubric.aggregator",
*(f"case_rubric.aggregator.rubric_{i}" for i in range(8)),
}
assert expected.issubset(names)
# Weights must sum to 1.0 (WeightedSum enforces this at construction but
# we lock the constant here so weight changes stay intentional).
assert abs(sum(CASE_DIMENSION_WEIGHTS) - 1.0) < 1e-6
assert len(CASE_DIMENSION_WEIGHTS) == 8
|