ChargeBackOps / tests /test_grader.py
pauldebanshu19's picture
feat: Implement multi-round dispute lifecycle with arbitration scoring and related tests
b7aa1f0
from evaluation.grading import grade_episode
from evaluation.rubrics import (
CASE_DIMENSION_WEIGHTS,
ChargebackOpsEpisodeRubric,
)
from server.chargeback_ops_environment import ChargebackOpsEnvironment
from scenarios.simulation import get_task
def test_grade_episode_bounds():
env = ChargebackOpsEnvironment()
env.reset(task_id="queue_optimization_hard")
report = grade_episode(
get_task("queue_optimization_hard"),
env._progress_by_case, # type: ignore[attr-defined]
env.state.step_count,
env.state.episode_id or "",
completed=False,
)
assert 0.0 <= report.normalized_score <= 1.0
def test_environment_exposes_rubric_tree():
"""The env must wire an OpenEnv Rubric that exposes all 7 scoring dimensions."""
env = ChargebackOpsEnvironment()
assert isinstance(env.rubric, ChargebackOpsEpisodeRubric)
names = {name for name, _ in env.rubric.named_rubrics()}
expected = {
"case_rubric",
"case_rubric.aggregator",
*(f"case_rubric.aggregator.rubric_{i}" for i in range(8)),
}
assert expected.issubset(names)
# Weights must sum to 1.0 (WeightedSum enforces this at construction but
# we lock the constant here so weight changes stay intentional).
assert abs(sum(CASE_DIMENSION_WEIGHTS) - 1.0) < 1e-6
assert len(CASE_DIMENSION_WEIGHTS) == 8