Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| from pathlib import Path | |
| from runners.baseline_runner import _heuristic_pick, _obvious_next_action, candidate_actions | |
| from evaluation.grading import grade_episode | |
| from core.models import ChargebackOpsAction | |
| from server.chargeback_ops_environment import ChargebackOpsEnvironment | |
| from scenarios.simulation import get_task, list_tasks | |
| def _run_heuristic_episode(task_id: str) -> tuple[float, float]: | |
| env = ChargebackOpsEnvironment() | |
| observation = env.reset(task_id=task_id) | |
| total_reward = 0.0 | |
| while not observation.done: | |
| candidates = candidate_actions(observation.model_dump()) | |
| assert candidates, f"No candidate actions available for task {task_id}" | |
| observation = env.step(_heuristic_pick(candidates).action) | |
| total_reward += observation.reward or 0.0 | |
| assert observation.grader_report is not None | |
| return round(total_reward, 4), observation.grader_report.normalized_score | |
| def _run_bad_episode(task_id: str) -> tuple[float, float]: | |
| env = ChargebackOpsEnvironment() | |
| observation = env.reset(task_id=task_id) | |
| total_reward = 0.0 | |
| while not observation.done: | |
| if observation.selected_case_id is None: | |
| open_case = next(case for case in observation.queue if case.status == "open") | |
| action = ChargebackOpsAction(action_type="select_case", case_id=open_case.case_id) | |
| else: | |
| case_id = observation.selected_case_id | |
| visible_case = observation.visible_case | |
| if visible_case and visible_case.current_strategy is None: | |
| action = ChargebackOpsAction( | |
| action_type="set_strategy", | |
| case_id=case_id, | |
| strategy="accept_chargeback", | |
| ) | |
| elif visible_case and visible_case.current_strategy == "accept_chargeback": | |
| action = ChargebackOpsAction( | |
| action_type="resolve_case", | |
| case_id=case_id, | |
| strategy="accept_chargeback", | |
| ) | |
| else: | |
| action = ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id=case_id, | |
| system_name="payment", | |
| ) | |
| observation = env.step(action) | |
| total_reward += observation.reward or 0.0 | |
| assert observation.grader_report is not None | |
| return round(total_reward, 4), observation.grader_report.normalized_score | |
| def test_problem_statement_task_catalog(): | |
| tasks = list_tasks() | |
| assert len(tasks) >= 3 | |
| assert {task.difficulty for task in tasks} >= {"easy", "medium", "hard"} | |
| def test_problem_statement_reset_and_state_cleanliness(): | |
| env = ChargebackOpsEnvironment() | |
| first = env.reset(task_id="goods_not_received_easy") | |
| first_episode = env.state.episode_id | |
| env.step(ChargebackOpsAction(action_type="select_case", case_id="CB-E1")) | |
| second = env.reset(task_id="fraud_signal_ambiguity") | |
| assert first.done is False | |
| assert second.task_id == "fraud_signal_ambiguity" | |
| assert env.state.task_id == "fraud_signal_ambiguity" | |
| assert env.state.step_count == 0 | |
| assert env.state.action_history == [] | |
| assert env.state.selected_case_id is None | |
| assert env.state.episode_id != first_episode | |
| def test_problem_statement_grader_is_deterministic(): | |
| env = ChargebackOpsEnvironment() | |
| env.reset(task_id="queue_optimization_hard") | |
| task = get_task("queue_optimization_hard") | |
| report_a = grade_episode( | |
| task, | |
| env._progress_by_case, # type: ignore[attr-defined] | |
| env.state.step_count, | |
| env.state.episode_id or "", | |
| completed=False, | |
| ) | |
| report_b = grade_episode( | |
| task, | |
| env._progress_by_case, # type: ignore[attr-defined] | |
| env.state.step_count, | |
| env.state.episode_id or "", | |
| completed=False, | |
| ) | |
| assert report_a.model_dump() == report_b.model_dump() | |
| assert 0.0 <= report_a.normalized_score <= 1.0 | |
| def test_problem_statement_reward_signal_has_partial_progress_and_penalties(): | |
| env = ChargebackOpsEnvironment() | |
| env.reset(task_id="fraud_signal_ambiguity") | |
| env.step(ChargebackOpsAction(action_type="select_case", case_id="CB-M1")) | |
| helpful = env.step( | |
| ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id="CB-M1", | |
| system_name="orders", | |
| ) | |
| ) | |
| duplicate = env.step( | |
| ChargebackOpsAction( | |
| action_type="query_system", | |
| case_id="CB-M1", | |
| system_name="orders", | |
| ) | |
| ) | |
| harmful = env.step( | |
| ChargebackOpsAction( | |
| action_type="add_evidence", | |
| case_id="CB-M1", | |
| evidence_ids=["M1-AVS-MISMATCH"], | |
| ) | |
| ) | |
| assert (helpful.reward or 0.0) > 0 | |
| assert (duplicate.reward or 0.0) < 0 | |
| assert (harmful.reward or 0.0) < 0 | |
| def test_problem_statement_agent_signal_distinguishes_good_from_bad(): | |
| heuristic_reward, heuristic_score = _run_heuristic_episode("queue_optimization_hard") | |
| bad_reward, bad_score = _run_bad_episode("queue_optimization_hard") | |
| assert heuristic_score > bad_score | |
| assert heuristic_reward > bad_reward | |
| def test_problem_statement_live_agent_budget_targets_real_branches(): | |
| ambiguous_states = 0 | |
| for task in list_tasks(): | |
| env = ChargebackOpsEnvironment() | |
| observation = env.reset(task_id=task.task_id) | |
| while not observation.done: | |
| payload = observation.model_dump() | |
| candidates = candidate_actions(payload) | |
| assert candidates | |
| if len(candidates) > 1 and _obvious_next_action(payload, candidates) is None: | |
| ambiguous_states += 1 | |
| candidate = _obvious_next_action(payload, candidates) or _heuristic_pick(candidates) | |
| observation = env.step(candidate.action) | |
| task_count = len(list_tasks()) | |
| assert ambiguous_states <= task_count * 3 | |
| def test_problem_statement_inference_contract_exists(): | |
| content = Path("runners/inference.py").read_text() | |
| assert "from openai import OpenAI" in content | |
| assert "API_BASE_URL" in content | |
| assert "MODEL_NAME" in content | |
| assert "HF_TOKEN" in content | |