Spaces:
Sleeping
Sleeping
Commit ·
52e9e29
1
Parent(s): 9ae9432
fix: address Codex adversarial review findings
Browse files- Remove grader answer-key leaks from info/metrics: no longer expose
missing_required_evidence, harmful_evidence_attached, or coverage
percentages derived from hidden labels. Only analyst-observable
signals (deadline warnings, unqueried systems, counts) are returned.
- Remove ISO replay from default task catalog so scores and task counts
are deterministic across all deployments. ISO tasks available via
explicit list_iso_tasks() and /generate endpoint.
- Reject invalid curriculum-reset difficulty values with ValueError
instead of silently falling back to the easiest built-in task.
- README.md +10 -12
- scenarios/simulation.py +18 -12
- server/chargeback_ops_environment.py +25 -25
README.md
CHANGED
|
@@ -323,21 +323,19 @@ Observations are designed to look like an analyst workspace rather than a toy qu
|
|
| 323 |
- masked card numbers
|
| 324 |
- deadline-relative queue summaries
|
| 325 |
|
| 326 |
-
Each step also returns a diagnostic `info` payload with:
|
| 327 |
|
| 328 |
-
- `deadline_warning`
|
| 329 |
-
- `unqueried_systems`
|
| 330 |
-
- `
|
| 331 |
-
- `
|
| 332 |
-
- `episode_metrics`
|
| 333 |
|
| 334 |
-
Episode-level
|
| 335 |
|
| 336 |
-
-
|
| 337 |
-
-
|
| 338 |
-
-
|
| 339 |
-
-
|
| 340 |
-
- open case count
|
| 341 |
|
| 342 |
## Quick Start
|
| 343 |
|
|
|
|
| 323 |
- masked card numbers
|
| 324 |
- deadline-relative queue summaries
|
| 325 |
|
| 326 |
+
Each step also returns a diagnostic `info` payload with analyst-observable signals only (no grader answer-key leakage):
|
| 327 |
|
| 328 |
+
- `deadline_warning` — true when the selected case has ≤2 steps until deadline
|
| 329 |
+
- `unqueried_systems` — which of the 6 merchant systems haven't been queried yet
|
| 330 |
+
- `attached_evidence_count` / `retrieved_evidence_count` — counts without revealing quality labels
|
| 331 |
+
- `steps_until_deadline` — exact steps remaining for the selected case
|
|
|
|
| 332 |
|
| 333 |
+
Episode-level metrics track operational signals:
|
| 334 |
|
| 335 |
+
- deadline pressure index (fraction of cases with ≤2 steps to deadline)
|
| 336 |
+
- triage efficiency (resolved cases per step)
|
| 337 |
+
- open / resolved case counts
|
| 338 |
+
- total evidence attached / retrieved
|
|
|
|
| 339 |
|
| 340 |
## Quick Start
|
| 341 |
|
scenarios/simulation.py
CHANGED
|
@@ -624,11 +624,16 @@ def get_task(task_id: str) -> TaskScenario:
|
|
| 624 |
|
| 625 |
|
| 626 |
def list_tasks() -> list[TaskScenario]:
|
| 627 |
-
"""Return
|
|
|
|
|
|
|
| 628 |
|
| 629 |
- **Showcase** (3): hand-crafted built-in tasks for demos and README.
|
| 630 |
- **Generated holdout** (7): seeded tasks never used for agent tuning.
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
| 632 |
"""
|
| 633 |
|
| 634 |
try:
|
|
@@ -654,15 +659,16 @@ def list_tasks() -> list[TaskScenario]:
|
|
| 654 |
generate_task(seed=77, difficulty="nightmare"),
|
| 655 |
]
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 659 |
try:
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
from iso_adapter import generate_iso_suite
|
| 664 |
-
replay = generate_iso_suite(easy_count=1, medium_count=1, hard_count=1)
|
| 665 |
-
except Exception:
|
| 666 |
-
pass
|
| 667 |
|
| 668 |
-
return
|
|
|
|
| 624 |
|
| 625 |
|
| 626 |
def list_tasks() -> list[TaskScenario]:
|
| 627 |
+
"""Return the fixed benchmark task catalog.
|
| 628 |
+
|
| 629 |
+
The catalog is deterministic and identical across all deployments:
|
| 630 |
|
| 631 |
- **Showcase** (3): hand-crafted built-in tasks for demos and README.
|
| 632 |
- **Generated holdout** (7): seeded tasks never used for agent tuning.
|
| 633 |
+
|
| 634 |
+
ISO replay tasks are available via ``list_iso_tasks()`` and the
|
| 635 |
+
``/generate`` endpoint but are excluded from the default catalog so
|
| 636 |
+
that scores and task counts are always comparable.
|
| 637 |
"""
|
| 638 |
|
| 639 |
try:
|
|
|
|
| 659 |
generate_task(seed=77, difficulty="nightmare"),
|
| 660 |
]
|
| 661 |
|
| 662 |
+
return showcase + holdout
|
| 663 |
+
|
| 664 |
+
|
| 665 |
+
def list_iso_tasks() -> list[TaskScenario]:
|
| 666 |
+
"""Return ISO 20022 replay tasks. Raises on failure instead of
|
| 667 |
+
silently returning an empty list so data/import issues are visible."""
|
| 668 |
+
|
| 669 |
try:
|
| 670 |
+
from .iso_adapter import generate_iso_suite
|
| 671 |
+
except ImportError: # pragma: no cover
|
| 672 |
+
from iso_adapter import generate_iso_suite
|
|
|
|
|
|
|
|
|
|
|
|
|
| 673 |
|
| 674 |
+
return generate_iso_suite(easy_count=1, medium_count=1, hard_count=1)
|
server/chargeback_ops_environment.py
CHANGED
|
@@ -93,7 +93,13 @@ class ChargebackOpsEnvironment(
|
|
| 93 |
) -> ChargebackOpsObservation:
|
| 94 |
task_id = kwargs.get("task_id")
|
| 95 |
difficulty = kwargs.get("difficulty")
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
resolved_seed = seed if seed is not None else int(kwargs.get("generated_seed", 42))
|
| 98 |
task_id = f"generated_{difficulty}_s{resolved_seed}"
|
| 99 |
if task_id is None:
|
|
@@ -457,21 +463,19 @@ class ChargebackOpsEnvironment(
|
|
| 457 |
}
|
| 458 |
|
| 459 |
def _episode_metrics(self) -> dict[str, float]:
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
helpful_attached = 0
|
| 464 |
open_cases = 0
|
| 465 |
urgent_cases = 0
|
| 466 |
resolved_cases = 0
|
|
|
|
|
|
|
| 467 |
|
| 468 |
for case in self._task.cases:
|
| 469 |
progress = self._progress_by_case[case.case_id]
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
required_attached += len(attached.intersection(case.required_evidence_ids))
|
| 473 |
-
helpful_total += len(case.helpful_evidence_ids)
|
| 474 |
-
helpful_attached += len(attached.intersection(case.helpful_evidence_ids))
|
| 475 |
steps_until_deadline = case.deadline_step - self._state.step_count
|
| 476 |
if progress.resolution_status == "open":
|
| 477 |
open_cases += 1
|
|
@@ -480,41 +484,37 @@ class ChargebackOpsEnvironment(
|
|
| 480 |
else:
|
| 481 |
resolved_cases += 1
|
| 482 |
|
| 483 |
-
evidence_coverage = 1.0 if required_total == 0 else required_attached / required_total
|
| 484 |
-
helpful_coverage = 1.0 if helpful_total == 0 else helpful_attached / helpful_total
|
| 485 |
deadline_pressure = 0.0 if len(self._task.cases) == 0 else urgent_cases / len(self._task.cases)
|
| 486 |
triage_efficiency = resolved_cases / max(1, self._state.step_count)
|
| 487 |
return {
|
| 488 |
-
"
|
| 489 |
-
"
|
| 490 |
"deadline_pressure_index": round(deadline_pressure, 4),
|
| 491 |
"triage_efficiency": round(triage_efficiency, 4),
|
| 492 |
-
"
|
|
|
|
| 493 |
}
|
| 494 |
|
| 495 |
def _selected_case_info(self) -> dict[str, object]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
if self._selected_case_id is None:
|
| 497 |
return {
|
| 498 |
"deadline_warning": False,
|
| 499 |
"unqueried_systems": [],
|
| 500 |
-
"missing_required_evidence": [],
|
| 501 |
-
"harmful_evidence_attached": [],
|
| 502 |
}
|
| 503 |
|
| 504 |
case = self._lookup_case(self._selected_case_id)
|
| 505 |
progress = self._progress_by_case[case.case_id]
|
| 506 |
-
attached = set(progress.attached_evidence_ids)
|
| 507 |
all_systems = {"orders", "payment", "shipping", "support", "refunds", "risk"}
|
| 508 |
return {
|
| 509 |
"deadline_warning": (case.deadline_step - self._state.step_count) <= 2,
|
| 510 |
"unqueried_systems": sorted(all_systems.difference(progress.revealed_systems)),
|
| 511 |
-
"
|
| 512 |
-
"
|
| 513 |
-
"
|
| 514 |
-
"attached_evidence_count": len(progress.attached_evidence_ids),
|
| 515 |
-
"retrieved_evidence_count": len(progress.retrieved_evidence_ids),
|
| 516 |
-
"steps_until_deadline": case.deadline_step - self._state.step_count,
|
| 517 |
-
},
|
| 518 |
}
|
| 519 |
|
| 520 |
def _build_queue(self) -> list[CaseQueueItem]:
|
|
|
|
| 93 |
) -> ChargebackOpsObservation:
|
| 94 |
task_id = kwargs.get("task_id")
|
| 95 |
difficulty = kwargs.get("difficulty")
|
| 96 |
+
_VALID_DIFFICULTIES = {"easy", "medium", "hard", "nightmare"}
|
| 97 |
+
if difficulty is not None and difficulty not in _VALID_DIFFICULTIES:
|
| 98 |
+
raise ValueError(
|
| 99 |
+
f"Invalid difficulty {difficulty!r}. "
|
| 100 |
+
f"Must be one of: {', '.join(sorted(_VALID_DIFFICULTIES))}"
|
| 101 |
+
)
|
| 102 |
+
if task_id is None and difficulty in _VALID_DIFFICULTIES:
|
| 103 |
resolved_seed = seed if seed is not None else int(kwargs.get("generated_seed", 42))
|
| 104 |
task_id = f"generated_{difficulty}_s{resolved_seed}"
|
| 105 |
if task_id is None:
|
|
|
|
| 463 |
}
|
| 464 |
|
| 465 |
def _episode_metrics(self) -> dict[str, float]:
|
| 466 |
+
"""User-observable episode metrics. Never exposes grader-internal
|
| 467 |
+
labels such as required/helpful/harmful evidence IDs or coverage
|
| 468 |
+
against the hidden answer key."""
|
|
|
|
| 469 |
open_cases = 0
|
| 470 |
urgent_cases = 0
|
| 471 |
resolved_cases = 0
|
| 472 |
+
total_attached = 0
|
| 473 |
+
total_retrieved = 0
|
| 474 |
|
| 475 |
for case in self._task.cases:
|
| 476 |
progress = self._progress_by_case[case.case_id]
|
| 477 |
+
total_attached += len(progress.attached_evidence_ids)
|
| 478 |
+
total_retrieved += len(progress.retrieved_evidence_ids)
|
|
|
|
|
|
|
|
|
|
| 479 |
steps_until_deadline = case.deadline_step - self._state.step_count
|
| 480 |
if progress.resolution_status == "open":
|
| 481 |
open_cases += 1
|
|
|
|
| 484 |
else:
|
| 485 |
resolved_cases += 1
|
| 486 |
|
|
|
|
|
|
|
| 487 |
deadline_pressure = 0.0 if len(self._task.cases) == 0 else urgent_cases / len(self._task.cases)
|
| 488 |
triage_efficiency = resolved_cases / max(1, self._state.step_count)
|
| 489 |
return {
|
| 490 |
+
"open_case_count": float(open_cases),
|
| 491 |
+
"resolved_case_count": float(resolved_cases),
|
| 492 |
"deadline_pressure_index": round(deadline_pressure, 4),
|
| 493 |
"triage_efficiency": round(triage_efficiency, 4),
|
| 494 |
+
"total_evidence_attached": float(total_attached),
|
| 495 |
+
"total_evidence_retrieved": float(total_retrieved),
|
| 496 |
}
|
| 497 |
|
| 498 |
def _selected_case_info(self) -> dict[str, object]:
|
| 499 |
+
"""Per-case diagnostic info visible to agents. Only exposes
|
| 500 |
+
signals an analyst could observe (deadline proximity, which
|
| 501 |
+
systems haven't been queried, counts). Does NOT expose which
|
| 502 |
+
evidence IDs are required, helpful, or harmful."""
|
| 503 |
if self._selected_case_id is None:
|
| 504 |
return {
|
| 505 |
"deadline_warning": False,
|
| 506 |
"unqueried_systems": [],
|
|
|
|
|
|
|
| 507 |
}
|
| 508 |
|
| 509 |
case = self._lookup_case(self._selected_case_id)
|
| 510 |
progress = self._progress_by_case[case.case_id]
|
|
|
|
| 511 |
all_systems = {"orders", "payment", "shipping", "support", "refunds", "risk"}
|
| 512 |
return {
|
| 513 |
"deadline_warning": (case.deadline_step - self._state.step_count) <= 2,
|
| 514 |
"unqueried_systems": sorted(all_systems.difference(progress.revealed_systems)),
|
| 515 |
+
"attached_evidence_count": len(progress.attached_evidence_ids),
|
| 516 |
+
"retrieved_evidence_count": len(progress.retrieved_evidence_ids),
|
| 517 |
+
"steps_until_deadline": case.deadline_step - self._state.step_count,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 518 |
}
|
| 519 |
|
| 520 |
def _build_queue(self) -> list[CaseQueueItem]:
|