mitudrudutta commited on
Commit
52e9e29
·
1 Parent(s): 9ae9432

fix: address Codex adversarial review findings

Browse files

- Remove grader answer-key leaks from info/metrics: no longer expose
missing_required_evidence, harmful_evidence_attached, or coverage
percentages derived from hidden labels. Only analyst-observable
signals (deadline warnings, unqueried systems, counts) are returned.
- Remove ISO replay from default task catalog so scores and task counts
are deterministic across all deployments. ISO tasks available via
explicit list_iso_tasks() and /generate endpoint.
- Reject invalid curriculum-reset difficulty values with ValueError
instead of silently falling back to the easiest built-in task.

README.md CHANGED
@@ -323,21 +323,19 @@ Observations are designed to look like an analyst workspace rather than a toy qu
323
  - masked card numbers
324
  - deadline-relative queue summaries
325
 
326
- Each step also returns a diagnostic `info` payload with:
327
 
328
- - `deadline_warning`
329
- - `unqueried_systems`
330
- - `missing_required_evidence`
331
- - `harmful_evidence_attached`
332
- - `episode_metrics`
333
 
334
- Episode-level state tracks research-oriented metrics such as:
335
 
336
- - evidence coverage percentage
337
- - helpful evidence coverage percentage
338
- - deadline pressure index
339
- - triage efficiency
340
- - open case count
341
 
342
  ## Quick Start
343
 
 
323
  - masked card numbers
324
  - deadline-relative queue summaries
325
 
326
+ Each step also returns a diagnostic `info` payload with analyst-observable signals only (no grader answer-key leakage):
327
 
328
+ - `deadline_warning` — true when the selected case has ≤2 steps until deadline
329
+ - `unqueried_systems` — which of the 6 merchant systems haven't been queried yet
330
+ - `attached_evidence_count` / `retrieved_evidence_count` — counts without revealing quality labels
331
+ - `steps_until_deadline` — exact steps remaining for the selected case
 
332
 
333
+ Episode-level metrics track operational signals:
334
 
335
+ - deadline pressure index (fraction of cases with ≤2 steps to deadline)
336
+ - triage efficiency (resolved cases per step)
337
+ - open / resolved case counts
338
+ - total evidence attached / retrieved
 
339
 
340
  ## Quick Start
341
 
scenarios/simulation.py CHANGED
@@ -624,11 +624,16 @@ def get_task(task_id: str) -> TaskScenario:
624
 
625
 
626
  def list_tasks() -> list[TaskScenario]:
627
- """Return all benchmark tasks organised into three splits.
 
 
628
 
629
  - **Showcase** (3): hand-crafted built-in tasks for demos and README.
630
  - **Generated holdout** (7): seeded tasks never used for agent tuning.
631
- - **ISO replay** (up to 3): real chargeback data tasks when CSV is present.
 
 
 
632
  """
633
 
634
  try:
@@ -654,15 +659,16 @@ def list_tasks() -> list[TaskScenario]:
654
  generate_task(seed=77, difficulty="nightmare"),
655
  ]
656
 
657
- # --- ISO replay split (real data, when available) ---
658
- replay: list[TaskScenario] = []
 
 
 
 
 
659
  try:
660
- try:
661
- from .iso_adapter import generate_iso_suite
662
- except ImportError: # pragma: no cover
663
- from iso_adapter import generate_iso_suite
664
- replay = generate_iso_suite(easy_count=1, medium_count=1, hard_count=1)
665
- except Exception:
666
- pass
667
 
668
- return showcase + holdout + replay
 
624
 
625
 
626
  def list_tasks() -> list[TaskScenario]:
627
+ """Return the fixed benchmark task catalog.
628
+
629
+ The catalog is deterministic and identical across all deployments:
630
 
631
  - **Showcase** (3): hand-crafted built-in tasks for demos and README.
632
  - **Generated holdout** (7): seeded tasks never used for agent tuning.
633
+
634
+ ISO replay tasks are available via ``list_iso_tasks()`` and the
635
+ ``/generate`` endpoint but are excluded from the default catalog so
636
+ that scores and task counts are always comparable.
637
  """
638
 
639
  try:
 
659
  generate_task(seed=77, difficulty="nightmare"),
660
  ]
661
 
662
+ return showcase + holdout
663
+
664
+
665
+ def list_iso_tasks() -> list[TaskScenario]:
666
+ """Return ISO 20022 replay tasks. Raises on failure instead of
667
+ silently returning an empty list so data/import issues are visible."""
668
+
669
  try:
670
+ from .iso_adapter import generate_iso_suite
671
+ except ImportError: # pragma: no cover
672
+ from iso_adapter import generate_iso_suite
 
 
 
 
673
 
674
+ return generate_iso_suite(easy_count=1, medium_count=1, hard_count=1)
server/chargeback_ops_environment.py CHANGED
@@ -93,7 +93,13 @@ class ChargebackOpsEnvironment(
93
  ) -> ChargebackOpsObservation:
94
  task_id = kwargs.get("task_id")
95
  difficulty = kwargs.get("difficulty")
96
- if task_id is None and difficulty in {"easy", "medium", "hard", "nightmare"}:
 
 
 
 
 
 
97
  resolved_seed = seed if seed is not None else int(kwargs.get("generated_seed", 42))
98
  task_id = f"generated_{difficulty}_s{resolved_seed}"
99
  if task_id is None:
@@ -457,21 +463,19 @@ class ChargebackOpsEnvironment(
457
  }
458
 
459
  def _episode_metrics(self) -> dict[str, float]:
460
- required_total = 0
461
- required_attached = 0
462
- helpful_total = 0
463
- helpful_attached = 0
464
  open_cases = 0
465
  urgent_cases = 0
466
  resolved_cases = 0
 
 
467
 
468
  for case in self._task.cases:
469
  progress = self._progress_by_case[case.case_id]
470
- attached = set(progress.attached_evidence_ids)
471
- required_total += len(case.required_evidence_ids)
472
- required_attached += len(attached.intersection(case.required_evidence_ids))
473
- helpful_total += len(case.helpful_evidence_ids)
474
- helpful_attached += len(attached.intersection(case.helpful_evidence_ids))
475
  steps_until_deadline = case.deadline_step - self._state.step_count
476
  if progress.resolution_status == "open":
477
  open_cases += 1
@@ -480,41 +484,37 @@ class ChargebackOpsEnvironment(
480
  else:
481
  resolved_cases += 1
482
 
483
- evidence_coverage = 1.0 if required_total == 0 else required_attached / required_total
484
- helpful_coverage = 1.0 if helpful_total == 0 else helpful_attached / helpful_total
485
  deadline_pressure = 0.0 if len(self._task.cases) == 0 else urgent_cases / len(self._task.cases)
486
  triage_efficiency = resolved_cases / max(1, self._state.step_count)
487
  return {
488
- "evidence_coverage_pct": round(evidence_coverage * 100, 2),
489
- "helpful_evidence_coverage_pct": round(helpful_coverage * 100, 2),
490
  "deadline_pressure_index": round(deadline_pressure, 4),
491
  "triage_efficiency": round(triage_efficiency, 4),
492
- "open_case_count": float(open_cases),
 
493
  }
494
 
495
  def _selected_case_info(self) -> dict[str, object]:
 
 
 
 
496
  if self._selected_case_id is None:
497
  return {
498
  "deadline_warning": False,
499
  "unqueried_systems": [],
500
- "missing_required_evidence": [],
501
- "harmful_evidence_attached": [],
502
  }
503
 
504
  case = self._lookup_case(self._selected_case_id)
505
  progress = self._progress_by_case[case.case_id]
506
- attached = set(progress.attached_evidence_ids)
507
  all_systems = {"orders", "payment", "shipping", "support", "refunds", "risk"}
508
  return {
509
  "deadline_warning": (case.deadline_step - self._state.step_count) <= 2,
510
  "unqueried_systems": sorted(all_systems.difference(progress.revealed_systems)),
511
- "missing_required_evidence": sorted(set(case.required_evidence_ids).difference(attached)),
512
- "harmful_evidence_attached": sorted(set(case.harmful_evidence_ids).intersection(attached)),
513
- "selected_case_metrics": {
514
- "attached_evidence_count": len(progress.attached_evidence_ids),
515
- "retrieved_evidence_count": len(progress.retrieved_evidence_ids),
516
- "steps_until_deadline": case.deadline_step - self._state.step_count,
517
- },
518
  }
519
 
520
  def _build_queue(self) -> list[CaseQueueItem]:
 
93
  ) -> ChargebackOpsObservation:
94
  task_id = kwargs.get("task_id")
95
  difficulty = kwargs.get("difficulty")
96
+ _VALID_DIFFICULTIES = {"easy", "medium", "hard", "nightmare"}
97
+ if difficulty is not None and difficulty not in _VALID_DIFFICULTIES:
98
+ raise ValueError(
99
+ f"Invalid difficulty {difficulty!r}. "
100
+ f"Must be one of: {', '.join(sorted(_VALID_DIFFICULTIES))}"
101
+ )
102
+ if task_id is None and difficulty in _VALID_DIFFICULTIES:
103
  resolved_seed = seed if seed is not None else int(kwargs.get("generated_seed", 42))
104
  task_id = f"generated_{difficulty}_s{resolved_seed}"
105
  if task_id is None:
 
463
  }
464
 
465
  def _episode_metrics(self) -> dict[str, float]:
466
+ """User-observable episode metrics. Never exposes grader-internal
467
+ labels such as required/helpful/harmful evidence IDs or coverage
468
+ against the hidden answer key."""
 
469
  open_cases = 0
470
  urgent_cases = 0
471
  resolved_cases = 0
472
+ total_attached = 0
473
+ total_retrieved = 0
474
 
475
  for case in self._task.cases:
476
  progress = self._progress_by_case[case.case_id]
477
+ total_attached += len(progress.attached_evidence_ids)
478
+ total_retrieved += len(progress.retrieved_evidence_ids)
 
 
 
479
  steps_until_deadline = case.deadline_step - self._state.step_count
480
  if progress.resolution_status == "open":
481
  open_cases += 1
 
484
  else:
485
  resolved_cases += 1
486
 
 
 
487
  deadline_pressure = 0.0 if len(self._task.cases) == 0 else urgent_cases / len(self._task.cases)
488
  triage_efficiency = resolved_cases / max(1, self._state.step_count)
489
  return {
490
+ "open_case_count": float(open_cases),
491
+ "resolved_case_count": float(resolved_cases),
492
  "deadline_pressure_index": round(deadline_pressure, 4),
493
  "triage_efficiency": round(triage_efficiency, 4),
494
+ "total_evidence_attached": float(total_attached),
495
+ "total_evidence_retrieved": float(total_retrieved),
496
  }
497
 
498
  def _selected_case_info(self) -> dict[str, object]:
499
+ """Per-case diagnostic info visible to agents. Only exposes
500
+ signals an analyst could observe (deadline proximity, which
501
+ systems haven't been queried, counts). Does NOT expose which
502
+ evidence IDs are required, helpful, or harmful."""
503
  if self._selected_case_id is None:
504
  return {
505
  "deadline_warning": False,
506
  "unqueried_systems": [],
 
 
507
  }
508
 
509
  case = self._lookup_case(self._selected_case_id)
510
  progress = self._progress_by_case[case.case_id]
 
511
  all_systems = {"orders", "payment", "shipping", "support", "refunds", "risk"}
512
  return {
513
  "deadline_warning": (case.deadline_step - self._state.step_count) <= 2,
514
  "unqueried_systems": sorted(all_systems.difference(progress.revealed_systems)),
515
+ "attached_evidence_count": len(progress.attached_evidence_ids),
516
+ "retrieved_evidence_count": len(progress.retrieved_evidence_ids),
517
+ "steps_until_deadline": case.deadline_step - self._state.step_count,
 
 
 
 
518
  }
519
 
520
  def _build_queue(self) -> list[CaseQueueItem]: