mitudrudutta commited on
Commit
9e6686d
·
1 Parent(s): 64cb3ce

feat: add adversarial evidence, nightmare difficulty, and benchmark splits

Browse files

- Add adversarial evidence templates where titles sound helpful but
content is harmful (delivery verification, account review, etc.)
- Add nightmare difficulty tier: 5-6 cases, ~2.4 steps/case budget,
adversarial templates mandatory
- Structure tasks into showcase/holdout/replay benchmark splits
- Wire ISO replay cases into list_tasks() when data is available
- Difficulty curve: easy 0.96, medium 0.80, hard 0.70, nightmare 0.54

core/models.py CHANGED
@@ -26,6 +26,12 @@ class CaseQueueItem(BaseModel):
26
  """Queue-level summary of a chargeback case."""
27
 
28
  case_id: str
 
 
 
 
 
 
29
  amount: float
30
  currency: str
31
  reason_code: str
@@ -57,8 +63,14 @@ class VisibleCase(BaseModel):
57
  """Current workspace for the selected case."""
58
 
59
  case_id: str
 
 
 
60
  order_id: str
61
  customer_id: str
 
 
 
62
  amount: float
63
  currency: str
64
  reason_code: str
@@ -78,7 +90,7 @@ class TaskSummary(BaseModel):
78
 
79
  task_id: str
80
  title: str
81
- difficulty: Literal["easy", "medium", "hard"]
82
  objective: str
83
  description: str
84
  max_steps: int
@@ -193,7 +205,7 @@ class ChargebackOpsObservation(Observation):
193
 
194
  task_id: str
195
  task_title: str
196
- difficulty: Literal["easy", "medium", "hard"]
197
  objective: str
198
  selected_case_id: str | None = None
199
  queue: list[CaseQueueItem] = Field(default_factory=list)
@@ -202,6 +214,7 @@ class ChargebackOpsObservation(Observation):
202
  available_actions: list[str] = Field(default_factory=list)
203
  steps_remaining: int
204
  progress_score: float = 0.0
 
205
  grader_report: GraderReport | None = None
206
 
207
 
@@ -210,11 +223,12 @@ class ChargebackOpsState(State):
210
 
211
  task_id: str
212
  task_title: str
213
- difficulty: Literal["easy", "medium", "hard"]
214
  objective: str
215
  selected_case_id: str | None = None
216
  queue_state: list[CaseResolutionState] = Field(default_factory=list)
217
  action_history: list[ActionTraceItem] = Field(default_factory=list)
 
218
  latest_grade: float | None = None
219
  grader_report: GraderReport | None = None
220
  completed: bool = False
 
26
  """Queue-level summary of a chargeback case."""
27
 
28
  case_id: str
29
+ transaction_id: str
30
+ transaction_timestamp: str
31
+ dispute_opened_at: str
32
+ merchant_name: str
33
+ merchant_mcc: str
34
+ masked_card: str
35
  amount: float
36
  currency: str
37
  reason_code: str
 
63
  """Current workspace for the selected case."""
64
 
65
  case_id: str
66
+ transaction_id: str
67
+ transaction_timestamp: str
68
+ dispute_opened_at: str
69
  order_id: str
70
  customer_id: str
71
+ merchant_name: str
72
+ merchant_mcc: str
73
+ masked_card: str
74
  amount: float
75
  currency: str
76
  reason_code: str
 
90
 
91
  task_id: str
92
  title: str
93
+ difficulty: Literal["easy", "medium", "hard", "nightmare"]
94
  objective: str
95
  description: str
96
  max_steps: int
 
205
 
206
  task_id: str
207
  task_title: str
208
+ difficulty: Literal["easy", "medium", "hard", "nightmare"]
209
  objective: str
210
  selected_case_id: str | None = None
211
  queue: list[CaseQueueItem] = Field(default_factory=list)
 
214
  available_actions: list[str] = Field(default_factory=list)
215
  steps_remaining: int
216
  progress_score: float = 0.0
217
+ info: dict[str, Any] = Field(default_factory=dict)
218
  grader_report: GraderReport | None = None
219
 
220
 
 
223
 
224
  task_id: str
225
  task_title: str
226
+ difficulty: Literal["easy", "medium", "hard", "nightmare"]
227
  objective: str
228
  selected_case_id: str | None = None
229
  queue_state: list[CaseResolutionState] = Field(default_factory=list)
230
  action_history: list[ActionTraceItem] = Field(default_factory=list)
231
+ metrics: dict[str, float] = Field(default_factory=dict)
232
  latest_grade: float | None = None
233
  grader_report: GraderReport | None = None
234
  completed: bool = False
scenarios/case_generator.py CHANGED
@@ -511,6 +511,179 @@ _PRODUCT_NOT_AS_DESCRIBED = _CaseTemplate(
511
  )
512
 
513
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
514
  _SERVICE_NOT_PROVIDED = _CaseTemplate(
515
  reason_code="service_not_provided",
516
  summaries=(
@@ -586,6 +759,11 @@ _CONTESTABLE_TEMPLATES: tuple[_CaseTemplate, ...] = (
586
  _SERVICE_NOT_PROVIDED,
587
  )
588
 
 
 
 
 
 
589
  _CONCEDABLE_TEMPLATES: tuple[_CaseTemplate, ...] = (
590
  _FRAUD_CNP_WEAK,
591
  _CREDIT_NOT_PROCESSED,
@@ -712,7 +890,7 @@ def generate_case(
712
  def generate_task(
713
  seed: int,
714
  *,
715
- difficulty: Literal["easy", "medium", "hard"] = "medium",
716
  case_count: int | None = None,
717
  ) -> TaskScenario:
718
  """Generate a full task scenario from a seed.
@@ -723,17 +901,29 @@ def generate_task(
723
  Deterministic seed — same seed always produces the same task.
724
  difficulty:
725
  Controls step budget, deadline pressure, and case count defaults.
 
726
  case_count:
727
- Override the number of cases (default: 1 for easy, 1-2 for medium, 2-4 for hard).
 
728
  """
729
 
730
  rng = random.Random(seed)
731
 
732
  # Defaults per difficulty
733
  if case_count is None:
734
- case_count = {"easy": 1, "medium": rng.choice([1, 2]), "hard": rng.choice([2, 3, 4])}[difficulty]
735
-
736
- max_steps = {"easy": 10, "medium": 12, "hard": max(12, case_count * 5)}[difficulty]
 
 
 
 
 
 
 
 
 
 
737
 
738
  # Build the case list
739
  cases: list[InternalCase] = []
@@ -743,14 +933,29 @@ def generate_task(
743
  if difficulty == "easy":
744
  # Easy: always a clean contestable case
745
  template = rng.choice(_CONTESTABLE_TEMPLATES)
 
 
 
 
 
 
 
 
 
 
746
  elif difficulty == "hard" and case_count > 1:
747
- # Hard: mix of contestable and concedable ensure at least one of each
748
  if i == 0:
749
- template = rng.choice(_CONTESTABLE_TEMPLATES)
 
 
 
750
  elif i == 1:
751
  template = rng.choice(_CONCEDABLE_TEMPLATES)
752
  else:
753
- template = rng.choice(_ALL_TEMPLATES)
 
 
754
  else:
755
  # Medium: any template
756
  template = rng.choice(_ALL_TEMPLATES)
@@ -758,7 +963,12 @@ def generate_task(
758
  used_templates.append(template)
759
 
760
  # Deadline tightens with difficulty
761
- base_deadline = {"easy": 8, "medium": 7, "hard": max(4, 8 - i)}[difficulty]
 
 
 
 
 
762
  deadline = base_deadline + rng.randint(-1, 1)
763
  deadline = max(3, min(deadline, max_steps - 1))
764
 
@@ -788,6 +998,12 @@ def generate_task(
788
  "Deadline Pressure Queue",
789
  "Complex Dispute Portfolio",
790
  ],
 
 
 
 
 
 
791
  }
792
  title = rng.choice(title_pool[difficulty])
793
 
@@ -798,6 +1014,11 @@ def generate_task(
798
  f"Optimize outcomes across {case_count} disputes ({code_list}) under tight deadlines. "
799
  "Prioritize high-value recoverable cases and concede weak ones efficiently."
800
  ),
 
 
 
 
 
801
  }
802
 
803
  description = (
 
511
  )
512
 
513
 
514
+ # ---------------------------------------------------------------------------
515
+ # Adversarial evidence templates — title sounds helpful, content is harmful
516
+ # ---------------------------------------------------------------------------
517
+
518
+ _GOODS_NOT_RECEIVED_ADVERSARIAL = _CaseTemplate(
519
+ reason_code="goods_not_received",
520
+ summaries=(
521
+ "Customer disputes delivery of a high-value item. Carrier shows delivered but GPS data is ambiguous.",
522
+ "Cardholder claims non-receipt. Delivery photo exists but address partially obscured.",
523
+ ),
524
+ inspection_notes=(
525
+ "Delivery evidence exists but has quality issues. Careful evidence curation is required.",
526
+ "Carrier proof is present but some artifacts may weaken the case. Review evidence before attaching.",
527
+ ),
528
+ policy_guidance=(
529
+ "For goods-not-received disputes, prove the merchandise was fulfilled "
530
+ "to the billed customer with order confirmation and carrier delivery evidence. "
531
+ "Do not include evidence that shows delivery anomalies."
532
+ ),
533
+ policy_requirements=("order confirmation", "carrier delivery confirmation"),
534
+ optimal_strategy="contest",
535
+ acceptable_strategies=(),
536
+ resolution_summary="Contest with clean delivery proof only. Exclude anomalous delivery artifacts.",
537
+ base_weight=1.3,
538
+ evidence_blueprints=(
539
+ _EvidenceBlueprint(
540
+ "ORDER-CONF", "orders", "Order confirmation",
541
+ (
542
+ "Order receipt with billing name, address, and itemized products.",
543
+ "Original order confirmation email showing the billed customer.",
544
+ ),
545
+ helpful=True, required=True,
546
+ ),
547
+ _EvidenceBlueprint(
548
+ "DELIVERY", "shipping", "Carrier delivery scan",
549
+ (
550
+ "Carrier tracking confirms delivery to the registered shipping address.",
551
+ "Delivery scan shows package arrived at the customer address.",
552
+ ),
553
+ helpful=True, required=True,
554
+ ),
555
+ # ADVERSARIAL: title sounds helpful, content is harmful
556
+ _EvidenceBlueprint(
557
+ "DELIVERY-DETAIL", "shipping", "Delivery verification report",
558
+ (
559
+ "GPS coordinates at delivery time show a 200m discrepancy from the registered address. "
560
+ "Driver noted the recipient was not present and package was left unattended.",
561
+ "Delivery verification flagged a geolocation mismatch between the scanned delivery point "
562
+ "and the customer's registered address.",
563
+ ),
564
+ harmful=True, probability=0.9,
565
+ ),
566
+ _EvidenceBlueprint(
567
+ "SIGNATURE", "shipping", "Delivery signature",
568
+ ("Recipient signature recorded at delivery.",),
569
+ helpful=True, probability=0.4,
570
+ ),
571
+ # ADVERSARIAL: title sounds helpful, content is harmful
572
+ _EvidenceBlueprint(
573
+ "CARRIER-NOTES", "shipping", "Carrier compliance summary",
574
+ (
575
+ "Carrier flagged this delivery as a high-risk address with multiple prior non-receipt claims. "
576
+ "Delivery was attempted twice before successful scan.",
577
+ "Carrier notes indicate the address has been suspended from guaranteed delivery due to "
578
+ "repeated delivery disputes.",
579
+ ),
580
+ harmful=True, probability=0.7,
581
+ ),
582
+ _EvidenceBlueprint(
583
+ "SUPPORT", "support", "Support interaction",
584
+ ("Customer contacted support about delivery status.",),
585
+ helpful=True, probability=0.5,
586
+ ),
587
+ _EvidenceBlueprint(
588
+ "NO-REFUND", "refunds", "Refund ledger",
589
+ ("No refund issued before the dispute.",),
590
+ ),
591
+ _EvidenceBlueprint(
592
+ "RISK", "risk", "Risk summary",
593
+ ("Standard transaction risk profile.",),
594
+ ),
595
+ ),
596
+ )
597
+
598
+
599
+ _FRAUD_CNP_ADVERSARIAL = _CaseTemplate(
600
+ reason_code="fraud_cnp",
601
+ summaries=(
602
+ "CNP fraud dispute on a mid-value electronics purchase. Account shows prior activity but verification signals are mixed.",
603
+ "Issuer disputes an online order as unauthorized. The account has history but recent changes raise questions.",
604
+ ),
605
+ inspection_notes=(
606
+ "Account has prior orders but recent address change and new device. Evidence must be curated carefully.",
607
+ "Some account linkage exists but verification artifacts contain damaging details. Read summaries before attaching.",
608
+ ),
609
+ policy_guidance=(
610
+ "For CNP fraud disputes, contest only when you can link the cardholder "
611
+ "to the account or device history. Do not attach evidence that strengthens "
612
+ "the issuer's fraud narrative."
613
+ ),
614
+ policy_requirements=("prior good order linkage", "customer account confirmation"),
615
+ optimal_strategy="contest",
616
+ acceptable_strategies=("accept_chargeback",),
617
+ resolution_summary="Contest with clean account-linkage evidence. Exclude verification failure details.",
618
+ base_weight=1.2,
619
+ evidence_blueprints=(
620
+ _EvidenceBlueprint(
621
+ "ORDER", "orders", "Order receipt",
622
+ (
623
+ "Checkout receipt linked to an established customer account.",
624
+ "Order confirmation with customer account ID and shipping address.",
625
+ ),
626
+ helpful=True,
627
+ ),
628
+ _EvidenceBlueprint(
629
+ "PRIOR-ORDERS", "risk", "Prior account activity",
630
+ (
631
+ "Same account with three prior fulfilled orders over 6 months.",
632
+ "Established account with consistent purchase history.",
633
+ ),
634
+ helpful=True, required=True,
635
+ ),
636
+ _EvidenceBlueprint(
637
+ "ACCOUNT-CHAT", "support", "Authenticated support chat",
638
+ (
639
+ "Customer logged in and confirmed the order in a support chat.",
640
+ "Authenticated session where customer discussed this purchase.",
641
+ ),
642
+ helpful=True, required=True,
643
+ ),
644
+ # ADVERSARIAL: title sounds like it helps, content destroys the case
645
+ _EvidenceBlueprint(
646
+ "ACCOUNT-REVIEW", "risk", "Account verification summary",
647
+ (
648
+ "Account review shows the shipping address was changed 2 hours before this order. "
649
+ "New device fingerprint does not match any prior sessions. "
650
+ "Velocity check flagged 3 orders within 40 minutes from the new device.",
651
+ "Account security audit reveals the email was accessed from an unrecognized IP "
652
+ "and the password was reset 4 hours before this purchase. "
653
+ "Device fingerprint is inconsistent with prior login history.",
654
+ ),
655
+ harmful=True, probability=0.85,
656
+ ),
657
+ # ADVERSARIAL: another trap
658
+ _EvidenceBlueprint(
659
+ "TRANSACTION-VERIFY", "payment", "Transaction authentication report",
660
+ (
661
+ "3D Secure challenge was presented but failed on the first attempt. "
662
+ "Authorization proceeded on a liability-shift exemption. "
663
+ "AVS returned a partial mismatch.",
664
+ "Transaction authentication record shows the cardholder failed the "
665
+ "initial verification challenge. Authorization was force-approved by the merchant.",
666
+ ),
667
+ harmful=True, probability=0.8,
668
+ ),
669
+ _EvidenceBlueprint(
670
+ "AVS-MISMATCH", "payment", "AVS mismatch detail",
671
+ ("Street-number mismatch recorded at authorization time.",),
672
+ harmful=True, probability=0.6,
673
+ ),
674
+ _EvidenceBlueprint(
675
+ "DELIVERY", "shipping", "Carrier delivery confirmation",
676
+ ("Package delivered to the account's shipping address.",),
677
+ helpful=True,
678
+ ),
679
+ _EvidenceBlueprint(
680
+ "NO-REFUND", "refunds", "Refund ledger",
681
+ ("No refund issued before the dispute.",),
682
+ ),
683
+ ),
684
+ )
685
+
686
+
687
  _SERVICE_NOT_PROVIDED = _CaseTemplate(
688
  reason_code="service_not_provided",
689
  summaries=(
 
759
  _SERVICE_NOT_PROVIDED,
760
  )
761
 
762
+ _ADVERSARIAL_TEMPLATES: tuple[_CaseTemplate, ...] = (
763
+ _GOODS_NOT_RECEIVED_ADVERSARIAL,
764
+ _FRAUD_CNP_ADVERSARIAL,
765
+ )
766
+
767
  _CONCEDABLE_TEMPLATES: tuple[_CaseTemplate, ...] = (
768
  _FRAUD_CNP_WEAK,
769
  _CREDIT_NOT_PROCESSED,
 
890
  def generate_task(
891
  seed: int,
892
  *,
893
+ difficulty: Literal["easy", "medium", "hard", "nightmare"] = "medium",
894
  case_count: int | None = None,
895
  ) -> TaskScenario:
896
  """Generate a full task scenario from a seed.
 
901
  Deterministic seed — same seed always produces the same task.
902
  difficulty:
903
  Controls step budget, deadline pressure, and case count defaults.
904
+ ``nightmare`` adds adversarial evidence and extreme budget pressure.
905
  case_count:
906
+ Override the number of cases (default: 1 for easy, 1-2 for medium,
907
+ 2-4 for hard, 5-6 for nightmare).
908
  """
909
 
910
  rng = random.Random(seed)
911
 
912
  # Defaults per difficulty
913
  if case_count is None:
914
+ case_count = {
915
+ "easy": 1,
916
+ "medium": rng.choice([1, 2]),
917
+ "hard": rng.choice([2, 3, 4]),
918
+ "nightmare": rng.choice([5, 6]),
919
+ }[difficulty]
920
+
921
+ max_steps = {
922
+ "easy": 10,
923
+ "medium": 12,
924
+ "hard": max(12, case_count * 5),
925
+ "nightmare": max(12, case_count * 3), # ~2.4 steps per case
926
+ }[difficulty]
927
 
928
  # Build the case list
929
  cases: list[InternalCase] = []
 
933
  if difficulty == "easy":
934
  # Easy: always a clean contestable case
935
  template = rng.choice(_CONTESTABLE_TEMPLATES)
936
+ elif difficulty == "nightmare":
937
+ # Nightmare: adversarial evidence, mixed strategies, high pressure
938
+ if i == 0:
939
+ template = rng.choice(_ADVERSARIAL_TEMPLATES)
940
+ elif i == 1:
941
+ template = rng.choice(_CONCEDABLE_TEMPLATES)
942
+ elif i == 2:
943
+ template = rng.choice(_ADVERSARIAL_TEMPLATES)
944
+ else:
945
+ template = rng.choice(_ALL_TEMPLATES + _ADVERSARIAL_TEMPLATES)
946
  elif difficulty == "hard" and case_count > 1:
947
+ # Hard: mix of contestable, concedable, and adversarial
948
  if i == 0:
949
+ # First case: adversarial contestable (trap evidence)
950
+ template = rng.choice(
951
+ _ADVERSARIAL_TEMPLATES + _CONTESTABLE_TEMPLATES
952
+ )
953
  elif i == 1:
954
  template = rng.choice(_CONCEDABLE_TEMPLATES)
955
  else:
956
+ template = rng.choice(
957
+ _ALL_TEMPLATES + _ADVERSARIAL_TEMPLATES
958
+ )
959
  else:
960
  # Medium: any template
961
  template = rng.choice(_ALL_TEMPLATES)
 
963
  used_templates.append(template)
964
 
965
  # Deadline tightens with difficulty
966
+ base_deadline = {
967
+ "easy": 8,
968
+ "medium": 7,
969
+ "hard": max(4, 8 - i),
970
+ "nightmare": max(3, 6 - i),
971
+ }[difficulty]
972
  deadline = base_deadline + rng.randint(-1, 1)
973
  deadline = max(3, min(deadline, max_steps - 1))
974
 
 
998
  "Deadline Pressure Queue",
999
  "Complex Dispute Portfolio",
1000
  ],
1001
+ "nightmare": [
1002
+ "Adversarial Portfolio Stress Test",
1003
+ "Trap Evidence Gauntlet",
1004
+ "Maximum Pressure Dispute Queue",
1005
+ "Misleading Signal Overload",
1006
+ ],
1007
  }
1008
  title = rng.choice(title_pool[difficulty])
1009
 
 
1014
  f"Optimize outcomes across {case_count} disputes ({code_list}) under tight deadlines. "
1015
  "Prioritize high-value recoverable cases and concede weak ones efficiently."
1016
  ),
1017
+ "nightmare": (
1018
+ f"Survive {case_count} disputes ({code_list}) with adversarial evidence, "
1019
+ "conflicting deadlines, and extreme step pressure. Evidence titles may be misleading — "
1020
+ "read summaries carefully. Triage ruthlessly."
1021
+ ),
1022
  }
1023
 
1024
  description = (
scenarios/simulation.py CHANGED
@@ -54,7 +54,7 @@ class TaskScenario:
54
 
55
  task_id: str
56
  title: str
57
- difficulty: Literal["easy", "medium", "hard"]
58
  objective: str
59
  description: str
60
  max_steps: int
@@ -592,7 +592,7 @@ def get_task(task_id: str) -> TaskScenario:
592
  # Support generated task ids: generated_{difficulty}_s{seed}
593
  import re
594
 
595
- m = re.match(r"^generated_(easy|medium|hard)_s(\d+)$", task_id)
596
  if m:
597
  try:
598
  from .case_generator import generate_task
@@ -624,23 +624,45 @@ def get_task(task_id: str) -> TaskScenario:
624
 
625
 
626
  def list_tasks() -> list[TaskScenario]:
627
- """Return built-in and generated tasks in a stable order."""
 
 
 
 
 
628
 
629
  try:
630
  from .case_generator import generate_task
631
  except ImportError: # pragma: no cover
632
  from case_generator import generate_task
633
 
634
- built_in = [TASKS[task_id] for task_id in [
 
635
  "goods_not_received_easy",
636
  "fraud_signal_ambiguity",
637
  "queue_optimization_hard",
638
  ]]
639
- generated = [
 
 
640
  generate_task(seed=42, difficulty="easy"),
641
  generate_task(seed=17, difficulty="medium"),
642
  generate_task(seed=99, difficulty="medium"),
643
  generate_task(seed=7, difficulty="hard"),
644
  generate_task(seed=53, difficulty="hard"),
 
 
645
  ]
646
- return built_in + generated
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  task_id: str
56
  title: str
57
+ difficulty: Literal["easy", "medium", "hard", "nightmare"]
58
  objective: str
59
  description: str
60
  max_steps: int
 
592
  # Support generated task ids: generated_{difficulty}_s{seed}
593
  import re
594
 
595
+ m = re.match(r"^generated_(easy|medium|hard|nightmare)_s(\d+)$", task_id)
596
  if m:
597
  try:
598
  from .case_generator import generate_task
 
624
 
625
 
626
  def list_tasks() -> list[TaskScenario]:
627
+ """Return all benchmark tasks organised into three splits.
628
+
629
+ - **Showcase** (3): hand-crafted built-in tasks for demos and README.
630
+ - **Generated holdout** (7): seeded tasks never used for agent tuning.
631
+ - **ISO replay** (up to 3): real chargeback data tasks when CSV is present.
632
+ """
633
 
634
  try:
635
  from .case_generator import generate_task
636
  except ImportError: # pragma: no cover
637
  from case_generator import generate_task
638
 
639
+ # --- Showcase split (fixed, hand-crafted) ---
640
+ showcase = [TASKS[task_id] for task_id in [
641
  "goods_not_received_easy",
642
  "fraud_signal_ambiguity",
643
  "queue_optimization_hard",
644
  ]]
645
+
646
+ # --- Generated holdout split (seeded, never used for tuning) ---
647
+ holdout = [
648
  generate_task(seed=42, difficulty="easy"),
649
  generate_task(seed=17, difficulty="medium"),
650
  generate_task(seed=99, difficulty="medium"),
651
  generate_task(seed=7, difficulty="hard"),
652
  generate_task(seed=53, difficulty="hard"),
653
+ generate_task(seed=31, difficulty="nightmare"),
654
+ generate_task(seed=77, difficulty="nightmare"),
655
  ]
656
+
657
+ # --- ISO replay split (real data, when available) ---
658
+ replay: list[TaskScenario] = []
659
+ try:
660
+ try:
661
+ from .iso_adapter import generate_iso_suite
662
+ except ImportError: # pragma: no cover
663
+ from iso_adapter import generate_iso_suite
664
+ replay = generate_iso_suite(easy_count=1, medium_count=1, hard_count=1)
665
+ except Exception:
666
+ pass
667
+
668
+ return showcase + holdout + replay
tests/test_requirements.py CHANGED
@@ -62,7 +62,7 @@ def _run_bad_episode(task_id: str) -> tuple[float, float]:
62
  def test_problem_statement_task_catalog():
63
  tasks = list_tasks()
64
  assert len(tasks) >= 3
65
- assert {task.difficulty for task in tasks} == {"easy", "medium", "hard"}
66
 
67
 
68
  def test_problem_statement_reset_and_state_cleanliness():
 
62
  def test_problem_statement_task_catalog():
63
  tasks = list_tasks()
64
  assert len(tasks) >= 3
65
+ assert {task.difficulty for task in tasks} >= {"easy", "medium", "hard"}
66
 
67
 
68
  def test_problem_statement_reset_and_state_cleanliness():