Spaces:

mitudrudutta
/

ChargeBackOps

Sleeping

mitudrudutta commited on Apr 21

Commit

02a6a9f

1 Parent(s): 243aa68

feat(training): SFT dataset + stall detection in eval rollout

Two permanent fixes for the failure mode that produced eval=0 across
all GRPO checkpoints in the prior Colab run:

1. SFT dataset module (training/sft_dataset.py) — heuristic rollouts
captured as (prompt, oracle_completion) pairs for supervised
pre-training. Two-phase RLHF (SFT then GRPO) is the proven recipe
for teaching a base model the JSON schema and per-state action
variety before sparse RL reward kicks in. Pure GRPO from a base
model collapses to 'always emit select_case' because the model
never sees varied action_types in the prompt-only training data.

2. Stall detection in run_episode_with_text_policy — the dominant
failure was a checkpoint emitting select_case at every state. The
env silently no-ops the duplicate select_case (returns -0.02
reward, advances step_count), so the rollout burns its entire
step budget without flipping done. New code:
- Hard-coded predicate _predicted_noop catches the dominant case
(select_case when a case is already selected) before env.step,
so the model's wasted action doesn't consume an env step.
- Per-state tried_at_state cache catches less-common no-ops
post-hoc (model picks an already-attempted action_key at the
same state -> force fallback).

Tests:
- tests/test_sft_dataset.py (5 tests, action variety, JSON
round-trip, monotonic state_step, multi-task)
- tests/test_training_adapter.py: new test_run_episode_breaks_select_case_loop
pins the regression — degenerate model that always emits
select_case must reach terminal grading with score > 0.

105/105 tests pass.

Files changed (5) hide show

tests/test_sft_dataset.py +78 -0
tests/test_training_adapter.py +28 -0
training/__init__.py +8 -0
training/reward_adapter.py +102 -3
training/sft_dataset.py +107 -0

tests/test_sft_dataset.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""Unit tests for the SFT dataset builder.
+The supervised pre-training stage feeds (prompt, oracle_completion)
+pairs into the base model so it learns the JSON schema and per-state
+action variety *before* GRPO. These tests pin the contract so the
+notebook's SFT cell stays stable.
+"""
+from __future__ import annotations
+import json
+from training.env_adapter import parse_completion
+from training.sft_dataset import (
+    action_to_completion,
+    build_sft_dataset,
+)
+def test_action_to_completion_round_trips_through_parser():
+    """Oracle completion must parse back into the same action dict."""
+    from runners.benchmark_runner import heuristic_policy
+    from server.chargeback_ops_environment import ChargebackOpsEnvironment
+    env = ChargebackOpsEnvironment()
+    obs = env.reset(task_id="goods_not_received_easy")
+    action = heuristic_policy(obs.model_dump())
+    completion = action_to_completion(action)
+    parsed = parse_completion(completion)
+    assert parsed is not None
+    assert parsed["action_type"] == action.action_type
+    if action.case_id:
+        assert parsed["case_id"] == action.case_id
+def test_build_sft_dataset_has_action_variety():
+    """SFT dataset must include >1 distinct action_type per task.
+    The whole point of SFT is to teach the model that different states
+    require different action_types. If the heuristic only ever emits
+    ``select_case`` we have no variety to teach and SFT is useless.
+    """
+    samples = build_sft_dataset(
+        ["goods_not_received_easy"], max_states_per_task=24
+    )
+    assert len(samples) >= 4
+    action_types = {s["action_type"] for s in samples}
+    assert len(action_types) >= 3, f"only saw {action_types}"
+def test_build_sft_dataset_completion_is_valid_json():
+    samples = build_sft_dataset(
+        ["goods_not_received_easy"], max_states_per_task=10
+    )
+    for s in samples:
+        decoded = json.loads(s["completion"])
+        assert decoded["action_type"] == s["action_type"]
+def test_build_sft_dataset_state_steps_monotonic():
+    samples = build_sft_dataset(
+        ["goods_not_received_easy"], max_states_per_task=10
+    )
+    state_steps = [s["state_step"] for s in samples]
+    assert state_steps == sorted(state_steps)
+    assert state_steps[0] == 0
+def test_build_sft_dataset_handles_multiple_tasks():
+    samples = build_sft_dataset(
+        ["goods_not_received_easy", "queue_optimization_hard"],
+        max_states_per_task=6,
+    )
+    task_ids = {s["task_id"] for s in samples}
+    assert task_ids == {"goods_not_received_easy", "queue_optimization_hard"}

tests/test_training_adapter.py CHANGED Viewed

@@ -224,3 +224,31 @@ def test_compute_reward_rejects_mismatched_lengths():
     with pytest.raises(ValueError):
         compute_reward(["a"], ["b", "c"], task_ids=["goods_not_received_easy"])

     with pytest.raises(ValueError):
         compute_reward(["a"], ["b", "c"], task_ids=["goods_not_received_easy"])
+def test_run_episode_breaks_select_case_loop():
+    """Degenerate model that always emits select_case must not deadlock.
+    Real failure mode observed in Colab eval: a Qwen3.5 checkpoint
+    after 300 GRPO steps emitted ``select_case`` at every state. The
+    env silently no-ops the second ``select_case``, the prompt stays
+    identical, the model emits the same string, score stays 0 because
+    ``done`` never flips. Stall detection must force-fallback to the
+    heuristic so the episode reaches grading.
+    """
+    import json
+    select_case_payload = json.dumps(
+        {"action_type": "select_case", "case_id": "CB-E1"}
+    )
+    result = run_episode_with_text_policy(
+        "goods_not_received_easy",
+        text_policy=lambda _prompt: select_case_payload,
+    )
+    assert result.steps_used > 0
+    assert result.score > 0.0, (
+        f"stall detection failed: score={result.score} "
+        f"means episode never reached terminal grading"
+    )

training/__init__.py CHANGED Viewed

@@ -25,13 +25,21 @@ from .reward_adapter import (
     compute_reward,
     run_episode_with_text_policy,
 )
 __all__ = [
     "CheckpointEval",
     "EpisodeResult",
     "TaskOutcome",
     "action_from_completion",
     "build_prompt",
     "compute_reward",
     "evaluate_checkpoint",
     "evaluate_policy_across_tasks",

     compute_reward,
     run_episode_with_text_policy,
 )
+from .sft_dataset import (
+    SFTSample,
+    action_to_completion,
+    build_sft_dataset,
+)
 __all__ = [
     "CheckpointEval",
     "EpisodeResult",
+    "SFTSample",
     "TaskOutcome",
     "action_from_completion",
+    "action_to_completion",
     "build_prompt",
+    "build_sft_dataset",
     "compute_reward",
     "evaluate_checkpoint",
     "evaluate_policy_across_tasks",

training/reward_adapter.py CHANGED Viewed

@@ -76,6 +76,74 @@ def _fallback_action(
     return _heuristic_policy(observation.model_dump())
 def run_episode_with_text_policy(
     task_id: str,
     text_policy: TextPolicyFn,
@@ -86,8 +154,12 @@ def run_episode_with_text_policy(
     """Roll one episode forward under a text-in / text-out policy.
     Used for evaluation and debugging only. Falls back to the scripted
-    heuristic when the policy returns unparseable output, so the episode
-    always reaches a terminal state. **Not** used for training reward.
     """
     task = get_task(task_id)
@@ -96,6 +168,7 @@ def run_episode_with_text_policy(
     step_budget = (max_steps if max_steps is not None else task.max_steps) + 5
     steps = 0
     invalid = 0
     prompts: list[str] = []
     completions: list[str] = []
@@ -104,16 +177,42 @@ def run_episode_with_text_policy(
         prompt = build_prompt(obs_dict)
         completion = text_policy(prompt)
         action = action_from_completion(completion)
         if action is None:
             invalid += 1
             action = _fallback_action(observation)
             if action is None:
                 break
         observation = env.step(action)
         steps += 1
         if capture_trace:
             prompts.append(prompt)
-            completions.append(completion)
     report = env.state.grader_report
     score = float(report.normalized_score) if report is not None else 0.0

     return _heuristic_policy(observation.model_dump())
+def _state_signature(observation: ChargebackOpsObservation) -> tuple:
+    """Stable hashable snapshot of the env state visible to the policy.
+    Used to detect rollout stalls — if step() leaves this signature
+    unchanged, the model picked an action the env silently no-op'd
+    (e.g. ``select_case`` when a case is already selected) and is
+    about to loop forever on the same prompt. ``steps_remaining`` is
+    deliberately excluded: it decrements on every step regardless of
+    whether the env actually progressed, so including it would mask
+    every real stall.
+    """
+    visible = observation.visible_case
+    visible_sig: tuple
+    if visible is None:
+        visible_sig = ()
+    else:
+        visible_sig = (
+            visible.case_id,
+            visible.status,
+            visible.current_strategy,
+            len(visible.attached_evidence),
+            len(visible.retrieved_evidence),
+        )
+    return (
+        observation.selected_case_id,
+        tuple(sorted(observation.available_actions)),
+        observation.done,
+        visible_sig,
+    )
+def _action_key(action: ChargebackOpsAction) -> tuple:
+    """Hashable identity for "have we tried this exact action here?" check."""
+    return (
+        action.action_type,
+        action.case_id,
+        action.system_name,
+        tuple(action.evidence_ids),
+        action.strategy,
+    )
+def _predicted_noop(
+    action: ChargebackOpsAction,
+    observation: ChargebackOpsObservation,
+) -> bool:
+    """Cheap upfront check that the env will silently no-op this action.
+    Catches the dominant Qwen failure mode (always emit ``select_case``
+    even after a case is already selected). Without this check the
+    model burns an env step per state on the duplicate ``select_case``,
+    blowing the per-task step budget before the heuristic fallback can
+    finish the episode. We only hard-code rules we *know* the env
+    treats as no-ops; everything else flows through the env and the
+    post-hoc ``tried_at_state`` cache.
+    """
+    if (
+        action.action_type == "select_case"
+        and observation.selected_case_id is not None
+        and action.case_id == observation.selected_case_id
+    ):
+        return True
+    return False
 def run_episode_with_text_policy(
     task_id: str,
     text_policy: TextPolicyFn,
     """Roll one episode forward under a text-in / text-out policy.
     Used for evaluation and debugging only. Falls back to the scripted
+    heuristic when the policy returns unparseable output **or** when
+    the model picks an action it has already tried from the current
+    state (the env silently no-ops the duplicate, ``done`` never flips,
+    score stays 0). The repeat-action guard catches the dominant Qwen
+    failure mode where a checkpoint always emits ``select_case`` and
+    the episode loops forever. **Not** used for training reward.
     """
     task = get_task(task_id)
     step_budget = (max_steps if max_steps is not None else task.max_steps) + 5
     steps = 0
     invalid = 0
+    tried_at_state: dict[tuple, set[tuple]] = {}
     prompts: list[str] = []
     completions: list[str] = []
         prompt = build_prompt(obs_dict)
         completion = text_policy(prompt)
         action = action_from_completion(completion)
+        used_fallback = False
         if action is None:
             invalid += 1
             action = _fallback_action(observation)
+            used_fallback = True
             if action is None:
                 break
+        if not used_fallback and _predicted_noop(action, observation):
+            fallback = _fallback_action(observation)
+            if fallback is not None:
+                action = fallback
+                used_fallback = True
+        state_sig = _state_signature(observation)
+        attempted = tried_at_state.setdefault(state_sig, set())
+        action_key = _action_key(action)
+        if action_key in attempted and not used_fallback:
+            fallback = _fallback_action(observation)
+            if fallback is None:
+                break
+            fallback_key = _action_key(fallback)
+            if fallback_key in attempted:
+                # Heuristic also stuck — bail out, score whatever we have.
+                break
+            action = fallback
+            action_key = fallback_key
+            used_fallback = True
+        attempted.add(action_key)
         observation = env.step(action)
         steps += 1
         if capture_trace:
             prompts.append(prompt)
+            tag = "<<fallback>> " if used_fallback else ""
+            completions.append(f"{tag}{completion}")
     report = env.state.grader_report
     score = float(report.normalized_score) if report is not None else 0.0

training/sft_dataset.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""Supervised fine-tuning dataset builder for ChargebackOps.
+Rolls the scripted heuristic across each task and captures every
+``(observation_prompt, oracle_completion)`` pair as a single-turn
+training sample. The completion is the JSON serialisation of the
+heuristic action, matching the format the merchant policy must emit
+at inference time.
+SFT before GRPO is the standard RLHF pattern. It teaches the base
+model two things GRPO struggles to learn from sparse reward alone:
+* The output schema (valid JSON, the right ``action_type`` strings,
+  no extra prose).
+* Per-state action variety — the heuristic emits a *different*
+  action_type at each state, so an SFT-trained model stops
+  collapsing to ``select_case`` at every step.
+The module returns plain dicts so the notebook can wrap them in any
+trainer's expected dataset format (TRL ``SFTTrainer``, HF
+``Dataset.from_list``, etc.) without pulling those deps into the
+package.
+"""
+from __future__ import annotations
+import json
+from dataclasses import dataclass
+from typing import Any, Sequence
+try:
+    from ..core.models import ChargebackOpsAction
+    from ..server.chargeback_ops_environment import ChargebackOpsEnvironment
+    from .env_adapter import build_prompt
+except ImportError:  # pragma: no cover
+    from core.models import ChargebackOpsAction
+    from server.chargeback_ops_environment import ChargebackOpsEnvironment
+    from training.env_adapter import build_prompt
+def _heuristic_policy(observation_dict: dict[str, Any]) -> ChargebackOpsAction | None:
+    try:
+        from ..runners.benchmark_runner import heuristic_policy
+    except ImportError:  # pragma: no cover
+        from runners.benchmark_runner import heuristic_policy
+    return heuristic_policy(observation_dict)
+def action_to_completion(action: ChargebackOpsAction) -> str:
+    """Serialise an action as the canonical JSON completion string."""
+    payload = action.model_dump(exclude_none=True)
+    payload = {k: v for k, v in payload.items() if v not in ([], "")}
+    return json.dumps(payload, separators=(",", ":"), sort_keys=True)
+@dataclass(frozen=True)
+class SFTSample:
+    """One supervised training pair."""
+    task_id: str
+    state_step: int
+    prompt: str
+    completion: str
+    action_type: str
+def build_sft_dataset(
+    task_ids: Sequence[str],
+    *,
+    max_states_per_task: int = 24,
+) -> list[dict[str, Any]]:
+    """Roll heuristic on each task; capture (prompt, oracle_completion) pairs.
+    Goes deeper than :func:`training.reward_adapter.build_state_action_dataset`
+    (default 24 vs 12 states per task) because SFT benefits from seeing
+    the full trajectory — including terminal-resolution actions which
+    are rare in the early-state distribution.
+    """
+    samples: list[dict[str, Any]] = []
+    for task_id in task_ids:
+        env = ChargebackOpsEnvironment()
+        obs = env.reset(task_id=task_id)
+        for state_step in range(max_states_per_task):
+            if obs.done:
+                break
+            heur = _heuristic_policy(obs.model_dump())
+            if heur is None:
+                break
+            samples.append(
+                {
+                    "task_id": task_id,
+                    "state_step": state_step,
+                    "prompt": build_prompt(obs.model_dump()),
+                    "completion": action_to_completion(heur),
+                    "action_type": heur.action_type,
+                }
+            )
+            obs = env.step(heur)
+    return samples
+__all__ = [
+    "SFTSample",
+    "action_to_completion",
+    "build_sft_dataset",
+]