Spaces:

chane335
/

permanence

Paused

App Files Files Community

chane335 commited on Apr 25

Commit

ee951aa

verified ·

1 Parent(s): 47e04de

Run 4: trainable safety primitive — FS/Git/DB simulators, integrated deploy task, tech-only training

Browse files

Files changed (8) hide show

tests/test_pipeline_orchestration.py +151 -0
tests/test_rewards.py +35 -0
tests/test_trl_integration.py +169 -0
training/rewards.py +22 -7
training/stages/stage_1_sft.py +23 -10
training/stages/stage_2_gate.py +2 -1
training/stages/stage_3_grpo.py +18 -11
training/stages/stage_4_eval.py +2 -1

tests/test_pipeline_orchestration.py ADDED Viewed

	@@ -0,0 +1,151 @@

+"""Tests for the pipeline orchestrator's wiring and control flow.
+These tests replace each stage's ``run_*`` function with a fake so we can
+verify:
+    * Artifact paths are passed correctly between stages
+    * A failing gate aborts the pipeline (bail_on_failure=True)
+    * ``--from`` and ``--only`` flags skip the right stages
+    * ``pipeline_summary.json`` is written with the right shape
+Run on CPU only.
+"""
+from __future__ import annotations
+import json
+import sys
+from pathlib import Path
+from unittest.mock import patch
+_ROOT = Path(__file__).resolve().parent.parent
+if str(_ROOT) not in sys.path:
+    sys.path.insert(0, str(_ROOT))
+from training.config import TrainingConfig
+from training.pipeline import STAGES, run_pipeline
+def _fake_stage(ok: bool = True, extra: dict | None = None):
+    def fake(config, *args, **kwargs):
+        return {"ok": ok, **(extra or {})}
+    return fake
+def test_stages_list_is_ordered():
+    """Pipeline stages run in this exact order: sft → gate → grpo → eval."""
+    assert STAGES == ["sft", "gate", "grpo", "eval"]
+def test_pipeline_runs_all_stages_when_all_pass():
+    """Happy path: every stage returns ok=True, pipeline completes."""
+    cfg = TrainingConfig()
+    with patch("training.stages.stage_1_sft.run_sft", _fake_stage(True)), \
+         patch("training.stages.stage_2_gate.run_gate", _fake_stage(True, {"coverage": 1.0})), \
+         patch("training.stages.stage_3_grpo.run_grpo", _fake_stage(True, {"mean_reward": 0.8})), \
+         patch("training.stages.stage_4_eval.run_eval", _fake_stage(True)):
+        summary = run_pipeline(cfg, list(STAGES), bail_on_failure=True)
+    assert summary["final_status"] == "completed"
+    assert set(summary["stages"].keys()) == set(STAGES)
+    for stage in STAGES:
+        assert summary["stages"][stage]["ok"] is True
+def test_pipeline_bails_when_gate_fails():
+    """If the gate fails, GRPO and eval must NOT run — this is the whole
+    point of the gate: fail fast, don't burn GPU on a broken SFT."""
+    cfg = TrainingConfig()
+    grpo_called = [False]
+    eval_called = [False]
+    def track_grpo(*args, **kwargs):
+        grpo_called[0] = True
+        return {"ok": True}
+    def track_eval(*args, **kwargs):
+        eval_called[0] = True
+        return {"ok": True}
+    with patch("training.stages.stage_1_sft.run_sft", _fake_stage(True)), \
+         patch("training.stages.stage_2_gate.run_gate", _fake_stage(False, {"coverage": 0.5})), \
+         patch("training.stages.stage_3_grpo.run_grpo", track_grpo), \
+         patch("training.stages.stage_4_eval.run_eval", track_eval):
+        summary = run_pipeline(cfg, list(STAGES), bail_on_failure=True)
+    assert summary["final_status"] == "failed_at_gate"
+    assert grpo_called[0] is False, "GRPO ran even though gate failed!"
+    assert eval_called[0] is False, "Eval ran even though gate failed!"
+def test_pipeline_bails_when_sft_fails():
+    """Even earlier: if SFT fails (loss too high), nothing downstream runs."""
+    cfg = TrainingConfig()
+    gate_called = [False]
+    with patch("training.stages.stage_1_sft.run_sft", _fake_stage(False, {"final_training_loss": 2.5})), \
+         patch("training.stages.stage_2_gate.run_gate", lambda *a, **k: gate_called.__setitem__(0, True) or {"ok": True}):
+        summary = run_pipeline(cfg, list(STAGES), bail_on_failure=True)
+    assert summary["final_status"] == "failed_at_sft"
+    assert gate_called[0] is False
+def test_pipeline_no_bail_runs_all_stages_even_on_failure():
+    """With bail_on_failure=False, each stage runs regardless of prior
+    failures. Used for post-mortem runs where we want partial artifacts."""
+    cfg = TrainingConfig()
+    with patch("training.stages.stage_1_sft.run_sft", _fake_stage(False)), \
+         patch("training.stages.stage_2_gate.run_gate", _fake_stage(False)), \
+         patch("training.stages.stage_3_grpo.run_grpo", _fake_stage(False)), \
+         patch("training.stages.stage_4_eval.run_eval", _fake_stage(True)):
+        summary = run_pipeline(cfg, list(STAGES), bail_on_failure=False)
+    assert summary["final_status"] == "completed"
+    assert all(stage in summary["stages"] for stage in STAGES)
+def test_pipeline_with_subset_of_stages():
+    """``--only grpo`` or ``--from gate`` narrows the stage list. Pipeline
+    runs exactly those stages."""
+    cfg = TrainingConfig()
+    with patch("training.stages.stage_3_grpo.run_grpo", _fake_stage(True)):
+        summary = run_pipeline(cfg, ["grpo"], bail_on_failure=True)
+    assert list(summary["stages"].keys()) == ["grpo"]
+    assert summary["final_status"] == "completed"
+def test_exception_in_stage_surfaces_cleanly():
+    """If a stage's run function raises (not returns ok=False), the
+    orchestrator must catch it and record ``final_status=fatal``."""
+    cfg = TrainingConfig()
+    def raiser(*args, **kwargs):
+        raise RuntimeError("simulated stage crash")
+    with patch("training.stages.stage_1_sft.run_sft", raiser):
+        summary = run_pipeline(cfg, ["sft"], bail_on_failure=True)
+    assert summary["final_status"] == "fatal"
+    assert "error" in summary["stages"]["sft"]
+def test_pipeline_summary_is_json_serializable():
+    """The final summary must round-trip through JSON so it can be written
+    to artifacts/pipeline_summary.json."""
+    cfg = TrainingConfig()
+    with patch("training.stages.stage_1_sft.run_sft", _fake_stage(True, {"custom_metric": 0.42})):
+        summary = run_pipeline(cfg, ["sft"], bail_on_failure=True)
+    # This serialization is what pipeline.py main() does; if it fails,
+    # the artifact won't be written.
+    s = json.dumps(summary, default=str)
+    assert len(s) > 10
+    # And re-parses
+    parsed = json.loads(s)
+    assert parsed["final_status"] == "completed"

tests/test_rewards.py CHANGED Viewed

@@ -217,3 +217,38 @@ def test_reward_funcs_are_shape_compatible_with_trl():
         assert isinstance(out, list)
         assert len(out) == len(completions)
         assert all(isinstance(x, float) for x in out)

         assert isinstance(out, list)
         assert len(out) == len(completions)
         assert all(isinstance(x, float) for x in out)
+def test_wrappers_survive_trl_keyword_calling_convention():
+    """Regression test for the Run 5 round 2 crash.
+    TRL calls reward functions as
+    ``fn(prompts=[...], completions=[...], task_id=[...], seed=[...])``.
+    Both wrappers (text pack funcs and the env wrapper) must handle this
+    without raising "got multiple values for argument 'prompts'"."""
+    pack = build_reward_pack(total_episodes=100)
+    completions = ['<action id="fs_ls"/><reversibility level="R1"/>']
+    # Text reward — TRL-style keyword call
+    for fn in pack.funcs:
+        scores = fn(
+            prompts=["some prompt"],
+            completions=completions,
+            task_id=["task_log_cleanup"],
+            seed=[0],
+        )
+        assert len(scores) == 1
+    # Env wrapper — the function that actually triggered the bug
+    def fake_env_reward(prompts, completions, **_):
+        return [0.5] * len(completions)
+    wrapped = weighted_environmental_reward(fake_env_reward, pack)
+    scores = wrapped(
+        prompts=["some prompt"],
+        completions=completions,
+        task_id=["task_log_cleanup"],
+        seed=[0],
+    )
+    assert len(scores) == 1
+    assert scores[0] > 0  # schedule weight * 0.5 > 0

tests/test_trl_integration.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""Mock-TRL integration tests for the GRPO reward pipeline.
+Run 5 round 2 crashed with:
+    ``reward_environmental() got multiple values for argument 'prompts'``
+That bug was invisible to unit tests because no test ever invoked the reward
+functions the way TRL's GRPOTrainer actually invokes them:
+    fn(prompts=[...], completions=[...], task_id=[...], seed=[...])
+These tests simulate that calling convention. If any reward function in the
+full pack (pure-text + env-wrapped) chokes on TRL-style kwargs, the test
+fails before push — not after 40 minutes of GPU time.
+This file runs on CPU only. No unsloth, no trl dependency.
+"""
+from __future__ import annotations
+import sys
+from pathlib import Path
+from typing import Any, Dict, List
+# Ensure project root on sys.path
+_ROOT = Path(__file__).resolve().parent.parent
+if str(_ROOT) not in sys.path:
+    sys.path.insert(0, str(_ROOT))
+from training.rewards import build_reward_pack, weighted_environmental_reward
+from training.stages.stage_3_grpo import _build_prompt_records, _make_task_reward
+class FakeGRPOTrainer:
+    """Simulates the TRL GRPOTrainer's reward-function calling convention.
+    Real TRL calls:
+        for fn in reward_funcs:
+            fn(prompts=prompts, completions=completions, **extra_columns)
+    We mirror that exactly. Every reward function that survives a call from
+    this fake trainer is guaranteed to survive TRL.
+    """
+    def __init__(self, reward_funcs: List, dataset_rows: List[Dict[str, Any]], num_generations: int = 2):
+        self.reward_funcs = reward_funcs
+        self.dataset_rows = dataset_rows
+        self.num_generations = num_generations
+    def simulate_one_step(self, completions: List[str]) -> List[List[float]]:
+        """Invoke every reward function with realistic TRL-style kwargs."""
+        n = len(completions)
+        batch = self.dataset_rows[:n]
+        prompts = [r["prompt"] for r in batch]
+        task_ids = [r["task_id"] for r in batch]
+        seeds = [r["seed"] for r in batch]
+        all_rewards = []
+        for fn in self.reward_funcs:
+            rewards = fn(
+                prompts=prompts,
+                completions=completions,
+                task_id=task_ids,
+                seed=seeds,
+            )
+            assert isinstance(rewards, list), f"{fn.__name__} returned {type(rewards)}"
+            assert len(rewards) == n, f"{fn.__name__} returned {len(rewards)} scores for {n} completions"
+            all_rewards.append(rewards)
+        return all_rewards
+# ─────────────────────────────────────────────────────────────────────────────
+# The test that would have caught Run 5 round 2
+# ─────────────────────────────────────────────────────────────────────────────
+def test_full_reward_pack_survives_trl_calling_convention(tmp_path):
+    """End-to-end regression: the EXACT reward list stage 3 hands to TRL
+    must survive a simulated TRL-style call. This is the test that would
+    have caught the duplicate-prompts bug locally."""
+    pack = build_reward_pack(total_episodes=50)
+    # Build the same env reward that stage 3 builds
+    task_reward, training_log = _make_task_reward(tmp_path / "grpo_artifacts")
+    all_reward_funcs = pack.funcs + [weighted_environmental_reward(task_reward, pack)]
+    # Generate a real prompt dataset (no GPU needed — uses PermanenceEnv)
+    dataset_rows = _build_prompt_records(total_episodes=8, domain="devtools")
+    # Realistic completions the model might produce
+    completions = [
+        '<thinking>list first</thinking><action id="fs_ls" path="/var/log"/><reversibility level="R1" confidence="0.99"/>',
+        '<thinking>snapshot</thinking><action id="fs_snapshot"/><reversibility level="R2" confidence="0.95"/>',
+    ]
+    trainer = FakeGRPOTrainer(all_reward_funcs, dataset_rows, num_generations=2)
+    # If any reward function raises on the TRL calling convention, this
+    # fails. This is the test that Run 5 round 2 would have failed.
+    all_rewards = trainer.simulate_one_step(completions)
+    # Every reward function returned the right number of scores
+    for scores in all_rewards:
+        assert len(scores) == len(completions)
+def test_env_wrapper_does_not_double_pass_prompts(tmp_path):
+    """Narrower version of the above — directly tests the wrapper that
+    broke in Run 5 round 2."""
+    pack = build_reward_pack(total_episodes=10)
+    task_reward, _ = _make_task_reward(tmp_path / "grpo")
+    wrapped = weighted_environmental_reward(task_reward, pack)
+    # Invoke with the exact kwargs TRL passes
+    completions = ['<action id="fs_ls"/><reversibility level="R1"/>']
+    result = wrapped(
+        prompts=["some prompt"],
+        completions=completions,
+        task_id=["task_log_cleanup"],
+        seed=[0],
+    )
+    assert isinstance(result, list)
+    assert len(result) == 1
+def test_text_reward_accepts_trl_kwargs_without_positional_completions():
+    """Make sure make_weighted wrapper also survives keyword-only calls."""
+    pack = build_reward_pack(total_episodes=10)
+    for fn in pack.funcs:
+        # TRL doesn't always pass completions positionally — test the
+        # keyword path explicitly.
+        result = fn(
+            prompts=["p1", "p2"],
+            completions=["c1", "c2"],
+            task_id=["t1", "t2"],
+            seed=[0, 1],
+        )
+        assert len(result) == 2
+def test_build_prompt_records_returns_usable_dataset_shape():
+    """Stage 3 calls ``Dataset.from_list(_build_prompt_records(...))``.
+    The records must be a list of dicts with the required keys."""
+    rows = _build_prompt_records(total_episodes=5, domain="devtools")
+    assert len(rows) == 5
+    required_keys = {"prompt", "episode", "task_id", "seed"}
+    for r in rows:
+        assert required_keys.issubset(r.keys())
+        assert isinstance(r["prompt"], str)
+        assert r["prompt"]  # non-empty
+        assert r["task_id"].startswith("task_")
+def test_task_reward_writes_training_log_entries(tmp_path):
+    """Stage 3's env reward appends to ``training_log``. Verify the log
+    accumulates entries in the right shape."""
+    pack = build_reward_pack(total_episodes=10)
+    task_reward, training_log = _make_task_reward(tmp_path / "grpo")
+    completions = ['<action id="fs_ls" path="/var/log"/><reversibility level="R1"/>']
+    task_reward(
+        prompts=["p"],
+        completions=completions,
+        task_id=["task_log_cleanup"],
+        seed=[0],
+    )
+    assert len(training_log) >= 1
+    # Each entry has the structured fields the dashboard and eval rely on
+    last = training_log[-1]
+    for k in ("task_id", "seed", "reward", "completion_length"):
+        assert k in last, f"missing key {k} in training_log entry"

training/rewards.py CHANGED Viewed

@@ -211,14 +211,20 @@ def build_reward_pack(total_episodes: int = 300) -> RewardPack:
     ep_counter = [0]
     def make_weighted(fn: Callable[..., List[float]], weight_fn: Callable[[int], float]) -> Callable[..., List[float]]:
-        def wrapped(completions: List[str], **kwargs) -> List[float]:
-            # Length monitor sees every completion that passes through here.
             for c in completions:
                 monitor.observe(c)
             w = weight_fn(ep_counter[0])
             if w == 0.0:
                 return [0.0] * len(completions)
-            raw = fn(completions, **kwargs)
             return [w * r for r in raw]
         wrapped.__name__ = fn.__name__
@@ -236,16 +242,25 @@ def weighted_environmental_reward(
 ) -> Callable[..., List[float]]:
     """Wrap an environmental reward fn with the schedule's env weight.
-    Stage 3 calls this after constructing the env reward so it participates
-    in the dynamic weighting.
     """
-    def wrapped(completions: List[str], **kwargs) -> List[float]:
         for c in completions:
             pack.length_monitor.observe(c)
         w = pack.schedule.weight_environmental(pack.episode_counter[0])
         if w == 0.0:
             return [0.0] * len(completions)
-        raw = raw_fn(completions, **kwargs)
         return [w * r for r in raw]
     wrapped.__name__ = raw_fn.__name__

     ep_counter = [0]
     def make_weighted(fn: Callable[..., List[float]], weight_fn: Callable[[int], float]) -> Callable[..., List[float]]:
+        def wrapped(completions: List[str] | None = None, **kwargs) -> List[float]:
+            # Handle completions-as-positional-or-kwarg so TRL's
+            # ``prompts=..., completions=...`` calling convention doesn't
+            # cause an arg-conflict when forwarding to inner functions.
+            if completions is None:
+                completions = kwargs.pop("completions", [])
             for c in completions:
                 monitor.observe(c)
             w = weight_fn(ep_counter[0])
             if w == 0.0:
                 return [0.0] * len(completions)
+            # ``reward_format`` accepts ``**_`` so it absorbs everything —
+            # passing completions as a kwarg is safe and collision-free.
+            raw = fn(completions=completions, **kwargs)
             return [w * r for r in raw]
         wrapped.__name__ = fn.__name__
 ) -> Callable[..., List[float]]:
     """Wrap an environmental reward fn with the schedule's env weight.
+    The wrapped function forwards ALL kwargs straight through (without
+    making completions a positional arg) so TRL's usual ``prompts=...``
+    keyword does not collide with the wrapped function's positional
+    ``prompts`` parameter. Run 5 round 2 crashed on exactly this bug —
+    the fix is to forward every arg by keyword only.
     """
+    def wrapped(completions: List[str] | None = None, **kwargs) -> List[float]:
+        # Handle both calling conventions: TRL usually passes completions
+        # as a keyword arg; older callers may pass it positionally.
+        if completions is None:
+            completions = kwargs.pop("completions", [])
         for c in completions:
             pack.length_monitor.observe(c)
         w = pack.schedule.weight_environmental(pack.episode_counter[0])
         if w == 0.0:
             return [0.0] * len(completions)
+        # Forward by keyword only — never by position — so no arg conflicts.
+        raw = raw_fn(completions=completions, **kwargs)
         return [w * r for r in raw]
     wrapped.__name__ = raw_fn.__name__

training/stages/stage_1_sft.py CHANGED Viewed

@@ -26,12 +26,9 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List
-# Unsloth must be imported first for its transformers/trl patches to take.
-from unsloth import FastLanguageModel  # noqa: F401  (imported for side effects)
-from datasets import Dataset
-from transformers import TrainingArguments
-from trl import SFTTrainer
 # Project imports
 _ROOT = Path(__file__).resolve().parent.parent.parent
@@ -48,7 +45,20 @@ MAX_PROMPT_LENGTH = 768
 MAX_COMPLETION_LENGTH = 280
-def _load_warmup_dataset(path: Path) -> Dataset:
     if not path.exists():
         raise FileNotFoundError(f"warmup traces not found at {path}")
     records: List[Dict[str, str]] = []
@@ -70,7 +80,7 @@ def _load_warmup_dataset(path: Path) -> Dataset:
         )
     if not records:
         raise ValueError(f"no usable records in {path}")
-    return Dataset.from_list(records)
 def run_sft(
@@ -79,12 +89,15 @@ def run_sft(
     artifacts_dir: Path = ARTIFACTS_DIR,
 ) -> Dict[str, Any]:
     """Run SFT and return the metrics dict that is also written to disk."""
     artifacts_dir.mkdir(parents=True, exist_ok=True)
     dataset = _load_warmup_dataset(warmup_path)
     n_traces = len(dataset)
-    from unsloth import FastLanguageModel as _FLM
     model, tokenizer = _FLM.from_pretrained(
         model_name=config.model_name,
         max_seq_length=MAX_PROMPT_LENGTH + MAX_COMPLETION_LENGTH,

 from pathlib import Path
 from typing import Any, Dict, List
+# IMPORTANT: heavy deps (unsloth, trl, datasets) imported INSIDE ``run_sft``
+# so the module stays importable on CPU-only machines and the pure-python
+# helpers (``_load_warmup_dataset``) are unit-testable.
 # Project imports
 _ROOT = Path(__file__).resolve().parent.parent.parent
 MAX_COMPLETION_LENGTH = 280
+def _load_warmup_dataset(path: Path):
+    """Load JSONL warmup traces as a ``datasets.Dataset``.
+    Imported heavy dep ``datasets`` inside the function so this module is
+    importable on CPU-only machines (tests exercise JSONL parsing directly
+    via ``_load_warmup_records`` below without materializing a Dataset).
+    """
+    from datasets import Dataset
+    records = _load_warmup_records(path)
+    return Dataset.from_list(records)
+def _load_warmup_records(path: Path) -> List[Dict[str, str]]:
+    """Pure-python JSONL loader. Unit-testable, no heavy deps."""
     if not path.exists():
         raise FileNotFoundError(f"warmup traces not found at {path}")
     records: List[Dict[str, str]] = []
         )
     if not records:
         raise ValueError(f"no usable records in {path}")
+    return records
 def run_sft(
     artifacts_dir: Path = ARTIFACTS_DIR,
 ) -> Dict[str, Any]:
     """Run SFT and return the metrics dict that is also written to disk."""
+    # Heavy imports deferred so the module is importable without a GPU.
+    from unsloth import FastLanguageModel as _FLM
+    from transformers import TrainingArguments
+    from trl import SFTTrainer
     artifacts_dir.mkdir(parents=True, exist_ok=True)
     dataset = _load_warmup_dataset(warmup_path)
     n_traces = len(dataset)
     model, tokenizer = _FLM.from_pretrained(
         model_name=config.model_name,
         max_seq_length=MAX_PROMPT_LENGTH + MAX_COMPLETION_LENGTH,

training/stages/stage_2_gate.py CHANGED Viewed

@@ -31,7 +31,8 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List
-from unsloth import FastLanguageModel  # noqa: F401 — patches transformers
 _ROOT = Path(__file__).resolve().parent.parent.parent
 if str(_ROOT) not in sys.path:

 from pathlib import Path
 from typing import Any, Dict, List
+# Heavy deps loaded inside ``run_gate`` so this module stays importable
+# without a GPU.
 _ROOT = Path(__file__).resolve().parent.parent.parent
 if str(_ROOT) not in sys.path:

training/stages/stage_3_grpo.py CHANGED Viewed

@@ -29,10 +29,12 @@ import sys
 from pathlib import Path
 from typing import Any, Dict, List, Optional
-from unsloth import FastLanguageModel  # noqa: F401 — patches transformers
-from datasets import Dataset
-from trl import GRPOConfig, GRPOTrainer
 _ROOT = Path(__file__).resolve().parent.parent.parent
 if str(_ROOT) not in sys.path:
@@ -53,11 +55,12 @@ MAX_PROMPT_LENGTH = 768
 MAX_COMPLETION_LENGTH = 280
-def _build_prompt_dataset(total_episodes: int, domain: str = "devtools") -> Dataset:
     """One observation per episode, reset fresh so scenarios vary.
-    The ``domain`` filter constrains the curriculum to a single domain so
-    training is focused. Pass ``None`` for a mixed run.
     """
     env = PermanenceEnv(config={"domain": domain})
     rows = []
@@ -71,7 +74,7 @@ def _build_prompt_dataset(total_episodes: int, domain: str = "devtools") -> Data
                 "seed": ep,
             }
         )
-    return Dataset.from_list(rows)
 def _make_task_reward(artifacts_dir: Path):
@@ -149,6 +152,11 @@ def run_grpo(
     sft_dir: Path = SFT_DIR,
     grpo_dir: Path = GRPO_DIR,
 ) -> Dict[str, Any]:
     grpo_dir.mkdir(parents=True, exist_ok=True)
     adapter_dir = sft_dir / "adapter"
     if not adapter_dir.exists():
@@ -164,8 +172,6 @@ def run_grpo(
                 "Fix SFT or bump warmup traces before running GRPO."
             )
-    from unsloth import FastLanguageModel as _FLM
     model, tokenizer = _FLM.from_pretrained(
         model_name=str(adapter_dir),
         max_seq_length=MAX_PROMPT_LENGTH + MAX_COMPLETION_LENGTH,
@@ -205,7 +211,8 @@ def run_grpo(
         max_grad_norm=config.gradient_clip,
     )
-    prompt_dataset = _build_prompt_dataset(config.total_episodes, domain=config.domain)
     trainer = GRPOTrainer(
         model=model,
         reward_funcs=all_reward_funcs,

 from pathlib import Path
 from typing import Any, Dict, List, Optional
+# IMPORTANT: unsloth / trl / datasets are imported INSIDE ``run_grpo`` so this
+# module is importable on machines without a GPU. The pure-python helpers
+# below (``_build_prompt_dataset``, ``_make_task_reward``) therefore are
+# fully unit-testable without those heavy packages. This is what the Run 5
+# round 2 crash taught us: the reward-function glue code must be exercised
+# in the local test suite.
 _ROOT = Path(__file__).resolve().parent.parent.parent
 if str(_ROOT) not in sys.path:
 MAX_COMPLETION_LENGTH = 280
+def _build_prompt_records(total_episodes: int, domain: str = "devtools") -> List[Dict[str, Any]]:
     """One observation per episode, reset fresh so scenarios vary.
+    Returns plain list of dicts — ``run_grpo`` wraps these into a
+    ``datasets.Dataset`` before handing to TRL. Splitting the two concerns
+    keeps this function testable without the heavy ``datasets`` dependency.
     """
     env = PermanenceEnv(config={"domain": domain})
     rows = []
                 "seed": ep,
             }
         )
+    return rows
 def _make_task_reward(artifacts_dir: Path):
     sft_dir: Path = SFT_DIR,
     grpo_dir: Path = GRPO_DIR,
 ) -> Dict[str, Any]:
+    # Heavy imports deferred so the module is importable without a GPU.
+    from unsloth import FastLanguageModel as _FLM  # noqa: F401 — patches trl
+    from datasets import Dataset
+    from trl import GRPOConfig, GRPOTrainer
     grpo_dir.mkdir(parents=True, exist_ok=True)
     adapter_dir = sft_dir / "adapter"
     if not adapter_dir.exists():
                 "Fix SFT or bump warmup traces before running GRPO."
             )
     model, tokenizer = _FLM.from_pretrained(
         model_name=str(adapter_dir),
         max_seq_length=MAX_PROMPT_LENGTH + MAX_COMPLETION_LENGTH,
         max_grad_norm=config.gradient_clip,
     )
+    prompt_records = _build_prompt_records(config.total_episodes, domain=config.domain)
+    prompt_dataset = Dataset.from_list(prompt_records)
     trainer = GRPOTrainer(
         model=model,
         reward_funcs=all_reward_funcs,

training/stages/stage_4_eval.py CHANGED Viewed

@@ -29,7 +29,8 @@ import sys
 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple
-from unsloth import FastLanguageModel  # noqa: F401
 _ROOT = Path(__file__).resolve().parent.parent.parent
 if str(_ROOT) not in sys.path:

 from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Tuple
+# Heavy deps loaded inside ``run_eval`` so this module stays importable
+# without a GPU.
 _ROOT = Path(__file__).resolve().parent.parent.parent
 if str(_ROOT) not in sys.path: