Spaces:

nks321
/

moa-rl-env

Sleeping

natnael kahssay Claude Sonnet 4.6 commited on Mar 8

Commit

ded7690

1 Parent(s): bb5a5ec

feat: RFC 005 interactive rollout wrapper + multi-turn GRPO training

rollout_wrapper.py:
- run_episode() runs a full interactive episode via vLLM
- model generates ONE tool call at a time, sees tool result, then decides next
- captures (context, completion, logprobs) per turn as a Trajectory
- true reactive multi-turn — not blind planning

train_rfc005.py:
- collects N_EPISODES in parallel via ThreadPoolExecutor
- re-scores each turn with HF model for differentiable logprobs
- GRPO loss = -advantage * sum(logprobs across all turns in episode)
- Unsloth syncs HF weights → vLLM after each optimizer.step() automatically

Upgrade from train.py:
before: model generates all tool calls at once, never sees results
now: model reacts to each tool result before deciding the next call

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (2) hide show

training/rollout_wrapper.py +197 -0
training/train_rfc005.py +160 -0

training/rollout_wrapper.py ADDED Viewed

	@@ -0,0 +1,197 @@

+"""
+RFC 005 interactive rollout wrapper.
+Runs a full multi-turn episode where the model sees tool results at each step.
+Unlike the single-completion approach in train.py, the model:
+  - generates ONE tool call at a time
+  - sees the actual result before deciding the next move
+  - is reactive, not planning blind
+Returns a Trajectory: list of (context, completion, logprobs) per turn + final reward.
+The training loop re-scores each turn with the HF model to get differentiable logprobs
+and computes GRPO loss across the full trajectory.
+"""
+import json
+import os
+import requests
+from dataclasses import dataclass, field
+ENV_URL    = os.environ.get("ENV_URL",    "https://http--moa-rl-env--7b2fgcxb6gxp.code.run")
+VLLM_URL   = os.environ.get("VLLM_URL",  "http://localhost:8001")
+MODEL_NAME = os.environ.get("MODEL_NAME", "unsloth/gpt-oss-20b-instruct")
+MAX_TURNS  = 8
+TIMEOUT    = 120
+SYSTEM_PROMPT = """\
+You are a TypeScript coding agent. Fix broken source files using tools.
+Emit exactly ONE tool call per response as a JSON object on its own line:
+  {"tool": "read",   "params": {"path": "src/foo.ts"}}
+  {"tool": "edit",   "params": {"path": "src/foo.ts", "old_string": "...", "new_string": "..."}}
+  {"tool": "bash",   "params": {"cmd": "npx tsc --noEmit 2>&1 | head -10"}}
+  {"tool": "submit", "params": {}}
+One JSON object. No prose. No markdown fences.\
+"""
+@dataclass
+class Turn:
+    """One model generation step within an episode."""
+    messages:   list[dict]   # full conversation context fed into this generation
+    completion: str          # what the model generated
+    logprobs:   list[float]  # per-token logprobs returned by vLLM (for reference)
+@dataclass
+class Trajectory:
+    """A complete episode: sequence of turns + final reward."""
+    turns:  list[Turn] = field(default_factory=list)
+    reward: float      = 0.0
+# ── env helpers ────────────────────────────────────────────────────────────────
+def _env_reset() -> dict:
+    r = requests.post(f"{ENV_URL}/reset", json={}, timeout=TIMEOUT)
+    r.raise_for_status()
+    raw = r.json()
+    return raw.get("observation", raw)
+def _env_step(tool: str, params: dict) -> dict:
+    r = requests.post(
+        f"{ENV_URL}/step",
+        json={"action": {"tool": tool, "params": params}},
+        timeout=TIMEOUT,
+    )
+    r.raise_for_status()
+    raw = r.json()
+    obs = raw.get("observation", raw)
+    obs["reward"] = raw.get("reward", 0.0)
+    return obs
+# ── vLLM generation ────────────────────────────────────────────────────────────
+def _vllm_generate(messages: list[dict]) -> tuple[str, list[float]]:
+    """
+    Call vLLM with logprobs=True.
+    Returns (completion_text, per_token_logprobs).
+    """
+    r = requests.post(
+        f"{VLLM_URL}/v1/chat/completions",
+        json={
+            "model": MODEL_NAME,
+            "messages": messages,
+            "max_tokens": 256,
+            "temperature": 0.7,
+            "logprobs": True,
+            "top_logprobs": 1,
+        },
+        timeout=TIMEOUT,
+    )
+    r.raise_for_status()
+    result  = r.json()
+    choice  = result["choices"][0]
+    text    = choice["message"]["content"]
+    lp_data = choice.get("logprobs", {}).get("content", [])
+    logprobs = [entry["logprob"] for entry in lp_data] if lp_data else []
+    return text, logprobs
+# ── prompt helpers ─────────────────────────────────────────────────────────────
+def _initial_messages(obs: dict) -> list[dict]:
+    user_msgs = obs.get("user_messages", [])
+    ctx = ""
+    if user_msgs:
+        ctx = "User messages that triggered this task:\n"
+        ctx += "\n".join(f"  > {m}" for m in user_msgs) + "\n\n"
+    content = (
+        f"{ctx}"
+        f"Task: {obs['task']}\n\n"
+        f"File to fix: {obs['broken_file_path']}\n\n"
+        "Tests that must pass:\n"
+        f"```ts\n{obs.get('test_file_content', '')[:1500]}\n```\n\n"
+        "Start by reading the file."
+    )
+    return [
+        {"role": "system",  "content": SYSTEM_PROMPT},
+        {"role": "user",    "content": content},
+    ]
+def _parse_tool_call(text: str) -> tuple[str, dict] | None:
+    for line in text.splitlines():
+        line = line.strip()
+        if not line.startswith("{"):
+            continue
+        try:
+            obj = json.loads(line)
+            if "tool" in obj and "params" in obj:
+                return obj["tool"], obj["params"]
+        except json.JSONDecodeError:
+            pass
+    return None
+# ── episode runner ──────────────────────────────────────────��──────────────────
+def run_episode() -> Trajectory:
+    """
+    Run one full interactive episode.
+    At each turn the model sees all previous tool results — true reactive multi-turn.
+    Captures logprobs at every generation step so GRPO loss can be computed
+    across the full trajectory.
+    Difference from single-completion train.py:
+      Before: model generates ALL tool calls blindly upfront
+      Now:    model generates ONE tool call, sees the result, then decides next move
+    """
+    traj     = Trajectory()
+    obs      = _env_reset()
+    messages = _initial_messages(obs)
+    for _ in range(MAX_TURNS):
+        completion, logprobs = _vllm_generate(messages)
+        traj.turns.append(Turn(
+            messages   = list(messages),   # snapshot of context at this step
+            completion = completion,
+            logprobs   = logprobs,
+        ))
+        parsed = _parse_tool_call(completion)
+        if parsed is None:
+            # Model produced no valid tool call — end with zero reward
+            traj.reward = 0.0
+            return traj
+        tool, params = parsed
+        # Append model turn to conversation
+        messages.append({"role": "assistant", "content": completion})
+        # Execute against env
+        step_obs = _env_step(tool, params)
+        done     = step_obs.get("done", False)
+        if done:
+            traj.reward = step_obs.get("reward", 0.0)
+            return traj
+        # Feed tool result back so model can react to it
+        tool_result = step_obs.get("tool_result", "")
+        messages.append({
+            "role":    "user",
+            "content": f"[{tool} result]\n{tool_result}",
+        })
+    # Max turns hit — force submit
+    obs_final   = _env_step("submit", {})
+    traj.reward = obs_final.get("reward", 0.0)
+    return traj

training/train_rfc005.py ADDED Viewed

	@@ -0,0 +1,160 @@

+"""
+RFC 005 training loop — true interactive multi-turn GRPO.
+The model generates one tool call at a time and sees tool results before
+deciding the next move. This is what train.py can't do with standard GRPOTrainer.
+How it works:
+  1. rollout_wrapper.run_episode() runs N parallel episodes via vLLM
+     - at each turn: generate → execute tool → inject result → continue
+     - captures (context, completion, vllm_logprobs) per turn
+  2. HF model re-scores each turn: forward pass on (context, completion)
+     → differentiable token logprobs
+  3. GRPO loss:
+     advantage_i = (reward_i - mean_reward) / (std_reward + 1e-8)
+     loss = -mean( advantage_i * sum(logprob of tokens in turn t, for all t in episode i) )
+  4. optimizer.step()
+  5. Unsloth syncs updated HF weights → vLLM automatically
+The key upgrade over train.py:
+  train.py   → model plans blind (generates all tool calls at once, never sees results)
+  this file  → model reacts     (one call at a time, sees actual output each step)
+"""
+import os
+import torch
+import torch.nn.functional as F
+from concurrent.futures import ThreadPoolExecutor
+from unsloth import FastLanguageModel
+from rollout_wrapper import run_episode, Trajectory
+MODEL_NAME  = os.environ.get("MODEL_NAME",  "unsloth/gpt-oss-20b-instruct")
+OUTPUT_DIR  = os.environ.get("OUTPUT_DIR",  "/output/moa-rl-grpo-rfc005")
+N_EPISODES  = int(os.environ.get("N_EPISODES",  "4"))   # episodes per training step (GRPO needs variance)
+MAX_STEPS   = int(os.environ.get("MAX_STEPS",   "300"))
+LR          = float(os.environ.get("LR",        "5e-6"))
+# ── model ──────────────────────────────────────────────────────────────────────
+print(f"Loading {MODEL_NAME}...")
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name     = MODEL_NAME,
+    max_seq_length = 4096,
+    load_in_4bit   = False,
+    dtype          = torch.bfloat16,
+)
+model = FastLanguageModel.get_peft_model(
+    model,
+    r                        = 16,
+    lora_alpha               = 16,
+    target_modules           = ["q_proj", "k_proj", "v_proj", "o_proj",
+                                 "gate_proj", "up_proj", "down_proj"],
+    use_gradient_checkpointing = "unsloth",
+    random_state             = 42,
+)
+# Start vLLM inside Unsloth (syncs weights automatically after each optimizer step)
+from unsloth import PatchFastRL
+PatchFastRL("GRPO", FastLanguageModel)
+optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
+# ── GRPO loss over a trajectory ────────────────────────────────────────────────
+def score_turn(messages: list[dict], completion: str) -> torch.Tensor:
+    """
+    Re-score one turn with the HF model to get differentiable token logprobs.
+    vLLM logprobs are used for episode collection (fast generation).
+    HF logprobs are used here for the actual gradient update.
+    """
+    # Build input: format messages as a single string the model was trained on
+    prompt_text = tokenizer.apply_chat_template(
+        messages,
+        tokenize          = False,
+        add_generation_prompt = True,
+    )
+    full_text = prompt_text + completion
+    inputs     = tokenizer(full_text,    return_tensors="pt").to(model.device)
+    prompt_ids = tokenizer(prompt_text,  return_tensors="pt")["input_ids"]
+    prompt_len = prompt_ids.shape[1]
+    with torch.no_grad() if not model.training else torch.enable_grad():
+        logits = model(**inputs).logits  # (1, seq_len, vocab)
+    # Only score the completion tokens (not the prompt)
+    comp_logits = logits[0, prompt_len - 1 : -1, :]   # (comp_len, vocab)
+    comp_ids    = inputs["input_ids"][0, prompt_len:]  # (comp_len,)
+    log_probs   = F.log_softmax(comp_logits, dim=-1)
+    token_lps   = log_probs[range(len(comp_ids)), comp_ids]
+    return token_lps.sum()   # scalar: total logprob of this completion
+def grpo_loss(trajectories: list[Trajectory]) -> torch.Tensor:
+    """
+    Compute GRPO loss across N trajectories.
+    advantage_i = (reward_i - mean) / (std + 1e-8)
+    loss = -mean_i( advantage_i * sum_t( logprob(turn t in episode i) ) )
+    """
+    rewards  = torch.tensor([t.reward for t in trajectories], dtype=torch.float32)
+    mean_r   = rewards.mean()
+    std_r    = rewards.std() + 1e-8
+    advantages = (rewards - mean_r) / std_r
+    losses = []
+    for traj, adv in zip(trajectories, advantages):
+        # Sum logprobs across all turns in this episode
+        total_lp = sum(
+            score_turn(turn.messages, turn.completion)
+            for turn in traj.turns
+        )
+        losses.append(-adv * total_lp)
+    return torch.stack(losses).mean()
+# ── training loop ──────────────────────────────────────────────────────────────
+print(f"RFC 005 training: {N_EPISODES} episodes/step × {MAX_STEPS} steps")
+print(f"Model: {MODEL_NAME}  →  {OUTPUT_DIR}")
+for step in range(MAX_STEPS):
+    model.train()
+    # Collect N episodes in parallel via vLLM
+    with ThreadPoolExecutor(max_workers=N_EPISODES) as pool:
+        trajectories = list(pool.map(lambda _: run_episode(), range(N_EPISODES)))
+    rewards = [t.reward for t in trajectories]
+    mean_r  = sum(rewards) / len(rewards)
+    # GRPO loss + optimizer step
+    loss = grpo_loss(trajectories)
+    optimizer.zero_grad()
+    loss.backward()
+    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
+    optimizer.step()
+    # Unsloth automatically syncs updated weights → vLLM after optimizer.step()
+    print(
+        f"step {step+1:4d}/{MAX_STEPS} | "
+        f"loss {loss.item():.4f} | "
+        f"rewards {[f'{r:.2f}' for r in rewards]} | "
+        f"mean {mean_r:.3f}"
+    )
+    if (step + 1) % 50 == 0:
+        model.save_pretrained(f"{OUTPUT_DIR}/step-{step+1}")
+        tokenizer.save_pretrained(f"{OUTPUT_DIR}/step-{step+1}")
+        print(f"  → checkpoint saved")
+model.save_pretrained(OUTPUT_DIR)
+tokenizer.save_pretrained(OUTPUT_DIR)
+print(f"Done. Saved to {OUTPUT_DIR}")