narcolepticchicken
/

occ-stack

ml-intern

Model card Files Files and versions

xet

Community

narcolepticchicken commited on 25 days ago

Commit

83ce3a2

verified ·

1 Parent(s): 2f3b1f2

Upload scripts/grpo_train_occ.py

Browse files

Files changed (1) hide show

scripts/grpo_train_occ.py +185 -0

scripts/grpo_train_occ.py ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/usr/bin/env python3
+"""OCC GRPO Training Demo — Minimal end-to-end GRPO with OCC reward hook.
+This script trains Qwen2.5-0.5B-Instruct with GRPO using a cost-adjusted
+marginal impact reward from OCC. The reward combines:
+  - Accuracy (is the answer correct?)
+  - Format (did the model think before answering?)
+  - Cost penalty (shorter completions are cheaper)
+  - Confident-wrong penalty (overconfident wrong answers are punished)
+Intended as a minimal demonstration: even 10-50 steps on a T4 proves the
+OCC reward integrates with TRL's GRPOTrainer.
+Usage:
+    uv run --with transformers --with trl --with torch --with datasets \
+      --with accelerate scripts/grpo_train_occ.py
+    Or via accelerate launch for multi-GPU:
+    accelerate launch scripts/grpo_train_occ.py
+"""
+import re
+import json
+import torch
+from datasets import Dataset, load_dataset
+from trl import GRPOTrainer, GRPOConfig
+# ═══════════════════════════════════════════════════════════════════
+# OCC Reward Function
+# ═══════════════════════════════════════════════════════════════════
+def occ_reward(completions, ground_truth, completion_ids=None, prompts=None, **kwargs):
+    """
+    OCC cost-adjusted reward function for GRPO.
+    Reward components:
+      - correctness: +1.0 if answer matches ground truth, -1.0 otherwise
+      - format: +0.1 if completion contains thinking markers
+      - cost_penalty: -0.001 per token (incentivizes conciseness)
+      - confident_wrong_penalty: -0.5 extra if wrong but uses confident language
+      - abstention_bonus: +0.3 if model says "I don't know" (reward honest uncertainty)
+    Total reward = correctness + format + cost_penalty + confident_wrong_penalty + abstention_bonus
+    """
+    rewards = []
+    for i, completion in enumerate(completions):
+        # Extract content from conversational format
+        if isinstance(completion, list) and len(completion) > 0:
+            content = completion[0].get("content", "")
+        else:
+            content = str(completion)
+        gt = ground_truth[i] if i < len(ground_truth) else ""
+        content_lower = content.lower()
+        # ── Correctness ──────────────────────────────────────
+        # Extract final answer from boxed{} or "answer is X" patterns
+        final_answer = None
+        boxed_match = re.search(r"\\boxed\{(.*?)\}", content)
+        if boxed_match:
+            final_answer = boxed_match.group(1).strip()
+        else:
+            answer_match = re.search(r"(?:answer|result|solution)\s*(?:is|=)\s*([^\s,.]+)", content_lower)
+            if answer_match:
+                final_answer = answer_match.group(1).strip()
+        if final_answer:
+            correctness = 1.0 if final_answer == gt.strip() else -1.0
+        else:
+            # No answer extracted — penalize slightly
+            correctness = -0.5
+        # ── Format reward ────────────────────────────────────
+        has_think = bool(re.search(r"|think", content, re.IGNORECASE))
+        format_reward = 0.1 if has_think else 0.0
+        # ── Cost penalty ─────────────────────────────────────
+        n_tokens = len(completion_ids[i]) if completion_ids else len(content.split())
+        cost_penalty = -0.001 * n_tokens
+        # ── Confident-wrong penalty ──────────────────────────
+        confident_markers = ["definitely", "certainly", "obviously", "clearly", "without doubt"]
+        is_confident = any(m in content_lower for m in confident_markers)
+        is_wrong = correctness < 0
+        confident_wrong_penalty = -0.5 if (is_confident and is_wrong) else 0.0
+        # ── Abstention bonus ─────────────────────────────────
+        abstention_markers = ["i don't know", "i do not know", "cannot determine", "uncertain", "not sure"]
+        is_abstaining = any(m in content_lower for m in abstention_markers)
+        abstention_bonus = 0.3 if is_abstaining else 0.0
+        # ── Assemble total reward ────────────────────────────
+        total = correctness + format_reward + cost_penalty + confident_wrong_penalty + abstention_bonus
+        rewards.append(total)
+    # Log reward breakdown for monitoring
+    if kwargs.get("log_extra"):
+        kwargs["log_extra"]("correctness", [1.0 if r > 0 else -1.0 if r < 0 else 0.0 for r in rewards])
+    return rewards
+# ═════════════════��═════════════════════════════════════════════════
+# Main Training
+# ═══════════════════════════════════════════════════════════════════
+def main():
+    print("[OCC-GRPO] Loading dataset...")
+    # Using DeepMath-103K — standard conversational format for GRPO
+    try:
+        dataset = load_dataset("trl-lib/DeepMath-103K", split="train")
+        # Take a small subset for quick demo
+        dataset = dataset.select(range(min(200, len(dataset))))
+        print(f"[OCC-GRPO] Loaded {len(dataset)} examples from DeepMath-103K")
+    except Exception:
+        print("[OCC-GRPO] DeepMath-103K not available, using synthetic dataset")
+        # Fallback: synthetic math problems
+        data = [
+            {"prompt": [{"role": "user", "content": f"What is {a} + {b}?"}],
+             "ground_truth": str(a + b)}
+            for a, b in [(i, j) for i in range(1, 20) for j in range(1, 20)][:200]
+        ]
+        dataset = Dataset.from_list(data)
+    # Training config — minimal for quick demo
+    training_args = GRPOConfig(
+        output_dir="./occ_grpo_output",
+        per_device_train_batch_size=2,
+        per_device_eval_batch_size=2,
+        num_train_epochs=1,
+        max_steps=20,  # Minimal demo
+        logging_steps=5,
+        save_steps=20,
+        learning_rate=1e-6,
+        bf16=True if torch.cuda.is_bf16_supported() else False,
+        fp16=True if not torch.cuda.is_bf16_supported() else False,
+        gradient_checkpointing=True,
+        gradient_accumulation_steps=2,
+        max_completion_length=256,
+        num_generations=4,  # G=4 completions per prompt
+        report_to="none",   # Disable wandb/tensorboard for simplicity
+        disable_tqdm=False,
+        logging_strategy="steps",
+        logging_first_step=True,
+    )
+    print(f"[OCC-GRPO] Model: Qwen/Qwen2.5-0.5B-Instruct")
+    print(f"[OCC-GRPO] Steps: {training_args.max_steps}")
+    print(f"[OCC-GRPO] Generations per prompt: {training_args.num_generations}")
+    print(f"[OCC-GRPO] Device: {'cuda' if torch.cuda.is_available() else 'cpu'}")
+    trainer = GRPOTrainer(
+        model="Qwen/Qwen2.5-0.5B-Instruct",
+        args=training_args,
+        reward_funcs=occ_reward,
+        train_dataset=dataset,
+    )
+    print("[OCC-GRPO] Starting training...")
+    trainer.train()
+    print("[OCC-GRPO] Saving model...")
+    trainer.save_model("./occ_grpo_output/final")
+    # Save a summary
+    summary = {
+        "method": "GRPO with OCC cost-adjusted reward",
+        "model": "Qwen/Qwen2.5-0.5B-Instruct",
+        "reward_components": [
+            "correctness (±1.0)",
+            "format (+0.1 if thinking markers)",
+            "cost_penalty (-0.001/token)",
+            "confident_wrong_penalty (-0.5)",
+            "abstention_bonus (+0.3)"
+        ],
+        "steps": training_args.max_steps,
+        "generations_per_prompt": training_args.num_generations,
+    }
+    with open("./occ_grpo_output/summary.json", "w") as f:
+        json.dump(summary, f, indent=2)
+    print("[OCC-GRPO] Done. Output in ./occ_grpo_output/")
+if __name__ == "__main__":
+    main()