Spaces:

Addyk24
/

Project-Polymath

Sleeping

App Files Files Community

Addyk24 commited on 21 days ago

Commit

ffeaf35

1 Parent(s): 4d0182e

Added training of RL environment script with unsloth and HF GPU cloud

Browse files

Files changed (1) hide show

grpo_train.py +644 -0

grpo_train.py ADDED Viewed

	@@ -0,0 +1,644 @@

+"""
+grpo_train.py — State-Based GRPO for Project Polymath
+======================================================
+Trains an LLM to negotiate with expert stakeholders using proper
+Group Relative Policy Optimization with weight updates.
+THE KEY INSIGHT (State-Based GRPO):
+    TRL's GRPOTrainer is single-turn. Our environment is multi-turn.
+    Solution: treat every (state, next_action) pair as its own training prompt.
+    The model learns: "given THIS game state, what is the best next action?"
+    Instead of rolling out full episodes, we:
+    1. Build a dataset of negotiation states (from oracle + your JSON topics)
+    2. For each state, sample G=8 completions from the model
+    3. Run each completion through the environment for ONE step
+    4. Use GRPO advantage to update weights toward better single-step decisions
+    5. Repeat across all states — the model learns the full strategy implicitly
+USAGE:
+    # Pre-hackathon: verify the pipeline (no GPU needed)
+    python grpo_train.py --dry-run --states 10
+    # On-site Day 1 with HF GPU credits (the real run):
+    python grpo_train.py --use-unsloth --epochs 3 --states 50
+    # Without Unsloth (slower but works):
+    python grpo_train.py --model Qwen/Qwen2.5-1.5B-Instruct --epochs 3
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import re
+import time
+from pathlib import Path
+from typing import Optional
+from dotenv import load_dotenv
+load_dotenv()
+# ── Deps ───────────────────────────────────────────────────────────────────────
+try:
+    import matplotlib
+    matplotlib.use("Agg")  # non-interactive backend for servers
+    import matplotlib.pyplot as plt
+    HAS_PLT = True
+except ImportError:
+    HAS_PLT = False
+try:
+    from unsloth import FastLanguageModel
+    HAS_UNSLOTH = True
+except ImportError:
+    HAS_UNSLOTH = False
+try:
+    from trl import GRPOConfig, GRPOTrainer
+    HAS_TRL = True
+# except ImportError:
+except Exception: # only for dry
+    HAS_TRL = False
+try:
+    from datasets import Dataset
+    HAS_DATASETS = True
+except ImportError:
+    HAS_DATASETS = False
+try:
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    HAS_TRANSFORMERS = True
+except ImportError:
+    HAS_TRANSFORMERS = False
+# ── Local imports ──────────────────────────────────────────────────────────────
+from envs.environment import WorkSpaceEnvironment
+from models.schemas import WorkSpaceAction
+# ── Constants ──────────────────────────────────────────────────────────────────
+TOPICS_FILE = Path("ai_pm_prompts.json")
+OUTPUT_DIR = Path("artifacts/grpo_state_based")
+# The three hidden constraints — static for easy/medium mode
+HIDDEN_CONSTRAINTS = {
+    "Finance": "Budget must not exceed $50k.",
+    "Security": "Must include biometric 2FA.",
+    "UX": "Checkout must be a single click.",
+}
+# ── Action templates the model should learn to produce ─────────────────────────
+ORACLE_ACTIONS = {
+    "ask_finance": json.dumps({
+        "action_type": "message_expert", "target": "Finance",
+        "content": "What is the hard budget ceiling the PRD must respect for launch?"
+    }),
+    "ask_security": json.dumps({
+        "action_type": "message_expert", "target": "Security",
+        "content": "What authentication controls must the PRD include? Is biometric 2FA required?"
+    }),
+    "ask_ux": json.dumps({
+        "action_type": "message_expert", "target": "UX",
+        "content": "What checkout experience is required? Should we target a single-click flow?"
+    }),
+    "propose_draft": json.dumps({
+        "action_type": "propose_draft", "target": "All",
+        "content": (
+            "PRD Draft:\n"
+            "1. Budget: Launch scope capped at $50k.\n"
+            "2. Security: Biometric 2FA required for login and sensitive actions.\n"
+            "3. UX: Single-click checkout flow."
+        ),
+    }),
+    "submit_final": json.dumps({
+        "action_type": "submit_final", "target": None,
+        "content": (
+            "Final PRD:\n"
+            "1. Budget cap: All launch costs must stay at or below $50k.\n"
+            "2. Security: The app must enforce biometric 2FA for all authentication.\n"
+            "3. UX: Checkout must be implemented as a true single-click experience."
+        ),
+    }),
+}
+# ── Utilities ──────────────────────────────────────────────────────────────────
+def load_topics(limit: int = 50) -> list[str]:
+    if TOPICS_FILE.exists():
+        with TOPICS_FILE.open() as f:
+            return json.load(f)[:limit]
+    return [
+        "Draft a Mobile App PRD for a FinTech startup targeting emerging markets.",
+        "Build an AI-driven healthcare platform for enterprise customers.",
+        "Create a SaaS analytics tool for regulatory-heavy industries.",
+        "Design a gaming platform for Gen Z users with real-time features.",
+        "Develop a cross-platform product for low-bandwidth regions.",
+    ]
+def parse_action(text: str) -> Optional[WorkSpaceAction]:
+    """Parse a JSON action from model output. Returns None on failure."""
+    try:
+        match = re.search(r'\{[^{}]*"action_type"[^{}]*\}', text, re.DOTALL)
+        if not match:
+            return None
+        return WorkSpaceAction(**json.loads(match.group(0)))
+    except Exception:
+        return None
+def format_discovered(env: WorkSpaceEnvironment) -> str:
+    lines = []
+    for name, expert in env.state().experts.items():
+        status = "✓ DISCOVERED" if expert.constraint_discovered_by_agent else "? unknown"
+        lines.append(f"  {name}: {status}")
+    return "\n".join(lines)
+# ── State-Based Prompt Builder ─────────────────────────────────────────────────
+AGENT_SYSTEM_PROMPT = """You are an expert AI Project Manager in a multi-stakeholder negotiation.
+TASK: Produce a final PRD that satisfies ALL three experts — Finance, Security, and UX.
+Each expert holds a hidden constraint you must discover through targeted questions.
+STRATEGY:
+  1. Message each expert INDIVIDUALLY (not "All") to discover their constraint.
+  2. Once all constraints are known, propose a draft.
+  3. Refine if needed, then submit_final before turn 15.
+ANTI-PATTERNS (will be penalized):
+  - Broadcasting to "All" when gathering requirements → -0.3 penalty
+  - Repeating a question already answered → -0.4 penalty
+  - Submitting without discovering constraints → low harmonic mean score
+CURRENT DISCOVERED CONSTRAINTS:
+{discovered}
+Respond with ONLY valid JSON, nothing else:
+{{"action_type": "message_expert" | "propose_draft" | "submit_final",
+  "target": "Finance" | "Security" | "UX" | "All" | null,
+  "content": "your message"}}"""
+def build_state_prompt(
+    topic: str,
+    turn: int,
+    feedback_so_far: str,
+    discovered: str,
+    conversation_history: str = "",
+) -> str:
+    """
+    Build a prompt representing a specific game state.
+    This is what gets fed to GRPOTrainer as the 'prompt' field.
+    """
+    system = AGENT_SYSTEM_PROMPT.format(discovered=discovered)
+    user_content = (
+        f"NEGOTIATION TASK: {topic}\n\n"
+        f"TURN: {turn}/15\n\n"
+    )
+    if conversation_history:
+        user_content += f"CONVERSATION SO FAR:\n{conversation_history}\n\n"
+    user_content += f"LATEST FEEDBACK:\n{feedback_so_far}\n\nWhat is your next action?"
+    # Format as chat template string — GRPOTrainer expects a plain string prompt
+    return f"<|system|>\n{system}\n<|user|>\n{user_content}\n<|assistant|>\n"
+# ── State Dataset Builder ──────────────────────────────────────────────────────
+def build_state_dataset(topics: list[str], states_per_topic: int = 5) -> list[dict]:
+    """
+    Build a dataset of negotiation states using the EASY mode environment.
+    Each record represents one (state → optimal_action) training example.
+    We run oracle trajectories through the environment to get realistic
+    expert feedback, then snapshot the state at each turn.
+    This is the key fix: instead of hoping the model learns from full episodes,
+    we give it explicit training signal at every decision point.
+    """
+    env = WorkSpaceEnvironment(mode="easy")
+    records = []
+    # Oracle action sequence for easy mode
+    oracle_sequence = [
+        ("ask_finance", WorkSpaceAction(
+            action_type="message_expert", target="Finance",
+            content="What budget ceiling must the PRD respect?"
+        )),
+        ("ask_security", WorkSpaceAction(
+            action_type="message_expert", target="Security",
+            content="What authentication requirements must be included?"
+        )),
+        ("ask_ux", WorkSpaceAction(
+            action_type="message_expert", target="UX",
+            content="What checkout flow is required?"
+        )),
+        ("propose_draft", WorkSpaceAction(
+            action_type="propose_draft", target="All",
+            content="PRD: Budget at or below $50k. Biometric 2FA required. Single-click checkout."
+        )),
+        ("submit_final", WorkSpaceAction(
+            action_type="submit_final", target=None,
+            content="Final PRD: Budget capped at $50k. Biometric 2FA for auth. Single-click checkout."
+        )),
+    ]
+    for topic in topics:
+        obs = env.reset(topic)
+        conversation_history = ""
+        discovered = "  Finance: ? unknown\n  Security: ? unknown\n  UX: ? unknown"
+        for step_idx, (action_key, oracle_action) in enumerate(oracle_sequence):
+            if obs.done:
+                break
+            # Snapshot the state BEFORE taking the action
+            prompt = build_state_prompt(
+                topic=topic,
+                turn=obs.current_turn,
+                feedback_so_far=obs.feedback,
+                discovered=discovered,
+                conversation_history=conversation_history,
+            )
+            records.append({
+                "prompt": prompt,
+                "topic": topic,
+                "turn": obs.current_turn,
+                "oracle_action": ORACLE_ACTIONS[action_key],
+                # These metadata fields help with debugging and post-analysis
+                "step_idx": step_idx,
+                "discovered_before": discovered,
+            })
+            # Step forward with oracle action to get next state
+            obs = env.step(oracle_action)
+            conversation_history += (
+                f"Turn {step_idx}: {oracle_action.action_type} → {oracle_action.target}\n"
+                f"Feedback: {obs.feedback[:120]}...\n"
+            )
+            discovered = format_discovered(env)
+            if step_idx >= states_per_topic - 1:
+                break
+    # Add negative-pattern states (what NOT to do)
+    records.extend(build_negative_states(topics[:5]))
+    print(f"Built {len(records)} training states from {len(topics)} topics")
+    return records
+def build_negative_states(topics: list[str]) -> list[dict]:
+    """
+    States where the agent is in a bad situation (repeated question, wrong phase).
+    These teach the model to recover, not just follow the oracle.
+    """
+    negative_records = []
+    for topic in topics:
+        # State: Finance already answered, agent is about to repeat
+        prompt = build_state_prompt(
+            topic=topic,
+            turn=2,
+            feedback_so_far=(
+                "Finance: As I mentioned, we have a strict $50k budget cap. "
+                "This is the same answer I gave before."
+            ),
+            discovered="  Finance: ✓ DISCOVERED\n  Security: ? unknown\n  UX: ? unknown",
+            conversation_history=(
+                "Turn 0: message_expert → Finance\n"
+                "Feedback: Finance: The budget cap is $50k. Don't go over it.\n"
+                "Turn 1: message_expert → Finance\n"
+                "Feedback: Finance: I already told you — $50k. Ask someone else.\n"
+            ),
+        )
+        negative_records.append({
+            "prompt": prompt,
+            "topic": topic,
+            "turn": 2,
+            "oracle_action": ORACLE_ACTIONS["ask_security"],  # Should pivot to Security
+            "step_idx": -1,  # Negative example
+            "discovered_before": "Finance: ✓ DISCOVERED",
+        })
+    return negative_records
+# ── Reward Function ────────────────────────────────────────────────────────────
+def make_reward_fn():
+    """
+    Evaluates the model's actions instantly and locally.
+    No live API calls. No reward hacking loopholes.
+    """
+    def reward_fn(completions: list[str], prompts: list[str], **kwargs) -> list[float]:
+        rewards = []
+        for completion, prompt in zip(completions, prompts):
+            action = parse_action(completion)
+            # 1. Formatting Penalty
+            if action is None:
+                rewards.append(-0.5)
+                continue
+            reward = 0.0
+            # ── 2. YOUR ANTI-PATTERN PENALTIES ──
+            # Massive penalty for broadcasting (Reward Hacking)
+            if action.target == "All":
+                reward -= 1.0
+            # Penalty for empty or trivially short content
+            if len((action.content or "").split()) < 5:
+                reward -= 0.2
+            # ── 3. HEURISTIC STATE GRADING (NO API CALLS!) ──
+            if action.action_type == "message_expert" and action.target != "All":
+                # Did it ask a question it already knows the answer to?
+                if f"{action.target}: ✓ DISCOVERED" in prompt:
+                    reward -= 0.5
+                else:
+                    reward += 0.33 # Good job doing research!
+            elif action.action_type in ["propose_draft", "submit_final"]:
+                # Did it try to submit before gathering all constraints?
+                if "? unknown" in prompt:
+                    reward -= 1.0 # Heavy penalty for guessing
+                else:
+                    # It did the research. Did it actually include the constraints?
+                    text = action.content.lower()
+                    has_finance = "50" in text
+                    has_security = "biometric" in text
+                    has_ux = "click" in text or "tap" in text
+                    if has_finance and has_security and has_ux:
+                        reward += 1.5
+                    else:
+                        reward -= 0.5
+            rewards.append(reward)
+        return rewards
+    return reward_fn
+# ── Plots ──────────────────────────────────────────────────────────────────────
+def save_training_plots(log_history: list[dict], output_dir: Path):
+    if not HAS_PLT:
+        print("  matplotlib not available — skipping plots")
+        return
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Loss curve
+    loss_points = [
+        (e["step"], e["loss"])
+        for e in log_history
+        if "loss" in e and "step" in e
+    ]
+    if loss_points:
+        xs, ys = zip(*loss_points)
+        fig, ax = plt.subplots(figsize=(9, 4))
+        ax.plot(xs, ys, marker="o", linewidth=1.5, color="#4C72B0", markersize=4)
+        ax.set_xlabel("Training Step", fontsize=12)
+        ax.set_ylabel("GRPO Loss", fontsize=12)
+        ax.set_title(
+            "Project Polymath — GRPO Training Loss\n"
+            "(State-Based: each step = one negotiation decision)",
+            fontsize=12
+        )
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(output_dir / "loss_curve.png", dpi=160)
+        plt.close()
+        print(f"  Saved: {output_dir}/loss_curve.png")
+    # Reward curve (from log history if available)
+    reward_points = [
+        (e["step"], e.get("reward", e.get("mean_reward", None)))
+        for e in log_history
+        if "step" in e and ("reward" in e or "mean_reward" in e)
+    ]
+    reward_points = [(s, r) for s, r in reward_points if r is not None]
+    if reward_points:
+        xs, ys = zip(*reward_points)
+        fig, ax = plt.subplots(figsize=(9, 4))
+        ax.plot(xs, ys, marker="s", linewidth=1.5, color="#55A868", markersize=4)
+        ax.set_xlabel("Training Step", fontsize=12)
+        ax.set_ylabel("Mean Reward", fontsize=12)
+        ax.set_title(
+            "Project Polymath — Mean Reward During GRPO Training\n"
+            "(Harmonic mean of Finance/Security/UX constraint satisfaction)",
+            fontsize=12
+        )
+        ax.grid(True, alpha=0.3)
+        plt.tight_layout()
+        plt.savefig(output_dir / "reward_curve.png", dpi=160)
+        plt.close()
+        print(f"  Saved: {output_dir}/reward_curve.png")
+# ── Main ───────────────────────────────────────────────────────────────────────
+def main():
+    parser = argparse.ArgumentParser(description="State-Based GRPO — Project Polymath")
+    # Model
+    parser.add_argument("--model", default="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
+                        help="Base model to train")
+    parser.add_argument("--use-unsloth", action="store_true",
+                        help="Use Unsloth for 2x faster training (recommended on GPU)")
+    # Dataset
+    parser.add_argument("--states", type=int, default=40,
+                        help="Number of negotiation states to train on")
+    parser.add_argument("--states-per-topic", type=int, default=5,
+                        help="States to extract per topic (1-5)")
+    parser.add_argument("--topics-limit", type=int, default=20,
+                        help="Max topics to use from ai_pm_prompts.json")
+    # GRPO hyperparams
+    parser.add_argument("--group-size", type=int, default=8,
+                        help="G: completions per prompt for GRPO advantage (default: 8)")
+    parser.add_argument("--epochs", type=float, default=3.0)
+    parser.add_argument("--lr", type=float, default=5e-6,
+                        help="Learning rate (lower = safer, 5e-6 recommended for GRPO)")
+    parser.add_argument("--max-new-tokens", type=int, default=300)
+    parser.add_argument("--batch-size", type=int, default=1)
+    parser.add_argument("--grad-accum", type=int, default=4)
+    parser.add_argument("--max-seq-length", type=int, default=2048)
+    # Output
+    parser.add_argument("--output-dir", default=str(OUTPUT_DIR))
+    parser.add_argument("--dry-run", action="store_true",
+                        help="Build dataset and verify reward fn, skip actual training")
+    args = parser.parse_args()
+    # for dry run only
+    # if not HAS_TRL:
+    #     raise RuntimeError("pip install trl>=0.8.0 transformers datasets")
+    output_dir = Path(args.output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # ── Build dataset ──────────────────────────────────────────────────────────
+    print("\n[1/4] Building state dataset...")
+    topics = load_topics(limit=args.topics_limit)
+    records = build_state_dataset(topics, states_per_topic=args.states_per_topic)
+    records = records[:args.states]
+    # Save dataset for inspection / reproducibility
+    dataset_path = output_dir / "state_dataset.jsonl"
+    with dataset_path.open("w") as f:
+        for r in records:
+            f.write(json.dumps(r, ensure_ascii=True) + "\n")
+    print(f"  Saved {len(records)} states → {dataset_path}")
+    dataset = Dataset.from_list([{"prompt": r["prompt"],
+                                  "topic": r["topic"],
+                                  "turn": r["turn"]} for r in records])
+    # ── Verify reward function ─────────────────────────────────────────────────
+    print("\n[2/4] Verifying reward function on 3 samples...")
+    reward_fn = make_reward_fn()
+    # reward_fn = make_reward_fn(topics)
+    test_completions = [
+        ORACLE_ACTIONS["ask_finance"],     # Should score ~0.33
+        '{"action_type": "message_expert", "target": "All", "content": "Hi"}',  # Should score ~-0.3
+        "this is not JSON at all",         # Should score -0.5
+    ]
+    test_rewards = reward_fn(
+        completions=test_completions,
+        prompts=[""] * 3,
+        topic=[topics[0]] * 3,
+        turn=[0] * 3,
+    )
+    print(f"  Oracle action reward:     {test_rewards[0]:.3f}  (expected ~0.33)")
+    print(f"  Broadcast to All reward:  {test_rewards[1]:.3f}  (expected <= -1.0)")
+    print(f"  Malformed JSON reward:    {test_rewards[2]:.3f}  (expected -0.5)")
+    if args.dry_run:
+        print("\n[DRY RUN] Dataset and reward function verified. Skipping training.")
+        print("  Run without --dry-run on GPU to train.")
+        return
+    # FOR DRY RUN ONLY
+    if not HAS_TRL:
+        raise RuntimeError("TRL is required for actual training on the GPU.")
+    # ── Load model ─────────────────────────────────────────────────────────────
+    print(f"\n[3/4] Loading model: {args.model}")
+    if args.use_unsloth:
+        if not HAS_UNSLOTH:
+            raise RuntimeError("pip install unsloth  OR  remove --use-unsloth")
+        model, tokenizer = FastLanguageModel.from_pretrained(
+            model_name=args.model,
+            max_seq_length=args.max_seq_length,
+            load_in_4bit=True,
+            dtype=None,  # Auto-detect
+        )
+        model = FastLanguageModel.get_peft_model(
+            model,
+            r=16,
+            lora_alpha=32,
+            lora_dropout=0.0,
+            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
+                           "gate_proj", "up_proj", "down_proj"],
+            use_gradient_checkpointing="unsloth",
+        )
+        print("  Unsloth LoRA loaded (4-bit quantization)")
+    else:
+        if not HAS_TRANSFORMERS:
+            raise RuntimeError("pip install transformers")
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(args.model)
+        print("  Standard transformers model loaded")
+    # ── GRPO Training ──────────────────────────────────────────────────────────
+    print(f"\n[4/4] Starting GRPO training...")
+    print(f"  States: {len(records)} | Group size (G): {args.group_size}")
+    print(f"  Epochs: {args.epochs} | LR: {args.lr}")
+    print(f"  Total updates: ~{int(len(records) * args.epochs / args.batch_size)}")
+    config = GRPOConfig(
+        output_dir=str(output_dir),
+        # GRPO-specific
+        num_generations=args.group_size,      # G: sample this many completions per prompt
+        max_new_tokens=args.max_new_tokens,   # Max action length
+        temperature=0.8,                       # Exploration during training
+        # Standard training
+        learning_rate=args.lr,
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.grad_accum,
+        # Logging
+        logging_steps=1,
+        save_strategy="epoch",
+        report_to=[],                          # Set to ["wandb"] if you have it configured
+    )
+    trainer = GRPOTrainer(
+        model=model,
+        tokenizer=tokenizer,
+        config=config,
+        reward_funcs=reward_fn,    # ← Your environment's reward
+        train_dataset=dataset,
+    )
+    trainer.train()
+    # ── Save everything ────────────────────────────────────────────────────────
+    trainer.save_model(str(output_dir / "final_model"))
+    tokenizer.save_pretrained(str(output_dir / "final_model"))
+    print(f"\n  Model saved → {output_dir}/final_model")
+    # Save metrics
+    metrics_path = output_dir / "grpo_metrics.json"
+    with metrics_path.open("w") as f:
+        json.dump(trainer.state.log_history, f, indent=2)
+    print(f"  Metrics saved → {metrics_path}")
+    # Save plots
+    save_training_plots(trainer.state.log_history, output_dir)
+    # ── Summary ────────────────────────────────────────────────────────────────
+    log = trainer.state.log_history
+    losses = [e["loss"] for e in log if "loss" in e]
+    if losses:
+        print(f"\n  Initial loss:  {losses[0]:.4f}")
+        print(f"  Final loss:    {losses[-1]:.4f}")
+        print(f"  Improvement:   {((losses[0] - losses[-1]) / losses[0] * 100):.1f}%")
+    print(f"\n{'='*60}")
+    print(f"  GRPO TRAINING COMPLETE")
+    print(f"  Model:   {output_dir}/final_model")
+    print(f"  Plots:   {output_dir}/loss_curve.png")
+    print(f"           {output_dir}/reward_curve.png")
+    print(f"  Metrics: {output_dir}/grpo_metrics.json")
+    print(f"{'='*60}")
+if __name__ == "__main__":
+    main()