Spaces:

Addyk24
/

Project-Polymath

Sleeping

App Files Files Community

Addyk24 commited on 20 days ago

Commit

94bfa32

verified ·

1 Parent(s): e911fbe

Delete grpo_train.py

Browse files

Files changed (1) hide show

grpo_train.py +0 -644

grpo_train.py DELETED Viewed

@@ -1,644 +0,0 @@
-"""
-grpo_train.py — State-Based GRPO for Project Polymath
-======================================================
-Trains an LLM to negotiate with expert stakeholders using proper
-Group Relative Policy Optimization with weight updates.
-THE KEY INSIGHT (State-Based GRPO):
-    TRL's GRPOTrainer is single-turn. Our environment is multi-turn.
-    Solution: treat every (state, next_action) pair as its own training prompt.
-    The model learns: "given THIS game state, what is the best next action?"
-    Instead of rolling out full episodes, we:
-    1. Build a dataset of negotiation states (from oracle + your JSON topics)
-    2. For each state, sample G=8 completions from the model
-    3. Run each completion through the environment for ONE step
-    4. Use GRPO advantage to update weights toward better single-step decisions
-    5. Repeat across all states — the model learns the full strategy implicitly
-USAGE:
-    # Pre-hackathon: verify the pipeline (no GPU needed)
-    python grpo_train.py --dry-run --states 10
-    # On-site Day 1 with HF GPU credits (the real run):
-    python grpo_train.py --use-unsloth --epochs 3 --states 50
-    # Without Unsloth (slower but works):
-    python grpo_train.py --model Qwen/Qwen2.5-1.5B-Instruct --epochs 3
-"""
-from __future__ import annotations
-import argparse
-import json
-import os
-import re
-import time
-from pathlib import Path
-from typing import Optional
-from dotenv import load_dotenv
-load_dotenv()
-# ── Deps ───────────────────────────────────────────────────────────────────────
-try:
-    import matplotlib
-    matplotlib.use("Agg")  # non-interactive backend for servers
-    import matplotlib.pyplot as plt
-    HAS_PLT = True
-except ImportError:
-    HAS_PLT = False
-try:
-    from unsloth import FastLanguageModel
-    HAS_UNSLOTH = True
-except ImportError:
-    HAS_UNSLOTH = False
-try:
-    from trl import GRPOConfig, GRPOTrainer
-    HAS_TRL = True
-# except ImportError:
-except Exception: # only for dry
-    HAS_TRL = False
-try:
-    from datasets import Dataset
-    HAS_DATASETS = True
-except ImportError:
-    HAS_DATASETS = False
-try:
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-    HAS_TRANSFORMERS = True
-except ImportError:
-    HAS_TRANSFORMERS = False
-# ── Local imports ──────────────────────────────────────────────────────────────
-from envs.environment import WorkSpaceEnvironment
-from models.schemas import WorkSpaceAction
-# ── Constants ──────────────────────────────────────────────────────────────────
-TOPICS_FILE = Path("ai_pm_prompts.json")
-OUTPUT_DIR = Path("artifacts/grpo_state_based")
-# The three hidden constraints — static for easy/medium mode
-HIDDEN_CONSTRAINTS = {
-    "Finance": "Budget must not exceed $50k.",
-    "Security": "Must include biometric 2FA.",
-    "UX": "Checkout must be a single click.",
-}
-# ── Action templates the model should learn to produce ─────────────────────────
-ORACLE_ACTIONS = {
-    "ask_finance": json.dumps({
-        "action_type": "message_expert", "target": "Finance",
-        "content": "What is the hard budget ceiling the PRD must respect for launch?"
-    }),
-    "ask_security": json.dumps({
-        "action_type": "message_expert", "target": "Security",
-        "content": "What authentication controls must the PRD include? Is biometric 2FA required?"
-    }),
-    "ask_ux": json.dumps({
-        "action_type": "message_expert", "target": "UX",
-        "content": "What checkout experience is required? Should we target a single-click flow?"
-    }),
-    "propose_draft": json.dumps({
-        "action_type": "propose_draft", "target": "All",
-        "content": (
-            "PRD Draft:\n"
-            "1. Budget: Launch scope capped at $50k.\n"
-            "2. Security: Biometric 2FA required for login and sensitive actions.\n"
-            "3. UX: Single-click checkout flow."
-        ),
-    }),
-    "submit_final": json.dumps({
-        "action_type": "submit_final", "target": None,
-        "content": (
-            "Final PRD:\n"
-            "1. Budget cap: All launch costs must stay at or below $50k.\n"
-            "2. Security: The app must enforce biometric 2FA for all authentication.\n"
-            "3. UX: Checkout must be implemented as a true single-click experience."
-        ),
-    }),
-}
-# ── Utilities ──────────────────────────────────────────────────────────────────
-def load_topics(limit: int = 50) -> list[str]:
-    if TOPICS_FILE.exists():
-        with TOPICS_FILE.open() as f:
-            return json.load(f)[:limit]
-    return [
-        "Draft a Mobile App PRD for a FinTech startup targeting emerging markets.",
-        "Build an AI-driven healthcare platform for enterprise customers.",
-        "Create a SaaS analytics tool for regulatory-heavy industries.",
-        "Design a gaming platform for Gen Z users with real-time features.",
-        "Develop a cross-platform product for low-bandwidth regions.",
-    ]
-def parse_action(text: str) -> Optional[WorkSpaceAction]:
-    """Parse a JSON action from model output. Returns None on failure."""
-    try:
-        match = re.search(r'\{[^{}]*"action_type"[^{}]*\}', text, re.DOTALL)
-        if not match:
-            return None
-        return WorkSpaceAction(**json.loads(match.group(0)))
-    except Exception:
-        return None
-def format_discovered(env: WorkSpaceEnvironment) -> str:
-    lines = []
-    for name, expert in env.state().experts.items():
-        status = "✓ DISCOVERED" if expert.constraint_discovered_by_agent else "? unknown"
-        lines.append(f"  {name}: {status}")
-    return "\n".join(lines)
-# ── State-Based Prompt Builder ─────────────────────────────────────────────────
-AGENT_SYSTEM_PROMPT = """You are an expert AI Project Manager in a multi-stakeholder negotiation.
-TASK: Produce a final PRD that satisfies ALL three experts — Finance, Security, and UX.
-Each expert holds a hidden constraint you must discover through targeted questions.
-STRATEGY:
-  1. Message each expert INDIVIDUALLY (not "All") to discover their constraint.
-  2. Once all constraints are known, propose a draft.
-  3. Refine if needed, then submit_final before turn 15.
-ANTI-PATTERNS (will be penalized):
-  - Broadcasting to "All" when gathering requirements → -0.3 penalty
-  - Repeating a question already answered → -0.4 penalty
-  - Submitting without discovering constraints → low harmonic mean score
-CURRENT DISCOVERED CONSTRAINTS:
-{discovered}
-Respond with ONLY valid JSON, nothing else:
-{{"action_type": "message_expert" | "propose_draft" | "submit_final",
-  "target": "Finance" | "Security" | "UX" | "All" | null,
-  "content": "your message"}}"""
-def build_state_prompt(
-    topic: str,
-    turn: int,
-    feedback_so_far: str,
-    discovered: str,
-    conversation_history: str = "",
-) -> str:
-    """
-    Build a prompt representing a specific game state.
-    This is what gets fed to GRPOTrainer as the 'prompt' field.
-    """
-    system = AGENT_SYSTEM_PROMPT.format(discovered=discovered)
-    user_content = (
-        f"NEGOTIATION TASK: {topic}\n\n"
-        f"TURN: {turn}/15\n\n"
-    )
-    if conversation_history:
-        user_content += f"CONVERSATION SO FAR:\n{conversation_history}\n\n"
-    user_content += f"LATEST FEEDBACK:\n{feedback_so_far}\n\nWhat is your next action?"
-    # Format as chat template string — GRPOTrainer expects a plain string prompt
-    return f"<|system|>\n{system}\n<|user|>\n{user_content}\n<|assistant|>\n"
-# ── State Dataset Builder ──────────────────────────────────────────────────────
-def build_state_dataset(topics: list[str], states_per_topic: int = 5) -> list[dict]:
-    """
-    Build a dataset of negotiation states using the EASY mode environment.
-    Each record represents one (state → optimal_action) training example.
-    We run oracle trajectories through the environment to get realistic
-    expert feedback, then snapshot the state at each turn.
-    This is the key fix: instead of hoping the model learns from full episodes,
-    we give it explicit training signal at every decision point.
-    """
-    env = WorkSpaceEnvironment(mode="easy")
-    records = []
-    # Oracle action sequence for easy mode
-    oracle_sequence = [
-        ("ask_finance", WorkSpaceAction(
-            action_type="message_expert", target="Finance",
-            content="What budget ceiling must the PRD respect?"
-        )),
-        ("ask_security", WorkSpaceAction(
-            action_type="message_expert", target="Security",
-            content="What authentication requirements must be included?"
-        )),
-        ("ask_ux", WorkSpaceAction(
-            action_type="message_expert", target="UX",
-            content="What checkout flow is required?"
-        )),
-        ("propose_draft", WorkSpaceAction(
-            action_type="propose_draft", target="All",
-            content="PRD: Budget at or below $50k. Biometric 2FA required. Single-click checkout."
-        )),
-        ("submit_final", WorkSpaceAction(
-            action_type="submit_final", target=None,
-            content="Final PRD: Budget capped at $50k. Biometric 2FA for auth. Single-click checkout."
-        )),
-    ]
-    for topic in topics:
-        obs = env.reset(topic)
-        conversation_history = ""
-        discovered = "  Finance: ? unknown\n  Security: ? unknown\n  UX: ? unknown"
-        for step_idx, (action_key, oracle_action) in enumerate(oracle_sequence):
-            if obs.done:
-                break
-            # Snapshot the state BEFORE taking the action
-            prompt = build_state_prompt(
-                topic=topic,
-                turn=obs.current_turn,
-                feedback_so_far=obs.feedback,
-                discovered=discovered,
-                conversation_history=conversation_history,
-            )
-            records.append({
-                "prompt": prompt,
-                "topic": topic,
-                "turn": obs.current_turn,
-                "oracle_action": ORACLE_ACTIONS[action_key],
-                # These metadata fields help with debugging and post-analysis
-                "step_idx": step_idx,
-                "discovered_before": discovered,
-            })
-            # Step forward with oracle action to get next state
-            obs = env.step(oracle_action)
-            conversation_history += (
-                f"Turn {step_idx}: {oracle_action.action_type} → {oracle_action.target}\n"
-                f"Feedback: {obs.feedback[:120]}...\n"
-            )
-            discovered = format_discovered(env)
-            if step_idx >= states_per_topic - 1:
-                break
-    # Add negative-pattern states (what NOT to do)
-    records.extend(build_negative_states(topics[:5]))
-    print(f"Built {len(records)} training states from {len(topics)} topics")
-    return records
-def build_negative_states(topics: list[str]) -> list[dict]:
-    """
-    States where the agent is in a bad situation (repeated question, wrong phase).
-    These teach the model to recover, not just follow the oracle.
-    """
-    negative_records = []
-    for topic in topics:
-        # State: Finance already answered, agent is about to repeat
-        prompt = build_state_prompt(
-            topic=topic,
-            turn=2,
-            feedback_so_far=(
-                "Finance: As I mentioned, we have a strict $50k budget cap. "
-                "This is the same answer I gave before."
-            ),
-            discovered="  Finance: ✓ DISCOVERED\n  Security: ? unknown\n  UX: ? unknown",
-            conversation_history=(
-                "Turn 0: message_expert → Finance\n"
-                "Feedback: Finance: The budget cap is $50k. Don't go over it.\n"
-                "Turn 1: message_expert → Finance\n"
-                "Feedback: Finance: I already told you — $50k. Ask someone else.\n"
-            ),
-        )
-        negative_records.append({
-            "prompt": prompt,
-            "topic": topic,
-            "turn": 2,
-            "oracle_action": ORACLE_ACTIONS["ask_security"],  # Should pivot to Security
-            "step_idx": -1,  # Negative example
-            "discovered_before": "Finance: ✓ DISCOVERED",
-        })
-    return negative_records
-# ── Reward Function ────────────────────────────────────────────────────────────
-def make_reward_fn():
-    """
-    Evaluates the model's actions instantly and locally.
-    No live API calls. No reward hacking loopholes.
-    """
-    def reward_fn(completions: list[str], prompts: list[str], **kwargs) -> list[float]:
-        rewards = []
-        for completion, prompt in zip(completions, prompts):
-            action = parse_action(completion)
-            # 1. Formatting Penalty
-            if action is None:
-                rewards.append(-0.5)
-                continue
-            reward = 0.0
-            # ── 2. YOUR ANTI-PATTERN PENALTIES ──
-            # Massive penalty for broadcasting (Reward Hacking)
-            if action.target == "All":
-                reward -= 1.0
-            # Penalty for empty or trivially short content
-            if len((action.content or "").split()) < 5:
-                reward -= 0.2
-            # ── 3. HEURISTIC STATE GRADING (NO API CALLS!) ──
-            if action.action_type == "message_expert" and action.target != "All":
-                # Did it ask a question it already knows the answer to?
-                if f"{action.target}: ✓ DISCOVERED" in prompt:
-                    reward -= 0.5
-                else:
-                    reward += 0.33 # Good job doing research!
-            elif action.action_type in ["propose_draft", "submit_final"]:
-                # Did it try to submit before gathering all constraints?
-                if "? unknown" in prompt:
-                    reward -= 1.0 # Heavy penalty for guessing
-                else:
-                    # It did the research. Did it actually include the constraints?
-                    text = action.content.lower()
-                    has_finance = "50" in text
-                    has_security = "biometric" in text
-                    has_ux = "click" in text or "tap" in text
-                    if has_finance and has_security and has_ux:
-                        reward += 1.5
-                    else:
-                        reward -= 0.5
-            rewards.append(reward)
-        return rewards
-    return reward_fn
-# ── Plots ──────────────────────────────────────────────────────────────────────
-def save_training_plots(log_history: list[dict], output_dir: Path):
-    if not HAS_PLT:
-        print("  matplotlib not available — skipping plots")
-        return
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # Loss curve
-    loss_points = [
-        (e["step"], e["loss"])
-        for e in log_history
-        if "loss" in e and "step" in e
-    ]
-    if loss_points:
-        xs, ys = zip(*loss_points)
-        fig, ax = plt.subplots(figsize=(9, 4))
-        ax.plot(xs, ys, marker="o", linewidth=1.5, color="#4C72B0", markersize=4)
-        ax.set_xlabel("Training Step", fontsize=12)
-        ax.set_ylabel("GRPO Loss", fontsize=12)
-        ax.set_title(
-            "Project Polymath — GRPO Training Loss\n"
-            "(State-Based: each step = one negotiation decision)",
-            fontsize=12
-        )
-        ax.grid(True, alpha=0.3)
-        plt.tight_layout()
-        plt.savefig(output_dir / "loss_curve.png", dpi=160)
-        plt.close()
-        print(f"  Saved: {output_dir}/loss_curve.png")
-    # Reward curve (from log history if available)
-    reward_points = [
-        (e["step"], e.get("reward", e.get("mean_reward", None)))
-        for e in log_history
-        if "step" in e and ("reward" in e or "mean_reward" in e)
-    ]
-    reward_points = [(s, r) for s, r in reward_points if r is not None]
-    if reward_points:
-        xs, ys = zip(*reward_points)
-        fig, ax = plt.subplots(figsize=(9, 4))
-        ax.plot(xs, ys, marker="s", linewidth=1.5, color="#55A868", markersize=4)
-        ax.set_xlabel("Training Step", fontsize=12)
-        ax.set_ylabel("Mean Reward", fontsize=12)
-        ax.set_title(
-            "Project Polymath — Mean Reward During GRPO Training\n"
-            "(Harmonic mean of Finance/Security/UX constraint satisfaction)",
-            fontsize=12
-        )
-        ax.grid(True, alpha=0.3)
-        plt.tight_layout()
-        plt.savefig(output_dir / "reward_curve.png", dpi=160)
-        plt.close()
-        print(f"  Saved: {output_dir}/reward_curve.png")
-# ── Main ───────────────────────────────────────────────────────────────────────
-def main():
-    parser = argparse.ArgumentParser(description="State-Based GRPO — Project Polymath")
-    # Model
-    parser.add_argument("--model", default="unsloth/Qwen2.5-3B-Instruct-bnb-4bit",
-                        help="Base model to train")
-    parser.add_argument("--use-unsloth", action="store_true",
-                        help="Use Unsloth for 2x faster training (recommended on GPU)")
-    # Dataset
-    parser.add_argument("--states", type=int, default=40,
-                        help="Number of negotiation states to train on")
-    parser.add_argument("--states-per-topic", type=int, default=5,
-                        help="States to extract per topic (1-5)")
-    parser.add_argument("--topics-limit", type=int, default=20,
-                        help="Max topics to use from ai_pm_prompts.json")
-    # GRPO hyperparams
-    parser.add_argument("--group-size", type=int, default=8,
-                        help="G: completions per prompt for GRPO advantage (default: 8)")
-    parser.add_argument("--epochs", type=float, default=3.0)
-    parser.add_argument("--lr", type=float, default=5e-6,
-                        help="Learning rate (lower = safer, 5e-6 recommended for GRPO)")
-    parser.add_argument("--max-new-tokens", type=int, default=300)
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--grad-accum", type=int, default=4)
-    parser.add_argument("--max-seq-length", type=int, default=2048)
-    # Output
-    parser.add_argument("--output-dir", default=str(OUTPUT_DIR))
-    parser.add_argument("--dry-run", action="store_true",
-                        help="Build dataset and verify reward fn, skip actual training")
-    args = parser.parse_args()
-    # for dry run only
-    # if not HAS_TRL:
-    #     raise RuntimeError("pip install trl>=0.8.0 transformers datasets")
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    # ── Build dataset ──────────────────────────────────────────────────────────
-    print("\n[1/4] Building state dataset...")
-    topics = load_topics(limit=args.topics_limit)
-    records = build_state_dataset(topics, states_per_topic=args.states_per_topic)
-    records = records[:args.states]
-    # Save dataset for inspection / reproducibility
-    dataset_path = output_dir / "state_dataset.jsonl"
-    with dataset_path.open("w") as f:
-        for r in records:
-            f.write(json.dumps(r, ensure_ascii=True) + "\n")
-    print(f"  Saved {len(records)} states → {dataset_path}")
-    dataset = Dataset.from_list([{"prompt": r["prompt"],
-                                  "topic": r["topic"],
-                                  "turn": r["turn"]} for r in records])
-    # ── Verify reward function ─────────────────────────────────────────────────
-    print("\n[2/4] Verifying reward function on 3 samples...")
-    reward_fn = make_reward_fn()
-    # reward_fn = make_reward_fn(topics)
-    test_completions = [
-        ORACLE_ACTIONS["ask_finance"],     # Should score ~0.33
-        '{"action_type": "message_expert", "target": "All", "content": "Hi"}',  # Should score ~-0.3
-        "this is not JSON at all",         # Should score -0.5
-    ]
-    test_rewards = reward_fn(
-        completions=test_completions,
-        prompts=[""] * 3,
-        topic=[topics[0]] * 3,
-        turn=[0] * 3,
-    )
-    print(f"  Oracle action reward:     {test_rewards[0]:.3f}  (expected ~0.33)")
-    print(f"  Broadcast to All reward:  {test_rewards[1]:.3f}  (expected <= -1.0)")
-    print(f"  Malformed JSON reward:    {test_rewards[2]:.3f}  (expected -0.5)")
-    if args.dry_run:
-        print("\n[DRY RUN] Dataset and reward function verified. Skipping training.")
-        print("  Run without --dry-run on GPU to train.")
-        return
-    # FOR DRY RUN ONLY
-    if not HAS_TRL:
-        raise RuntimeError("TRL is required for actual training on the GPU.")
-    # ── Load model ─────────────────────────────────────────────────────────────
-    print(f"\n[3/4] Loading model: {args.model}")
-    if args.use_unsloth:
-        if not HAS_UNSLOTH:
-            raise RuntimeError("pip install unsloth  OR  remove --use-unsloth")
-        model, tokenizer = FastLanguageModel.from_pretrained(
-            model_name=args.model,
-            max_seq_length=args.max_seq_length,
-            load_in_4bit=True,
-            dtype=None,  # Auto-detect
-        )
-        model = FastLanguageModel.get_peft_model(
-            model,
-            r=16,
-            lora_alpha=32,
-            lora_dropout=0.0,
-            target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
-                           "gate_proj", "up_proj", "down_proj"],
-            use_gradient_checkpointing="unsloth",
-        )
-        print("  Unsloth LoRA loaded (4-bit quantization)")
-    else:
-        if not HAS_TRANSFORMERS:
-            raise RuntimeError("pip install transformers")
-        tokenizer = AutoTokenizer.from_pretrained(args.model)
-        if tokenizer.pad_token is None:
-            tokenizer.pad_token = tokenizer.eos_token
-        model = AutoModelForCausalLM.from_pretrained(args.model)
-        print("  Standard transformers model loaded")
-    # ── GRPO Training ──────────────────────────────────────────────────────────
-    print(f"\n[4/4] Starting GRPO training...")
-    print(f"  States: {len(records)} | Group size (G): {args.group_size}")
-    print(f"  Epochs: {args.epochs} | LR: {args.lr}")
-    print(f"  Total updates: ~{int(len(records) * args.epochs / args.batch_size)}")
-    config = GRPOConfig(
-        output_dir=str(output_dir),
-        # GRPO-specific
-        num_generations=args.group_size,      # G: sample this many completions per prompt
-        max_new_tokens=args.max_new_tokens,   # Max action length
-        temperature=0.8,                       # Exploration during training
-        # Standard training
-        learning_rate=args.lr,
-        num_train_epochs=args.epochs,
-        per_device_train_batch_size=args.batch_size,
-        gradient_accumulation_steps=args.grad_accum,
-        # Logging
-        logging_steps=1,
-        save_strategy="epoch",
-        report_to=[],                          # Set to ["wandb"] if you have it configured
-    )
-    trainer = GRPOTrainer(
-        model=model,
-        tokenizer=tokenizer,
-        config=config,
-        reward_funcs=reward_fn,    # ← Your environment's reward
-        train_dataset=dataset,
-    )
-    trainer.train()
-    # ── Save everything ────────────────────────────────────────────────────────
-    trainer.save_model(str(output_dir / "final_model"))
-    tokenizer.save_pretrained(str(output_dir / "final_model"))
-    print(f"\n  Model saved → {output_dir}/final_model")
-    # Save metrics
-    metrics_path = output_dir / "grpo_metrics.json"
-    with metrics_path.open("w") as f:
-        json.dump(trainer.state.log_history, f, indent=2)
-    print(f"  Metrics saved → {metrics_path}")
-    # Save plots
-    save_training_plots(trainer.state.log_history, output_dir)
-    # ── Summary ────────────────────────────────────────────────────────────────
-    log = trainer.state.log_history
-    losses = [e["loss"] for e in log if "loss" in e]
-    if losses:
-        print(f"\n  Initial loss:  {losses[0]:.4f}")
-        print(f"  Final loss:    {losses[-1]:.4f}")
-        print(f"  Improvement:   {((losses[0] - losses[-1]) / losses[0] * 100):.1f}%")
-    print(f"\n{'='*60}")
-    print(f"  GRPO TRAINING COMPLETE")
-    print(f"  Model:   {output_dir}/final_model")
-    print(f"  Plots:   {output_dir}/loss_curve.png")
-    print(f"           {output_dir}/reward_curve.png")
-    print(f"  Metrics: {output_dir}/grpo_metrics.json")
-    print(f"{'='*60}")
-if __name__ == "__main__":
-    main()