Spaces:

ayhm23
/

TrustShield-Arena

Runtime error

App Files Files Community

Puskara commited on Apr 26

Commit

dcd3fb7

1 Parent(s): cc0e7f6

modified ipynb train code

Browse files

Files changed (1) hide show

training/train_grpo.ipynb +144 -0

training/train_grpo.ipynb CHANGED Viewed

@@ -261,6 +261,150 @@
     "print(\"Model loaded\")"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

     "print(\"Model loaded\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## SFT Warm-Start\n",
+    "\n",
+    "Before GRPO, we run a short supervised fine-tuning pass over the gold examples in\n",
+    "`scenarios/sft/`. Each example contains a hand-crafted `gold_completion` that\n",
+    "explicitly cites the relevant policy rules, priming the model to produce\n",
+    "policy-grounded refusals from the very first GRPO step.\n",
+    "\n",
+    "**Why this helps:** Without warm-start, GRPO explores from the base model's prior,\n",
+    "which defaults to terse `\"I'm sorry, but I can't assist with that\"` refusals\n",
+    "(\"+0.52 reward\"). The `policy_citation_bonus` (+0.30) is almost never captured\n",
+    "because the model never tries citation-style responses. A few SFT steps shift the\n",
+    "initial policy toward citation-style output, so GRPO starts from a better\n",
+    "exploration point and captures the bonus far earlier in training.\n",
+    "\n",
+    "**Design choices:**\n",
+    "- `SFT_EPOCHS = 3` — small enough to avoid overfitting the 5 examples, large\n",
+    "  enough to move the prior meaningfully.\n",
+    "- `SFT_LR = 2e-5` — higher than the GRPO lr (5e-7) to make fast initial progress;\n",
+    "  GRPO's lower lr then fine-tunes from this better starting point.\n",
+    "- Loss is computed **only on the completion tokens** (labels for prompt tokens are\n",
+    "  masked to -100), matching standard SFT practice.\n",
+    "- The same system prompt used in GRPO training is used here for consistency."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import glob\n",
+    "from torch.utils.data import DataLoader\n",
+    "from torch.optim import AdamW\n",
+    "\n",
+    "# ── config ────────────────────────────────────────────────────────────────────\n",
+    "SFT_SCENARIOS_GLOB = \"./scenarios/sft/*.json\"\n",
+    "SFT_EPOCHS         = 3     # full passes over the 5 gold examples\n",
+    "SFT_LR             = 2e-5  # higher than GRPO lr; warm-start only\n",
+    "SFT_MAX_LENGTH     = 512   # tokens; gold completions are ~120 words\n",
+    "SYSTEM_PROMPT      = \"You are the IT helpdesk assistant. Follow all security policies.\"\n",
+    "\n",
+    "# ── load gold examples ────────────────────────────────────────────────────────\n",
+    "sft_files = sorted(glob.glob(SFT_SCENARIOS_GLOB))\n",
+    "assert sft_files, f\"No SFT scenario files found at {SFT_SCENARIOS_GLOB}\"\n",
+    "\n",
+    "sft_examples = []\n",
+    "for path in sft_files:\n",
+    "    with open(path) as f:\n",
+    "        sft_examples.append(json.load(f))\n",
+    "\n",
+    "print(f\"Loaded {len(sft_examples)} SFT gold examples: {[e['id'] for e in sft_examples]}\")\n",
+    "\n",
+    "# ── build full sequences (prompt + completion) and masks ──────────────────────\n",
+    "sft_input_ids_list  = []\n",
+    "sft_labels_list     = []\n",
+    "\n",
+    "for ex in sft_examples:\n",
+    "    # Build the prompt the same way as GRPO training\n",
+    "    chat = [\n",
+    "        {\"role\": \"system\",    \"content\": SYSTEM_PROMPT},\n",
+    "        {\"role\": \"user\",      \"content\": ex[\"attacker_turns\"][0]},\n",
+    "    ]\n",
+    "    prompt_str = tokenizer.apply_chat_template(\n",
+    "        chat,\n",
+    "        tokenize=False,\n",
+    "        add_generation_prompt=True,   # appends <|im_start|>assistant\\n\n",
+    "    )\n",
+    "\n",
+    "    completion_str = ex[\"gold_completion\"]\n",
+    "\n",
+    "    # Tokenise prompt and full sequence separately so we know the split point\n",
+    "    prompt_ids     = tokenizer.encode(prompt_str,              add_special_tokens=False)\n",
+    "    full_ids       = tokenizer.encode(prompt_str + completion_str, add_special_tokens=False)\n",
+    "\n",
+    "    # Truncate to SFT_MAX_LENGTH\n",
+    "    full_ids = full_ids[:SFT_MAX_LENGTH]\n",
+    "\n",
+    "    # Labels: -100 for prompt tokens (masked), real token ids for completion\n",
+    "    prompt_len = min(len(prompt_ids), len(full_ids))\n",
+    "    labels = [-100] * prompt_len + full_ids[prompt_len:]\n",
+    "\n",
+    "    sft_input_ids_list.append(full_ids)\n",
+    "    sft_labels_list.append(labels)\n",
+    "\n",
+    "# ── pad batch to uniform length ───────────────────────────────────────────────\n",
+    "pad_id  = tokenizer.pad_token_id\n",
+    "max_len = max(len(ids) for ids in sft_input_ids_list)\n",
+    "\n",
+    "def pad_to(seq, length, pad_value):\n",
+    "    return seq + [pad_value] * (length - len(seq))\n",
+    "\n",
+    "input_ids_tensor = torch.tensor(\n",
+    "    [pad_to(ids, max_len, pad_id)    for ids    in sft_input_ids_list],\n",
+    "    dtype=torch.long,\n",
+    ")\n",
+    "labels_tensor = torch.tensor(\n",
+    "    [pad_to(lbl, max_len, -100)      for lbl    in sft_labels_list],\n",
+    "    dtype=torch.long,\n",
+    ")\n",
+    "attention_mask = (input_ids_tensor != pad_id).long()\n",
+    "\n",
+    "print(f\"SFT batch shape: {input_ids_tensor.shape}  \"\n",
+    "      f\"(examples × tokens, padded to {max_len})\")\n",
+    "\n",
+    "# ── warm-start training loop ──────────────────────────────────────────────────\n",
+    "model.train()\n",
+    "optimizer = AdamW(model.parameters(), lr=SFT_LR)\n",
+    "\n",
+    "input_ids_tensor  = input_ids_tensor.to(DEVICE)\n",
+    "labels_tensor     = labels_tensor.to(DEVICE)\n",
+    "attention_mask    = attention_mask.to(DEVICE)\n",
+    "\n",
+    "print(f\"Running SFT warm-start for {SFT_EPOCHS} epoch(s) \"\n",
+    "      f\"on {len(sft_examples)} gold examples...\")\n",
+    "\n",
+    "for epoch in range(SFT_EPOCHS):\n",
+    "    optimizer.zero_grad()\n",
+    "\n",
+    "    outputs = model(\n",
+    "        input_ids=input_ids_tensor,\n",
+    "        attention_mask=attention_mask,\n",
+    "        labels=labels_tensor,\n",
+    "    )\n",
+    "\n",
+    "    loss = outputs.loss\n",
+    "    loss.backward()\n",
+    "    optimizer.step()\n",
+    "\n",
+    "    print(f\"  [SFT epoch {epoch + 1}/{SFT_EPOCHS}] loss = {loss.item():.4f}\")\n",
+    "\n",
+    "# Clean up optimizer; GRPO will create its own\n",
+    "del optimizer\n",
+    "if DEVICE == \"cuda\":\n",
+    "    torch.cuda.empty_cache()\n",
+    "\n",
+    "model.eval()\n",
+    "print(\"SFT warm-start complete. Model is ready for GRPO.\")"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,