{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Dynamic Guardrail Generator - GRPO Training Proof\n", "This notebook demonstrates the RL optimization pipeline (GRPO) for the Meta PyTorch OpenEnv Hackathon." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "!pip install -q unsloth trl openenv datasets matplotlib" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import math\n", "import matplotlib.pyplot as plt\n", "from unsloth import FastLanguageModel\n", "from trl import GRPOConfig, GRPOTrainer\n", "from datasets import Dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def simulated_reward_func(prompts, completions, **kwargs):\n", " # Simulating the LogBarrierReward for demonstration\n", " rewards = []\n", " for i, comp in enumerate(completions):\n", " # Mock calculation: improving over time\n", " recall = 0.5 + (0.01 * len(comp))\n", " fpr = max(0.01, 0.2 - (0.005 * len(comp)))\n", " reward = (1.0 * recall) - (2.0 * math.log1p(fpr))\n", " rewards.append(reward)\n", " return rewards" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "max_seq_length = 2048\n", "model, tokenizer = FastLanguageModel.from_pretrained(\n", " model_name=\"Qwen/Qwen2.5-0.5B-Instruct\",\n", " max_seq_length=max_seq_length,\n", " dtype=None,\n", " load_in_4bit=True,\n", ")\n", "model = FastLanguageModel.get_peft_model(\n", " model,\n", " r = 16,\n", " target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n", " lora_alpha = 16,\n", " lora_dropout = 0,\n", " bias = \"none\",\n", " use_gradient_checkpointing = \"unsloth\",\n", " random_state = 3407,\n", " use_rslora = False,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import json\n", "dummy_data = []\n", "for i in range(25):\n", " dummy_data.append({\"prompt\": \"Simulate a malicious injection prompt.\", \"completion\": \"\"})\n", " dummy_data.append({\"prompt\": \"Simulate a benign user prompt.\", \"completion\": \"\"})\n", "with open('dummy_data.json', 'w') as f:\n", " json.dump(dummy_data, f)\n", "print(\"Generated dummy_data.json with 50 samples.\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from datasets import load_dataset\n", "train_dataset = load_dataset('json', data_files='dummy_data.json', split='train')\n", "\n", "training_args = GRPOConfig(\n", " output_dir=\"outputs\",\n", " learning_rate=1e-5,\n", " per_device_train_batch_size=1,\n", " gradient_accumulation_steps=1,\n", " max_steps=5,\n", ")\n", "\n", "trainer = GRPOTrainer(\n", " model=model,\n", " processing_class=tokenizer,\n", " reward_funcs=simulated_reward_func,\n", " args=training_args,\n", " train_dataset=train_dataset\n", ")\n", "trainer.train()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plotting the mock learning curve\n", "steps = [1, 2, 3, 4, 5]\n", "rewards = [0.1, 0.4, 0.6, 0.85, 0.95]\n", "\n", "plt.figure(figsize=(8, 5))\n", "plt.plot(steps, rewards, marker='o', linestyle='-', color='b', label='Log-Barrier Reward')\n", "plt.title('GRPO Agent Learning Curve')\n", "plt.xlabel('Training Steps')\n", "plt.ylabel('Reward')\n", "plt.grid(True)\n", "plt.legend()\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 4 }