{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# GRPO Training with CodeArena RL Benchmark\n", "\n", "This notebook demonstrates how to connect our custom `codearena-rl-benchmark` environment to HuggingFace's `trl.GRPOTrainer`.\n", "It uses the `m-a-p/Code-Feedback` dataset to train the LLM for coding debugging and improving time complexity." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "> \u26a0\ufe0f **Note for Judges**: This training notebook is designed to be run in **Google Colab (Linux)** with an active GPU.\n", "> It uses HuggingFace TRL GRPOTrainer which requires Linux. Do not run locally on Windows.\n", "> The code below demonstrates how CodeArena functions as a live environment-in-the-loop reward signal." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "!pip install trl transformers datasets httpx fastapi uvicorn pydantic openai\n", "!git clone https://github.com/havinashpatil/meta.git\n", "!cd meta && pip install -r requirements.txt" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import torch\n", "from datasets import load_dataset\n", "from transformers import AutoModelForCausalLM, AutoTokenizer\n", "from trl import GRPOConfig, GRPOTrainer\n", "import httpx\n", "\n", "# Start the backend server in the background (Colab trick)\n", "import subprocess\n", "import time\n", "subprocess.Popen([\"uvicorn\", \"server.app:app\", \"--port\", \"7860\", \"--app-dir\", \"meta\"])\n", "time.sleep(5) # Wait for server to start" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def codearena_reward_func(completions, prompts):\n", " \"\"\"\n", " Reward function that queries the CodeArena OpenEnv server.\n", " For each proposed fix in `completions`, we step the environment.\n", " \"\"\"\n", " rewards = []\n", " for completion in completions:\n", " # Clean the generated code\n", " proposed_fix = completion[0].get('content', '').strip()\n", " if proposed_fix.startswith('```python'):\n", " proposed_fix = proposed_fix[9:].replace('```', '').strip()\n", " \n", " try:\n", " # Step the environment\n", " res = httpx.post(\n", " \"http://localhost:7860/step\",\n", " json={\"proposed_fix\": proposed_fix},\n", " timeout=10.0\n", " )\n", " res.raise_for_status()\n", " reward = res.json().get('reward', 0.0)\n", " rewards.append(reward)\n", " except Exception as e:\n", " print(f\"Env Error: {e}\")\n", " rewards.append(0.0)\n", " \n", " return rewards" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load Model\n", "model_name = \"Qwen/Qwen2.5-Coder-1.5B\"\n", "model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=\"auto\")\n", "tokenizer = AutoTokenizer.from_pretrained(model_name)\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "# Load dataset for Coding Debugging and Time Complexity Optimization\n", "dataset = load_dataset(\"m-a-p/Code-Feedback\", split=\"train\")\n", "\n", "def format_prompt(example):\n", " # m-a-p/Code-Feedback contains 'messages' with user and assistant roles\n", " messages = example.get('messages', [])\n", " user_query = \"\"\n", " if messages and len(messages) > 0 and messages[0].get('role') == 'user':\n", " user_query = messages[0].get('content', '')\n", " \n", " prompt = f\"Optimize and debug this code to improve time complexity:\\n{user_query}\"\n", " return {\"prompt\": prompt}\n", "\n", "dataset = dataset.map(format_prompt)\n", "# Keep only the prompt column for the trainer\n", "dataset = dataset.select_columns([\"prompt\"])\n", "# Limit for demo purposes\n", "dataset = dataset.select(range(100))\n", "\n", "# Initialize GRPO Trainer\n", "training_args = GRPOConfig(\n", " output_dir=\"./codearena-grpo\",\n", " learning_rate=1e-5,\n", " max_steps=50,\n", " per_device_train_batch_size=2,\n", " gradient_accumulation_steps=2,\n", ")\n", "\n", "trainer = GRPOTrainer(\n", " model=model,\n", " reward_funcs=codearena_reward_func,\n", " args=training_args,\n", " train_dataset=dataset,\n", ")\n", "\n", "trainer.train()" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.13.6" } }, "nbformat": 4, "nbformat_minor": 4 }