{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# GRPO Training with CodeArena RL Benchmark\n",
        "\n",
        "This notebook demonstrates how to connect our custom `codearena-rl-benchmark` environment to HuggingFace's `trl.GRPOTrainer`.\n",
        "It uses the `m-a-p/Code-Feedback` dataset to train the LLM for coding debugging and improving time complexity."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "> \u26a0\ufe0f **Note for Judges**: This training notebook is designed to be run in **Google Colab (Linux)** with an active GPU.\n",
        "> It uses HuggingFace TRL GRPOTrainer which requires Linux. Do not run locally on Windows.\n",
        "> The code below demonstrates how CodeArena functions as a live environment-in-the-loop reward signal."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [],
      "source": [
        "!pip install trl transformers datasets httpx fastapi uvicorn pydantic openai\n",
        "!git clone https://github.com/havinashpatil/meta.git\n",
        "!cd meta && pip install -r requirements.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [],
      "source": [
        "import torch\n",
        "from datasets import load_dataset\n",
        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
        "from trl import GRPOConfig, GRPOTrainer\n",
        "import httpx\n",
        "\n",
        "# Start the backend server in the background (Colab trick)\n",
        "import subprocess\n",
        "import time\n",
        "subprocess.Popen([\"uvicorn\", \"server.app:app\", \"--port\", \"7860\", \"--app-dir\", \"meta\"])\n",
        "time.sleep(5)  # Wait for server to start"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def codearena_reward_func(completions, prompts):\n",
        "    \"\"\"\n",
        "    Reward function that queries the CodeArena OpenEnv server.\n",
        "    For each proposed fix in `completions`, we step the environment.\n",
        "    \"\"\"\n",
        "    rewards = []\n",
        "    for completion in completions:\n",
        "        # Clean the generated code\n",
        "        proposed_fix = completion[0].get('content', '').strip()\n",
        "        if proposed_fix.startswith('```python'):\n",
        "            proposed_fix = proposed_fix[9:].replace('```', '').strip()\n",
        "            \n",
        "        try:\n",
        "            # Step the environment\n",
        "            res = httpx.post(\n",
        "                \"http://localhost:7860/step\",\n",
        "                json={\"proposed_fix\": proposed_fix},\n",
        "                timeout=10.0\n",
        "            )\n",
        "            res.raise_for_status()\n",
        "            reward = res.json().get('reward', 0.0)\n",
        "            rewards.append(reward)\n",
        "        except Exception as e:\n",
        "            print(f\"Env Error: {e}\")\n",
        "            rewards.append(0.0)\n",
        "            \n",
        "    return rewards"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Load Model\n",
        "model_name = \"Qwen/Qwen2.5-Coder-1.5B\"\n",
        "model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=\"auto\")\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
        "tokenizer.pad_token = tokenizer.eos_token\n",
        "\n",
        "# Load dataset for Coding Debugging and Time Complexity Optimization\n",
        "dataset = load_dataset(\"m-a-p/Code-Feedback\", split=\"train\")\n",
        "\n",
        "def format_prompt(example):\n",
        "    # m-a-p/Code-Feedback contains 'messages' with user and assistant roles\n",
        "    messages = example.get('messages', [])\n",
        "    user_query = \"\"\n",
        "    if messages and len(messages) > 0 and messages[0].get('role') == 'user':\n",
        "        user_query = messages[0].get('content', '')\n",
        "    \n",
        "    prompt = f\"Optimize and debug this code to improve time complexity:\\n{user_query}\"\n",
        "    return {\"prompt\": prompt}\n",
        "\n",
        "dataset = dataset.map(format_prompt)\n",
        "# Keep only the prompt column for the trainer\n",
        "dataset = dataset.select_columns([\"prompt\"])\n",
        "# Limit for demo purposes\n",
        "dataset = dataset.select(range(100))\n",
        "\n",
        "# Initialize GRPO Trainer\n",
        "training_args = GRPOConfig(\n",
        "    output_dir=\"./codearena-grpo\",\n",
        "    learning_rate=1e-5,\n",
        "    max_steps=50,\n",
        "    per_device_train_batch_size=2,\n",
        "    gradient_accumulation_steps=2,\n",
        ")\n",
        "\n",
        "trainer = GRPOTrainer(\n",
        "    model=model,\n",
        "    reward_funcs=codearena_reward_func,\n",
        "    args=training_args,\n",
        "    train_dataset=dataset,\n",
        ")\n",
        "\n",
        "trainer.train()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.13.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}