File size: 5,887 Bytes
a448db8
 
 
 
 
 
 
 
9204c04
 
a448db8
 
5dffd52
 
 
 
 
 
 
 
 
a448db8
 
03a7eb9
a448db8
5dffd52
a448db8
82e39c9
a448db8
 
 
 
 
 
03a7eb9
a448db8
5dffd52
a448db8
 
9204c04
a448db8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9204c04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a448db8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82e39c9
a448db8
 
 
 
 
 
 
 
 
 
 
 
 
82e39c9
a448db8
 
 
 
5dffd52
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# GRPO Training with CodeArena RL Benchmark\n",
        "\n",
        "This notebook demonstrates how to connect our custom `codearena-rl-benchmark` environment to HuggingFace's `trl.GRPOTrainer`.\n",
        "It uses the `m-a-p/Code-Feedback` dataset to train the LLM for coding debugging and improving time complexity."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "> \u26a0\ufe0f **Note for Judges**: This training notebook is designed to be run in **Google Colab (Linux)** with an active GPU.\n",
        "> It uses HuggingFace TRL GRPOTrainer which requires Linux. Do not run locally on Windows.\n",
        "> The code below demonstrates how CodeArena functions as a live environment-in-the-loop reward signal."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [],
      "source": [
        "!pip install trl transformers datasets httpx fastapi uvicorn pydantic openai\n",
        "!git clone https://github.com/havinashpatil/meta.git\n",
        "!cd meta && pip install -r requirements.txt"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 5,
      "metadata": {},
      "outputs": [],
      "source": [
        "import torch\n",
        "from datasets import load_dataset\n",
        "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
        "from trl import GRPOConfig, GRPOTrainer\n",
        "import httpx\n",
        "\n",
        "# Start the backend server in the background (Colab trick)\n",
        "import subprocess\n",
        "import time\n",
        "subprocess.Popen([\"uvicorn\", \"server.app:app\", \"--port\", \"7860\", \"--app-dir\", \"meta\"])\n",
        "time.sleep(5)  # Wait for server to start"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def codearena_reward_func(completions, prompts):\n",
        "    \"\"\"\n",
        "    Reward function that queries the CodeArena OpenEnv server.\n",
        "    For each proposed fix in `completions`, we step the environment.\n",
        "    \"\"\"\n",
        "    rewards = []\n",
        "    for completion in completions:\n",
        "        # Clean the generated code\n",
        "        proposed_fix = completion[0].get('content', '').strip()\n",
        "        if proposed_fix.startswith('```python'):\n",
        "            proposed_fix = proposed_fix[9:].replace('```', '').strip()\n",
        "            \n",
        "        try:\n",
        "            # Step the environment\n",
        "            res = httpx.post(\n",
        "                \"http://localhost:7860/step\",\n",
        "                json={\"proposed_fix\": proposed_fix},\n",
        "                timeout=10.0\n",
        "            )\n",
        "            res.raise_for_status()\n",
        "            reward = res.json().get('reward', 0.0)\n",
        "            rewards.append(reward)\n",
        "        except Exception as e:\n",
        "            print(f\"Env Error: {e}\")\n",
        "            rewards.append(0.0)\n",
        "            \n",
        "    return rewards"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "# Load Model\n",
        "model_name = \"Qwen/Qwen2.5-Coder-1.5B\"\n",
        "model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map=\"auto\")\n",
        "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
        "tokenizer.pad_token = tokenizer.eos_token\n",
        "\n",
        "# Load dataset for Coding Debugging and Time Complexity Optimization\n",
        "dataset = load_dataset(\"m-a-p/Code-Feedback\", split=\"train\")\n",
        "\n",
        "def format_prompt(example):\n",
        "    # m-a-p/Code-Feedback contains 'messages' with user and assistant roles\n",
        "    messages = example.get('messages', [])\n",
        "    user_query = \"\"\n",
        "    if messages and len(messages) > 0 and messages[0].get('role') == 'user':\n",
        "        user_query = messages[0].get('content', '')\n",
        "    \n",
        "    prompt = f\"Optimize and debug this code to improve time complexity:\\n{user_query}\"\n",
        "    return {\"prompt\": prompt}\n",
        "\n",
        "dataset = dataset.map(format_prompt)\n",
        "# Keep only the prompt column for the trainer\n",
        "dataset = dataset.select_columns([\"prompt\"])\n",
        "# Limit for demo purposes\n",
        "dataset = dataset.select(range(100))\n",
        "\n",
        "# Initialize GRPO Trainer\n",
        "training_args = GRPOConfig(\n",
        "    output_dir=\"./codearena-grpo\",\n",
        "    learning_rate=1e-5,\n",
        "    max_steps=50,\n",
        "    per_device_train_batch_size=2,\n",
        "    gradient_accumulation_steps=2,\n",
        ")\n",
        "\n",
        "trainer = GRPOTrainer(\n",
        "    model=model,\n",
        "    reward_funcs=codearena_reward_func,\n",
        "    args=training_args,\n",
        "    train_dataset=dataset,\n",
        ")\n",
        "\n",
        "trainer.train()"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.13.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 4
}