teolm30 commited on
Commit
25337cd
·
verified ·
1 Parent(s): 26463a3

Add Colab notebook for free GPU training

Browse files
Files changed (1) hide show
  1. fox1.3_colab.ipynb +327 -0
fox1.3_colab.ipynb ADDED
@@ -0,0 +1,327 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpu_computation": true,
8
+ "accelerator": "GPU"
9
+ },
10
+ "kernelspec": {
11
+ "name": "python3",
12
+ "display_name": "Python 3"
13
+ },
14
+ "language_info": {
15
+ "name": "python"
16
+ }
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "text",
21
+ "metadata": {},
22
+ "source": [
23
+ "# 🦊 Fox1.3 Training & Evaluation Pipeline\n",
24
+ "\n",
25
+ "**This notebook:**\n",
26
+ "1. Clones Fox1.3 from HuggingFace\n",
27
+ "2. Runs HumanEval + MBPP benchmarks (baseline)\n",
28
+ "3. Fine-tunes with LoRA on CodeAlpaca_20K\n",
29
+ "4. Runs benchmarks again (improved score)\n",
30
+ "5. Pushes back to HuggingFace\n",
31
+ "\n",
32
+ "**Runtime:** Runtime → Change runtime type → GPU (T4 recommended)"
33
+ ]
34
+ },
35
+ {
36
+ "cell_type": "code",
37
+ "execution_count": null,
38
+ "metadata": {},
39
+ "outputs": [],
40
+ "source": [
41
+ "# Install dependencies\n",
42
+ "!pip install -q transformers peft bitsandbytes accelerate datasets scipy torch\n",
43
+ "!pip install -q huggingface_hub"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": null,
49
+ "metadata": {},
50
+ "outputs": [],
51
+ "source": [
52
+ "# Login to HuggingFace (use your token)\n",
53
+ "from huggingface_hub import login\n",
54
+ "HF_TOKEN = input(\"Enter your HF token (hf_...): \")\n",
55
+ "login(token=HF_TOKEN)"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "metadata": {},
62
+ "outputs": [],
63
+ "source": [
64
+ "# Clone the Fox1.3 repo\n",
65
+ "!git clone https://huggingface.co/teolm30/fox1.3 fox1.3-repo\n",
66
+ "%cd fox1.3-repo\n",
67
+ "!ls"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": null,
73
+ "metadata": {},
74
+ "outputs": [],
75
+ "source": [
76
+ "%%writefile evaluate.py\n",
77
+ "#!/usr/bin/env python3\n",
78
+ "\"\"\"Fox1.3 Evaluation - HumanEval + MBPP\"\"\"\n",
79
+ "import torch\n",
80
+ "from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline\n",
81
+ "from datasets import load_dataset\n",
82
+ "import json\n",
83
+ "\n",
84
+ "MODEL_NAME = \"teolm30/fox1.3\"\n",
85
+ "DEVICE = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
86
+ "print(f\"Using device: {DEVICE}\")\n",
87
+ "\n",
88
+ "def load_model():\n",
89
+ " tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
90
+ " tokenizer.pad_token = tokenizer.eos_token\n",
91
+ " model = AutoModelForCausalLM.from_pretrained(\n",
92
+ " MODEL_NAME,\n",
93
+ " torch_dtype=torch.float16,\n",
94
+ " device_map=\"auto\",\n",
95
+ " trust_remote_code=True\n",
96
+ " )\n",
97
+ " return model, tokenizer\n",
98
+ "\n",
99
+ "def run_humaneval(model, tokenizer):\n",
100
+ " dataset = load_dataset(\"openai/openai_humaneval\", split=\"test\")\n",
101
+ " pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer,\n",
102
+ " max_new_tokens=256, do_sample=False, pad_token_id=tokenizer.eos_token_id)\n",
103
+ " correct = 0\n",
104
+ " for i, item in enumerate(dataset):\n",
105
+ " prompt = item[\"prompt\"]\n",
106
+ " test = item[\"test\"]\n",
107
+ " try:\n",
108
+ " out = pipe(prompt, pad_token_id=tokenizer.eos_token_id)\n",
109
+ " code = out[0][\"generated_text\"][len(prompt):].strip()\n",
110
+ " if \"```python\" in code:\n",
111
+ " code = code.split(\"```python\")[1].split(\"```\")[0].strip()\n",
112
+ " exec_globals = {}\n",
113
+ " exec(code, exec_globals)\n",
114
+ " exec(test, exec_globals)\n",
115
+ " correct += 1\n",
116
+ " except:\n",
117
+ " pass\n",
118
+ " if (i+1) % 20 == 0:\n",
119
+ " print(f\"HumanEval: {i+1}/{len(dataset)} | Running score: {correct}/{i+1}\")\n",
120
+ " print(f\"HumanEval PASS@1: {correct}/{len(dataset)} = {correct/len(dataset):.4f}\")\n",
121
+ " return correct / len(dataset)\n",
122
+ "\n",
123
+ "def run_mbpp(model, tokenizer):\n",
124
+ " dataset = load_dataset(\"google-research/mbpp\", \"sanitized\", split=\"test\")\n",
125
+ " pipe = pipeline(\"text-generation\", model=model, tokenizer=tokenizer,\n",
126
+ " max_new_tokens=256, do_sample=False, pad_token_id=tokenizer.eos_token_id)\n",
127
+ " correct = 0\n",
128
+ " for i, item in enumerate(dataset[:374]):\n",
129
+ " prompt = f\"### Instruction:\\nWrite a Python function.\\n\\n### Input:\\n{item['prompt']}\\n\\n### Response:\\n\"\n",
130
+ " try:\n",
131
+ " out = pipe(prompt, pad_token_id=tokenizer.eos_token_id)\n",
132
+ " code = out[0][\"generated_text\"][len(prompt):].strip()\n",
133
+ " if \"```python\" in code:\n",
134
+ " code = code.split(\"```python\")[1].split(\"```\")[0].strip()\n",
135
+ " exec_globals = {}\n",
136
+ " exec(code, exec_globals)\n",
137
+ " all_pass = True\n",
138
+ " for test in item[\"test_list\"]:\n",
139
+ " try:\n",
140
+ " exec(test, exec_globals)\n",
141
+ " except:\n",
142
+ " all_pass = False\n",
143
+ " break\n",
144
+ " if all_pass:\n",
145
+ " correct += 1\n",
146
+ " except:\n",
147
+ " pass\n",
148
+ " if (i+1) % 50 == 0:\n",
149
+ " print(f\"MBPP: {i+1}/374 | Running score: {correct}/{i+1}\")\n",
150
+ " print(f\"MBPP PASS@1: {correct}/374 = {correct/374:.4f}\")\n",
151
+ " return correct / 374\n",
152
+ "\n",
153
+ "print(\"Loading model...\")\n",
154
+ "model, tokenizer = load_model()\n",
155
+ "print(\"\\n=== BASELINE BENCHMARK ===\")\n",
156
+ "he_score = run_humaneval(model, tokenizer)\n",
157
+ "mbpp_score = run_mbpp(model, tokenizer)\n",
158
+ "print(f\"\\nBaseline: HumanEval={he_score:.4f}, MBPP={mbpp_score:.4f}\")"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": null,
164
+ "metadata": {},
165
+ "outputs": [],
166
+ "source": [
167
+ "%%writefile train.py\n",
168
+ "#!/usr/bin/env python3\n",
169
+ "\"\"\"Fox1.3 LoRA Fine-tuning\"\"\"\n",
170
+ "import os\n",
171
+ "import torch\n",
172
+ "from datasets import load_dataset\n",
173
+ "from transformers import (\n",
174
+ " AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig,\n",
175
+ " TrainingArguments, Trainer, DataCollatorForLanguageModeling\n",
176
+ ")\n",
177
+ "from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training\n",
178
+ "import logging\n",
179
+ "logging.basicConfig(level=logging.INFO)\n",
180
+ "\n",
181
+ "MODEL_NAME = \"Qwen/Qwen2.5-1B-Instruct\"\n",
182
+ "DATASET_NAME = \"HuggingFaceH4/CodeAlpaca_20K\"\n",
183
+ "OUTPUT_DIR = \"/tmp/fox1.3-checkpoints\"\n",
184
+ "\n",
185
+ "def format_instruction(example):\n",
186
+ " inst = example.get(\"instruction\", \"\")\n",
187
+ " inp = example.get(\"input\", \"\")\n",
188
+ " out = example.get(\"output\", \"\")\n",
189
+ " if inp:\n",
190
+ " text = f\"### Instruction:\\n{inst}\\n\\n### Input:\\n{inp}\\n\\n### Response:\\n{out}\"\n",
191
+ " else:\n",
192
+ " text = f\"### Instruction:\\n{inst}\\n\\n### Response:\\n{out}\"\n",
193
+ " return {\"text\": text}\n",
194
+ "\n",
195
+ "print(\"Loading tokenizer...\")\n",
196
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
197
+ "tokenizer.pad_token = tokenizer.eos_token\n",
198
+ "\n",
199
+ "print(\"Loading model with 4-bit quantization...\")\n",
200
+ "bnb_config = BitsAndBytesConfig(\n",
201
+ " load_in_4bit=True, bnb_4bit_quant_type=\"nf4\",\n",
202
+ " bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True,\n",
203
+ ")\n",
204
+ "model = AutoModelForCausalLM.from_pretrained(\n",
205
+ " MODEL_NAME, quantization_config=bnb_config, device_map=\"auto\", trust_remote_code=True\n",
206
+ ")\n",
207
+ "model = prepare_model_for_kbit_training(model)\n",
208
+ "\n",
209
+ "lora_config = LoraConfig(\n",
210
+ " r=8, lora_alpha=16,\n",
211
+ " target_modules=[\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\"],\n",
212
+ " lora_dropout=0.05, bias=\"none\", task_type=\"CAUSAL_LM\"\n",
213
+ ")\n",
214
+ "model = get_peft_model(model, lora_config)\n",
215
+ "model.print_trainable_parameters()\n",
216
+ "\n",
217
+ "print(\"Loading dataset...\")\n",
218
+ "dataset = load_dataset(DATASET_NAME, split=\"train\")\n",
219
+ "dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)\n",
220
+ "dataset = dataset.filter(lambda x: x[\"text\"] is not None)\n",
221
+ "\n",
222
+ "def tokenize(example):\n",
223
+ " return tokenizer(example[\"text\"], truncation=True, max_length=1024, padding=\"max_length\")\n",
224
+ "\n",
225
+ "dataset = dataset.map(tokenize, batched=True, remove_columns=[\"text\"])\n",
226
+ "dataset = dataset.train_test_split(test_size=0.1)\n",
227
+ "train_ds, eval_ds = dataset[\"train\"], dataset[\"test\"]\n",
228
+ "\n",
229
+ "training_args = TrainingArguments(\n",
230
+ " output_dir=OUTPUT_DIR,\n",
231
+ " num_train_epochs=3,\n",
232
+ " per_device_train_batch_size=4,\n",
233
+ " per_device_eval_batch_size=4,\n",
234
+ " learning_rate=2e-4,\n",
235
+ " warmup_steps=50,\n",
236
+ " logging_steps=20,\n",
237
+ " eval_strategy=\"epoch\",\n",
238
+ " save_strategy=\"epoch\",\n",
239
+ " bf16=True,\n",
240
+ " tf32=True,\n",
241
+ " optim=\"paged_adamw_8bit\",\n",
242
+ " group_by_length=True,\n",
243
+ " report_to=\"none\",\n",
244
+ ")\n",
245
+ "\n",
246
+ "trainer = Trainer(\n",
247
+ " model=model,\n",
248
+ " args=training_args,\n",
249
+ " train_dataset=train_ds,\n",
250
+ " eval_dataset=eval_ds,\n",
251
+ " data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),\n",
252
+ ")\n",
253
+ "\n",
254
+ "print(\"Starting training... (this will take ~30-60 min on T4)\")\n",
255
+ "trainer.train()\n",
256
+ "print(\"Training complete!\")\n",
257
+ "\n",
258
+ "# Save merged model\n",
259
+ "print(\"Merging and saving...\")\n",
260
+ "merged_model = model.merge_and_unload()\n",
261
+ "merged_model.save_pretrained(\"/tmp/fox1.3-improved\")\n",
262
+ "tokenizer.save_pretrained(\"/tmp/fox1.3-improved\")\n",
263
+ "print(\"Done! Model saved to /tmp/fox1.3-improved\")"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": null,
269
+ "metadata": {},
270
+ "outputs": [],
271
+ "source": [
272
+ "# Run baseline benchmark\n",
273
+ "!python3 evaluate.py 2>&1"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": null,
279
+ "metadata": {},
280
+ "outputs": [],
281
+ "source": [
282
+ "# Run training (this takes 30-60 min on free T4)\n",
283
+ "!python3 train.py 2>&1"
284
+ ]
285
+ },
286
+ {
287
+ "cell_type": "code",
288
+ "execution_count": null,
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": [
292
+ "# Push improved model to HF\n",
293
+ "from huggingface_hub import HfApi, create_repo\n",
294
+ "import os\n",
295
+ "\n",
296
+ "api = HfApi()\n",
297
+ "repo_id = \"teolm30/fox1.3\"\n",
298
+ "\n",
299
+ "print(\"Uploading improved model...\")\n",
300
+ "api.upload_folder(\n",
301
+ " folder_path=\"/tmp/fox1.3-improved\",\n",
302
+ " repo_id=repo_id,\n",
303
+ " repo_type=\"model\",\n",
304
+ " commit_message=\"Fine-tuned on CodeAlpaca_20K (LoRA, 3 epochs)\",\n",
305
+ ")\n",
306
+ "print(f\"\\n✅ Uploaded! https://huggingface.co/{repo_id}\")"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": null,
312
+ "metadata": {},
313
+ "outputs": [],
314
+ "source": [
315
+ "# Run benchmark on improved model\n",
316
+ "import torch\n",
317
+ "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
318
+ "MODEL_NAME = \"teolm30/fox1.3\"\n",
319
+ "print(\"Loading improved model...\")\n",
320
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)\n",
321
+ "tokenizer.pad_token = tokenizer.eos_token\n",
322
+ "model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map=\"auto\", trust_remote_code=True)\n",
323
+ "print(\"Model loaded! Run evaluate.py to get final scores.\")"
324
+ ]
325
+ }
326
+ ]
327
+ }