{ "nbformat": 4, "nbformat_minor": 0, "metadata": { "colab": { "provenance": [], "gpuType": "T4" }, "kernelspec": { "name": "python3", "display_name": "Python 3" }, "language_info": { "name": "python" }, "accelerator": "GPU" }, "cells": [ { "cell_type": "code", "source": [ "%pip install evaluate" ], "metadata": { "id": "aqcbe-No3r2r" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "lOwXY3N4tmbr" }, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib\n", "import torch\n", "import torch.nn as nn\n", "from torch.utils.data import Dataset, DataLoader\n", "from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments\n", "from datasets import load_dataset\n", "import evaluate\n", "from copy import deepcopy\n", "\n", "SEED=42\n", "MODEL=\"gpt2-medium\"\n", "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n" ] }, { "cell_type": "code", "source": [ "def tokenize(x, tokenizer):\n", " output = tokenizer(x[\"text\"], padding=\"max_length\", truncation=True, max_length=512)\n", " output[\"label\"] = output[\"input_ids\"].copy()\n", " return output\n", "\n", "def gen_tokenizer(model_name):\n", " tokenizer = AutoTokenizer.from_pretrained(model_name)\n", " tokenizer.pad_token = tokenizer.eos_token\n", " return tokenizer\n", "\n", "def finetune(config):\n", " ds = config[\"ds\"]\n", " preprocess_function = config[\"datasets_preprocess\"][config[\"dataset\"]]\n", " tokenizer = gen_tokenizer(config[\"model\"])\n", "\n", " train_dataset = ds[\"train\"].select(range(config[\"max_train_size\"])).map(\n", " lambda x: preprocess_function(x, tokenizer),\n", " )\n", "\n", "\n", " train_dataset = train_dataset.map(lambda x: tokenize(x, tokenizer), batched=True)\n", "\n", " model = AutoModelForCausalLM.from_pretrained(config[\"model\"])\n", " orig_model = deepcopy(model)\n", "\n", " trainer = Trainer(\n", " model=model,\n", " args=config[\"training_args\"],\n", " train_dataset=train_dataset,\n", " processing_class=tokenizer,\n", " )\n", "\n", " print(\"Starting training\")\n", " trainer.train()\n", " print(\"Training complete\")\n", "\n", " return orig_model, model" ], "metadata": { "id": "B3XugMEV5vZF" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def gsm8k_preprocess(x, tokenizer):\n", " return {\"text\": f\"Question: {x['question']}\\nAnswer: {x['answer']}\" + tokenizer.eos_token}\n", "\n", "def svamp_preprocess(x, tokenizer):\n", " return {\"text\": f\"{x['question_concat']}\\nAnswer: {x['Answer']}\" + tokenizer.eos_token}\n", "\n", "def tinystories_preprocess(x, tokenizer):\n", " return {\"text\": x[\"text\"] + tokenizer.eos_token}\n", "\n", "datasets_finetune = {\n", " \"openai/gsm8k\": gsm8k_preprocess,\n", " \"ChilleD/SVAMP\": svamp_preprocess,\n", " \"roneneldan/TinyStories\": tinystories_preprocess\n", "}\n", "\n", "def preprocess_test_gsm8k(x):\n", " return {\"text\": f\"Question: {x['question']}\\nAnswer:\" }\n", "\n", "def preprocess_test_svamp(x):\n", " return {\"text\": f\"{x['question_concat']}\\nAnswer:\"}\n", "\n", "def preprocess_test_tinystories(x):\n", " return {\"text\": x[\"text\"]}\n", "\n", "datasets_finetune_test = {\n", " \"openai/gsm8k\": preprocess_test_gsm8k,\n", " \"ChilleD/SVAMP\": preprocess_test_svamp,\n", " \"roneneldan/TinyStories\": preprocess_test_tinystories\n", "}" ], "metadata": { "id": "Y09qs3FFxwx1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def test_finetune(dataset, ds, orig_model, model, datasets_preprocess, first_x):\n", " tokenizer = gen_tokenizer(MODEL)\n", " preprocess_function = datasets_preprocess[dataset]\n", " if \"validation\" in ds:\n", " ds[\"test\"] = deepcopy(ds[\"validation\"])\n", "\n", " test_dataset = ds[\"test\"].map(\n", " lambda x: preprocess_function(x),\n", " )\n", "\n", " model = model.to(device)\n", " orig_model = orig_model.to(device)\n", "\n", " model.eval()\n", " orig_model.eval()\n", " xi = 0\n", " with torch.no_grad():\n", " for x in test_dataset:\n", " input_tensor = tokenizer(x[\"text\"], return_tensors=\"pt\")\n", " input_tensor[\"input_ids\"] = input_tensor[\"input_ids\"].to(device)\n", " input_tensor[\"attention_mask\"] = input_tensor[\"attention_mask\"].to(device)\n", "\n", " output = orig_model.generate(**input_tensor, max_new_tokens=512)\n", "\n", " print(\"Original model output\")\n", " print(tokenizer.decode(output[0], skip_special_tokens=True))\n", "\n", " finetuned_output = model.generate(**input_tensor, max_new_tokens=512)\n", "\n", " print(\"Finetuned model output\")\n", " print(tokenizer.decode(finetuned_output[0], skip_special_tokens=True))\n", "\n", " xi += 1\n", " if xi > first_x:\n", " break\n" ], "metadata": { "id": "zQqD3dHWDj6H" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "def generate_config(dataset):\n", " return config\n" ], "metadata": { "id": "2hlh0GERDBdC" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "dataset = \"ChilleD/SVAMP\"\n", "ds = load_dataset(dataset, \"default\")\n", "ds_1 = dataset.split('/')[1]\n", "\n", "config = {\n", " \"ds\": ds,\n", " \"dataset\": dataset,\n", " \"datasets_preprocess\": datasets_finetune,\n", " \"model\": MODEL,\n", " \"max_train_size\": 700,\n", " \"training_args\": TrainingArguments(\n", " output_dir=f\"./results_{ds_1}\",\n", " report_to=\"none\",\n", " num_train_epochs=10,\n", " per_device_train_batch_size=4,\n", " warmup_steps=200,\n", " learning_rate=5e-5,\n", " weight_decay=0.01,\n", " logging_steps=200,\n", " save_strategy=\"steps\",\n", " metric_for_best_model=\"loss\",\n", " greater_is_better=False,\n", " seed=SEED,\n", " ),\n", "}\n", "\n", "orig_model, model = finetune(config)" ], "metadata": { "id": "WSGsa3Xtx04j" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "test_finetune(dataset, ds, orig_model, model, datasets_finetune_test, 3)" ], "metadata": { "id": "2Z6kyEGqL7zN" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "dataset = \"roneneldan/TinyStories\"\n", "ds = load_dataset(dataset, \"default\")\n", "ds_1 = dataset.split('/')[1]\n", "\n", "\n", "config = {\n", " \"ds\": ds,\n", " \"dataset\": dataset,\n", " \"datasets_preprocess\": datasets_finetune,\n", " \"model\": MODEL,\n", " \"max_train_size\": 7000,\n", " \"training_args\": TrainingArguments(\n", " output_dir=f\"./results_{ds_1}\",\n", " report_to=\"none\",\n", " num_train_epochs=1,\n", " per_device_train_batch_size=4,\n", " warmup_steps=200,\n", " learning_rate=5e-5,\n", " weight_decay=0.01,\n", " logging_steps=200,\n", " save_strategy=\"steps\",\n", " metric_for_best_model=\"loss\",\n", " greater_is_better=False,\n", " seed=SEED,\n", " ),\n", "}\n", "\n", "orig_model, model = finetune(generate_config(dataset))" ], "metadata": { "id": "mOzvvJWP_PL1" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "test_finetune(dataset, ds, orig_model, model, datasets_finetune_test, 3)" ], "metadata": { "id": "X6WryZ6p3xGm" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import files\n", "files.download('/content/results_TinyStories/TinyStories-checkpoint-1750.zip')" ], "metadata": { "id": "LBJxFu5oVP29" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "xpFlk05UW87Q" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from google.colab import files\n", "files.download('/content/results_SVAMP/SVAMP-checkpoint-1750.zip')" ], "metadata": { "id": "jxnCHVVDVO6j" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "TygO0jjlVWG_" }, "execution_count": null, "outputs": [] } ] }