In [None]:
%pip install evaluate

In [None]:
import numpy as np
import matplotlib
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import evaluate
from copy import deepcopy

SEED=42
MODEL="gpt2-medium"
device = "cuda" if torch.cuda.is_available() else "cpu"


In [None]:
def tokenize(x, tokenizer):
 output = tokenizer(x["text"], padding="max_length", truncation=True, max_length=512)
 output["label"] = output["input_ids"].copy()
 return output

def gen_tokenizer(model_name):
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 tokenizer.pad_token = tokenizer.eos_token
 return tokenizer

def finetune(config):
 ds = config["ds"]
 preprocess_function = config["datasets_preprocess"][config["dataset"]]
 tokenizer = gen_tokenizer(config["model"])

 train_dataset = ds["train"].select(range(config["max_train_size"])).map(
 lambda x: preprocess_function(x, tokenizer),
 )


 train_dataset = train_dataset.map(lambda x: tokenize(x, tokenizer), batched=True)

 model = AutoModelForCausalLM.from_pretrained(config["model"])
 orig_model = deepcopy(model)

 trainer = Trainer(
 model=model,
 args=config["training_args"],
 train_dataset=train_dataset,
 processing_class=tokenizer,
 )

 print("Starting training")
 trainer.train()
 print("Training complete")

 return orig_model, model

In [None]:
def gsm8k_preprocess(x, tokenizer):
 return {"text": f"Question: {x['question']}\nAnswer: {x['answer']}" + tokenizer.eos_token}

def svamp_preprocess(x, tokenizer):
 return {"text": f"{x['question_concat']}\nAnswer: {x['Answer']}" + tokenizer.eos_token}

def tinystories_preprocess(x, tokenizer):
 return {"text": x["text"] + tokenizer.eos_token}

datasets_finetune = {
 "openai/gsm8k": gsm8k_preprocess,
 "ChilleD/SVAMP": svamp_preprocess,
 "roneneldan/TinyStories": tinystories_preprocess
}

def preprocess_test_gsm8k(x):
 return {"text": f"Question: {x['question']}\nAnswer:" }

def preprocess_test_svamp(x):
 return {"text": f"{x['question_concat']}\nAnswer:"}

def preprocess_test_tinystories(x):
 return {"text": x["text"]}

datasets_finetune_test = {
 "openai/gsm8k": preprocess_test_gsm8k,
 "ChilleD/SVAMP": preprocess_test_svamp,
 "roneneldan/TinyStories": preprocess_test_tinystories
}

In [None]:
def test_finetune(dataset, ds, orig_model, model, datasets_preprocess, first_x):
 tokenizer = gen_tokenizer(MODEL)
 preprocess_function = datasets_preprocess[dataset]
 if "validation" in ds:
 ds["test"] = deepcopy(ds["validation"])

 test_dataset = ds["test"].map(
 lambda x: preprocess_function(x),
 )

 model = model.to(device)
 orig_model = orig_model.to(device)

 model.eval()
 orig_model.eval()
 xi = 0
 with torch.no_grad():
 for x in test_dataset:
 input_tensor = tokenizer(x["text"], return_tensors="pt")
 input_tensor["input_ids"] = input_tensor["input_ids"].to(device)
 input_tensor["attention_mask"] = input_tensor["attention_mask"].to(device)

 output = orig_model.generate(**input_tensor, max_new_tokens=512)

 print("Original model output")
 print(tokenizer.decode(output[0], skip_special_tokens=True))

 finetuned_output = model.generate(**input_tensor, max_new_tokens=512)

 print("Finetuned model output")
 print(tokenizer.decode(finetuned_output[0], skip_special_tokens=True))

 xi += 1
 if xi > first_x:
 break


In [None]:
def generate_config(dataset):
 return config


In [None]:
dataset = "ChilleD/SVAMP"
ds = load_dataset(dataset, "default")
ds_1 = dataset.split('/')[1]

config = {
 "ds": ds,
 "dataset": dataset,
 "datasets_preprocess": datasets_finetune,
 "model": MODEL,
 "max_train_size": 700,
 "training_args": TrainingArguments(
 output_dir=f"./results_{ds_1}",
 report_to="none",
 num_train_epochs=10,
 per_device_train_batch_size=4,
 warmup_steps=200,
 learning_rate=5e-5,
 weight_decay=0.01,
 logging_steps=200,
 save_strategy="steps",
 metric_for_best_model="loss",
 greater_is_better=False,
 seed=SEED,
 ),
}

orig_model, model = finetune(config)

In [None]:
test_finetune(dataset, ds, orig_model, model, datasets_finetune_test, 3)

In [None]:
dataset = "roneneldan/TinyStories"
ds = load_dataset(dataset, "default")
ds_1 = dataset.split('/')[1]


config = {
 "ds": ds,
 "dataset": dataset,
 "datasets_preprocess": datasets_finetune,
 "model": MODEL,
 "max_train_size": 7000,
 "training_args": TrainingArguments(
 output_dir=f"./results_{ds_1}",
 report_to="none",
 num_train_epochs=1,
 per_device_train_batch_size=4,
 warmup_steps=200,
 learning_rate=5e-5,
 weight_decay=0.01,
 logging_steps=200,
 save_strategy="steps",
 metric_for_best_model="loss",
 greater_is_better=False,
 seed=SEED,
 ),
}

orig_model, model = finetune(generate_config(dataset))

In [None]:
test_finetune(dataset, ds, orig_model, model, datasets_finetune_test, 3)

In [None]:
from google.colab import files
files.download('/content/results_TinyStories/TinyStories-checkpoint-1750.zip')

In [None]:
from google.colab import files
files.download('/content/results_SVAMP/SVAMP-checkpoint-1750.zip')