In [1]:
from datasets import load_dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

In [2]:
ds = load_dataset("higgsfield/school-math-questions")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
print(ds['train'][1])

{'prompt': 'Question: Weng earns $12 an hour for babysitting. Yesterday, she just did 50 minutes of babysitting. How much did she earn?\nAnswer: ', 'completion': 'Weng earns 12/60 = $0.2 per minute.\nWorking 50 minutes, she earned 0.2 x 50 = $10.'}


In [4]:
qa_pairs = [(item['prompt'], item['completion']) for item in ds['train']]

In [5]:
class MathDataset(torch.utils.data.Dataset):
    def __init__(self, qa_pairs, tokenizer, max_length=128):
        self.qa_pairs = qa_pairs
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.qa_pairs)

    def __getitem__(self, idx):
        question, answer = self.qa_pairs[idx]
        input_text = f"Q: {question} A:"

        # Tokenize and pad input and target sequences
        input_ids = self.tokenizer.encode(input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").squeeze(0)
        target_ids = self.tokenizer.encode(answer.strip(), truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").squeeze(0)

        # Set the labels to -100 where input_ids are padding tokens
        target_ids[target_ids == self.tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "labels": target_ids,
        }

In [None]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)

math_dataset = MathDataset(qa_pairs, tokenizer)

from transformers import Trainer, TrainingArguments

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10,
    save_total_limit=2,
)

# Create a Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=math_dataset,
)

# Fine-tune the model
trainer.train()




[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mstar-nguyenanhkiet-2302[0m ([33mstar-nguyenanhkiet-2302-tr-ng-i-h-c-khoa-h-c-t-nhi-n-hqg-hcm[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


In [None]:
class MathChatBot:
    def __init__(self, model_name="gpt2"):
        self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
        self.model = GPT2LMHeadModel.from_pretrained(model_name)

    def get_response(self, question):
        input_text = f"Q: {question} A:"
        input_ids = self.tokenizer.encode(input_text, return_tensors="pt")

        output = self.model.generate(input_ids, max_length=50, num_return_sequences=1)
        answer = self.tokenizer.decode(output[0], skip_special_tokens=True)
        return answer.split("A:")[-1].strip()

# Usage
if __name__ == "__main__":
    bot = MathChatBot()
    while True:
        user_input = input("You: ")
        if user_input.lower() == "exit":
            print("Exiting chat...")
            break
        response = bot.get_response(user_input)
        print(f"Bot: {response}")