|
|
from transformers import TrainingArguments, Trainer |
|
|
from datasets import load_dataset |
|
|
import jsonlines |
|
|
import os |
|
|
import torch |
|
|
from model import Transformer, ModelArgs |
|
|
from tokenizer import Tokenizer |
|
|
|
|
|
class MathDataset(torch.utils.data.Dataset): |
|
|
def __init__(self, tokenizer, data_paths, max_length=512): |
|
|
self.tokenizer = tokenizer |
|
|
self.max_length = max_length |
|
|
self.data = [] |
|
|
|
|
|
|
|
|
for path in data_paths: |
|
|
with jsonlines.open(path) as reader: |
|
|
self.data.extend(list(reader)) |
|
|
|
|
|
def __len__(self): |
|
|
return len(self.data) |
|
|
|
|
|
def __getitem__(self, idx): |
|
|
example = self.data[idx] |
|
|
|
|
|
|
|
|
if "proof_steps" in example: |
|
|
|
|
|
text = f"Problem: {example['problem']}\nSolution: {example['solution']}\nProof Steps:\n" |
|
|
for step in example["proof_steps"]: |
|
|
text += f"- {step['text']}\n" |
|
|
else: |
|
|
|
|
|
text = f"Question: {example['question']}\nAnswer: {example['answer']}" |
|
|
|
|
|
|
|
|
inputs = self.tokenizer( |
|
|
text, |
|
|
padding="max_length", |
|
|
truncation=True, |
|
|
max_length=self.max_length, |
|
|
return_tensors="pt" |
|
|
) |
|
|
|
|
|
|
|
|
inputs = {k: v.squeeze(0) for k, v in inputs.items()} |
|
|
|
|
|
return { |
|
|
"input_ids": inputs["input_ids"], |
|
|
"attention_mask": inputs["attention_mask"], |
|
|
"labels": inputs["input_ids"] |
|
|
} |
|
|
|
|
|
def main(): |
|
|
|
|
|
model_args = ModelArgs( |
|
|
dim=512, |
|
|
n_layers=8, |
|
|
n_heads=8, |
|
|
vocab_size=50000, |
|
|
max_seq_len=1024 |
|
|
) |
|
|
model = Transformer(model_args) |
|
|
|
|
|
|
|
|
tokenizer = Tokenizer() |
|
|
|
|
|
|
|
|
if tokenizer.pad_token is None: |
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
|
data_dir = os.path.join(os.path.dirname(__file__), "processed_data") |
|
|
data_paths = [ |
|
|
os.path.join(data_dir, "gsm8k_processed.jsonl"), |
|
|
os.path.join(data_dir, "proofnet_processed.jsonl") |
|
|
] |
|
|
|
|
|
|
|
|
dataset = MathDataset( |
|
|
tokenizer=tokenizer, |
|
|
data_paths=data_paths, |
|
|
max_length=1024 |
|
|
) |
|
|
|
|
|
|
|
|
training_args = TrainingArguments( |
|
|
output_dir="./math_expert_output", |
|
|
overwrite_output_dir=True, |
|
|
num_train_epochs=3, |
|
|
per_device_train_batch_size=2, |
|
|
gradient_accumulation_steps=4, |
|
|
save_steps=1000, |
|
|
save_total_limit=2, |
|
|
logging_dir="./math_expert_logs", |
|
|
logging_steps=100, |
|
|
evaluation_strategy="steps", |
|
|
eval_steps=1000, |
|
|
load_best_model_at_end=True, |
|
|
learning_rate=5e-5, |
|
|
warmup_steps=500, |
|
|
weight_decay=0.01, |
|
|
fp16=True if torch.cuda.is_available() else False |
|
|
) |
|
|
|
|
|
|
|
|
trainer = Trainer( |
|
|
model=model, |
|
|
args=training_args, |
|
|
train_dataset=dataset, |
|
|
tokenizer=tokenizer, |
|
|
) |
|
|
|
|
|
|
|
|
print("Starting training with your custom model...") |
|
|
trainer.train() |
|
|
|
|
|
|
|
|
output_dir = "./math_expert_model" |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) |
|
|
model_args.save(os.path.join(output_dir, "config.json")) |
|
|
print(f"Model saved to {output_dir}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|