hollywoodfrancis's picture
Upload 11 files
b8ab4a2 verified
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import jsonlines
import os
import torch
from model import Transformer, ModelArgs
from tokenizer import Tokenizer
class MathDataset(torch.utils.data.Dataset):
def __init__(self, tokenizer, data_paths, max_length=512):
self.tokenizer = tokenizer
self.max_length = max_length
self.data = []
# Load and combine data from all files
for path in data_paths:
with jsonlines.open(path) as reader:
self.data.extend(list(reader))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
example = self.data[idx]
# Format the input text
if "proof_steps" in example:
# For ProofNet-style data
text = f"Problem: {example['problem']}\nSolution: {example['solution']}\nProof Steps:\n"
for step in example["proof_steps"]:
text += f"- {step['text']}\n"
else:
# For GSM8K-style data
text = f"Question: {example['question']}\nAnswer: {example['answer']}"
# Tokenize
inputs = self.tokenizer(
text,
padding="max_length",
truncation=True,
max_length=self.max_length,
return_tensors="pt"
)
# Remove batch dimension
inputs = {k: v.squeeze(0) for k, v in inputs.items()}
return {
"input_ids": inputs["input_ids"],
"attention_mask": inputs["attention_mask"],
"labels": inputs["input_ids"] # For causal LM training
}
def main():
# Initialize your custom model
model_args = ModelArgs(
dim=512,
n_layers=8,
n_heads=8,
vocab_size=50000, # Adjust based on your tokenizer
max_seq_len=1024
)
model = Transformer(model_args)
# Initialize your custom tokenizer
tokenizer = Tokenizer()
# Configure tokenizer
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Set up training data paths
data_dir = os.path.join(os.path.dirname(__file__), "processed_data")
data_paths = [
os.path.join(data_dir, "gsm8k_processed.jsonl"),
os.path.join(data_dir, "proofnet_processed.jsonl")
]
# Create dataset
dataset = MathDataset(
tokenizer=tokenizer,
data_paths=data_paths,
max_length=1024 # Increased max_length for longer proofs
)
# Define training arguments
training_args = TrainingArguments(
output_dir="./math_expert_output",
overwrite_output_dir=True,
num_train_epochs=3,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
save_steps=1000,
save_total_limit=2,
logging_dir="./math_expert_logs",
logging_steps=100,
evaluation_strategy="steps",
eval_steps=1000,
load_best_model_at_end=True,
learning_rate=5e-5,
warmup_steps=500,
weight_decay=0.01,
fp16=True if torch.cuda.is_available() else False
)
# Create trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer,
)
# Start training
print("Starting training with your custom model...")
trainer.train()
# Save the model
output_dir = "./math_expert_model"
os.makedirs(output_dir, exist_ok=True)
torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin"))
model_args.save(os.path.join(output_dir, "config.json"))
print(f"Model saved to {output_dir}")
if __name__ == "__main__":
main()