from transformers import TrainingArguments, Trainer from datasets import load_dataset import jsonlines import os import torch from model import Transformer, ModelArgs from tokenizer import Tokenizer class MathDataset(torch.utils.data.Dataset): def __init__(self, tokenizer, data_paths, max_length=512): self.tokenizer = tokenizer self.max_length = max_length self.data = [] # Load and combine data from all files for path in data_paths: with jsonlines.open(path) as reader: self.data.extend(list(reader)) def __len__(self): return len(self.data) def __getitem__(self, idx): example = self.data[idx] # Format the input text if "proof_steps" in example: # For ProofNet-style data text = f"Problem: {example['problem']}\nSolution: {example['solution']}\nProof Steps:\n" for step in example["proof_steps"]: text += f"- {step['text']}\n" else: # For GSM8K-style data text = f"Question: {example['question']}\nAnswer: {example['answer']}" # Tokenize inputs = self.tokenizer( text, padding="max_length", truncation=True, max_length=self.max_length, return_tensors="pt" ) # Remove batch dimension inputs = {k: v.squeeze(0) for k, v in inputs.items()} return { "input_ids": inputs["input_ids"], "attention_mask": inputs["attention_mask"], "labels": inputs["input_ids"] # For causal LM training } def main(): # Initialize your custom model model_args = ModelArgs( dim=512, n_layers=8, n_heads=8, vocab_size=50000, # Adjust based on your tokenizer max_seq_len=1024 ) model = Transformer(model_args) # Initialize your custom tokenizer tokenizer = Tokenizer() # Configure tokenizer if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token # Set up training data paths data_dir = os.path.join(os.path.dirname(__file__), "processed_data") data_paths = [ os.path.join(data_dir, "gsm8k_processed.jsonl"), os.path.join(data_dir, "proofnet_processed.jsonl") ] # Create dataset dataset = MathDataset( tokenizer=tokenizer, data_paths=data_paths, max_length=1024 # Increased max_length for longer proofs ) # Define training arguments training_args = TrainingArguments( output_dir="./math_expert_output", overwrite_output_dir=True, num_train_epochs=3, per_device_train_batch_size=2, gradient_accumulation_steps=4, save_steps=1000, save_total_limit=2, logging_dir="./math_expert_logs", logging_steps=100, evaluation_strategy="steps", eval_steps=1000, load_best_model_at_end=True, learning_rate=5e-5, warmup_steps=500, weight_decay=0.01, fp16=True if torch.cuda.is_available() else False ) # Create trainer trainer = Trainer( model=model, args=training_args, train_dataset=dataset, tokenizer=tokenizer, ) # Start training print("Starting training with your custom model...") trainer.train() # Save the model output_dir = "./math_expert_model" os.makedirs(output_dir, exist_ok=True) torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) model_args.save(os.path.join(output_dir, "config.json")) print(f"Model saved to {output_dir}") if __name__ == "__main__": main()