| from transformers import TrainingArguments, Trainer |
| from datasets import load_dataset |
| import jsonlines |
| import os |
| import torch |
| from model import Transformer, ModelArgs |
| from tokenizer import Tokenizer |
|
|
| class MathDataset(torch.utils.data.Dataset): |
| def __init__(self, tokenizer, data_paths, max_length=512): |
| self.tokenizer = tokenizer |
| self.max_length = max_length |
| self.data = [] |
| |
| |
| for path in data_paths: |
| with jsonlines.open(path) as reader: |
| self.data.extend(list(reader)) |
| |
| def __len__(self): |
| return len(self.data) |
| |
| def __getitem__(self, idx): |
| example = self.data[idx] |
| |
| |
| if "proof_steps" in example: |
| |
| text = f"Problem: {example['problem']}\nSolution: {example['solution']}\nProof Steps:\n" |
| for step in example["proof_steps"]: |
| text += f"- {step['text']}\n" |
| else: |
| |
| text = f"Question: {example['question']}\nAnswer: {example['answer']}" |
| |
| |
| inputs = self.tokenizer( |
| text, |
| padding="max_length", |
| truncation=True, |
| max_length=self.max_length, |
| return_tensors="pt" |
| ) |
| |
| |
| inputs = {k: v.squeeze(0) for k, v in inputs.items()} |
| |
| return { |
| "input_ids": inputs["input_ids"], |
| "attention_mask": inputs["attention_mask"], |
| "labels": inputs["input_ids"] |
| } |
|
|
| def main(): |
| |
| model_args = ModelArgs( |
| dim=512, |
| n_layers=8, |
| n_heads=8, |
| vocab_size=50000, |
| max_seq_len=1024 |
| ) |
| model = Transformer(model_args) |
| |
| |
| tokenizer = Tokenizer() |
| |
| |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| |
| |
| data_dir = os.path.join(os.path.dirname(__file__), "processed_data") |
| data_paths = [ |
| os.path.join(data_dir, "gsm8k_processed.jsonl"), |
| os.path.join(data_dir, "proofnet_processed.jsonl") |
| ] |
| |
| |
| dataset = MathDataset( |
| tokenizer=tokenizer, |
| data_paths=data_paths, |
| max_length=1024 |
| ) |
| |
| |
| training_args = TrainingArguments( |
| output_dir="./math_expert_output", |
| overwrite_output_dir=True, |
| num_train_epochs=3, |
| per_device_train_batch_size=2, |
| gradient_accumulation_steps=4, |
| save_steps=1000, |
| save_total_limit=2, |
| logging_dir="./math_expert_logs", |
| logging_steps=100, |
| evaluation_strategy="steps", |
| eval_steps=1000, |
| load_best_model_at_end=True, |
| learning_rate=5e-5, |
| warmup_steps=500, |
| weight_decay=0.01, |
| fp16=True if torch.cuda.is_available() else False |
| ) |
| |
| |
| trainer = Trainer( |
| model=model, |
| args=training_args, |
| train_dataset=dataset, |
| tokenizer=tokenizer, |
| ) |
| |
| |
| print("Starting training with your custom model...") |
| trainer.train() |
| |
| |
| output_dir = "./math_expert_model" |
| os.makedirs(output_dir, exist_ok=True) |
| torch.save(model.state_dict(), os.path.join(output_dir, "pytorch_model.bin")) |
| model_args.save(os.path.join(output_dir, "config.json")) |
| print(f"Model saved to {output_dir}") |
|
|
| if __name__ == "__main__": |
| main() |
|
|