import os import argparse import json import numpy as np from tqdm import tqdm import torch import torch.nn as nn from torch.utils.data import DataLoader, random_split from torch.cuda.amp import GradScaler, autocast from transformers import GPT2TokenizerFast, AdamW, get_linear_schedule_with_warmup from datasets import load_dataset from transformers import logging as hf_logging # Suppress unnecessary logging hf_logging.set_verbosity_error() # Import your transformer model # from your_transformer_module import create_transformer_model class Config: """Centralized configuration for training""" def __init__(self): # Model hyperparameters self.vocab_size = 50257 # GPT-2 vocab size self.d_model = 512 self.nhead = 8 self.num_layers = 6 self.dim_feedforward = 2048 self.dropout = 0.1 # Training hyperparameters self.batch_size = 32 self.num_epochs = 3 self.learning_rate = 5e-5 self.weight_decay = 0.01 self.warmup_steps = 0.1 # Percentage of total steps self.max_seq_length = 512 self.gradient_accumulation_steps = 1 self.max_grad_norm = 1.0 self.seed = 42 # Paths self.output_dir = "./checkpoints" self.model_save_prefix = "reasoning_model" # Device configuration self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.fp16 = torch.cuda.is_available() def save(self, path): """Save configuration to file""" os.makedirs(os.path.dirname(path), exist_ok=True) with open(path, 'w') as f: json.dump(self.__dict__, f, indent=2) @classmethod def from_file(cls, path): """Load configuration from file""" config = cls() with open(path, 'r') as f: config.__dict__.update(json.load(f)) return config def load_and_preprocess_data(config): """Load and preprocess the dataset""" # Load dataset dataset = load_dataset("ag2428/reasoningDataV4", split="train") # Initialize tokenizer tokenizer = GPT2TokenizerFast.from_pretrained("gpt2") tokenizer.pad_token = tokenizer.eos_token # Set padding token # Tokenization function def tokenize_function(examples): # Combine instruction and answer with a separator texts = [f"{inst}\n{ans}" for inst, ans in zip(examples["instruction"], examples["answer"])] # Tokenize tokenized = tokenizer( texts, max_length=config.max_seq_length, truncation=True, padding="max_length", return_tensors="pt" ) # Create labels (shifted input_ids for language modeling) tokenized["labels"] = tokenized["input_ids"].clone() return tokenized # Tokenize dataset tokenized_datasets = dataset.map( tokenize_function, batched=True, remove_columns=dataset.column_names, desc="Tokenizing dataset" ) # Split into train and validation sets train_val = tokenized_datasets.train_test_split(test_size=0.1, seed=config.seed) train_dataset = train_val["train"] val_dataset = train_val["test"] # Convert to PyTorch format train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels']) # Create data loaders train_loader = DataLoader( train_dataset, batch_size=config.batch_size, shuffle=True, num_workers=4, pin_memory=True ) val_loader = DataLoader( val_dataset, batch_size=config.batch_size, shuffle=False, num_workers=4, pin_memory=True ) return train_loader, val_loader, tokenizer def train_epoch(model, train_loader, optimizer, scheduler, scaler, config, epoch): """Train for one epoch""" model.train() total_loss = 0 progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}") for step, batch in enumerate(progress_bar): # Move batch to device input_ids = batch['input_ids'].to(config.device) attention_mask = batch['attention_mask'].to(config.device) labels = batch['labels'].to(config.device) # Forward pass with mixed precision with autocast(enabled=config.fp16): outputs = model( input_ids=input_ids, attention_mask=attention_mask, labels=labels ) loss = outputs.loss / config.gradient_accumulation_steps # Backward pass and optimize if config.fp16: scaler.scale(loss).backward() else: loss.backward() # Gradient clipping and optimization step if (step + 1) % config.gradient_accumulation_steps == 0: if config.fp16: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) scaler.step(optimizer) scaler.update() else: torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) optimizer.step() scheduler.step() optimizer.zero_grad() total_loss += loss.item() * config.gradient_accumulation_steps # Update progress bar progress_bar.set_postfix({ 'loss': f"{total_loss / (step + 1):.4f}", 'lr': f"{scheduler.get_last_lr()[0]:.2e}" }) return total_loss / len(train_loader) def evaluate(model, val_loader, config): """Evaluate the model on the validation set""" model.eval() total_loss = 0 with torch.no_grad(): for batch in tqdm(val_loader, desc="Evaluating"): input_ids = batch['input_ids'].to(config.device) attention_mask = batch['attention_mask'].to(config.device) labels = batch['labels'].to(config.device) with autocast(enabled=config.fp16): outputs = model( input_ids=input_ids, attention_mask=attention_mask, labels=labels ) loss = outputs.loss total_loss += loss.item() return total_loss / len(val_loader) def save_checkpoint(model, optimizer, scheduler, epoch, config, is_best=False): """Save model checkpoint""" os.makedirs(config.output_dir, exist_ok=True) # Prepare checkpoint checkpoint = { 'epoch': epoch, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), 'config': config.__dict__, } # Save checkpoint if is_best: filename = os.path.join(config.output_dir, f"{config.model_save_prefix}_best.pt") else: filename = os.path.join(config.output_dir, f"{config.model_save_prefix}_epoch_{epoch}.pt") torch.save(checkpoint, filename) print(f"Checkpoint saved to {filename}") def main(): # Parse command line arguments parser = argparse.ArgumentParser(description="Train a reasoning model") parser.add_argument('--config', type=str, default=None, help="Path to config file") parser.add_argument('--output_dir', type=str, default=None, help="Output directory for checkpoints") parser.add_argument('--batch_size', type=int, default=None, help="Batch size") parser.add_argument('--num_epochs', type=int, default=None, help="Number of epochs") parser.add_argument('--learning_rate', type=float, default=None, help="Learning rate") parser.add_argument('--fp16', action='store_true', help="Use mixed precision training") args = parser.parse_args() # Initialize config if args.config: config = Config.from_file(args.config) else: config = Config() # Override config with command line arguments if args.output_dir: config.output_dir = args.output_dir if args.batch_size: config.batch_size = args.batch_size if args.num_epochs: config.num_epochs = args.num_epochs if args.learning_rate: config.learning_rate = args.learning_rate if args.fp16: config.fp16 = True # Set random seed for reproducibility torch.manual_seed(config.seed) np.random.seed(config.seed) if torch.cuda.is_available(): torch.cuda.manual_seed_all(config.seed) # Create output directory os.makedirs(config.output_dir, exist_ok=True) # Save config config.save(os.path.join(config.output_dir, "config.json")) # Load data print("Loading and preprocessing data...") train_loader, val_loader, tokenizer = load_and_preprocess_data(config) # Initialize model print("Initializing model...") # TODO: Replace with your model initialization # model = create_transformer_model( # vocab_size=config.vocab_size, # d_model=config.d_model, # nhead=config.nhead, # num_layers=config.num_layers, # dim_feedforward=config.dim_feedforward, # dropout=config.dropout, # max_seq_length=config.max_seq_length # ) # For now, let's use a placeholder that will raise an error class PlaceholderModel(nn.Module): def __init__(self): super().__init__() self.loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) def forward(self, input_ids, attention_mask, labels=None): # This is a placeholder that will raise an error # Replace with your actual model implementation raise NotImplementedError( "Please implement your transformer model and replace this placeholder. " "See the TODO comment in the code for more details." ) model = PlaceholderModel() model = model.to(config.device) # Initialize optimizer and scheduler no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ { 'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': config.weight_decay, }, { 'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0, } ] optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate) # Calculate total training steps total_steps = len(train_loader) * config.num_epochs // config.gradient_accumulation_steps warmup_steps = int(total_steps * config.warmup_steps) # Initialize learning rate scheduler scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps ) # Initialize gradient scaler for mixed precision training scaler = GradScaler(enabled=config.fp16) # Training loop print("Starting training...") best_val_loss = float('inf') for epoch in range(config.num_epochs): # Train for one epoch train_loss = train_epoch(model, train_loader, optimizer, scheduler, scaler, config, epoch) # Evaluate on validation set val_loss = evaluate(model, val_loader, config) print(f"Epoch {epoch + 1}/{config.num_epochs}:") print(f" Train loss: {train_loss:.4f}") print(f" Val loss: {val_loss:.4f}") # Save checkpoint save_checkpoint(model, optimizer, scheduler, epoch, config) # Save best model if val_loss < best_val_loss: best_val_loss = val_loss save_checkpoint(model, optimizer, scheduler, epoch, config, is_best=True) print("Training complete!") if __name__ == "__main__": main()