conscious_Ai-2 / data.py
Ret's picture
Upload 9 files
0ec1bb9 verified
import os
import argparse
import json
import numpy as np
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from torch.cuda.amp import GradScaler, autocast
from transformers import GPT2TokenizerFast, AdamW, get_linear_schedule_with_warmup
from datasets import load_dataset
from transformers import logging as hf_logging
# Suppress unnecessary logging
hf_logging.set_verbosity_error()
# Import your transformer model
# from your_transformer_module import create_transformer_model
class Config:
"""Centralized configuration for training"""
def __init__(self):
# Model hyperparameters
self.vocab_size = 50257 # GPT-2 vocab size
self.d_model = 512
self.nhead = 8
self.num_layers = 6
self.dim_feedforward = 2048
self.dropout = 0.1
# Training hyperparameters
self.batch_size = 32
self.num_epochs = 3
self.learning_rate = 5e-5
self.weight_decay = 0.01
self.warmup_steps = 0.1 # Percentage of total steps
self.max_seq_length = 512
self.gradient_accumulation_steps = 1
self.max_grad_norm = 1.0
self.seed = 42
# Paths
self.output_dir = "./checkpoints"
self.model_save_prefix = "reasoning_model"
# Device configuration
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.fp16 = torch.cuda.is_available()
def save(self, path):
"""Save configuration to file"""
os.makedirs(os.path.dirname(path), exist_ok=True)
with open(path, 'w') as f:
json.dump(self.__dict__, f, indent=2)
@classmethod
def from_file(cls, path):
"""Load configuration from file"""
config = cls()
with open(path, 'r') as f:
config.__dict__.update(json.load(f))
return config
def load_and_preprocess_data(config):
"""Load and preprocess the dataset"""
# Load dataset
dataset = load_dataset("ag2428/reasoningDataV4", split="train")
# Initialize tokenizer
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token # Set padding token
# Tokenization function
def tokenize_function(examples):
# Combine instruction and answer with a separator
texts = [f"{inst}\n{ans}" for inst, ans in zip(examples["instruction"], examples["answer"])]
# Tokenize
tokenized = tokenizer(
texts,
max_length=config.max_seq_length,
truncation=True,
padding="max_length",
return_tensors="pt"
)
# Create labels (shifted input_ids for language modeling)
tokenized["labels"] = tokenized["input_ids"].clone()
return tokenized
# Tokenize dataset
tokenized_datasets = dataset.map(
tokenize_function,
batched=True,
remove_columns=dataset.column_names,
desc="Tokenizing dataset"
)
# Split into train and validation sets
train_val = tokenized_datasets.train_test_split(test_size=0.1, seed=config.seed)
train_dataset = train_val["train"]
val_dataset = train_val["test"]
# Convert to PyTorch format
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
# Create data loaders
train_loader = DataLoader(
train_dataset,
batch_size=config.batch_size,
shuffle=True,
num_workers=4,
pin_memory=True
)
val_loader = DataLoader(
val_dataset,
batch_size=config.batch_size,
shuffle=False,
num_workers=4,
pin_memory=True
)
return train_loader, val_loader, tokenizer
def train_epoch(model, train_loader, optimizer, scheduler, scaler, config, epoch):
"""Train for one epoch"""
model.train()
total_loss = 0
progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}")
for step, batch in enumerate(progress_bar):
# Move batch to device
input_ids = batch['input_ids'].to(config.device)
attention_mask = batch['attention_mask'].to(config.device)
labels = batch['labels'].to(config.device)
# Forward pass with mixed precision
with autocast(enabled=config.fp16):
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
loss = outputs.loss / config.gradient_accumulation_steps
# Backward pass and optimize
if config.fp16:
scaler.scale(loss).backward()
else:
loss.backward()
# Gradient clipping and optimization step
if (step + 1) % config.gradient_accumulation_steps == 0:
if config.fp16:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
scaler.step(optimizer)
scaler.update()
else:
torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
total_loss += loss.item() * config.gradient_accumulation_steps
# Update progress bar
progress_bar.set_postfix({
'loss': f"{total_loss / (step + 1):.4f}",
'lr': f"{scheduler.get_last_lr()[0]:.2e}"
})
return total_loss / len(train_loader)
def evaluate(model, val_loader, config):
"""Evaluate the model on the validation set"""
model.eval()
total_loss = 0
with torch.no_grad():
for batch in tqdm(val_loader, desc="Evaluating"):
input_ids = batch['input_ids'].to(config.device)
attention_mask = batch['attention_mask'].to(config.device)
labels = batch['labels'].to(config.device)
with autocast(enabled=config.fp16):
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask,
labels=labels
)
loss = outputs.loss
total_loss += loss.item()
return total_loss / len(val_loader)
def save_checkpoint(model, optimizer, scheduler, epoch, config, is_best=False):
"""Save model checkpoint"""
os.makedirs(config.output_dir, exist_ok=True)
# Prepare checkpoint
checkpoint = {
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'scheduler_state_dict': scheduler.state_dict(),
'config': config.__dict__,
}
# Save checkpoint
if is_best:
filename = os.path.join(config.output_dir, f"{config.model_save_prefix}_best.pt")
else:
filename = os.path.join(config.output_dir, f"{config.model_save_prefix}_epoch_{epoch}.pt")
torch.save(checkpoint, filename)
print(f"Checkpoint saved to {filename}")
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description="Train a reasoning model")
parser.add_argument('--config', type=str, default=None, help="Path to config file")
parser.add_argument('--output_dir', type=str, default=None, help="Output directory for checkpoints")
parser.add_argument('--batch_size', type=int, default=None, help="Batch size")
parser.add_argument('--num_epochs', type=int, default=None, help="Number of epochs")
parser.add_argument('--learning_rate', type=float, default=None, help="Learning rate")
parser.add_argument('--fp16', action='store_true', help="Use mixed precision training")
args = parser.parse_args()
# Initialize config
if args.config:
config = Config.from_file(args.config)
else:
config = Config()
# Override config with command line arguments
if args.output_dir:
config.output_dir = args.output_dir
if args.batch_size:
config.batch_size = args.batch_size
if args.num_epochs:
config.num_epochs = args.num_epochs
if args.learning_rate:
config.learning_rate = args.learning_rate
if args.fp16:
config.fp16 = True
# Set random seed for reproducibility
torch.manual_seed(config.seed)
np.random.seed(config.seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(config.seed)
# Create output directory
os.makedirs(config.output_dir, exist_ok=True)
# Save config
config.save(os.path.join(config.output_dir, "config.json"))
# Load data
print("Loading and preprocessing data...")
train_loader, val_loader, tokenizer = load_and_preprocess_data(config)
# Initialize model
print("Initializing model...")
# TODO: Replace with your model initialization
# model = create_transformer_model(
# vocab_size=config.vocab_size,
# d_model=config.d_model,
# nhead=config.nhead,
# num_layers=config.num_layers,
# dim_feedforward=config.dim_feedforward,
# dropout=config.dropout,
# max_seq_length=config.max_seq_length
# )
# For now, let's use a placeholder that will raise an error
class PlaceholderModel(nn.Module):
def __init__(self):
super().__init__()
self.loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)
def forward(self, input_ids, attention_mask, labels=None):
# This is a placeholder that will raise an error
# Replace with your actual model implementation
raise NotImplementedError(
"Please implement your transformer model and replace this placeholder. "
"See the TODO comment in the code for more details."
)
model = PlaceholderModel()
model = model.to(config.device)
# Initialize optimizer and scheduler
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
{
'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
'weight_decay': config.weight_decay,
},
{
'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
'weight_decay': 0.0,
}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=config.learning_rate)
# Calculate total training steps
total_steps = len(train_loader) * config.num_epochs // config.gradient_accumulation_steps
warmup_steps = int(total_steps * config.warmup_steps)
# Initialize learning rate scheduler
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=warmup_steps,
num_training_steps=total_steps
)
# Initialize gradient scaler for mixed precision training
scaler = GradScaler(enabled=config.fp16)
# Training loop
print("Starting training...")
best_val_loss = float('inf')
for epoch in range(config.num_epochs):
# Train for one epoch
train_loss = train_epoch(model, train_loader, optimizer, scheduler, scaler, config, epoch)
# Evaluate on validation set
val_loss = evaluate(model, val_loader, config)
print(f"Epoch {epoch + 1}/{config.num_epochs}:")
print(f" Train loss: {train_loss:.4f}")
print(f" Val loss: {val_loss:.4f}")
# Save checkpoint
save_checkpoint(model, optimizer, scheduler, epoch, config)
# Save best model
if val_loss < best_val_loss:
best_val_loss = val_loss
save_checkpoint(model, optimizer, scheduler, epoch, config, is_best=True)
print("Training complete!")
if __name__ == "__main__":
main()