File size: 5,428 Bytes

45bcb9b

"""

Step 3: STREAMLINED Training - Minimal, Fast

"""

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Config
import sentencepiece as spm
from tqdm import tqdm
import time

# ===== CONFIG =====
CONFIG = {
    'train_file': './final_corpus/multilingual_corpus_train.txt',
    'val_file': './final_corpus/multilingual_corpus_val.txt',
    'tokenizer_path': './final_corpus/multilingual_spm.model',
    
    # Tiny model for fast training
    'n_positions': 128,
    'n_embd': 256,
    'n_layer': 4,
    'n_head': 4,
    'n_inner': 512,
    
    # Training
    'batch_size': 2,        # Small batch for 4GB
    'grad_accum': 8,        # Effective batch = 16
    'learning_rate': 2e-4,
    'total_steps': 5000,    # Train for 5000 steps only
    'save_every': 1000,
}

class SimpleDataset(Dataset):
    def __init__(self, filepath, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        
        print("Loading data...")
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f if line.strip()]
        
        # Tokenize all at once
        self.examples = []
        for line in tqdm(lines[:600000], desc="Tokenizing"):  # Use only 50K lines
            tokens = tokenizer.encode(line)
            if len(tokens) > 10:
                if len(tokens) > block_size:
                    tokens = tokens[:block_size]
                else:
                    tokens = tokens + [0] * (block_size - len(tokens))
                self.examples.append(tokens)
        
        print(f"Created {len(self.examples)} examples")
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)

def train_streamlined():
    print("\n" + "="*60)
    print("STREAMLINED TRAINING - FASTEST POSSIBLE")
    print("="*60)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: {device}")
    
    # Load tokenizer
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.load(CONFIG['tokenizer_path'])
    vocab_size = tokenizer.get_piece_size()
    
    # Create tiny model
    config = GPT2Config(
        vocab_size=vocab_size,
        n_positions=CONFIG['n_positions'],
        n_embd=CONFIG['n_embd'],
        n_layer=CONFIG['n_layer'],
        n_head=CONFIG['n_head'],
        n_inner=CONFIG['n_inner'],
        pad_token_id=0,
    )
    
    model = GPT2LMHeadModel(config)
    model.to(device)
    model.train()
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    
    # Create dataset (small)
    dataset = SimpleDataset(CONFIG['train_file'], tokenizer, CONFIG['n_positions'])
    dataloader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    
    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
    
    print(f"\nModel: {sum(p.numel() for p in model.parameters())/1e6:.1f}M params")
    print(f"Training steps: {CONFIG['total_steps']}")
    print(f"Estimated time: {CONFIG['total_steps']*0.3/3600:.1f} hours\n")
    
    # Training loop
    global_step = 0
    accumulation_steps = 0
    start_time = time.time()
    
    while global_step < CONFIG['total_steps']:
        for batch in dataloader:
            batch = batch.to(device)
            
            # Forward
            outputs = model(input_ids=batch, labels=batch)
            loss = outputs.loss / CONFIG['grad_accum']
            
            # Backward
            loss.backward()
            accumulation_steps += 1
            
            # Gradient accumulation
            if accumulation_steps == CONFIG['grad_accum']:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()
                
                global_step += 1
                accumulation_steps = 0
                
                # Print progress
                if global_step % 100 == 0:
                    elapsed = time.time() - start_time
                    steps_per_second = global_step / elapsed
                    remaining = (CONFIG['total_steps'] - global_step) / steps_per_second
                    
                    print(f"Step {global_step}/{CONFIG['total_steps']} | "
                          f"Loss: {loss.item()*CONFIG['grad_accum']:.3f} | "
                          f"Remaining: {remaining/3600:.1f}h")
                
                # Save checkpoint
                if global_step % CONFIG['save_every'] == 0:
                    save_path = f"./checkpoints_tiny/step{global_step}"
                    model.save_pretrained(save_path)
                    print(f"Saved checkpoint: {save_path}")
                
                # Stop if reached total steps
                if global_step >= CONFIG['total_steps']:
                    break
    
    print(f"\nTraining completed in {(time.time()-start_time)/3600:.2f} hours")
    
    # Save final model
    model.save_pretrained("./checkpoints_tiny/final")
    print("Final model saved to ./checkpoints_tiny/final")

if __name__ == "__main__":
    train_streamlined()