File size: 5,428 Bytes
45bcb9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""

Step 3: STREAMLINED Training - Minimal, Fast

"""

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2LMHeadModel, GPT2Config
import sentencepiece as spm
from tqdm import tqdm
import time

# ===== CONFIG =====
CONFIG = {
    'train_file': './final_corpus/multilingual_corpus_train.txt',
    'val_file': './final_corpus/multilingual_corpus_val.txt',
    'tokenizer_path': './final_corpus/multilingual_spm.model',
    
    # Tiny model for fast training
    'n_positions': 128,
    'n_embd': 256,
    'n_layer': 4,
    'n_head': 4,
    'n_inner': 512,
    
    # Training
    'batch_size': 2,        # Small batch for 4GB
    'grad_accum': 8,        # Effective batch = 16
    'learning_rate': 2e-4,
    'total_steps': 5000,    # Train for 5000 steps only
    'save_every': 1000,
}

class SimpleDataset(Dataset):
    def __init__(self, filepath, tokenizer, block_size):
        self.tokenizer = tokenizer
        self.block_size = block_size
        
        print("Loading data...")
        with open(filepath, 'r', encoding='utf-8') as f:
            lines = [line.strip() for line in f if line.strip()]
        
        # Tokenize all at once
        self.examples = []
        for line in tqdm(lines[:600000], desc="Tokenizing"):  # Use only 50K lines
            tokens = tokenizer.encode(line)
            if len(tokens) > 10:
                if len(tokens) > block_size:
                    tokens = tokens[:block_size]
                else:
                    tokens = tokens + [0] * (block_size - len(tokens))
                self.examples.append(tokens)
        
        print(f"Created {len(self.examples)} examples")
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)

def train_streamlined():
    print("\n" + "="*60)
    print("STREAMLINED TRAINING - FASTEST POSSIBLE")
    print("="*60)
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Device: {device}")
    
    # Load tokenizer
    tokenizer = spm.SentencePieceProcessor()
    tokenizer.load(CONFIG['tokenizer_path'])
    vocab_size = tokenizer.get_piece_size()
    
    # Create tiny model
    config = GPT2Config(
        vocab_size=vocab_size,
        n_positions=CONFIG['n_positions'],
        n_embd=CONFIG['n_embd'],
        n_layer=CONFIG['n_layer'],
        n_head=CONFIG['n_head'],
        n_inner=CONFIG['n_inner'],
        pad_token_id=0,
    )
    
    model = GPT2LMHeadModel(config)
    model.to(device)
    model.train()
    
    # Enable gradient checkpointing
    model.gradient_checkpointing_enable()
    
    # Create dataset (small)
    dataset = SimpleDataset(CONFIG['train_file'], tokenizer, CONFIG['n_positions'])
    dataloader = DataLoader(dataset, batch_size=CONFIG['batch_size'], shuffle=True)
    
    # Optimizer
    optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['learning_rate'])
    
    print(f"\nModel: {sum(p.numel() for p in model.parameters())/1e6:.1f}M params")
    print(f"Training steps: {CONFIG['total_steps']}")
    print(f"Estimated time: {CONFIG['total_steps']*0.3/3600:.1f} hours\n")
    
    # Training loop
    global_step = 0
    accumulation_steps = 0
    start_time = time.time()
    
    while global_step < CONFIG['total_steps']:
        for batch in dataloader:
            batch = batch.to(device)
            
            # Forward
            outputs = model(input_ids=batch, labels=batch)
            loss = outputs.loss / CONFIG['grad_accum']
            
            # Backward
            loss.backward()
            accumulation_steps += 1
            
            # Gradient accumulation
            if accumulation_steps == CONFIG['grad_accum']:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()
                
                global_step += 1
                accumulation_steps = 0
                
                # Print progress
                if global_step % 100 == 0:
                    elapsed = time.time() - start_time
                    steps_per_second = global_step / elapsed
                    remaining = (CONFIG['total_steps'] - global_step) / steps_per_second
                    
                    print(f"Step {global_step}/{CONFIG['total_steps']} | "
                          f"Loss: {loss.item()*CONFIG['grad_accum']:.3f} | "
                          f"Remaining: {remaining/3600:.1f}h")
                
                # Save checkpoint
                if global_step % CONFIG['save_every'] == 0:
                    save_path = f"./checkpoints_tiny/step{global_step}"
                    model.save_pretrained(save_path)
                    print(f"Saved checkpoint: {save_path}")
                
                # Stop if reached total steps
                if global_step >= CONFIG['total_steps']:
                    break
    
    print(f"\nTraining completed in {(time.time()-start_time)/3600:.2f} hours")
    
    # Save final model
    model.save_pretrained("./checkpoints_tiny/final")
    print("Final model saved to ./checkpoints_tiny/final")

if __name__ == "__main__":
    train_streamlined()