File size: 2,207 Bytes
45bcb9b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 |
"""
Step 2: Model configuration
"""
from dataclasses import dataclass
from transformers import GPT2Config
@dataclass
class ModelConfig:
# Model architecture
vocab_size: int = 8000 # Updated from tokenizer
n_positions: int = 256 # Context length
n_embd: int = 512 # Hidden size
n_layer: int = 8 # Number of layers
n_head: int = 8 # Attention heads
n_inner: int = 1024 # FFN dimension
# Training - REALISTIC VALUES
batch_size: int = 8 # Per GPU batch size
gradient_accumulation: int = 4 # Effective batch = 32
learning_rate: float = 3e-4
warmup_steps: int = 1000
total_steps: int = 20000 # ~8-9 epochs, NOT 50000
weight_decay: float = 0.1
max_grad_norm: float = 1.0
# Data
train_file: str = "./final_corpus/multilingual_corpus_train.txt"
val_file: str = "./final_corpus/multilingual_corpus_val.txt"
tokenizer_path: str = "./final_corpus/multilingual_spm.model"
# Checkpoints
output_dir: str = "./checkpoints"
save_steps: int = 1000
eval_steps: int = 500
logging_steps: int = 100
# Mixed precision
fp16: bool = True
def __post_init__(self):
print(f"\nModel Configuration (REALISTIC):")
print(f" Parameters: ~{self.total_params:.1f}M")
print(f" Hidden size: {self.n_embd}")
print(f" Layers: {self.n_layer}")
print(f" Context length: {self.n_positions}")
print(f" Effective batch: {self.effective_batch_size}")
print(f" Total steps: {self.total_steps} (~8-9 epochs)")
print(f" Learning rate: {self.learning_rate}")
@property
def effective_batch_size(self):
return self.batch_size * self.gradient_accumulation
@property
def total_params(self):
# Rough estimate
embedding = self.vocab_size * self.n_embd
attention = 4 * self.n_embd * self.n_embd
ffn = 2 * self.n_embd * self.n_inner
ln = 2 * self.n_embd
per_layer = attention + ffn + ln
total = embedding + (self.n_layer * per_layer)
return total / 1e6 # Millions |