|
|
"""
|
|
|
Step 2: Model configuration
|
|
|
"""
|
|
|
|
|
|
from dataclasses import dataclass
|
|
|
from transformers import GPT2Config
|
|
|
|
|
|
@dataclass
|
|
|
class ModelConfig:
|
|
|
|
|
|
vocab_size: int = 8000
|
|
|
n_positions: int = 256
|
|
|
n_embd: int = 512
|
|
|
n_layer: int = 8
|
|
|
n_head: int = 8
|
|
|
n_inner: int = 1024
|
|
|
|
|
|
|
|
|
batch_size: int = 8
|
|
|
gradient_accumulation: int = 4
|
|
|
learning_rate: float = 3e-4
|
|
|
warmup_steps: int = 1000
|
|
|
total_steps: int = 20000
|
|
|
weight_decay: float = 0.1
|
|
|
max_grad_norm: float = 1.0
|
|
|
|
|
|
|
|
|
train_file: str = "./final_corpus/multilingual_corpus_train.txt"
|
|
|
val_file: str = "./final_corpus/multilingual_corpus_val.txt"
|
|
|
tokenizer_path: str = "./final_corpus/multilingual_spm.model"
|
|
|
|
|
|
|
|
|
output_dir: str = "./checkpoints"
|
|
|
save_steps: int = 1000
|
|
|
eval_steps: int = 500
|
|
|
logging_steps: int = 100
|
|
|
|
|
|
|
|
|
fp16: bool = True
|
|
|
|
|
|
def __post_init__(self):
|
|
|
print(f"\nModel Configuration (REALISTIC):")
|
|
|
print(f" Parameters: ~{self.total_params:.1f}M")
|
|
|
print(f" Hidden size: {self.n_embd}")
|
|
|
print(f" Layers: {self.n_layer}")
|
|
|
print(f" Context length: {self.n_positions}")
|
|
|
print(f" Effective batch: {self.effective_batch_size}")
|
|
|
print(f" Total steps: {self.total_steps} (~8-9 epochs)")
|
|
|
print(f" Learning rate: {self.learning_rate}")
|
|
|
|
|
|
@property
|
|
|
def effective_batch_size(self):
|
|
|
return self.batch_size * self.gradient_accumulation
|
|
|
|
|
|
@property
|
|
|
def total_params(self):
|
|
|
|
|
|
embedding = self.vocab_size * self.n_embd
|
|
|
attention = 4 * self.n_embd * self.n_embd
|
|
|
ffn = 2 * self.n_embd * self.n_inner
|
|
|
ln = 2 * self.n_embd
|
|
|
per_layer = attention + ffn + ln
|
|
|
total = embedding + (self.n_layer * per_layer)
|
|
|
return total / 1e6 |