| """GPT-2 style model configuration for RTX 4070 (12GB VRAM).""" | |
| import os | |
| PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| DATA_DIR = os.path.join(PROJECT_DIR, "data") | |
| CHECKPOINT_DIR = os.path.join(PROJECT_DIR, "checkpoints") | |
| LOG_DIR = os.path.join(PROJECT_DIR, "logs") | |
| TOKENIZER_PATH = os.path.join(DATA_DIR, "tokenizer.json") | |
| DATA_TEXT_PATH = os.path.join(DATA_DIR, "russian_dialogues.txt") | |
| MAX_SEQ_LEN = 64 | |
| TRAIN_SPLIT = 0.9 | |
| VOCAB_SIZE = 16000 | |
| # Model | |
| EMBED_DIM = 256 | |
| NUM_HEADS = 4 | |
| NUM_LAYERS = 6 | |
| FF_DIM = 1024 | |
| DROPOUT = 0.15 | |
| BATCH_SIZE = 32 | |
| LEARNING_RATE = 5e-4 | |
| WEIGHT_DECAY = 0.01 | |
| WARMUP_STEPS = 50 | |
| MAX_EPOCHS = 100 | |
| PATIENCE = 20 | |
| MIXED_PRECISION = False # float32 — more stable for small models | |
| MAX_GRAD_NORM = 1.0 | |