"""GPT-2 style model configuration for RTX 4070 (12GB VRAM).""" import os PROJECT_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_DIR = os.path.join(PROJECT_DIR, "data") CHECKPOINT_DIR = os.path.join(PROJECT_DIR, "checkpoints") LOG_DIR = os.path.join(PROJECT_DIR, "logs") TOKENIZER_PATH = os.path.join(DATA_DIR, "tokenizer.json") DATA_TEXT_PATH = os.path.join(DATA_DIR, "russian_dialogues.txt") MAX_SEQ_LEN = 64 TRAIN_SPLIT = 0.9 VOCAB_SIZE = 16000 # Model EMBED_DIM = 256 NUM_HEADS = 4 NUM_LAYERS = 6 FF_DIM = 1024 DROPOUT = 0.15 BATCH_SIZE = 32 LEARNING_RATE = 5e-4 WEIGHT_DECAY = 0.01 WARMUP_STEPS = 50 MAX_EPOCHS = 100 PATIENCE = 20 MIXED_PRECISION = False # float32 — more stable for small models MAX_GRAD_NORM = 1.0