| import torch | |
| MODEL_CONFIG = { | |
| 'VOCAB_SIZE': 50000, | |
| 'D_MODEL': 1024, | |
| 'N_HEADS': 32, | |
| 'D_FF': 4096, | |
| 'N_LAYERS': 32, | |
| 'MAX_SEQ_LEN': 512, | |
| 'BATCH_SIZE': 32, | |
| 'LEARNING_RATE': 1e-4, | |
| 'NUM_EPOCHS': 20, | |
| 'DEVICE': 'cuda' if torch.cuda.is_available() else 'cpu', | |
| 'WARMUP_STEPS': 4000, | |
| 'ADAM_EPSILON': 1e-8, | |
| 'WEIGHT_DECAY': 0.01, | |
| 'GRADIENT_ACCUMULATION_STEPS': 2, | |
| 'MAX_GRAD_NORM': 1.0, | |
| 'DROPOUT': 0.1, | |
| } | |
| TRAINING_CONFIG = { | |
| 'CHECKPOINT_SAVE_STEPS': 5000, | |
| 'LOGGING_STEPS': 100, | |
| 'EVAL_STEPS': 1000, | |
| 'SAVE_TOTAL_LIMIT': 5 | |
| } | |