| import json | |
| import os.path | |
| import re | |
| import torch | |
| GRADIENT_ACCUM = 2 | |
| MIXED_PRECISION = ( | |
| "no" if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else "bf16" | |
| ) # 'float32', 'bfloat16', or 'float16', the latter will auto implement a GradScaler | |
| BATCH_SIZE = 12 | |
| LR = 1e-3 | |
| training_seq_length = 3072 | |
| embedding_dim = 768 | |
| attention_heads = 8 | |
| N_decoder_layers = 8 | |
| TRAINING_EPOCHS = 50 | |
| BASE_LAYER_DROPOUTS = [0.0, 0.0, 0.1, 0.2, 0.4] | |
| TRANSFORMER_BIAS_ENABLE = False | |
| VOCAB_SIZE = 12000 | |
| SPECIAL_TOKENS = ["<|startoftext|>", "<|user|>", "<|agent|>", "<|endofturn|>"] | |
| RESUME_CHECKPOINT = "75.1 million-params-transformer-12-bf16" | |
| CHECKPOINT_VERSION = "1.1.0" | |