{ "OUT_DIR": "/content/van_fast_transformer", "TOKENIZER_NAME": "gpt2", "DATASET_NAME": "HuggingFaceFW/fineweb-edu", "DATASET_CONFIG": "sample-10BT", "DATASET_SPLIT": "train", "TEXT_KEY": "text", "VOCAB_SIZE": 50257, "BLOCK_SIZE": 1024, "D_MODEL": 1024, "N_LAYER": 18, "N_HEAD": 16, "N_KV_HEAD": 4, "D_FF": 4096, "DROPOUT": 0.0, "USE_QK_NORM": true, "MAX_STEPS": 5000, "BATCH_SIZE": 1, "GRAD_ACCUM": 4, "LR": 0.0003, "MIN_LR": 3e-05, "WARMUP_STEPS": 300, "WEIGHT_DECAY": 0.1, "BETA1": 0.9, "BETA2": 0.95, "MAX_GRAD_NORM": 1.0, "EARLY_STOP_LOSS": 0.0001, "EARLY_STOP_PATIENCE": 1, "EARLY_STOP_SAVE": true, "EARLY_STOP_ON_EVAL": false, "EARLY_STOP_EVAL_LOSS": 0.0001, "EARLY_STOP_EVAL_PATIENCE": 2, "LOG_EVERY": 10, "EVAL_EVERY": 1000, "SAVE_EVERY": 1000, "EVAL_BATCHES": 4, "GEN_MAX_NEW_TOKENS": 160, "GEN_TEMPERATURE": 0.8, "GEN_TOP_K": 50, "GEN_TOP_P": 0.95, "SEED": 42, "DTYPE": "bf16", "TF32": true, "COMPILE": true, "GRADIENT_CHECKPOINTING": false, "NUM_WORKERS": 2, "PIN_MEMORY": true, "DEBUG_SMALL": false }