summerV2 / training_cfg.json
summerMC's picture
Upload folder using huggingface_hub
ec757bc verified
{
"OUT_DIR": "/content/van_fast_transformer",
"TOKENIZER_NAME": "gpt2",
"DATASET_NAME": "HuggingFaceFW/fineweb-edu",
"DATASET_CONFIG": "sample-10BT",
"DATASET_SPLIT": "train",
"TEXT_KEY": "text",
"VOCAB_SIZE": 50257,
"BLOCK_SIZE": 1024,
"D_MODEL": 1024,
"N_LAYER": 18,
"N_HEAD": 16,
"N_KV_HEAD": 4,
"D_FF": 4096,
"DROPOUT": 0.0,
"USE_QK_NORM": true,
"MAX_STEPS": 5000,
"BATCH_SIZE": 1,
"GRAD_ACCUM": 4,
"LR": 0.0003,
"MIN_LR": 3e-05,
"WARMUP_STEPS": 300,
"WEIGHT_DECAY": 0.1,
"BETA1": 0.9,
"BETA2": 0.95,
"MAX_GRAD_NORM": 1.0,
"EARLY_STOP_LOSS": 0.0001,
"EARLY_STOP_PATIENCE": 1,
"EARLY_STOP_SAVE": true,
"EARLY_STOP_ON_EVAL": false,
"EARLY_STOP_EVAL_LOSS": 0.0001,
"EARLY_STOP_EVAL_PATIENCE": 2,
"LOG_EVERY": 10,
"EVAL_EVERY": 1000,
"SAVE_EVERY": 1000,
"EVAL_BATCHES": 4,
"GEN_MAX_NEW_TOKENS": 160,
"GEN_TEMPERATURE": 0.8,
"GEN_TOP_K": 50,
"GEN_TOP_P": 0.95,
"SEED": 42,
"DTYPE": "bf16",
"TF32": true,
"COMPILE": true,
"GRADIENT_CHECKPOINTING": false,
"NUM_WORKERS": 2,
"PIN_MEMORY": true,
"DEBUG_SMALL": false
}