| | #!/bin/bash |
| |
|
| |
|
| | |
| | export SLURM_PARTITION=<slurm partition, used to feed -p option in slurm> |
| | export SLURM_ACCOUNT=<slurm account, used to feed -A option in slurm> |
| |
|
| |
|
| | |
| | export MEGATRON_CODE_DIR=<megatron source code directory> |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | export DOCKER_MOUNT_DIR=<megatron dataset and bpe tokenizer vocab path> |
| |
|
| |
|
| | |
| | MEGATRON_DATA=<path to megatron processed data> |
| | BPE_VOCAB_FILE=<path to bpe vocab file> |
| | BPE_MERGE_FILE=<path to bpe merges file> |
| |
|
| |
|
| | |
| | |
| | |
| | export MEGATRON_PARAMS=" ${MEGATRON_EXTRA_PARAMS} \ |
| | --tensor-model-parallel-size ${TP} \ |
| | --pipeline-model-parallel-size ${PP} \ |
| | --micro-batch-size ${MBS} \ |
| | --global-batch-size ${GBS} \ |
| | --num-layers ${NLS} \ |
| | --hidden-size ${HS} \ |
| | --num-attention-heads ${NAH} \ |
| | --DDP-impl ${DDP} \ |
| | --data-path ${MEGATRON_DATA} \ |
| | --vocab-file ${BPE_VOCAB_FILE} \ |
| | --merge-file ${BPE_MERGE_FILE} \ |
| | --log-interval 5 \ |
| | --seq-length 2048 \ |
| | --max-position-embeddings 2048 \ |
| | --train-iters 500 \ |
| | --lr-decay-iters 320 \ |
| | --lr 0.0001 \ |
| | --min-lr 0.00001 \ |
| | --lr-decay-style cosine \ |
| | --lr-warmup-fraction 0.01 \ |
| | --split 969,30,1 \ |
| | --eval-iters 100 \ |
| | --eval-interval 1000 \ |
| | --clip-grad 1.0 \ |
| | --fp16 \ |
| | --loss-scale 8192 " |
| |
|
| |
|
| |
|