| # export PDSH_RCMD_TYPE=ssh | |
| # export NCCL_SOCKET_IFNAME=bond1 | |
| # export TRITON_CACHE_DIR=/mnt/jfzn/msj/triton | |
| bash train.sh \ | |
| type=mask_gdn \ | |
| lr=3e-5 \ | |
| scheduler=cosine_with_min_lr \ | |
| batch=8 \ | |
| update=4 \ | |
| warmup=1024 \ | |
| steps=20480 \ | |
| context=2048 \ | |
| gpus=8 \ | |
| nodes=1 \ | |
| path=/mnt/jfzn/msj/train_exp/mask-gdn-test222 \ | |
| project=fla \ | |
| model=configs/mask_gdn_1B.json \ | |
| data=cerebras/SlimPajama-627B \ | |
| name=SlimPajama \ | |
| cache=/mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk3/slimp/train \ | |
| # config=/mnt/jfzn/msj/flash-linear-attention/legacy/training/configs/deepspeed_2node.yaml \ | |
| # bash train.sh \ | |
| # type=mask_gdn \ | |
| # lr=3e-5 \ | |
| # scheduler=cosine_with_min_lr \ | |
| # batch=16 \ | |
| # update=2 \ | |
| # warmup=512 \ | |
| # steps=30720 \ | |
| # context=2048 \ | |
| # gpus=8 \ | |
| # nodes=1 \ | |
| # path=/mnt/jfzn/msj/train_exp/mask_gdn_340M-rank4-method1_byt \ | |
| # project=fla \ | |
| # model=configs/mask_gdn_340M.json \ | |
| # data=cerebras/SlimPajama-627B \ | |
| # name=SlimPajama \ | |
| # cache=/mnt/jfzn/msj/slim_chunk1 \ | |
| # checkpoint=/mnt/jfzn/msj/train_exp/mask_gdn_340M-rank4-method1_byt/checkpoint-14336 \ | |
| # # cache=/mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk1/slimp/train \ | |