| export PDSH_RCMD_TYPE=ssh | |
| export NCCL_SOCKET_IFNAME=bond1 | |
| export NCCL_IB_DISABLE=0 # 明确启用IB | |
| bash train_node.sh \ | |
| type=gated_deltanet \ | |
| lr=3e-4 \ | |
| scheduler=cosine_with_min_lr \ | |
| batch=8 \ | |
| update=4 \ | |
| warmup=512 \ | |
| steps=50016 \ | |
| context=2048 \ | |
| gpus=8 \ | |
| nodes=4 \ | |
| path=/mnt/jfzn/msj/train_exp/gdn_1B_a800 \ | |
| project=fla \ | |
| model=configs/gdn_1B.json \ | |
| data=cerebras/SlimPajama-627B \ | |
| name=SlimPajama \ | |
| cache=/mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk1/slimp/train,/mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk2/slimp/train \ | |
| checkpoint=/mnt/jfzn/msj/train_exp/gdn_1B_a800/checkpoint-2048 \ | |
| # bash train_node.sh \ | |
| # type=gdn \ | |
| # lr=3e-4 \ | |
| # scheduler=cosine_with_min_lr \ | |
| # batch=8 \ | |
| # update=4 \ | |
| # warmup=512 \ | |
| # steps=50016 \ | |
| # context=2048 \ | |
| # gpus=8 \ | |
| # nodes=2 \ | |
| # path=/mnt/jfzn/msj/train_exp/gdn_1B_hrr4_head10——test \ | |
| # project=fla \ | |
| # model=configs/gdn_1B.json \ | |
| # data=cerebras/SlimPajama-627B \ | |
| # name=SlimPajama \ | |
| # cache=/mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk1/slimp/train,/mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk2/slimp/train \ | |