| # export PDSH_RCMD_TYPE=ssh | |
| # export NCCL_SOCKET_IFNAME=bond1 | |
| # export NCCL_IB_DISABLE=0 # 明确启用IB | |
| bash train.sh \ | |
| type=emgla \ | |
| lr=3e-4 \ | |
| scheduler=cosine_with_min_lr \ | |
| batch=16 \ | |
| update=2 \ | |
| warmup=512 \ | |
| steps=30720 \ | |
| context=2048 \ | |
| gpus=8 \ | |
| nodes=1 \ | |
| path=/mnt/jfzn/msj/train_exp/emgla_340M \ | |
| project=fla \ | |
| model=configs/emgla_340M.json \ | |
| data=cerebras/SlimPajama-627B \ | |
| name=SlimPajama \ | |
| cache=/mnt/jfzn/data/SlimPajama-627B/pre_slimp_chunk1/slimp/train \ | |
| tasks=run \ | |