| echo $ARNOLD_ID | |
| echo $ARNOLD_WORKER_GPU | |
| echo $ARNOLD_WORKER_0_PORT | |
| echo $ARNOLD_WORKER_0_HOST | |
| echo $ARNOLD_WORKER_NUM | |
| mkdir -p data/m1_python | |
| mkdir -p data/m1_full | |
| cat /mnt/hdfs/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl > data/m1_python/m1.chunk.0.jsonl | |
| cat /mnt/hdfs/linzheng/data/opencoder_python/opencoder_python.chunk.2.jsonl > data/m1_python/m1.chunk.1.jsonl | |
| head -n 20000000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl > data/m1_full/m1.chunk.0.jsonl | |
| export WANDB_API_KEY=549ae7ae396007b48e31bfc4398d7a38c31fe998 | |
| # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs8_seqlen2048_python | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=6 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=400000 data.batch_size=8 data.seq_len=2048 | |
| # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs32_seqlen512_python | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=6 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=400000 data.batch_size=32 data.seq_len=512 | |
| # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs8_seqlen2048_full | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=6 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=400000 data.batch_size=8 data.seq_len=2048 | |
| # EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs32_seqlen512_full | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=6 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=400000 data.batch_size=32 data.seq_len=512 | |
| ################################################## | |
| EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs8_seqlen2048_python | |
| torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| --node_rank=$ARNOLD_ID \ | |
| --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| dump_dir=$EXP_NAME \ | |
| logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| model.dim=768 model.n_layers=12 model.n_heads=12 \ | |
| optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 | |
| EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs32_seqlen512_python | |
| torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| --node_rank=$ARNOLD_ID \ | |
| --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| dump_dir=$EXP_NAME \ | |
| logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| model.dim=768 model.n_layers=12 model.n_heads=12 \ | |
| optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 | |
| EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs8_seqlen2048_full | |
| torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| --node_rank=$ARNOLD_ID \ | |
| --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| dump_dir=$EXP_NAME \ | |
| logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| model.dim=768 model.n_layers=12 model.n_heads=12 \ | |
| optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 | |
| # EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs32_seqlen512_full | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=768 model.n_layers=12 model.n_heads=12 \ | |
| # optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 | |
| ################################################## | |
| EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs16_seqlen2048_python | |
| torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| --node_rank=$ARNOLD_ID \ | |
| --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| dump_dir=$EXP_NAME \ | |
| logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| optim.lr=1e-3 steps=400000 data.batch_size=16 data.seq_len=2048 | |
| EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs64_seqlen512_python | |
| torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| --node_rank=$ARNOLD_ID \ | |
| --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| dump_dir=$EXP_NAME \ | |
| logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| optim.lr=1e-3 steps=400000 data.batch_size=64 data.seq_len=512 | |
| EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs16_seqlen2048_full | |
| torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| --node_rank=$ARNOLD_ID \ | |
| --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| dump_dir=$EXP_NAME \ | |
| logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| optim.lr=1e-3 steps=400000 data.batch_size=16 data.seq_len=2048 | |
| EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs64_seqlen512_full | |
| torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| --node_rank=$ARNOLD_ID \ | |
| --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| dump_dir=$EXP_NAME \ | |
| logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| optim.lr=1e-3 steps=400000 data.batch_size=64 data.seq_len=512 | |
| ################################################## | |
| # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=200000 data.batch_size=8 data.seq_len=2048 | |
| # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_python | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=200000 data.batch_size=32 data.seq_len=512 | |
| # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_full | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=200000 data.batch_size=8 data.seq_len=2048 | |
| # EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_full | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # model.dim=512 model.n_layers=12 model.n_heads=8 \ | |
| # optim.lr=1e-3 steps=200000 data.batch_size=32 data.seq_len=512 | |
| ################################################## | |
| # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs8_seqlen2048_python | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 | |
| # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs32_seqlen512_python | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 | |
| # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs8_seqlen2048_full | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048 | |
| # EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs32_seqlen512_full | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \ | |
| # dump_dir=$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512 | |
| ################################################## | |
| # EXP_NAME=m1_6M_lr3e-2_steps50k_bs32_seqlen2048 | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ | |
| # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/xinyu/m1/checkpoints/$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=3e-2 steps=50000 data.batch_size=8 data.seq_len=1024 | |
| # EXP_NAME=m1_6M_lr5e-2_steps10k_bs32_seqlen2048 | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ | |
| # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=5e-2 steps=50000 data.batch_size=32 data.seq_len=2048 | |
| # EXP_NAME=m1_6M_lr5e-2_steps50k_bs32_seqlen2048 | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ | |
| # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/ \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=5e-2 steps=50000 data.batch_size=32 data.seq_len=2048 | |
| # EXP_NAME=m1_6M_lr5e-2_steps50k_bs512_seqlen128 | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ | |
| # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/ \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=5e-2 steps=50000 data.batch_size=512 data.seq_len=128 | |
| # EXP_NAME=m1_6M_lr1e-2_steps50k_bs128_seqlen512 | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ | |
| # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=1e-2 steps=50000 data.batch_size=128 data.seq_len=512 | |
| # EXP_NAME=m1_6M_lr3e-2_steps50k_bs128_seqlen512 | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ | |
| # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=3e-2 steps=50000 data.batch_size=128 data.seq_len=512 | |
| # EXP_NAME=m1_6M_lr5e-2_steps50k_bs128_seqlen512 | |
| # torchrun --nnodes=$ARNOLD_WORKER_NUM \ | |
| # --node_rank=$ARNOLD_ID \ | |
| # --nproc_per_node=$ARNOLD_WORKER_GPU \ | |
| # --master_addr=$ARNOLD_WORKER_0_HOST \ | |
| # --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \ | |
| # -m apps.main.train config=apps/main/configs/m1_6M.yaml \ | |
| # dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \ | |
| # logging.wandb.name=$EXP_NAME name=$EXP_NAME \ | |
| # optim.lr=5e-2 steps=50000 data.batch_size=128 data.seq_len=512 | |