Byte-lingua-code / m1_run.sh
2ira's picture
offline_compression_graph_code
72c0672 verified
echo $ARNOLD_ID
echo $ARNOLD_WORKER_GPU
echo $ARNOLD_WORKER_0_PORT
echo $ARNOLD_WORKER_0_HOST
echo $ARNOLD_WORKER_NUM
mkdir -p data/m1_python
mkdir -p data/m1_full
cat /mnt/hdfs/linzheng/data/opencoder_python/opencoder_python.chunk.1.jsonl > data/m1_python/m1.chunk.0.jsonl
cat /mnt/hdfs/linzheng/data/opencoder_python/opencoder_python.chunk.2.jsonl > data/m1_python/m1.chunk.1.jsonl
head -n 20000000 /mnt/hdfs/user/linzheng/data/opencoder/chunk.1.jsonl > data/m1_full/m1.chunk.0.jsonl
export WANDB_API_KEY=549ae7ae396007b48e31bfc4398d7a38c31fe998
# EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs8_seqlen2048_python
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=6 model.n_heads=8 \
# optim.lr=1e-3 steps=400000 data.batch_size=8 data.seq_len=2048
# EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs32_seqlen512_python
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=6 model.n_heads=8 \
# optim.lr=1e-3 steps=400000 data.batch_size=32 data.seq_len=512
# EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs8_seqlen2048_full
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=6 model.n_heads=8 \
# optim.lr=1e-3 steps=400000 data.batch_size=8 data.seq_len=2048
# EXP_NAME=checkpoints/m1_20M_lr1e-3_steps400k_bs32_seqlen512_full
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=6 model.n_heads=8 \
# optim.lr=1e-3 steps=400000 data.batch_size=32 data.seq_len=512
##################################################
EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs8_seqlen2048_python
torchrun --nnodes=$ARNOLD_WORKER_NUM \
--node_rank=$ARNOLD_ID \
--nproc_per_node=$ARNOLD_WORKER_GPU \
--master_addr=$ARNOLD_WORKER_0_HOST \
--master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
-m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
dump_dir=$EXP_NAME \
logging.wandb.name=$EXP_NAME name=$EXP_NAME \
model.dim=768 model.n_layers=12 model.n_heads=12 \
optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048
EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs32_seqlen512_python
torchrun --nnodes=$ARNOLD_WORKER_NUM \
--node_rank=$ARNOLD_ID \
--nproc_per_node=$ARNOLD_WORKER_GPU \
--master_addr=$ARNOLD_WORKER_0_HOST \
--master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
-m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
dump_dir=$EXP_NAME \
logging.wandb.name=$EXP_NAME name=$EXP_NAME \
model.dim=768 model.n_layers=12 model.n_heads=12 \
optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512
EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs8_seqlen2048_full
torchrun --nnodes=$ARNOLD_WORKER_NUM \
--node_rank=$ARNOLD_ID \
--nproc_per_node=$ARNOLD_WORKER_GPU \
--master_addr=$ARNOLD_WORKER_0_HOST \
--master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
-m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
dump_dir=$EXP_NAME \
logging.wandb.name=$EXP_NAME name=$EXP_NAME \
model.dim=768 model.n_layers=12 model.n_heads=12 \
optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048
# EXP_NAME=checkpoints/m1_85M_lr1e-3_steps100k_bs32_seqlen512_full
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=768 model.n_layers=12 model.n_heads=12 \
# optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512
##################################################
EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs16_seqlen2048_python
torchrun --nnodes=$ARNOLD_WORKER_NUM \
--node_rank=$ARNOLD_ID \
--nproc_per_node=$ARNOLD_WORKER_GPU \
--master_addr=$ARNOLD_WORKER_0_HOST \
--master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
-m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
dump_dir=$EXP_NAME \
logging.wandb.name=$EXP_NAME name=$EXP_NAME \
model.dim=512 model.n_layers=12 model.n_heads=8 \
optim.lr=1e-3 steps=400000 data.batch_size=16 data.seq_len=2048
EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs64_seqlen512_python
torchrun --nnodes=$ARNOLD_WORKER_NUM \
--node_rank=$ARNOLD_ID \
--nproc_per_node=$ARNOLD_WORKER_GPU \
--master_addr=$ARNOLD_WORKER_0_HOST \
--master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
-m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
dump_dir=$EXP_NAME \
logging.wandb.name=$EXP_NAME name=$EXP_NAME \
model.dim=512 model.n_layers=12 model.n_heads=8 \
optim.lr=1e-3 steps=400000 data.batch_size=64 data.seq_len=512
EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs16_seqlen2048_full
torchrun --nnodes=$ARNOLD_WORKER_NUM \
--node_rank=$ARNOLD_ID \
--nproc_per_node=$ARNOLD_WORKER_GPU \
--master_addr=$ARNOLD_WORKER_0_HOST \
--master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
-m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
dump_dir=$EXP_NAME \
logging.wandb.name=$EXP_NAME name=$EXP_NAME \
model.dim=512 model.n_layers=12 model.n_heads=8 \
optim.lr=1e-3 steps=400000 data.batch_size=16 data.seq_len=2048
EXP_NAME=checkpoints/m1_40M_lr1e-3_steps400k_bs64_seqlen512_full
torchrun --nnodes=$ARNOLD_WORKER_NUM \
--node_rank=$ARNOLD_ID \
--nproc_per_node=$ARNOLD_WORKER_GPU \
--master_addr=$ARNOLD_WORKER_0_HOST \
--master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
-m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
dump_dir=$EXP_NAME \
logging.wandb.name=$EXP_NAME name=$EXP_NAME \
model.dim=512 model.n_layers=12 model.n_heads=8 \
optim.lr=1e-3 steps=400000 data.batch_size=64 data.seq_len=512
##################################################
# EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_python
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=12 model.n_heads=8 \
# optim.lr=1e-3 steps=200000 data.batch_size=8 data.seq_len=2048
# EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_python
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=12 model.n_heads=8 \
# optim.lr=1e-3 steps=200000 data.batch_size=32 data.seq_len=512
# EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs8_seqlen2048_full
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=12 model.n_heads=8 \
# optim.lr=1e-3 steps=200000 data.batch_size=8 data.seq_len=2048
# EXP_NAME=checkpoints/m1_40M_lr1e-3_steps200k_bs32_seqlen512_full
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# model.dim=512 model.n_layers=12 model.n_heads=8 \
# optim.lr=1e-3 steps=200000 data.batch_size=32 data.seq_len=512
##################################################
# EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs8_seqlen2048_python
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048
# EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs32_seqlen512_python
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_python.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512
# EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs8_seqlen2048_full
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=1e-3 steps=100000 data.batch_size=8 data.seq_len=2048
# EXP_NAME=checkpoints/m1_200M_lr1e-3_steps100k_bs32_seqlen512_full
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_200M_full.yaml \
# dump_dir=$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=1e-3 steps=100000 data.batch_size=32 data.seq_len=512
##################################################
# EXP_NAME=m1_6M_lr3e-2_steps50k_bs32_seqlen2048
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_6M.yaml \
# dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/xinyu/m1/checkpoints/$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=3e-2 steps=50000 data.batch_size=8 data.seq_len=1024
# EXP_NAME=m1_6M_lr5e-2_steps10k_bs32_seqlen2048
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_6M.yaml \
# dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=5e-2 steps=50000 data.batch_size=32 data.seq_len=2048
# EXP_NAME=m1_6M_lr5e-2_steps50k_bs32_seqlen2048
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_6M.yaml \
# dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/ \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=5e-2 steps=50000 data.batch_size=32 data.seq_len=2048
# EXP_NAME=m1_6M_lr5e-2_steps50k_bs512_seqlen128
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_6M.yaml \
# dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/ \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=5e-2 steps=50000 data.batch_size=512 data.seq_len=128
# EXP_NAME=m1_6M_lr1e-2_steps50k_bs128_seqlen512
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_6M.yaml \
# dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=1e-2 steps=50000 data.batch_size=128 data.seq_len=512
# EXP_NAME=m1_6M_lr3e-2_steps50k_bs128_seqlen512
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_6M.yaml \
# dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=3e-2 steps=50000 data.batch_size=128 data.seq_len=512
# EXP_NAME=m1_6M_lr5e-2_steps50k_bs128_seqlen512
# torchrun --nnodes=$ARNOLD_WORKER_NUM \
# --node_rank=$ARNOLD_ID \
# --nproc_per_node=$ARNOLD_WORKER_GPU \
# --master_addr=$ARNOLD_WORKER_0_HOST \
# --master_port=$(( ARNOLD_WORKER_0_PORT + 1 )) \
# -m apps.main.train config=apps/main/configs/m1_6M.yaml \
# dump_dir=/mnt/bn/tiktok-mm-5/aiic/users/linzheng/artifacts/m1_checkpoints/$EXP_NAME \
# logging.wandb.name=$EXP_NAME name=$EXP_NAME \
# optim.lr=5e-2 steps=50000 data.batch_size=128 data.seq_len=512