JustinTX's picture
Add files using upload-large-folder tool
d7b3a74 verified
#!/bin/bash
# for rerun the task
pkill -9 sglang
sleep 3
ray stop --force
pkill -9 ray
pkill -9 python
sleep 3
pkill -9 ray
pkill -9 python
set -ex
export PYTHONBUFFERED=16
# DeepSeek-R1-Distill-Qwen-7B
MODEL_ARGS=(
--swiglu
--num-layers 28
--hidden-size 3584
--ffn-hidden-size 18944
--num-attention-heads 28
--group-query-attention
--num-query-groups 4
--max-position-embeddings 131072
--seq-length 4096
--use-rotary-position-embeddings
--disable-bias-linear
--add-qkv-bias
--normalization "RMSNorm"
--norm-epsilon 1e-06
--rotary-base 10000
--vocab-size 152064
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
--attention-backend flash
--moe-token-dispatcher-type alltoall
--untie-embeddings-and-output-weights
--attention-dropout 0.0
--hidden-dropout 0.0
)
CKPT_ARGS=(
--hf-checkpoint /root/DeepSeek-R1-Distill-Qwen-7B
--ref-load /root/DeepSeek-R1-Distill-Qwen-7B_torch_dist
--save-interval 100
--save /root/DeepSeek-R1-Distill-Qwen-7B_slime
)
ROLLOUT_ARGS=(
--rollout-function-path slime_plugins.rollout_buffer.rollout_buffer_example.generate_rollout
--rm-type deepscaler
--prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
--input-key prompt
--label-key label
--num-rollout 3000
--rollout-batch-size 128
--rollout-max-response-len 8192
--rollout-temperature 0.8
--rollout-shuffle
--n-samples-per-prompt 8
--global-batch-size 1024
--micro-batch-size 8
--ref-micro-batch-size 8
--use-dynamic-batch-size
--max-tokens-per-gpu 9216
--balance-data
)
DISTRIBUTED_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
--context-parallel-size 1
--sequence-parallel
)
PERF_ARGS=(
--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1
)
GRPO_ARGS=(
--advantage-estimator grpo
--use-kl-loss
--kl-loss-coef 0.001
--kl-loss-type low_var_kl
--entropy-coef 0.00
)
OPTIMIZER_ARGS=(
--lr 1e-6
--lr-decay-style constant
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.98
)
WANDB_ARGS=(
# --use-wandb
)
# launch the master node of ray in container
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats
ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json='{
"env_vars": {
"PYTHONPATH": "/root/Megatron-LM/",
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
"NCCL_CUMEM_ENABLE": "0"
}
}' \
-- python3 train_async.py \
--actor-num-nodes 1 \
--actor-num-gpus-per-node 4 \
--rollout-num-gpus 4 \
--rollout-num-gpus-per-engine 1 \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
${WANDB_ARGS[@]} \
${PERF_ARGS[@]} \
--rollout-buffer-url http://${MASTER_ADDR}:8889 \
--keep-old-actor \
--disable-rewards-normalization \
--loss-mask-type distill_qwen \
--log-passrate