File size: 3,061 Bytes
d7b3a74 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 | #!/bin/bash
# for rerun the task
pkill -9 sglang
sleep 3
ray stop --force
pkill -9 ray
pkill -9 python
sleep 3
pkill -9 ray
pkill -9 python
set -ex
export PYTHONBUFFERED=16
# DeepSeek-R1-Distill-Qwen-7B
MODEL_ARGS=(
--swiglu
--num-layers 28
--hidden-size 3584
--ffn-hidden-size 18944
--num-attention-heads 28
--group-query-attention
--num-query-groups 4
--max-position-embeddings 131072
--seq-length 4096
--use-rotary-position-embeddings
--disable-bias-linear
--add-qkv-bias
--normalization "RMSNorm"
--norm-epsilon 1e-06
--rotary-base 10000
--vocab-size 152064
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
--attention-backend flash
--moe-token-dispatcher-type alltoall
--untie-embeddings-and-output-weights
--attention-dropout 0.0
--hidden-dropout 0.0
)
CKPT_ARGS=(
--hf-checkpoint /root/DeepSeek-R1-Distill-Qwen-7B
--ref-load /root/DeepSeek-R1-Distill-Qwen-7B_torch_dist
--save-interval 100
--save /root/DeepSeek-R1-Distill-Qwen-7B_slime
)
ROLLOUT_ARGS=(
--rollout-function-path slime_plugins.rollout_buffer.rollout_buffer_example.generate_rollout
--rm-type deepscaler
--prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
--input-key prompt
--label-key label
--num-rollout 3000
--rollout-batch-size 128
--rollout-max-response-len 8192
--rollout-temperature 0.8
--rollout-shuffle
--n-samples-per-prompt 8
--global-batch-size 1024
--micro-batch-size 8
--ref-micro-batch-size 8
--use-dynamic-batch-size
--max-tokens-per-gpu 9216
--balance-data
)
DISTRIBUTED_ARGS=(
--tensor-model-parallel-size 2
--pipeline-model-parallel-size 1
--context-parallel-size 1
--sequence-parallel
)
PERF_ARGS=(
--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1
)
GRPO_ARGS=(
--advantage-estimator grpo
--use-kl-loss
--kl-loss-coef 0.001
--kl-loss-type low_var_kl
--entropy-coef 0.00
)
OPTIMIZER_ARGS=(
--lr 1e-6
--lr-decay-style constant
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.98
)
WANDB_ARGS=(
# --use-wandb
)
# launch the master node of ray in container
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats
ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json='{
"env_vars": {
"PYTHONPATH": "/root/Megatron-LM/",
"CUDA_DEVICE_MAX_CONNECTIONS": "1",
"NCCL_CUMEM_ENABLE": "0"
}
}' \
-- python3 train_async.py \
--actor-num-nodes 1 \
--actor-num-gpus-per-node 4 \
--rollout-num-gpus 4 \
--rollout-num-gpus-per-engine 1 \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${DISTRIBUTED_ARGS[@]} \
${WANDB_ARGS[@]} \
${PERF_ARGS[@]} \
--rollout-buffer-url http://${MASTER_ADDR}:8889 \
--keep-old-actor \
--disable-rewards-normalization \
--loss-mask-type distill_qwen \
--log-passrate
|