Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- code/RL_model/verl/verl_train/examples/cispo_trainer/run_cispo_qwen2_5_0_5b_gsm8k.sh +51 -0
- code/RL_model/verl/verl_train/examples/generation/run_deepseek7b_mutli_node.sh +22 -0
- code/RL_model/verl/verl_train/examples/generation/run_deepseek_v2_lite_math.sh +22 -0
- code/RL_model/verl/verl_train/examples/gpg_trainer/gpg.md +34 -0
- code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math.sh +52 -0
- code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math_megatron.sh +53 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/README.md +70 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh +118 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh +179 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm.sh +40 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math.sh +49 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_seq_balance.sh +39 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_gptoss_20b.sh +79 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh +54 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_moonlight16b_math_megatron.sh +58 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh +182 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b.sh +41 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math.sh +49 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh +59 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh +122 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh +91 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh +89 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh +52 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance_math_megatron.sh +57 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh +51 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-7b_math_megatron_diff_tp.sh +50 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh +40 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh +71 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh +41 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh +88 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh +53 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_freeze_vision.sh +47 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh +52 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_seq_balance.sh +45 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh +51 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh +52 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh +51 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh +181 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b_npu.sh +58 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh +81 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh +71 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-235b-megatron.sh +84 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-30b-megatron.sh +85 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh +195 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh +133 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh +236 -0
- code/RL_model/verl/verl_train/examples/grpo_trainer/run_seed_oss_36b.sh +48 -0
- code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen30b_gspo.sh +197 -0
- code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh +199 -0
- code/RL_model/verl/verl_train/examples/gspo_trainer/test_gspo_3b_math.sh +195 -0
code/RL_model/verl/verl_train/examples/cispo_trainer/run_cispo_qwen2_5_0_5b_gsm8k.sh
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 5 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 6 |
+
|
| 7 |
+
train_files="['$gsm8k_train_path']"
|
| 8 |
+
test_files="['$gsm8k_test_path']"
|
| 9 |
+
|
| 10 |
+
python3 -m verl.trainer.main_ppo \
|
| 11 |
+
algorithm.adv_estimator=grpo \
|
| 12 |
+
actor_rollout_ref.actor.policy_loss.loss_mode=cispo \
|
| 13 |
+
actor_rollout_ref.actor.clip_ratio_low=10 \
|
| 14 |
+
actor_rollout_ref.actor.clip_ratio_high=0.2 \
|
| 15 |
+
data.train_files="$train_files" \
|
| 16 |
+
data.val_files="$test_files" \
|
| 17 |
+
data.train_batch_size=256 \
|
| 18 |
+
data.max_prompt_length=1024 \
|
| 19 |
+
data.max_response_length=1024 \
|
| 20 |
+
data.filter_overlong_prompts=True \
|
| 21 |
+
data.truncation='error' \
|
| 22 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
|
| 23 |
+
actor_rollout_ref.model.torch_dtype=bfloat16 \
|
| 24 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 25 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 26 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 27 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
|
| 28 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 29 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 30 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 31 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 32 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 33 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 34 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 35 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
|
| 36 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 37 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 38 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
|
| 39 |
+
actor_rollout_ref.rollout.n=5 \
|
| 40 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
|
| 41 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 42 |
+
algorithm.use_kl_in_reward=False \
|
| 43 |
+
trainer.critic_warmup=0 \
|
| 44 |
+
trainer.logger='["console","wandb"]' \
|
| 45 |
+
trainer.project_name='verl_cispo_example_gsm8k' \
|
| 46 |
+
trainer.experiment_name='qwen2_5_0_5b_cispo' \
|
| 47 |
+
trainer.n_gpus_per_node=1 \
|
| 48 |
+
trainer.nnodes=1 \
|
| 49 |
+
trainer.save_freq=5 \
|
| 50 |
+
trainer.test_freq=5 \
|
| 51 |
+
trainer.total_epochs=3 $@
|
code/RL_model/verl/verl_train/examples/generation/run_deepseek7b_mutli_node.sh
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
data_path=$HOME/data/rlhf/gsm8k/test.parquet
|
| 4 |
+
save_path=$HOME/data/rlhf/math/deepseek_v2_lite_gen_test.parquet
|
| 5 |
+
model_path=deepseek-ai/deepseek-llm-7b-chat
|
| 6 |
+
|
| 7 |
+
python3 -m verl.trainer.main_generation \
|
| 8 |
+
trainer.nnodes=2 \
|
| 9 |
+
trainer.n_gpus_per_node=8 \
|
| 10 |
+
data.path=$data_path \
|
| 11 |
+
data.prompt_key=prompt \
|
| 12 |
+
data.n_samples=1 \
|
| 13 |
+
data.output_path=$save_path \
|
| 14 |
+
model.path=$model_path\
|
| 15 |
+
+model.trust_remote_code=True \
|
| 16 |
+
rollout.temperature=1.0 \
|
| 17 |
+
rollout.top_k=50 \
|
| 18 |
+
rollout.top_p=0.7 \
|
| 19 |
+
rollout.prompt_length=2048 \
|
| 20 |
+
rollout.response_length=1024 \
|
| 21 |
+
rollout.tensor_model_parallel_size=16 \
|
| 22 |
+
rollout.gpu_memory_utilization=0.8
|
code/RL_model/verl/verl_train/examples/generation/run_deepseek_v2_lite_math.sh
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
data_path=$HOME/data/gsm8k/test.parquet
|
| 4 |
+
save_path=$HOME/data/gsm8k/deepseek_v2_lite_gen_test.parquet
|
| 5 |
+
model_path=deepseek-ai/deepseek-llm-7b-chat
|
| 6 |
+
|
| 7 |
+
python3 -m verl.trainer.main_generation \
|
| 8 |
+
trainer.nnodes=1 \
|
| 9 |
+
trainer.n_gpus_per_node=8 \
|
| 10 |
+
data.path=$data_path \
|
| 11 |
+
data.prompt_key=prompt \
|
| 12 |
+
data.n_samples=1 \
|
| 13 |
+
data.output_path=$save_path \
|
| 14 |
+
model.path=$model_path \
|
| 15 |
+
+model.trust_remote_code=True \
|
| 16 |
+
rollout.temperature=1.0 \
|
| 17 |
+
rollout.top_k=50 \
|
| 18 |
+
rollout.top_p=0.7 \
|
| 19 |
+
rollout.prompt_length=2048 \
|
| 20 |
+
rollout.response_length=1024 \
|
| 21 |
+
rollout.tensor_model_parallel_size=2 \
|
| 22 |
+
rollout.gpu_memory_utilization=0.8
|
code/RL_model/verl/verl_train/examples/gpg_trainer/gpg.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GPG: Group Policy Gradient
|
| 2 |
+
|
| 3 |
+
Group Policy Gradient (GPG) is a minimalist reinforcement learning (RL) method that enhances the reasoning ability of large language models without relying on supervised fine-tuning or complex tricks. GPG revisits traditional policy gradients and directly optimizes the RL objective—no surrogate losses, no KL penalties, no critic, and no reference model. Compared to GRPO, GPG is simpler, more efficient, and achieves better results on many tasks. For more details, please refer to the original paper [GPG: A Simple and Strong Reinforcement Learning Baseline for Model Reasoning
|
| 4 |
+
](https://arxiv.org/abs/2504.02546).
|
| 5 |
+
|
| 6 |
+
## Key Components
|
| 7 |
+
- Use a corrected advantage function to improve policy gradient accuracy and training efficiency.
|
| 8 |
+
- By eliminating the critic and reference models, avoiding KL divergence constraints, significantly simplifies the training process compared to Group Relative Policy Optimization (GRPO)
|
| 9 |
+
|
| 10 |
+
## Configuration
|
| 11 |
+
To configure GPG within the framework, use the following YAML settings.
|
| 12 |
+
|
| 13 |
+
```yaml
|
| 14 |
+
algorithm:
|
| 15 |
+
adv_estimator: gpg
|
| 16 |
+
actor_rollout_ref:
|
| 17 |
+
actor:
|
| 18 |
+
policy_loss:
|
| 19 |
+
loss_mode: "gpg"
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
## Advanced Extensions
|
| 23 |
+
GPG is a simple and strong baseline for model reasoning. Although it avoids using KL loss in its original form, you can still use KL loss to further improve the performance.
|
| 24 |
+
|
| 25 |
+
```yaml
|
| 26 |
+
algorithm:
|
| 27 |
+
adv_estimator: gpg
|
| 28 |
+
actor_rollout_ref:
|
| 29 |
+
actor:
|
| 30 |
+
use_kl_loss: True # enable kl regularization
|
| 31 |
+
kl_loss_coef: 0.01
|
| 32 |
+
policy_loss:
|
| 33 |
+
loss_mode: "gpg"
|
| 34 |
+
```
|
code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
|
| 4 |
+
# export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 5 |
+
|
| 6 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 7 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 8 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 9 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 10 |
+
|
| 11 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 12 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 13 |
+
|
| 14 |
+
python3 -m verl.trainer.main_ppo \
|
| 15 |
+
algorithm.adv_estimator=gpg \
|
| 16 |
+
data.train_files="$train_files" \
|
| 17 |
+
data.val_files="$test_files" \
|
| 18 |
+
data.train_batch_size=1024 \
|
| 19 |
+
data.max_prompt_length=1024 \
|
| 20 |
+
data.max_response_length=1024 \
|
| 21 |
+
data.filter_overlong_prompts=True \
|
| 22 |
+
data.truncation='error' \
|
| 23 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 24 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 25 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 26 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 27 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
|
| 28 |
+
actor_rollout_ref.actor.use_kl_loss=False \
|
| 29 |
+
actor_rollout_ref.actor.policy_loss.loss_mode=gpg \
|
| 30 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 31 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 32 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 33 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 34 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 35 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 36 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
|
| 37 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 38 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 39 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 40 |
+
actor_rollout_ref.rollout.n=5 \
|
| 41 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
|
| 42 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 43 |
+
algorithm.use_kl_in_reward=False \
|
| 44 |
+
trainer.critic_warmup=0 \
|
| 45 |
+
trainer.logger='["console","wandb"]' \
|
| 46 |
+
trainer.project_name='verl_gpg_example_gsm8k_math' \
|
| 47 |
+
trainer.experiment_name='qwen2_7b_function_rm' \
|
| 48 |
+
trainer.n_gpus_per_node=8 \
|
| 49 |
+
trainer.nnodes=1 \
|
| 50 |
+
trainer.save_freq=20 \
|
| 51 |
+
trainer.test_freq=5 \
|
| 52 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math_megatron.sh
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
|
| 4 |
+
# export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 5 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 6 |
+
|
| 7 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 8 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 9 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 10 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 11 |
+
|
| 12 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 13 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 14 |
+
|
| 15 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 16 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 17 |
+
algorithm.adv_estimator=gpg \
|
| 18 |
+
data.train_files="$train_files" \
|
| 19 |
+
data.val_files="$test_files" \
|
| 20 |
+
data.train_batch_size=1024 \
|
| 21 |
+
data.max_prompt_length=1024 \
|
| 22 |
+
data.max_response_length=1024 \
|
| 23 |
+
data.filter_overlong_prompts=True \
|
| 24 |
+
data.truncation='error' \
|
| 25 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 26 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 27 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 28 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
|
| 29 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
|
| 30 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
|
| 31 |
+
actor_rollout_ref.actor.policy_loss.loss_mode=gpg \
|
| 32 |
+
actor_rollout_ref.actor.use_kl_loss=False \
|
| 33 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 34 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 35 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 36 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 37 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 38 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 39 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 40 |
+
actor_rollout_ref.rollout.n=5 \
|
| 41 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 42 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
|
| 43 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
|
| 44 |
+
algorithm.use_kl_in_reward=False \
|
| 45 |
+
trainer.critic_warmup=0 \
|
| 46 |
+
trainer.logger='["console","wandb"]' \
|
| 47 |
+
trainer.project_name='verl_gpg_example_gsm8k_math' \
|
| 48 |
+
trainer.experiment_name='qwen2_7b_megatron' \
|
| 49 |
+
trainer.n_gpus_per_node=8 \
|
| 50 |
+
trainer.nnodes=1 \
|
| 51 |
+
trainer.save_freq=20 \
|
| 52 |
+
trainer.test_freq=5 \
|
| 53 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/README.md
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Group Relative Policy Optimization (GRPO)
|
| 2 |
+
|
| 3 |
+
In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive.
|
| 4 |
+
|
| 5 |
+
GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows:
|
| 6 |
+
- Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs.
|
| 7 |
+
- Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality.
|
| 8 |
+
- Baseline Calculation: The average reward of the group serves as a baseline.
|
| 9 |
+
- Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones.
|
| 10 |
+
|
| 11 |
+
This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300)
|
| 12 |
+
|
| 13 |
+
## Key Components
|
| 14 |
+
|
| 15 |
+
- No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic)
|
| 16 |
+
- Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group.
|
| 17 |
+
- Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group.
|
| 18 |
+
|
| 19 |
+
## Configuration
|
| 20 |
+
|
| 21 |
+
Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
|
| 22 |
+
|
| 23 |
+
Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic).
|
| 24 |
+
|
| 25 |
+

|
| 26 |
+
|
| 27 |
+
- `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling.
|
| 28 |
+
|
| 29 |
+
- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
|
| 30 |
+
|
| 31 |
+
- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers.
|
| 32 |
+
|
| 33 |
+
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor
|
| 34 |
+
|
| 35 |
+
- `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2
|
| 36 |
+
|
| 37 |
+
- `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead
|
| 38 |
+
|
| 39 |
+
- `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead.
|
| 40 |
+
|
| 41 |
+
Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss:
|
| 42 |
+
|
| 43 |
+
- `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO.
|
| 44 |
+
|
| 45 |
+
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
|
| 46 |
+
|
| 47 |
+
- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
|
| 48 |
+
|
| 49 |
+
## Advanced Extensions
|
| 50 |
+
|
| 51 |
+
### DrGRPO
|
| 52 |
+
|
| 53 |
+
The work [Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, that leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization, which can inadvertently favor longer, less accurate responses. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias.
|
| 54 |
+
|
| 55 |
+
Configure the following to enable DrGRPO, with all other parameters the same as GRPO's:
|
| 56 |
+
|
| 57 |
+
- `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging
|
| 58 |
+
- `actor_rollout_ref.actor.loss_scale_factor`: (Optional) Set to a constant integer (e.g., max response length) to ensure consistent normalization throughout training. If not set, uses the current batch's response length.
|
| 59 |
+
- `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO
|
| 60 |
+
- `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm
|
| 61 |
+
|
| 62 |
+
## Reference Example
|
| 63 |
+
|
| 64 |
+
Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)
|
| 65 |
+
|
| 66 |
+
```bash
|
| 67 |
+
bash examples/grpo_trainer/run_qwen3-8b.sh
|
| 68 |
+
```
|
| 69 |
+
|
| 70 |
+
For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
# # 0. download HF checkpoint
|
| 4 |
+
# # remove the `quantization_config` in the `config.json`
|
| 5 |
+
# # set `num_nextn_predict_layers=0` to disable MTP, which is not currently supported
|
| 6 |
+
# hf download deepseek-ai/DeepSeek-V3-0324
|
| 7 |
+
|
| 8 |
+
# no offline dist checkpoint needed, now with mbridge>=0.13.0, we can directly init model from huggingface downloaded fp8 weights
|
| 9 |
+
# tested on docker://verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
|
| 10 |
+
LLM="<path_to_dsv3_config>"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
# 2. run the script
|
| 14 |
+
gsm8k_train_path=/root/data/gsm8k/train.parquet
|
| 15 |
+
gsm8k_test_path=/root/data/gsm8k/test.parquet
|
| 16 |
+
train_files=$gsm8k_train_path
|
| 17 |
+
test_files=$gsm8k_test_path
|
| 18 |
+
|
| 19 |
+
ALL_OFFLOAD=${ALL_OFFLOAD:-True}
|
| 20 |
+
COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
|
| 21 |
+
COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
|
| 22 |
+
COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
|
| 23 |
+
|
| 24 |
+
ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
|
| 25 |
+
ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
|
| 26 |
+
ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
|
| 27 |
+
REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
|
| 28 |
+
CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
|
| 29 |
+
CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
|
| 30 |
+
CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
|
| 31 |
+
RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
|
| 32 |
+
|
| 33 |
+
# 256 H100(80GB)
|
| 34 |
+
NODES=32
|
| 35 |
+
PP=16
|
| 36 |
+
TP=1
|
| 37 |
+
EP=16
|
| 38 |
+
ETP=1
|
| 39 |
+
INFER_TP=32
|
| 40 |
+
# consider TP/ETP, and enable recompute if short of memory
|
| 41 |
+
|
| 42 |
+
# full recompute
|
| 43 |
+
|
| 44 |
+
n_resp_per_prompt=4
|
| 45 |
+
max_prompt_length=2048
|
| 46 |
+
max_response_length=4096
|
| 47 |
+
use_dynamic_bsz=True
|
| 48 |
+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
|
| 49 |
+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
|
| 50 |
+
|
| 51 |
+
use_kl_in_reward=False
|
| 52 |
+
kl_coef=0.0
|
| 53 |
+
use_kl_loss=True
|
| 54 |
+
kl_loss_coef=0.001
|
| 55 |
+
|
| 56 |
+
# RAY_ADDRESS='auto' ray job submit --working-dir . --
|
| 57 |
+
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
|
| 58 |
+
algorithm.adv_estimator=grpo \
|
| 59 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward} \
|
| 60 |
+
algorithm.kl_ctrl.kl_coef=${kl_coef} \
|
| 61 |
+
data.train_files="$train_files" \
|
| 62 |
+
data.val_files="$test_files" \
|
| 63 |
+
data.train_batch_size=512 \
|
| 64 |
+
data.max_prompt_length=$max_prompt_length \
|
| 65 |
+
data.max_response_length=$max_response_length \
|
| 66 |
+
data.filter_overlong_prompts=True \
|
| 67 |
+
data.truncation='error' \
|
| 68 |
+
actor_rollout_ref.model.path=$LLM \
|
| 69 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 70 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 71 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
|
| 72 |
+
actor_rollout_ref.actor.use_torch_compile=False \
|
| 73 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
|
| 74 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
|
| 75 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
|
| 76 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 77 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
|
| 78 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
|
| 79 |
+
actor_rollout_ref.rollout.temperature=1.0 \
|
| 80 |
+
actor_rollout_ref.rollout.top_p=1.0 \
|
| 81 |
+
actor_rollout_ref.rollout.top_k=-1 \
|
| 82 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=$INFER_TP \
|
| 83 |
+
trainer.logger='["console","tensorboard"]' \
|
| 84 |
+
trainer.project_name='verl_megatron_gsm8k_examples' \
|
| 85 |
+
trainer.experiment_name='dsv3-32nodes' \
|
| 86 |
+
trainer.n_gpus_per_node=8 \
|
| 87 |
+
trainer.nnodes=$NODES \
|
| 88 |
+
trainer.save_freq=-1 \
|
| 89 |
+
trainer.test_freq=5 \
|
| 90 |
+
actor_rollout_ref.model.use_fused_kernels=True \
|
| 91 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
|
| 92 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 93 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 94 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
|
| 95 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 96 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 97 |
+
actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend='fused' \
|
| 98 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=4 \
|
| 99 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=1 \
|
| 100 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
|
| 101 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
|
| 102 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
|
| 103 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
|
| 104 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
|
| 105 |
+
actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
|
| 106 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
|
| 107 |
+
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
|
| 108 |
+
actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
|
| 109 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
|
| 110 |
+
actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
|
| 111 |
+
actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
|
| 112 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
|
| 113 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
|
| 114 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
|
| 115 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True \
|
| 116 |
+
trainer.default_local_dir=$CKPT_DIR \
|
| 117 |
+
trainer.val_before_train=False \
|
| 118 |
+
trainer.total_epochs=100 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh
ADDED
|
@@ -0,0 +1,179 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -xeuo pipefail
|
| 3 |
+
|
| 4 |
+
## !!!!!!!important!!!!!!
|
| 5 |
+
# 1. set the following environment variables on all your nodes
|
| 6 |
+
# env_vars:
|
| 7 |
+
# CUDA_DEVICE_MAX_CONNECTIONS: "1"
|
| 8 |
+
# NCCL_NVLS_ENABLE: "0"
|
| 9 |
+
# VLLM_USE_V1: 1
|
| 10 |
+
# 2. install mbridge=0.1.13 on all your node with the following command:
|
| 11 |
+
# pip3 install git+https://github.com/ISEEKYAN/mbridge
|
| 12 |
+
# 3. remove the `quantization_config` in the DeepSeek-V3's `config.json` and
|
| 13 |
+
# set `num_nextn_predict_layers=0` to disable MTP, which is not currently supported
|
| 14 |
+
|
| 15 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 16 |
+
[ -f "${SCRIPT_DIR}/env.sh" ] && source "${SCRIPT_DIR}/env.sh"
|
| 17 |
+
|
| 18 |
+
adv_estimator=grpo
|
| 19 |
+
|
| 20 |
+
use_kl_in_reward=False
|
| 21 |
+
kl_coef=0.0
|
| 22 |
+
use_kl_loss=True
|
| 23 |
+
kl_loss_coef=0.001
|
| 24 |
+
|
| 25 |
+
clip_ratio_low=0.2
|
| 26 |
+
clip_ratio_high=0.28
|
| 27 |
+
|
| 28 |
+
max_prompt_length=$((1024 * 2))
|
| 29 |
+
max_response_length=$((1204 * 8))
|
| 30 |
+
enable_overlong_buffer=True
|
| 31 |
+
overlong_buffer_len=$((1024 * 4))
|
| 32 |
+
overlong_penalty_factor=1.0
|
| 33 |
+
|
| 34 |
+
loss_agg_mode="token-mean"
|
| 35 |
+
|
| 36 |
+
train_prompt_bsz=96
|
| 37 |
+
n_resp_per_prompt=8
|
| 38 |
+
train_prompt_mini_bsz=32
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# minimum nodes for DeepSeek-V3: 12 nodes
|
| 42 |
+
NNODES=${NNODES:-12}
|
| 43 |
+
|
| 44 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 45 |
+
|
| 46 |
+
MODEL_PATH=$RAY_DATA_HOME/models/DeepSeek-V3-config-verl
|
| 47 |
+
|
| 48 |
+
TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
|
| 49 |
+
TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
|
| 50 |
+
|
| 51 |
+
# Algorithm
|
| 52 |
+
temperature=1.0
|
| 53 |
+
top_p=1.0
|
| 54 |
+
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
|
| 55 |
+
val_top_p=0.7
|
| 56 |
+
|
| 57 |
+
# Performance Related Parameter
|
| 58 |
+
use_dynamic_bsz=True
|
| 59 |
+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10))
|
| 60 |
+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
|
| 61 |
+
offload=True
|
| 62 |
+
optim_offload=${OFFLOAD_OPTIM:-True}
|
| 63 |
+
gen_tp=32
|
| 64 |
+
train_tp=${TP:-8}
|
| 65 |
+
train_pp=${PP:-12}
|
| 66 |
+
|
| 67 |
+
EP=${EP:-8}
|
| 68 |
+
ETP=1
|
| 69 |
+
CP=1
|
| 70 |
+
optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
|
| 71 |
+
LAST_LAYER=${LAST_LAYER:-6}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
project_name='verl-deepseek-v3'
|
| 75 |
+
exp_name="671B-${NNODES}-pp${train_pp}-tp${train_tp}-ep${EP}-actor-length${actor_ppo_max_token_len}"
|
| 76 |
+
CKPTS_DIR=$RAY_DATA_HOME/ckpt/${project_name}/${exp_name}
|
| 77 |
+
|
| 78 |
+
python3 -m verl.trainer.main_ppo \
|
| 79 |
+
--config-path=config \
|
| 80 |
+
--config-name='ppo_megatron_trainer.yaml' \
|
| 81 |
+
data.train_files="${TRAIN_FILE}" \
|
| 82 |
+
data.val_files="${TEST_FILE}" \
|
| 83 |
+
data.prompt_key=prompt \
|
| 84 |
+
data.truncation='left' \
|
| 85 |
+
data.max_prompt_length=${max_prompt_length} \
|
| 86 |
+
data.max_response_length=${max_response_length} \
|
| 87 |
+
data.train_batch_size=${train_prompt_bsz} \
|
| 88 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
|
| 89 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 90 |
+
algorithm.adv_estimator=${adv_estimator} \
|
| 91 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward} \
|
| 92 |
+
algorithm.kl_ctrl.kl_coef=${kl_coef} \
|
| 93 |
+
actor_rollout_ref.model.use_fused_kernels=True \
|
| 94 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True \
|
| 95 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
|
| 96 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
|
| 97 |
+
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
|
| 98 |
+
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
|
| 99 |
+
actor_rollout_ref.actor.clip_ratio_c=10.0 \
|
| 100 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
|
| 101 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 102 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 103 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
|
| 104 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 105 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 106 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
|
| 107 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 108 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 109 |
+
actor_rollout_ref.model.path="${MODEL_PATH}" \
|
| 110 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 111 |
+
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
|
| 112 |
+
actor_rollout_ref.actor.optim.weight_decay=0.1 \
|
| 113 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} \
|
| 114 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
|
| 115 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
|
| 116 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
|
| 117 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
| 118 |
+
actor_rollout_ref.actor.megatron.param_offload=${offload} \
|
| 119 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${optim_offload} \
|
| 120 |
+
actor_rollout_ref.actor.megatron.grad_offload=${offload} \
|
| 121 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
|
| 122 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
|
| 123 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
|
| 124 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
|
| 125 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
|
| 126 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 127 |
+
actor_rollout_ref.actor.optim.clip_grad=1.0 \
|
| 128 |
+
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
|
| 129 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 130 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
|
| 131 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=True \
|
| 132 |
+
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
|
| 133 |
+
actor_rollout_ref.rollout.temperature=${temperature} \
|
| 134 |
+
actor_rollout_ref.rollout.top_p=${top_p} \
|
| 135 |
+
actor_rollout_ref.rollout.top_k=${top_k} \
|
| 136 |
+
actor_rollout_ref.nccl_timeout=1200 \
|
| 137 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
|
| 138 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
|
| 139 |
+
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
|
| 140 |
+
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
|
| 141 |
+
actor_rollout_ref.rollout.val_kwargs.n=1 \
|
| 142 |
+
actor_rollout_ref.rollout.enforce_eager=True \
|
| 143 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
|
| 144 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
|
| 145 |
+
actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
|
| 146 |
+
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
|
| 147 |
+
actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
|
| 148 |
+
actor_rollout_ref.ref.megatron.param_offload=${offload} \
|
| 149 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
|
| 150 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
|
| 151 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
|
| 152 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
|
| 153 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
|
| 154 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
|
| 155 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
|
| 156 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
|
| 157 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
|
| 158 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
|
| 159 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
|
| 160 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
|
| 161 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${LAST_LAYER} \
|
| 162 |
+
reward_model.reward_manager=dapo \
|
| 163 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
|
| 164 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
|
| 165 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
|
| 166 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
|
| 167 |
+
+reward_model.reward_kwargs.max_resp_len=${max_response_length} \
|
| 168 |
+
trainer.logger=['console','wandb'] \
|
| 169 |
+
trainer.project_name="${project_name}" \
|
| 170 |
+
trainer.experiment_name="${exp_name}" \
|
| 171 |
+
trainer.n_gpus_per_node=8 \
|
| 172 |
+
trainer.nnodes="${NNODES}" \
|
| 173 |
+
trainer.val_before_train=False \
|
| 174 |
+
trainer.test_freq=10 \
|
| 175 |
+
trainer.save_freq=100 \
|
| 176 |
+
trainer.total_epochs=10 \
|
| 177 |
+
trainer.default_local_dir="${CKPTS_DIR}" \
|
| 178 |
+
trainer.resume_mode=auto \
|
| 179 |
+
trainer.log_val_generations=10
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm.sh
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
python3 -m verl.trainer.main_ppo \
|
| 4 |
+
algorithm.adv_estimator=grpo \
|
| 5 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 6 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 7 |
+
data.train_batch_size=1024 \
|
| 8 |
+
data.max_prompt_length=512 \
|
| 9 |
+
data.max_response_length=1024 \
|
| 10 |
+
data.filter_overlong_prompts=True \
|
| 11 |
+
data.truncation='error' \
|
| 12 |
+
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
|
| 13 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 14 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 15 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 16 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
|
| 17 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 18 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 19 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 20 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 21 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 22 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 23 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 24 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
|
| 25 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 26 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 27 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 28 |
+
actor_rollout_ref.rollout.n=5 \
|
| 29 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
|
| 30 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 31 |
+
algorithm.use_kl_in_reward=False \
|
| 32 |
+
trainer.critic_warmup=0 \
|
| 33 |
+
trainer.logger=console \
|
| 34 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 35 |
+
trainer.experiment_name='deepseek_llm_7b_function_rm' \
|
| 36 |
+
trainer.n_gpus_per_node=8 \
|
| 37 |
+
trainer.nnodes=1 \
|
| 38 |
+
trainer.save_freq=20 \
|
| 39 |
+
trainer.test_freq=5 \
|
| 40 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math.sh
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 5 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 6 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 7 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 8 |
+
|
| 9 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 10 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 11 |
+
|
| 12 |
+
python3 -m verl.trainer.main_ppo \
|
| 13 |
+
algorithm.adv_estimator=grpo \
|
| 14 |
+
data.train_files="$train_files" \
|
| 15 |
+
data.val_files="$test_files" \
|
| 16 |
+
data.train_batch_size=1024 \
|
| 17 |
+
data.max_prompt_length=1024 \
|
| 18 |
+
data.max_response_length=1024 \
|
| 19 |
+
data.filter_overlong_prompts=True \
|
| 20 |
+
data.truncation='error' \
|
| 21 |
+
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
|
| 22 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 23 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 24 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 25 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
|
| 26 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 27 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 28 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 29 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 30 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 31 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 32 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 33 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
|
| 34 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 35 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 36 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 37 |
+
actor_rollout_ref.rollout.n=5 \
|
| 38 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
|
| 39 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 40 |
+
algorithm.use_kl_in_reward=False \
|
| 41 |
+
trainer.critic_warmup=0 \
|
| 42 |
+
trainer.logger='["console","wandb"]' \
|
| 43 |
+
trainer.project_name='verl_grpo_example_gsm8k_math' \
|
| 44 |
+
trainer.experiment_name='deepseek_llm_7b_function_rm_math' \
|
| 45 |
+
trainer.n_gpus_per_node=8 \
|
| 46 |
+
trainer.nnodes=1 \
|
| 47 |
+
trainer.save_freq=20 \
|
| 48 |
+
trainer.test_freq=5 \
|
| 49 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_seq_balance.sh
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
python3 -m verl.trainer.main_ppo \
|
| 4 |
+
algorithm.adv_estimator=grpo \
|
| 5 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 6 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 7 |
+
data.train_batch_size=1024 \
|
| 8 |
+
data.max_prompt_length=512 \
|
| 9 |
+
data.max_response_length=512 \
|
| 10 |
+
data.filter_overlong_prompts=True \
|
| 11 |
+
data.truncation='error' \
|
| 12 |
+
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
|
| 13 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 14 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 15 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 16 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 17 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
|
| 18 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 19 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 20 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 21 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 22 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 23 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 24 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 25 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 26 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 27 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 28 |
+
actor_rollout_ref.rollout.n=5 \
|
| 29 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 30 |
+
algorithm.use_kl_in_reward=False \
|
| 31 |
+
trainer.critic_warmup=0 \
|
| 32 |
+
trainer.logger='["console","wandb"]' \
|
| 33 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 34 |
+
trainer.experiment_name='deepseek_llm_7b_function_rm_seq_packing' \
|
| 35 |
+
trainer.n_gpus_per_node=8 \
|
| 36 |
+
trainer.nnodes=1 \
|
| 37 |
+
trainer.save_freq=20 \
|
| 38 |
+
trainer.test_freq=5 \
|
| 39 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_gptoss_20b.sh
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
cat > get_model.py << EOF
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
|
| 6 |
+
|
| 7 |
+
model_id = "openai/gpt-oss-20b"
|
| 8 |
+
output_dir = "$HOME/models/gpt-oss-20b-bf16"
|
| 9 |
+
|
| 10 |
+
quantization_config = Mxfp4Config(dequantize=True)
|
| 11 |
+
model_kwargs = dict(
|
| 12 |
+
attn_implementation="eager",
|
| 13 |
+
torch_dtype=torch.bfloat16,
|
| 14 |
+
quantization_config=quantization_config,
|
| 15 |
+
use_cache=False,
|
| 16 |
+
device_map="auto",
|
| 17 |
+
)
|
| 18 |
+
|
| 19 |
+
model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
|
| 20 |
+
|
| 21 |
+
# Patch config with custom attribute before saving
|
| 22 |
+
model.config.attn_implementation = "eager"
|
| 23 |
+
|
| 24 |
+
model.save_pretrained(output_dir)
|
| 25 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
| 26 |
+
tokenizer.save_pretrained(output_dir)
|
| 27 |
+
EOF
|
| 28 |
+
|
| 29 |
+
python get_model.py
|
| 30 |
+
# or you can use lmsys/gpt-oss-20b-bf16
|
| 31 |
+
# recommend to use same value for train_batch_size and ppo_mini_batch_size
|
| 32 |
+
# to avoid MOE training instability
|
| 33 |
+
# use large value for max_response_length if you want to use reasoning effort high.
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
model_dir=$HOME/models/gpt-oss-20b-bf16
|
| 37 |
+
python3 -m verl.trainer.main_ppo \
|
| 38 |
+
algorithm.adv_estimator=grpo \
|
| 39 |
+
data.train_files="$gsm8k_train_path" \
|
| 40 |
+
data.val_files="$gsm8k_test_path" \
|
| 41 |
+
data.train_batch_size=256 \
|
| 42 |
+
data.max_prompt_length=512 \
|
| 43 |
+
data.max_response_length=8192 \
|
| 44 |
+
data.filter_overlong_prompts=True \
|
| 45 |
+
data.truncation='error' \
|
| 46 |
+
+data.apply_chat_template_kwargs.reasoning_effort=medium \
|
| 47 |
+
actor_rollout_ref.model.path=${model_dir} \
|
| 48 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 49 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 50 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 51 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
|
| 52 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 53 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 54 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 55 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 56 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 57 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 58 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 59 |
+
+actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
|
| 60 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
|
| 61 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 62 |
+
actor_rollout_ref.rollout.name=sglang \
|
| 63 |
+
actor_rollout_ref.rollout.mode=async \
|
| 64 |
+
actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=triton \
|
| 65 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
|
| 66 |
+
actor_rollout_ref.rollout.n=5 \
|
| 67 |
+
actor_rollout_ref.rollout.load_format=safetensors \
|
| 68 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
|
| 69 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 70 |
+
algorithm.use_kl_in_reward=False \
|
| 71 |
+
trainer.critic_warmup=0 \
|
| 72 |
+
trainer.logger='["console","wandb"]' \
|
| 73 |
+
trainer.project_name='verl_grpo_example_gsm8k_math' \
|
| 74 |
+
trainer.experiment_name='oai_oss_20b_function_rm' \
|
| 75 |
+
trainer.n_gpus_per_node=8 \
|
| 76 |
+
trainer.nnodes=1 \
|
| 77 |
+
trainer.save_freq=50 \
|
| 78 |
+
trainer.test_freq=10 \
|
| 79 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
train_files=data/full_hh_rlhf/rl/train.parquet
|
| 2 |
+
test_files=data/full_hh_rlhf/rl/train.parquet # no use
|
| 3 |
+
|
| 4 |
+
max_prompt_length=4096
|
| 5 |
+
max_response_length=2048
|
| 6 |
+
|
| 7 |
+
gen_tp=4
|
| 8 |
+
n_per_prompt=5
|
| 9 |
+
adv_estimator="grpo"
|
| 10 |
+
|
| 11 |
+
project_name=verl_full_hh_rlhf_examples
|
| 12 |
+
exp_name="grpo_mistral13B-skyworkLlama8b-hhrlhf"
|
| 13 |
+
|
| 14 |
+
python3 -m verl.trainer.main_ppo \
|
| 15 |
+
algorithm.adv_estimator=$adv_estimator \
|
| 16 |
+
data.train_files="$train_files" \
|
| 17 |
+
data.val_files="$test_files" \
|
| 18 |
+
data.train_batch_size=512 \
|
| 19 |
+
data.prompt_key="prompt" \
|
| 20 |
+
data.return_raw_chat=True \
|
| 21 |
+
data.max_prompt_length=$max_prompt_length \
|
| 22 |
+
data.max_response_length=$max_response_length \
|
| 23 |
+
data.filter_overlong_prompts=True \
|
| 24 |
+
data.truncation='error' \
|
| 25 |
+
actor_rollout_ref.model.path=mistralai/Mistral-Nemo-Instruct-2407 \
|
| 26 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 27 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 28 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
|
| 29 |
+
actor_rollout_ref.actor.use_kl_loss=False \
|
| 30 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=10 \
|
| 31 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 32 |
+
actor_rollout_ref.rollout.n=$n_per_prompt \
|
| 33 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
|
| 34 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
|
| 35 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 36 |
+
reward_model.enable=True \
|
| 37 |
+
reward_model.model.path=Skywork/Skywork-Reward-Llama-3.1-8B \
|
| 38 |
+
reward_model.use_reward_loop=True \
|
| 39 |
+
reward_model.rollout.name=vllm \
|
| 40 |
+
reward_model.rollout.gpu_memory_utilization=0.8 \
|
| 41 |
+
reward_model.rollout.tensor_model_parallel_size=1 \
|
| 42 |
+
reward_model.rollout.prompt_length=8192 \
|
| 43 |
+
reward_model.rollout.response_length=4096 \
|
| 44 |
+
reward_model.num_workers=8 \
|
| 45 |
+
algorithm.use_kl_in_reward=False \
|
| 46 |
+
trainer.logger='["console","wandb"]' \
|
| 47 |
+
trainer.val_before_train=False \
|
| 48 |
+
trainer.project_name=$project_name \
|
| 49 |
+
trainer.experiment_name=$exp_name \
|
| 50 |
+
trainer.n_gpus_per_node=8 \
|
| 51 |
+
trainer.nnodes=1 \
|
| 52 |
+
trainer.save_freq=10 \
|
| 53 |
+
trainer.test_freq=-1 \
|
| 54 |
+
trainer.total_epochs=5 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_moonlight16b_math_megatron.sh
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
HF_MODEL_PATH=moonshotai/Moonlight-16B-A3B
|
| 6 |
+
DIST_CKPT_PATH=${DIST_CKPT_PATH}
|
| 7 |
+
|
| 8 |
+
train_path=$HOME/data/gsm8k/train.parquet
|
| 9 |
+
test_path=$HOME/data/gsm8k/test.parquet
|
| 10 |
+
|
| 11 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 12 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 13 |
+
algorithm.adv_estimator=grpo \
|
| 14 |
+
data.train_files="$train_path" \
|
| 15 |
+
data.val_files="$test_path" \
|
| 16 |
+
data.train_batch_size=192 \
|
| 17 |
+
data.max_prompt_length=1024 \
|
| 18 |
+
data.max_response_length=2048 \
|
| 19 |
+
data.filter_overlong_prompts=True \
|
| 20 |
+
data.truncation='error' \
|
| 21 |
+
data.trust_remote_code=True \
|
| 22 |
+
actor_rollout_ref.model.path=$HF_MODEL_PATH \
|
| 23 |
+
actor_rollout_ref.model.trust_remote_code=True \
|
| 24 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 25 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
|
| 26 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
|
| 27 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=3 \
|
| 28 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
|
| 29 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=4 \
|
| 30 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=1 \
|
| 31 |
+
actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
|
| 32 |
+
actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
|
| 33 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 34 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 35 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 36 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 37 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
|
| 38 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
|
| 39 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 40 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 41 |
+
actor_rollout_ref.rollout.n=5 \
|
| 42 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
|
| 43 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=3 \
|
| 44 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
|
| 45 |
+
actor_rollout_ref.ref.megatron.expert_model_parallel_size=4 \
|
| 46 |
+
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=1 \
|
| 47 |
+
actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
|
| 48 |
+
actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
|
| 49 |
+
algorithm.use_kl_in_reward=False \
|
| 50 |
+
trainer.critic_warmup=0 \
|
| 51 |
+
trainer.logger='["console","wandb"]' \
|
| 52 |
+
trainer.project_name='verl_grpo_example_gsm8k_math' \
|
| 53 |
+
trainer.experiment_name='moonlight_megatron_ep' \
|
| 54 |
+
trainer.n_gpus_per_node=8 \
|
| 55 |
+
trainer.nnodes=3 \
|
| 56 |
+
trainer.save_freq=20 \
|
| 57 |
+
trainer.test_freq=5 \
|
| 58 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -xeuo pipefail
|
| 3 |
+
mkdir -p logs
|
| 4 |
+
|
| 5 |
+
# Project Configuration
|
| 6 |
+
project_name='GRPO-Qwen2.5-32B-BASE-SGLang'
|
| 7 |
+
exp_name='GRPO-Qwen2.5-32B-BASE-FSDP-SGLang'
|
| 8 |
+
|
| 9 |
+
# Necessary env
|
| 10 |
+
export HCCL_CONNECT_TIMEOUT=1500
|
| 11 |
+
export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
|
| 12 |
+
export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
|
| 13 |
+
|
| 14 |
+
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
|
| 15 |
+
# If the number of nodes is 16, ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
| 16 |
+
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 17 |
+
|
| 18 |
+
export DISABLE_L2_CACHE=1
|
| 19 |
+
export TASK_QUEUE_ENABLE=1
|
| 20 |
+
|
| 21 |
+
# Node Info
|
| 22 |
+
NNODES=${NNODES:-2}
|
| 23 |
+
NPUS_PER_NODE=${NPUS_PER_NODE:-8}
|
| 24 |
+
|
| 25 |
+
# Model Weights Paths
|
| 26 |
+
MODEL_PATH=Qwen/Qwen2.5-32B
|
| 27 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 28 |
+
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
|
| 29 |
+
|
| 30 |
+
# File System Paths
|
| 31 |
+
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/datasets/deepscaler/train.parquet"}
|
| 32 |
+
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/datasets/deepscaler/test.parquet"}
|
| 33 |
+
|
| 34 |
+
# Data Configuration
|
| 35 |
+
max_prompt_length=$((1024 * 2))
|
| 36 |
+
max_response_length=$((1024 * 8))
|
| 37 |
+
|
| 38 |
+
# Training Batch Configuration
|
| 39 |
+
train_prompt_bsz=32
|
| 40 |
+
train_prompt_mini_bsz=32
|
| 41 |
+
n_resp_per_prompt=8
|
| 42 |
+
|
| 43 |
+
# Algorithm Configuration
|
| 44 |
+
adv_estimator=grpo
|
| 45 |
+
use_kl_in_reward=False
|
| 46 |
+
kl_coef=0.0
|
| 47 |
+
use_kl_loss=True
|
| 48 |
+
kl_loss_coef=0.001
|
| 49 |
+
|
| 50 |
+
# Performance and Memory Management Configuration
|
| 51 |
+
all_offload=True
|
| 52 |
+
use_dynamic_bsz=False
|
| 53 |
+
|
| 54 |
+
# SGLang Configuration
|
| 55 |
+
gen_tp=4
|
| 56 |
+
gen_sp=1
|
| 57 |
+
gen_dp=1
|
| 58 |
+
gen_ep=1
|
| 59 |
+
gpu_memory_utilization=0.5
|
| 60 |
+
|
| 61 |
+
# Data Configuration
|
| 62 |
+
DATA_CONFIG=(
|
| 63 |
+
# File Paths
|
| 64 |
+
data.train_files="${TRAIN_FILE}"
|
| 65 |
+
data.val_files="${TEST_FILE}"
|
| 66 |
+
# Data Structure
|
| 67 |
+
data.prompt_key=prompt
|
| 68 |
+
# Batch and Length Configuration
|
| 69 |
+
data.train_batch_size=${train_prompt_bsz}
|
| 70 |
+
data.max_prompt_length=${max_prompt_length}
|
| 71 |
+
data.max_response_length=${max_response_length}
|
| 72 |
+
# Preprocessing
|
| 73 |
+
data.filter_overlong_prompts=False
|
| 74 |
+
data.truncation='left'
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Model Configuration
|
| 78 |
+
MODEL_CONFIG=(
|
| 79 |
+
# Model Path
|
| 80 |
+
actor_rollout_ref.model.path="${MODEL_PATH}"
|
| 81 |
+
# Model Processing
|
| 82 |
+
actor_rollout_ref.model.use_remove_padding=True
|
| 83 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
# Reinforcement Learning Algorithm Configuration
|
| 87 |
+
ALGORITHM_CONFIG=(
|
| 88 |
+
# Advantage Estimation
|
| 89 |
+
algorithm.adv_estimator=${adv_estimator}
|
| 90 |
+
# KL Divergence Control
|
| 91 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward}
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# Actor Model Configuration
|
| 95 |
+
ACTOR_CONFIG=(
|
| 96 |
+
# Core Runtime Settings
|
| 97 |
+
actor_rollout_ref.actor.use_torch_compile=False
|
| 98 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
|
| 99 |
+
# Loss Function Configuration
|
| 100 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
|
| 101 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl
|
| 102 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
|
| 103 |
+
actor_rollout_ref.actor.entropy_coeff=0
|
| 104 |
+
# PPO Training Parameters
|
| 105 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
|
| 106 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
|
| 107 |
+
# Optimizer Settings
|
| 108 |
+
actor_rollout_ref.actor.optim.lr=1e-6
|
| 109 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=${all_offload}
|
| 110 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${all_offload}
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Reference Model Configuration
|
| 114 |
+
REF_CONFIG=(
|
| 115 |
+
# Core Runtime Settings
|
| 116 |
+
actor_rollout_ref.ref.use_torch_compile=False
|
| 117 |
+
# Log Probability Inference
|
| 118 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
|
| 119 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
|
| 120 |
+
# Memory Optimization
|
| 121 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=${all_offload}
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
# Rollout Configuration
|
| 125 |
+
ROLLOUT_CONFIG=(
|
| 126 |
+
# Rollout Engine
|
| 127 |
+
actor_rollout_ref.rollout.name=sglang
|
| 128 |
+
+actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
|
| 129 |
+
# Generation Parameters
|
| 130 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt}
|
| 131 |
+
actor_rollout_ref.rollout.top_p=1.0
|
| 132 |
+
actor_rollout_ref.rollout.top_k=-1
|
| 133 |
+
actor_rollout_ref.rollout.temperature=1.0
|
| 134 |
+
# Log Probability Inference
|
| 135 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
|
| 136 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
|
| 137 |
+
# Memory Management
|
| 138 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
|
| 139 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
|
| 140 |
+
actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
|
| 141 |
+
actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
|
| 142 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False
|
| 143 |
+
actor_rollout_ref.rollout.multi_stage_wake_up=True
|
| 144 |
+
# Validation Generation
|
| 145 |
+
actor_rollout_ref.rollout.val_kwargs.n=1
|
| 146 |
+
actor_rollout_ref.rollout.val_kwargs.do_sample=True
|
| 147 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=1.0
|
| 148 |
+
actor_rollout_ref.rollout.val_kwargs.top_k=-1
|
| 149 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=1.0
|
| 150 |
+
actor_rollout_ref.nccl_timeout=1800
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
# Trainer Configuration
|
| 154 |
+
TRAINER_CONFIG=(
|
| 155 |
+
trainer.logger='["console"]'
|
| 156 |
+
trainer.project_name="${project_name}"
|
| 157 |
+
trainer.experiment_name="${exp_name}"
|
| 158 |
+
trainer.nnodes="${NNODES}"
|
| 159 |
+
trainer.n_gpus_per_node="${NPUS_PER_NODE}"
|
| 160 |
+
trainer.total_epochs=5
|
| 161 |
+
trainer.val_before_train=False
|
| 162 |
+
trainer.test_freq=-1
|
| 163 |
+
trainer.save_freq=100
|
| 164 |
+
trainer.default_local_dir="${CKPTS_DIR}"
|
| 165 |
+
trainer.critic_warmup=0
|
| 166 |
+
)
|
| 167 |
+
|
| 168 |
+
# Main GRPO Training Command
|
| 169 |
+
# Add the reward function processing for the DeepScaler dataset here
|
| 170 |
+
python3 -m verl.trainer.main_ppo \
|
| 171 |
+
--config-path=config \
|
| 172 |
+
--config-name='ppo_trainer.yaml' \
|
| 173 |
+
custom_reward_function.path=recipe/r1_ascend/deepscaler.py \
|
| 174 |
+
custom_reward_function.name=compute_score \
|
| 175 |
+
"${DATA_CONFIG[@]}" \
|
| 176 |
+
"${MODEL_CONFIG[@]}" \
|
| 177 |
+
"${ACTOR_CONFIG[@]}" \
|
| 178 |
+
"${REF_CONFIG[@]}" \
|
| 179 |
+
"${ROLLOUT_CONFIG[@]}" \
|
| 180 |
+
"${ALGORITHM_CONFIG[@]}" \
|
| 181 |
+
"${TRAINER_CONFIG[@]}" \
|
| 182 |
+
"$@" | tee logs/run_qwen2_5-32b_grpo_fsdp_sglang_npu.log
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b.sh
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
python3 -m verl.trainer.main_ppo \
|
| 5 |
+
algorithm.adv_estimator=grpo \
|
| 6 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 7 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 8 |
+
data.train_batch_size=1024 \
|
| 9 |
+
data.max_prompt_length=512 \
|
| 10 |
+
data.max_response_length=1024 \
|
| 11 |
+
data.filter_overlong_prompts=True \
|
| 12 |
+
data.truncation='error' \
|
| 13 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 14 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 15 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 16 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 17 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
|
| 18 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 19 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 20 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 21 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 22 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 23 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 24 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 25 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
|
| 26 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 27 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 28 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 29 |
+
actor_rollout_ref.rollout.n=5 \
|
| 30 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
|
| 31 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 32 |
+
algorithm.use_kl_in_reward=False \
|
| 33 |
+
trainer.critic_warmup=0 \
|
| 34 |
+
trainer.logger='["console","wandb"]' \
|
| 35 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 36 |
+
trainer.experiment_name='qwen2_7b_function_rm' \
|
| 37 |
+
trainer.n_gpus_per_node=8 \
|
| 38 |
+
trainer.nnodes=1 \
|
| 39 |
+
trainer.save_freq=20 \
|
| 40 |
+
trainer.test_freq=5 \
|
| 41 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math.sh
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 5 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 6 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 7 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 8 |
+
|
| 9 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 10 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 11 |
+
|
| 12 |
+
python3 -m verl.trainer.main_ppo \
|
| 13 |
+
algorithm.adv_estimator=grpo \
|
| 14 |
+
data.train_files="$train_files" \
|
| 15 |
+
data.val_files="$test_files" \
|
| 16 |
+
data.train_batch_size=1024 \
|
| 17 |
+
data.max_prompt_length=1024 \
|
| 18 |
+
data.max_response_length=1024 \
|
| 19 |
+
data.filter_overlong_prompts=True \
|
| 20 |
+
data.truncation='error' \
|
| 21 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 22 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 23 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 24 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 25 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
|
| 26 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 27 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 28 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 29 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 30 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 31 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 32 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 33 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
|
| 34 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 35 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 36 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 37 |
+
actor_rollout_ref.rollout.n=5 \
|
| 38 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
|
| 39 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 40 |
+
algorithm.use_kl_in_reward=False \
|
| 41 |
+
trainer.critic_warmup=0 \
|
| 42 |
+
trainer.logger='["console","wandb"]' \
|
| 43 |
+
trainer.project_name='verl_grpo_example_gsm8k_math' \
|
| 44 |
+
trainer.experiment_name='qwen2_7b_function_rm' \
|
| 45 |
+
trainer.n_gpus_per_node=8 \
|
| 46 |
+
trainer.nnodes=1 \
|
| 47 |
+
trainer.save_freq=20 \
|
| 48 |
+
trainer.test_freq=5 \
|
| 49 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
rollout_mode="async"
|
| 6 |
+
export VLLM_USE_V1=1
|
| 7 |
+
return_raw_chat="True"
|
| 8 |
+
|
| 9 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 10 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 11 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 12 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 13 |
+
|
| 14 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 15 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 16 |
+
|
| 17 |
+
USE_FUSED_KERNELS=True
|
| 18 |
+
|
| 19 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 20 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 21 |
+
algorithm.adv_estimator=grpo \
|
| 22 |
+
data.train_files="$train_files" \
|
| 23 |
+
data.val_files="$test_files" \
|
| 24 |
+
data.return_raw_chat=$return_raw_chat \
|
| 25 |
+
data.train_batch_size=1024 \
|
| 26 |
+
data.max_prompt_length=1024 \
|
| 27 |
+
data.max_response_length=1024 \
|
| 28 |
+
data.filter_overlong_prompts=True \
|
| 29 |
+
data.truncation='error' \
|
| 30 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 31 |
+
actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
|
| 32 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 33 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 34 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
|
| 35 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
|
| 36 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
|
| 37 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 38 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 39 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 40 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 41 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 42 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 43 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 44 |
+
actor_rollout_ref.rollout.mode=$rollout_mode \
|
| 45 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 46 |
+
actor_rollout_ref.rollout.n=5 \
|
| 47 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 48 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
|
| 49 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
|
| 50 |
+
algorithm.use_kl_in_reward=False \
|
| 51 |
+
trainer.critic_warmup=0 \
|
| 52 |
+
trainer.logger='["console","wandb"]' \
|
| 53 |
+
trainer.project_name='verl_grpo_example_gsm8k_math' \
|
| 54 |
+
trainer.experiment_name='qwen2_7b_megatron' \
|
| 55 |
+
trainer.n_gpus_per_node=8 \
|
| 56 |
+
trainer.nnodes=1 \
|
| 57 |
+
trainer.save_freq=20 \
|
| 58 |
+
trainer.test_freq=5 \
|
| 59 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -xeuo pipefail
|
| 3 |
+
|
| 4 |
+
# Need to install Megatron-Bridge
|
| 5 |
+
# NOTE: Make sure you use Megatron-Bridge later than 0.2.0
|
| 6 |
+
# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44 or later)
|
| 7 |
+
# for proper MoE LoRA support.
|
| 8 |
+
|
| 9 |
+
# For Megatron communication/computation overlapping
|
| 10 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
| 11 |
+
|
| 12 |
+
############################ Quick Config ############################
|
| 13 |
+
|
| 14 |
+
rollout_name="vllm" # sglang or vllm
|
| 15 |
+
project_name='verl_grpo_example_gsm8k_math'
|
| 16 |
+
exp_name='qwen2_7b_megatron_lora'
|
| 17 |
+
|
| 18 |
+
adv_estimator=grpo
|
| 19 |
+
|
| 20 |
+
max_prompt_length=1024
|
| 21 |
+
max_response_length=1024
|
| 22 |
+
train_prompt_bsz=128
|
| 23 |
+
|
| 24 |
+
############################ Paths ############################
|
| 25 |
+
|
| 26 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 27 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 28 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 29 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 30 |
+
|
| 31 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 32 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 33 |
+
|
| 34 |
+
############################ Parameter Groups ############################
|
| 35 |
+
|
| 36 |
+
DATA=(
|
| 37 |
+
data.train_files="$train_files"
|
| 38 |
+
data.val_files="$test_files"
|
| 39 |
+
data.max_prompt_length=$max_prompt_length
|
| 40 |
+
data.max_response_length=$max_response_length
|
| 41 |
+
data.train_batch_size=$train_prompt_bsz
|
| 42 |
+
data.filter_overlong_prompts=True
|
| 43 |
+
data.truncation='error'
|
| 44 |
+
data.shuffle=False
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
MODEL=(
|
| 48 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct
|
| 49 |
+
actor_rollout_ref.model.lora.rank=256
|
| 50 |
+
actor_rollout_ref.model.lora.alpha=512
|
| 51 |
+
actor_rollout_ref.model.lora.lora_A_init_method=kaiming
|
| 52 |
+
# # Optional: Use canonical LoRA
|
| 53 |
+
# actor_rollout_ref.model.lora.type="canonical_lora"
|
| 54 |
+
# actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'
|
| 55 |
+
|
| 56 |
+
# # Optional: Add dropout to LoRA layers
|
| 57 |
+
# actor_rollout_ref.model.lora.dropout=0.05
|
| 58 |
+
# actor_rollout_ref.model.lora.dropout_position=pre
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
ACTOR=(
|
| 62 |
+
actor_rollout_ref.actor.optim.lr=1e-6
|
| 63 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=16
|
| 64 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
|
| 65 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True
|
| 66 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True
|
| 67 |
+
actor_rollout_ref.actor.megatron.vanilla_mbridge=False
|
| 68 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1
|
| 69 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4
|
| 70 |
+
actor_rollout_ref.actor.use_kl_loss=True
|
| 71 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001
|
| 72 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl
|
| 73 |
+
actor_rollout_ref.actor.entropy_coeff=0
|
| 74 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
|
| 75 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
|
| 76 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
ROLLOUT=(
|
| 80 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
|
| 81 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2
|
| 82 |
+
actor_rollout_ref.rollout.name=$rollout_name
|
| 83 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6
|
| 84 |
+
actor_rollout_ref.rollout.n=4
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
REF=(
|
| 88 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
|
| 89 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1
|
| 90 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
ALGORITHM=(
|
| 94 |
+
algorithm.adv_estimator=$adv_estimator
|
| 95 |
+
algorithm.use_kl_in_reward=False
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
TRAINER=(
|
| 99 |
+
trainer.logger='["console","wandb"]'
|
| 100 |
+
trainer.project_name=$project_name
|
| 101 |
+
trainer.experiment_name=$exp_name
|
| 102 |
+
trainer.n_gpus_per_node=8
|
| 103 |
+
trainer.nnodes=1
|
| 104 |
+
trainer.save_freq=20
|
| 105 |
+
trainer.test_freq=5
|
| 106 |
+
trainer.total_epochs=15
|
| 107 |
+
trainer.val_before_train=False
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
############################ Launch ############################
|
| 111 |
+
|
| 112 |
+
python3 -m verl.trainer.main_ppo \
|
| 113 |
+
--config-path=config \
|
| 114 |
+
--config-name='ppo_megatron_trainer.yaml' \
|
| 115 |
+
"${DATA[@]}" \
|
| 116 |
+
"${ALGORITHM[@]}" \
|
| 117 |
+
"${MODEL[@]}" \
|
| 118 |
+
"${ROLLOUT[@]}" \
|
| 119 |
+
"${ACTOR[@]}" \
|
| 120 |
+
"${REF[@]}" \
|
| 121 |
+
"${TRAINER[@]}" \
|
| 122 |
+
"$@"
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
# Clean all slurm / MPI / PMIx env to avoid pmix mismatch error
|
| 6 |
+
for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do
|
| 7 |
+
unset "$v"
|
| 8 |
+
done
|
| 9 |
+
|
| 10 |
+
export RAY_DEDUP_LOGS=0
|
| 11 |
+
|
| 12 |
+
# -----
|
| 13 |
+
# Config
|
| 14 |
+
# -----
|
| 15 |
+
TP=${1:-4}
|
| 16 |
+
ACTOR_TP=${ACTOR_TP:-4}
|
| 17 |
+
PROJECT_NAME=${PROJECT_NAME:-"verl_grpo_example_gsm8k_math"}
|
| 18 |
+
EXP_NAME=megatron-trtllm-qwen2-7b-tp${TP}-8gpus
|
| 19 |
+
|
| 20 |
+
if [ $TP -eq 4 ]; then
|
| 21 |
+
MAX_BATCH_SIZE=1024
|
| 22 |
+
else
|
| 23 |
+
MAX_BATCH_SIZE=384
|
| 24 |
+
fi
|
| 25 |
+
|
| 26 |
+
# -----
|
| 27 |
+
# Data
|
| 28 |
+
# -----
|
| 29 |
+
DATADIR=${DATADIR:-$PWD/data}
|
| 30 |
+
|
| 31 |
+
GSM8K_TRAIN_PATH=${DATADIR}/gsm8k/train.parquet
|
| 32 |
+
GSM8K_TEST_PATH=${DATADIR}/gsm8k/test.parquet
|
| 33 |
+
MATH_TRAIN_PATH=${DATADIR}/math/train.parquet
|
| 34 |
+
MATH_TEST_PATH=${DATADIR}/math/test.parquet
|
| 35 |
+
|
| 36 |
+
TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']"
|
| 37 |
+
TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']"
|
| 38 |
+
|
| 39 |
+
USE_FUSED_KERNELS=True
|
| 40 |
+
|
| 41 |
+
# -----
|
| 42 |
+
# Launch
|
| 43 |
+
# -----
|
| 44 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 45 |
+
--config-name='ppo_megatron_trainer.yaml' \
|
| 46 |
+
algorithm.adv_estimator=grpo \
|
| 47 |
+
data.train_files="$TRAIN_FILES" \
|
| 48 |
+
data.val_files="$TEST_FILES" \
|
| 49 |
+
data.return_raw_chat=True \
|
| 50 |
+
data.train_batch_size=1024 \
|
| 51 |
+
data.max_prompt_length=2048 \
|
| 52 |
+
data.max_response_length=1024 \
|
| 53 |
+
data.filter_overlong_prompts=True \
|
| 54 |
+
data.truncation='error' \
|
| 55 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 56 |
+
actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
|
| 57 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 58 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 59 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
|
| 60 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True \
|
| 61 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${ACTOR_TP} \
|
| 62 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 63 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 64 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 65 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 66 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
|
| 67 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
|
| 68 |
+
actor_rollout_ref.rollout.name=trtllm \
|
| 69 |
+
actor_rollout_ref.rollout.mode="async" \
|
| 70 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 71 |
+
actor_rollout_ref.rollout.n=5 \
|
| 72 |
+
actor_rollout_ref.rollout.max_num_seqs=${MAX_BATCH_SIZE} \
|
| 73 |
+
actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
|
| 74 |
+
actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \
|
| 75 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
|
| 76 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${ACTOR_TP} \
|
| 77 |
+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
|
| 78 |
+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
|
| 79 |
+
actor_rollout_ref.rollout.calculate_log_probs=True \
|
| 80 |
+
algorithm.use_kl_in_reward=False \
|
| 81 |
+
trainer.critic_warmup=0 \
|
| 82 |
+
trainer.logger='["console","wandb"]' \
|
| 83 |
+
trainer.project_name="${PROJECT_NAME}" \
|
| 84 |
+
trainer.experiment_name=${EXP_NAME} \
|
| 85 |
+
trainer.n_gpus_per_node=8 \
|
| 86 |
+
trainer.nnodes=1 \
|
| 87 |
+
trainer.save_freq=-1 \
|
| 88 |
+
trainer.test_freq=5 \
|
| 89 |
+
trainer.resume_mode=disable \
|
| 90 |
+
trainer.total_epochs=15 \
|
| 91 |
+
"${@:2}"
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
# Clean all slurm / MPI / PMIx env to avoid pmix mismatch error
|
| 4 |
+
for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do
|
| 5 |
+
unset "$v"
|
| 6 |
+
done
|
| 7 |
+
|
| 8 |
+
export RAY_DEDUP_LOGS=0
|
| 9 |
+
|
| 10 |
+
# -----
|
| 11 |
+
# Config
|
| 12 |
+
# -----
|
| 13 |
+
TP=${1:-4}
|
| 14 |
+
PROJECT_NAME=${PROJECT_NAME:-"verl_grpo_example_gsm8k_math"}
|
| 15 |
+
EXP_NAME=trtllm-qwen2-7b-tp${TP}-8gpus${EXP_NAME_SUFFIX:+"-"}${EXP_NAME_SUFFIX}
|
| 16 |
+
|
| 17 |
+
if [ $TP -eq 4 ]; then
|
| 18 |
+
MAX_BATCH_SIZE=1024
|
| 19 |
+
else
|
| 20 |
+
MAX_BATCH_SIZE=384
|
| 21 |
+
fi
|
| 22 |
+
|
| 23 |
+
# -----
|
| 24 |
+
# Data
|
| 25 |
+
# -----
|
| 26 |
+
DATADIR=${DATADIR:-$PWD/data}
|
| 27 |
+
MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-7B-Instruct"}
|
| 28 |
+
|
| 29 |
+
GSM8K_TRAIN_PATH=${DATADIR}/gsm8k/train.parquet
|
| 30 |
+
GSM8K_TEST_PATH=${DATADIR}/gsm8k/test.parquet
|
| 31 |
+
MATH_TRAIN_PATH=${DATADIR}/math/train.parquet
|
| 32 |
+
MATH_TEST_PATH=${DATADIR}/math/test.parquet
|
| 33 |
+
|
| 34 |
+
TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']"
|
| 35 |
+
TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']"
|
| 36 |
+
|
| 37 |
+
# -----
|
| 38 |
+
# Launch
|
| 39 |
+
# -----
|
| 40 |
+
python3 -m verl.trainer.main_ppo \
|
| 41 |
+
algorithm.adv_estimator=grpo \
|
| 42 |
+
algorithm.rollout_correction.rollout_is_threshold=2.0 \
|
| 43 |
+
data.train_files="$TRAIN_FILES" \
|
| 44 |
+
data.val_files="$TEST_FILES" \
|
| 45 |
+
data.train_batch_size=1024 \
|
| 46 |
+
data.max_prompt_length=2048 \
|
| 47 |
+
data.max_response_length=1024 \
|
| 48 |
+
data.return_raw_chat=True \
|
| 49 |
+
data.filter_overlong_prompts=True \
|
| 50 |
+
data.truncation='error' \
|
| 51 |
+
actor_rollout_ref.hybrid_engine=True \
|
| 52 |
+
actor_rollout_ref.model.path=${MODEL_PATH} \
|
| 53 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 54 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 55 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 56 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
|
| 57 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 58 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 59 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 60 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 61 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 62 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 63 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 64 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
|
| 65 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
|
| 66 |
+
actor_rollout_ref.rollout.name=trtllm \
|
| 67 |
+
actor_rollout_ref.rollout.mode="async" \
|
| 68 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 69 |
+
actor_rollout_ref.rollout.n=5 \
|
| 70 |
+
actor_rollout_ref.rollout.max_num_seqs=${MAX_BATCH_SIZE} \
|
| 71 |
+
actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
|
| 72 |
+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
|
| 73 |
+
+actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
|
| 74 |
+
actor_rollout_ref.rollout.calculate_log_probs=True \
|
| 75 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
|
| 76 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 77 |
+
actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \
|
| 78 |
+
algorithm.use_kl_in_reward=False \
|
| 79 |
+
trainer.critic_warmup=0 \
|
| 80 |
+
trainer.logger='["console","wandb"]' \
|
| 81 |
+
trainer.project_name="${PROJECT_NAME}" \
|
| 82 |
+
trainer.experiment_name=${EXP_NAME} \
|
| 83 |
+
trainer.n_gpus_per_node=8 \
|
| 84 |
+
trainer.nnodes=1 \
|
| 85 |
+
trainer.save_freq=-1 \
|
| 86 |
+
trainer.test_freq=5 \
|
| 87 |
+
trainer.resume_mode=disable \
|
| 88 |
+
trainer.total_epochs=15 \
|
| 89 |
+
"${@:2}"
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
# For async rollout mode, dataset should return raw chat.
|
| 5 |
+
rollout_mode="async"
|
| 6 |
+
rollout_name="sglang" # sglang or vllm
|
| 7 |
+
return_raw_chat="True"
|
| 8 |
+
if [ "$rollout_name" = "vllm" ]; then
|
| 9 |
+
export VLLM_USE_V1=1
|
| 10 |
+
fi
|
| 11 |
+
|
| 12 |
+
python3 -m verl.trainer.main_ppo \
|
| 13 |
+
algorithm.adv_estimator=grpo \
|
| 14 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 15 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 16 |
+
data.return_raw_chat=$return_raw_chat \
|
| 17 |
+
data.train_batch_size=1024 \
|
| 18 |
+
data.max_prompt_length=512 \
|
| 19 |
+
data.max_response_length=1024 \
|
| 20 |
+
data.filter_overlong_prompts=True \
|
| 21 |
+
data.truncation='error' \
|
| 22 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 23 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 24 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 25 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 26 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 27 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
|
| 28 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 29 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 30 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 31 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 32 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 33 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 34 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 35 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 36 |
+
actor_rollout_ref.rollout.name=$rollout_name \
|
| 37 |
+
actor_rollout_ref.rollout.mode=$rollout_mode \
|
| 38 |
+
actor_rollout_ref.rollout.multi_turn.format=hermes \
|
| 39 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 40 |
+
actor_rollout_ref.rollout.n=5 \
|
| 41 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 42 |
+
algorithm.use_kl_in_reward=False \
|
| 43 |
+
trainer.critic_warmup=0 \
|
| 44 |
+
trainer.logger='["console","wandb"]' \
|
| 45 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 46 |
+
trainer.experiment_name='qwen2_7b_function_rm_kl1e-3' \
|
| 47 |
+
trainer.val_before_train=False \
|
| 48 |
+
trainer.n_gpus_per_node=8 \
|
| 49 |
+
trainer.nnodes=1 \
|
| 50 |
+
trainer.save_freq=20 \
|
| 51 |
+
trainer.test_freq=5 \
|
| 52 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance_math_megatron.sh
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 6 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 7 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 8 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 9 |
+
|
| 10 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 11 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 12 |
+
|
| 13 |
+
offload=True
|
| 14 |
+
|
| 15 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 16 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 17 |
+
algorithm.adv_estimator=grpo \
|
| 18 |
+
data.train_files="$train_files" \
|
| 19 |
+
data.val_files="$test_files" \
|
| 20 |
+
data.train_batch_size=1024 \
|
| 21 |
+
data.max_prompt_length=1024 \
|
| 22 |
+
data.max_response_length=1024 \
|
| 23 |
+
data.filter_overlong_prompts=True \
|
| 24 |
+
data.truncation='error' \
|
| 25 |
+
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
|
| 26 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 27 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 28 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 29 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \
|
| 30 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
|
| 31 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
|
| 32 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 33 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 34 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 35 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 36 |
+
actor_rollout_ref.actor.megatron.param_offload=${offload} \
|
| 37 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
|
| 38 |
+
actor_rollout_ref.actor.megatron.grad_offload=${offload} \
|
| 39 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 40 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 41 |
+
actor_rollout_ref.ref.megatron.param_offload=${offload} \
|
| 42 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 43 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 44 |
+
actor_rollout_ref.rollout.n=5 \
|
| 45 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 46 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
|
| 47 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
|
| 48 |
+
algorithm.use_kl_in_reward=False \
|
| 49 |
+
trainer.critic_warmup=0 \
|
| 50 |
+
trainer.logger='["console","wandb"]' \
|
| 51 |
+
trainer.project_name='verl_grpo_example_gsm8k_math' \
|
| 52 |
+
trainer.experiment_name='qwen2_7b_megatron' \
|
| 53 |
+
trainer.n_gpus_per_node=8 \
|
| 54 |
+
trainer.nnodes=1 \
|
| 55 |
+
trainer.save_freq=20 \
|
| 56 |
+
trainer.test_freq=5 \
|
| 57 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
python3 -m verl.trainer.main_ppo \
|
| 4 |
+
algorithm.adv_estimator=grpo \
|
| 5 |
+
trainer.val_before_train=False \
|
| 6 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 7 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 8 |
+
data.train_batch_size=16 \
|
| 9 |
+
data.max_prompt_length=512 \
|
| 10 |
+
data.max_response_length=1024 \
|
| 11 |
+
data.filter_overlong_prompts=True \
|
| 12 |
+
data.truncation='error' \
|
| 13 |
+
data.shuffle=False \
|
| 14 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
|
| 15 |
+
actor_rollout_ref.model.lora_rank=64 \
|
| 16 |
+
actor_rollout_ref.model.lora_alpha=32 \
|
| 17 |
+
actor_rollout_ref.actor.optim.lr=3e-6 \
|
| 18 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 19 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
|
| 20 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
|
| 21 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 22 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 23 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 24 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 25 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 26 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 27 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 28 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
|
| 29 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 30 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 31 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 32 |
+
actor_rollout_ref.rollout.n=5 \
|
| 33 |
+
actor_rollout_ref.rollout.load_format=safetensors \
|
| 34 |
+
actor_rollout_ref.rollout.layered_summon=True \
|
| 35 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
|
| 36 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 37 |
+
algorithm.use_kl_in_reward=False \
|
| 38 |
+
trainer.critic_warmup=0 \
|
| 39 |
+
trainer.logger='["console","wandb"]' \
|
| 40 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 41 |
+
trainer.experiment_name='qwen2.5_3b_grpo_lora' \
|
| 42 |
+
trainer.n_gpus_per_node=2 \
|
| 43 |
+
trainer.nnodes=1 \
|
| 44 |
+
trainer.save_freq=20 \
|
| 45 |
+
trainer.test_freq=5 \
|
| 46 |
+
trainer.total_epochs=15 $@
|
| 47 |
+
|
| 48 |
+
# actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 49 |
+
# data.train_batch_size=1024 \
|
| 50 |
+
# trainer.n_gpus_per_node=8 \
|
| 51 |
+
# actor_rollout_ref.model.use_shm=True \
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-7b_math_megatron_diff_tp.sh
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 6 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 7 |
+
math_train_path=$HOME/data/math/train.parquet
|
| 8 |
+
math_test_path=$HOME/data/math/test.parquet
|
| 9 |
+
|
| 10 |
+
train_files="['$gsm8k_train_path', '$math_train_path']"
|
| 11 |
+
test_files="['$gsm8k_test_path', '$math_test_path']"
|
| 12 |
+
|
| 13 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 14 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 15 |
+
algorithm.adv_estimator=grpo \
|
| 16 |
+
data.train_files="$train_files" \
|
| 17 |
+
data.val_files="$test_files" \
|
| 18 |
+
data.train_batch_size=1024 \
|
| 19 |
+
data.max_prompt_length=1024 \
|
| 20 |
+
data.max_response_length=1024 \
|
| 21 |
+
data.filter_overlong_prompts=True \
|
| 22 |
+
data.truncation='error' \
|
| 23 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
|
| 24 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 25 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 26 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
|
| 27 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
|
| 28 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
|
| 29 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 30 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 31 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 32 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 33 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 34 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
|
| 35 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 36 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 37 |
+
actor_rollout_ref.rollout.n=5 \
|
| 38 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 39 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
|
| 40 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
|
| 41 |
+
algorithm.use_kl_in_reward=False \
|
| 42 |
+
trainer.critic_warmup=0 \
|
| 43 |
+
trainer.logger='["console","wandb"]' \
|
| 44 |
+
trainer.project_name='verl_grpo_example_gsm8k_math' \
|
| 45 |
+
trainer.experiment_name='qwen2_7b_megatron' \
|
| 46 |
+
trainer.n_gpus_per_node=8 \
|
| 47 |
+
trainer.nnodes=1 \
|
| 48 |
+
trainer.save_freq=20 \
|
| 49 |
+
trainer.test_freq=5 \
|
| 50 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
python3 -m verl.trainer.main_ppo \
|
| 4 |
+
algorithm.adv_estimator=grpo \
|
| 5 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 6 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 7 |
+
data.train_batch_size=1024 \
|
| 8 |
+
data.max_prompt_length=1024 \
|
| 9 |
+
data.max_response_length=1024 \
|
| 10 |
+
data.filter_overlong_prompts=True \
|
| 11 |
+
data.truncation='error' \
|
| 12 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-32B-Instruct \
|
| 13 |
+
actor_rollout_ref.actor.optim.lr=1e-6\
|
| 14 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 15 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 16 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
|
| 17 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 18 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 19 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 20 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 21 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 22 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 23 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 24 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
|
| 25 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
|
| 26 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 27 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
|
| 28 |
+
actor_rollout_ref.rollout.n=5 \
|
| 29 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
|
| 30 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 31 |
+
algorithm.use_kl_in_reward=False \
|
| 32 |
+
trainer.critic_warmup=0 \
|
| 33 |
+
trainer.logger=console \
|
| 34 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 35 |
+
trainer.experiment_name='qwen2_5_32b_function_rm' \
|
| 36 |
+
trainer.n_gpus_per_node=16 \
|
| 37 |
+
trainer.nnodes=2 \
|
| 38 |
+
trainer.save_freq=-1 \
|
| 39 |
+
trainer.test_freq=10 \
|
| 40 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
# profiling configuration
|
| 4 |
+
PROFILE_STEPS="[2,4]"
|
| 5 |
+
PROFILE_RANKS_ALL=False
|
| 6 |
+
DISCRETE=True
|
| 7 |
+
PROFILE_RANKS="[1,2]"
|
| 8 |
+
|
| 9 |
+
# profiling NPU options
|
| 10 |
+
SAVE_PATH="$HOME/profile_data"
|
| 11 |
+
LEVEL="level0"
|
| 12 |
+
CONTENTS=['npu','cpu']
|
| 13 |
+
ANALYSIS=True
|
| 14 |
+
|
| 15 |
+
python3 -m verl.trainer.main_ppo \
|
| 16 |
+
algorithm.adv_estimator=grpo \
|
| 17 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 18 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 19 |
+
data.train_batch_size=32 \
|
| 20 |
+
data.max_prompt_length=1024 \
|
| 21 |
+
data.max_response_length=1024 \
|
| 22 |
+
data.filter_overlong_prompts=True \
|
| 23 |
+
data.truncation='error' \
|
| 24 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
|
| 25 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 26 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 27 |
+
actor_rollout_ref.actor.optim.lr=5e-8 \
|
| 28 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=2 \
|
| 29 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
|
| 30 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 31 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 32 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 33 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 34 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 35 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 36 |
+
actor_rollout_ref.actor.profiler.enable=True \
|
| 37 |
+
actor_rollout_ref.actor.profiler.ranks=$PROFILE_RANKS \
|
| 38 |
+
actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \
|
| 39 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.discrete=$DISCRETE \
|
| 40 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.contents=$CONTENTS \
|
| 41 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.level=$LEVEL \
|
| 42 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.analysis=$ANALYSIS \
|
| 43 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
|
| 44 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
|
| 45 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 46 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
|
| 47 |
+
actor_rollout_ref.rollout.n=4 \
|
| 48 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 49 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
|
| 50 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 51 |
+
actor_rollout_ref.ref.profiler.enable=True \
|
| 52 |
+
actor_rollout_ref.ref.profiler.ranks=$PROFILE_RANKS \
|
| 53 |
+
actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
|
| 54 |
+
actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
|
| 55 |
+
actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
|
| 56 |
+
actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
|
| 57 |
+
actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
|
| 58 |
+
algorithm.use_kl_in_reward=False \
|
| 59 |
+
trainer.critic_warmup=0 \
|
| 60 |
+
trainer.logger=console \
|
| 61 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 62 |
+
trainer.experiment_name='qwen2_5_7b_function_rm' \
|
| 63 |
+
trainer.n_gpus_per_node=8 \
|
| 64 |
+
trainer.nnodes=1 \
|
| 65 |
+
trainer.save_freq=-1 \
|
| 66 |
+
trainer.test_freq=5 \
|
| 67 |
+
trainer.total_epochs=5 \
|
| 68 |
+
global_profiler.tool=npu \
|
| 69 |
+
global_profiler.steps=$PROFILE_STEPS \
|
| 70 |
+
global_profiler.save_path=$SAVE_PATH
|
| 71 |
+
$@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
python3 -m verl.trainer.main_ppo \
|
| 4 |
+
algorithm.adv_estimator=grpo \
|
| 5 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 6 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 7 |
+
data.train_batch_size=1024 \
|
| 8 |
+
data.max_prompt_length=1024 \
|
| 9 |
+
data.max_response_length=1024 \
|
| 10 |
+
data.filter_overlong_prompts=True \
|
| 11 |
+
data.truncation='error' \
|
| 12 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
|
| 13 |
+
actor_rollout_ref.actor.optim.lr=5e-8 \
|
| 14 |
+
actor_rollout_ref.model.use_remove_padding=False \
|
| 15 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
|
| 16 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
|
| 17 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 18 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 19 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 20 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 21 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 22 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 23 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 24 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
|
| 25 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
|
| 26 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 27 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
|
| 28 |
+
actor_rollout_ref.rollout.n=5 \
|
| 29 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 30 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
|
| 31 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 32 |
+
algorithm.use_kl_in_reward=False \
|
| 33 |
+
trainer.critic_warmup=0 \
|
| 34 |
+
trainer.logger=console \
|
| 35 |
+
trainer.project_name='verl_grpo_example_gsm8k' \
|
| 36 |
+
trainer.experiment_name='qwen2_5_7b_function_rm' \
|
| 37 |
+
trainer.n_gpus_per_node=16 \
|
| 38 |
+
trainer.nnodes=1 \
|
| 39 |
+
trainer.save_freq=-1 \
|
| 40 |
+
trainer.test_freq=5 \
|
| 41 |
+
trainer.total_epochs=5 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
HF_MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct
|
| 6 |
+
DIST_CKPT_PATH=${DIST_CKPT_PATH}
|
| 7 |
+
|
| 8 |
+
# convert HF model to megatron format offlinely
|
| 9 |
+
# python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# megatron tuning guide:
|
| 13 |
+
# 1. recommend to offload all states by setting ALL_OFFLOAD=True
|
| 14 |
+
# 2. enable dynamic batch size by setting actor_rollout_ref.actor.use_dynamic_bsz=True ref.log_prob_use_dynamic_bsz=True rollout.log_prob_use_dynamic_bsz=True
|
| 15 |
+
# 3. set ppo_max_token_len_per_gpu and log_prob_max_token_len_per_gpu as large as possible for better MFU (limited by GPU memory). assure ppo_max_token_len_per_gpu > max_prompt_length+max_response_length, if sequence length is too long, you can increase the TP/PP size
|
| 16 |
+
# 4. if memory is very limited, enable full recompute, but the mfu will be 30% lower
|
| 17 |
+
# full recompute settings:
|
| 18 |
+
# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
|
| 19 |
+
# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
|
| 20 |
+
# +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
|
| 21 |
+
|
| 22 |
+
ALL_OFFLOAD=${ALL_OFFLOAD:-True}
|
| 23 |
+
COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
|
| 24 |
+
COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
|
| 25 |
+
COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
|
| 26 |
+
|
| 27 |
+
ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
|
| 28 |
+
ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
|
| 29 |
+
ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
|
| 30 |
+
REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
train_path=$HOME/data/geo3k/train.parquet
|
| 34 |
+
test_path=$HOME/data/geo3k/test.parquet
|
| 35 |
+
|
| 36 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 37 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 38 |
+
algorithm.adv_estimator=grpo \
|
| 39 |
+
data.train_files="$train_path" \
|
| 40 |
+
data.val_files="$test_path" \
|
| 41 |
+
data.train_batch_size=512 \
|
| 42 |
+
data.max_prompt_length=1024 \
|
| 43 |
+
data.max_response_length=2048 \
|
| 44 |
+
data.filter_overlong_prompts=True \
|
| 45 |
+
data.truncation='error' \
|
| 46 |
+
actor_rollout_ref.model.path=$HF_MODEL_PATH \
|
| 47 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 48 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 49 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
|
| 50 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1 \
|
| 51 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
|
| 52 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 53 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 54 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 55 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 56 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
|
| 57 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 58 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 59 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5120 \
|
| 60 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
|
| 61 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=20480 \
|
| 62 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
|
| 63 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \
|
| 64 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 65 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 66 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
|
| 67 |
+
actor_rollout_ref.rollout.n=5 \
|
| 68 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
|
| 69 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1 \
|
| 70 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
|
| 71 |
+
actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
|
| 72 |
+
actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
|
| 73 |
+
actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
|
| 74 |
+
actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
|
| 75 |
+
actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
|
| 76 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
|
| 77 |
+
actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
|
| 78 |
+
actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
|
| 79 |
+
algorithm.use_kl_in_reward=False \
|
| 80 |
+
trainer.critic_warmup=0 \
|
| 81 |
+
trainer.logger='["console","wandb"]' \
|
| 82 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 83 |
+
trainer.experiment_name='qwen2_5_vl_7b_megatron' \
|
| 84 |
+
trainer.n_gpus_per_node=8 \
|
| 85 |
+
trainer.nnodes=1 \
|
| 86 |
+
trainer.save_freq=20 \
|
| 87 |
+
trainer.test_freq=5 \
|
| 88 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
# python examples/data_preprocess/geo3k.py --local_dir ~/data/geo3k
|
| 4 |
+
|
| 5 |
+
python3 -m verl.trainer.main_ppo \
|
| 6 |
+
algorithm.adv_estimator=grpo \
|
| 7 |
+
data.train_files=$HOME/data/geo3k/train.parquet \
|
| 8 |
+
data.val_files=$HOME/data/geo3k/test.parquet \
|
| 9 |
+
data.train_batch_size=512 \
|
| 10 |
+
data.max_prompt_length=1024 \
|
| 11 |
+
data.max_response_length=2048 \
|
| 12 |
+
data.filter_overlong_prompts=True \
|
| 13 |
+
data.truncation='error' \
|
| 14 |
+
data.image_key=images \
|
| 15 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
|
| 16 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 17 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 18 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 19 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
|
| 20 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 21 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 22 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 23 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 24 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 25 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 26 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 27 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
|
| 28 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
|
| 29 |
+
actor_rollout_ref.rollout.name=sglang \
|
| 30 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 31 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
|
| 32 |
+
actor_rollout_ref.rollout.multi_stage_wake_up=True \
|
| 33 |
+
global_profiler.tool=torch_memory \
|
| 34 |
+
global_profiler.save_path=./mem_snapshots \
|
| 35 |
+
global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries=100000 \
|
| 36 |
+
global_profiler.global_tool_config.torch_memory.stack_depth=32 \
|
| 37 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 38 |
+
actor_rollout_ref.rollout.enforce_eager=False \
|
| 39 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 40 |
+
actor_rollout_ref.rollout.n=5 \
|
| 41 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
|
| 42 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 43 |
+
actor_rollout_ref.rollout.mode=async \
|
| 44 |
+
algorithm.use_kl_in_reward=False \
|
| 45 |
+
trainer.critic_warmup=0 \
|
| 46 |
+
trainer.logger='["console","wandb"]' \
|
| 47 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 48 |
+
trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
|
| 49 |
+
trainer.n_gpus_per_node=8 \
|
| 50 |
+
trainer.nnodes=1 \
|
| 51 |
+
trainer.save_freq=20 \
|
| 52 |
+
trainer.test_freq=5 \
|
| 53 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_freeze_vision.sh
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
|
| 4 |
+
python3 -m verl.trainer.main_ppo \
|
| 5 |
+
algorithm.adv_estimator=grpo \
|
| 6 |
+
data.train_files=$HOME/data/geo3k/train.parquet \
|
| 7 |
+
data.val_files=$HOME/data/geo3k/test.parquet \
|
| 8 |
+
data.train_batch_size=512 \
|
| 9 |
+
data.max_prompt_length=1024 \
|
| 10 |
+
data.max_response_length=2048 \
|
| 11 |
+
data.filter_overlong_prompts=True \
|
| 12 |
+
data.truncation='error' \
|
| 13 |
+
data.image_key=images \
|
| 14 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
|
| 15 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 16 |
+
actor_rollout_ref.actor.freeze_vision_tower=True \
|
| 17 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 18 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 19 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
|
| 20 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 21 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 22 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 23 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 24 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 25 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 26 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 27 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
|
| 28 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 29 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 30 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 31 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 32 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 33 |
+
actor_rollout_ref.rollout.enforce_eager=False \
|
| 34 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 35 |
+
actor_rollout_ref.rollout.n=5 \
|
| 36 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
|
| 37 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 38 |
+
algorithm.use_kl_in_reward=False \
|
| 39 |
+
trainer.critic_warmup=0 \
|
| 40 |
+
trainer.logger='["console","wandb"]' \
|
| 41 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 42 |
+
trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
|
| 43 |
+
trainer.n_gpus_per_node=8 \
|
| 44 |
+
trainer.nnodes=1 \
|
| 45 |
+
trainer.save_freq=20 \
|
| 46 |
+
trainer.test_freq=5 \
|
| 47 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
|
| 4 |
+
# export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 5 |
+
|
| 6 |
+
python3 -m verl.trainer.main_ppo \
|
| 7 |
+
algorithm.adv_estimator=grpo \
|
| 8 |
+
data.train_files=$HOME/data/geo3k/train.parquet \
|
| 9 |
+
data.val_files=$HOME/data/geo3k/test.parquet \
|
| 10 |
+
data.train_batch_size=512 \
|
| 11 |
+
data.max_prompt_length=1024 \
|
| 12 |
+
data.max_response_length=2048 \
|
| 13 |
+
data.filter_overlong_prompts=True \
|
| 14 |
+
data.truncation='error' \
|
| 15 |
+
data.image_key=images \
|
| 16 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
|
| 17 |
+
actor_rollout_ref.actor.optim.lr=3e-6 \
|
| 18 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 19 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 20 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
|
| 21 |
+
actor_rollout_ref.model.lora_rank=64 \
|
| 22 |
+
actor_rollout_ref.model.lora_alpha=32 \
|
| 23 |
+
actor_rollout_ref.model.target_modules=all-linear \
|
| 24 |
+
actor_rollout_ref.model.exclude_modules='.*visual.*' \
|
| 25 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 26 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 27 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 28 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 29 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 30 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 31 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 32 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
|
| 33 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 34 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 35 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 36 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 37 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 38 |
+
actor_rollout_ref.rollout.enforce_eager=False \
|
| 39 |
+
actor_rollout_ref.rollout.free_cache_engine=False \
|
| 40 |
+
actor_rollout_ref.rollout.n=5 \
|
| 41 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
|
| 42 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 43 |
+
algorithm.use_kl_in_reward=False \
|
| 44 |
+
trainer.critic_warmup=0 \
|
| 45 |
+
trainer.logger='["console","wandb"]' \
|
| 46 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 47 |
+
trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
|
| 48 |
+
trainer.n_gpus_per_node=8 \
|
| 49 |
+
trainer.nnodes=1 \
|
| 50 |
+
trainer.save_freq=20 \
|
| 51 |
+
trainer.test_freq=5 \
|
| 52 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_seq_balance.sh
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
|
| 4 |
+
python3 -m verl.trainer.main_ppo \
|
| 5 |
+
algorithm.adv_estimator=grpo \
|
| 6 |
+
data.train_files=$HOME/data/geo3k/train.parquet \
|
| 7 |
+
data.val_files=$HOME/data/geo3k/test.parquet \
|
| 8 |
+
data.train_batch_size=512 \
|
| 9 |
+
data.max_prompt_length=1024 \
|
| 10 |
+
data.max_response_length=2048 \
|
| 11 |
+
data.filter_overlong_prompts=True \
|
| 12 |
+
data.truncation='error' \
|
| 13 |
+
data.image_key=images \
|
| 14 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
|
| 15 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 16 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 17 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 18 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 19 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=6144 \
|
| 20 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 21 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 22 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 23 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 24 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 25 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 26 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 27 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 28 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 29 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 30 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 31 |
+
actor_rollout_ref.rollout.enforce_eager=False \
|
| 32 |
+
actor_rollout_ref.rollout.free_cache_engine=False \
|
| 33 |
+
actor_rollout_ref.rollout.n=5 \
|
| 34 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=6144 \
|
| 35 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 36 |
+
algorithm.use_kl_in_reward=False \
|
| 37 |
+
trainer.critic_warmup=0 \
|
| 38 |
+
trainer.logger='["console","wandb"]' \
|
| 39 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 40 |
+
trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
|
| 41 |
+
trainer.n_gpus_per_node=8 \
|
| 42 |
+
trainer.nnodes=1 \
|
| 43 |
+
trainer.save_freq=20 \
|
| 44 |
+
trainer.test_freq=5 \
|
| 45 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
|
| 4 |
+
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
|
| 5 |
+
# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
|
| 6 |
+
export USE_OPTIMIZED_MODEL=0
|
| 7 |
+
|
| 8 |
+
python3 -m verl.trainer.main_ppo \
|
| 9 |
+
algorithm.adv_estimator=grpo \
|
| 10 |
+
data.train_files=$HOME/data/geo3k/train.parquet \
|
| 11 |
+
data.val_files=$HOME/data/geo3k/test.parquet \
|
| 12 |
+
data.train_batch_size=512 \
|
| 13 |
+
data.max_prompt_length=1024 \
|
| 14 |
+
data.max_response_length=2048 \
|
| 15 |
+
data.filter_overlong_prompts=True \
|
| 16 |
+
data.truncation='error' \
|
| 17 |
+
data.image_key=images \
|
| 18 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-32B-Instruct \
|
| 19 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 20 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 21 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
|
| 22 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
|
| 23 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 24 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 25 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 26 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 27 |
+
actor_rollout_ref.actor.use_torch_compile=False \
|
| 28 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 29 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 30 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 31 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
|
| 32 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
|
| 33 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 34 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 35 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
|
| 36 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 37 |
+
actor_rollout_ref.rollout.enforce_eager=True \
|
| 38 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 39 |
+
actor_rollout_ref.rollout.n=5 \
|
| 40 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
|
| 41 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 42 |
+
algorithm.use_kl_in_reward=False \
|
| 43 |
+
trainer.critic_warmup=0 \
|
| 44 |
+
trainer.logger=console \
|
| 45 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 46 |
+
trainer.experiment_name='qwen2_5_vl_32b_function_rm' \
|
| 47 |
+
trainer.n_gpus_per_node=16 \
|
| 48 |
+
trainer.nnodes=2 \
|
| 49 |
+
trainer.save_freq=-1 \
|
| 50 |
+
trainer.test_freq=-1 \
|
| 51 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
|
| 4 |
+
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
|
| 5 |
+
# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
|
| 6 |
+
export USE_OPTIMIZED_MODEL=0
|
| 7 |
+
|
| 8 |
+
python3 -m verl.trainer.main_ppo \
|
| 9 |
+
algorithm.adv_estimator=grpo \
|
| 10 |
+
data.train_files=$HOME/data/geo3k/train.parquet \
|
| 11 |
+
data.val_files=$HOME/data/geo3k/test.parquet \
|
| 12 |
+
data.train_batch_size=512 \
|
| 13 |
+
data.max_prompt_length=1024 \
|
| 14 |
+
data.max_response_length=2048 \
|
| 15 |
+
data.filter_overlong_prompts=True \
|
| 16 |
+
data.truncation='error' \
|
| 17 |
+
data.image_key=images \
|
| 18 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-3B-Instruct \
|
| 19 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 20 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 21 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=16 \
|
| 22 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
|
| 23 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 24 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 25 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 26 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 27 |
+
actor_rollout_ref.actor.use_torch_compile=False \
|
| 28 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 29 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 30 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 31 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 32 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 33 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 34 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 35 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 36 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 37 |
+
actor_rollout_ref.rollout.enforce_eager=True \
|
| 38 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 39 |
+
actor_rollout_ref.rollout.n=5 \
|
| 40 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 41 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 42 |
+
algorithm.use_kl_in_reward=False \
|
| 43 |
+
trainer.use_legacy_worker_impl=disable \
|
| 44 |
+
trainer.critic_warmup=0 \
|
| 45 |
+
trainer.logger=console \
|
| 46 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 47 |
+
trainer.experiment_name='qwen2_5_vl_3b_function_rm' \
|
| 48 |
+
trainer.n_gpus_per_node=8 \
|
| 49 |
+
trainer.nnodes=1 \
|
| 50 |
+
trainer.save_freq=-1 \
|
| 51 |
+
trainer.test_freq=-1 \
|
| 52 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
|
| 4 |
+
# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
|
| 5 |
+
# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
|
| 6 |
+
export USE_OPTIMIZED_MODEL=0
|
| 7 |
+
|
| 8 |
+
python3 -m verl.trainer.main_ppo \
|
| 9 |
+
algorithm.adv_estimator=grpo \
|
| 10 |
+
data.train_files=$HOME/data/geo3k/train.parquet \
|
| 11 |
+
data.val_files=$HOME/data/geo3k/test.parquet \
|
| 12 |
+
data.train_batch_size=512 \
|
| 13 |
+
data.max_prompt_length=1024 \
|
| 14 |
+
data.max_response_length=2048 \
|
| 15 |
+
data.filter_overlong_prompts=True \
|
| 16 |
+
data.truncation='error' \
|
| 17 |
+
data.image_key=images \
|
| 18 |
+
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
|
| 19 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 20 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 21 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
|
| 22 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
|
| 23 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 24 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 25 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 26 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 27 |
+
actor_rollout_ref.actor.use_torch_compile=False \
|
| 28 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 29 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 30 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 31 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 32 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
|
| 33 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 34 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 35 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
|
| 36 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 37 |
+
actor_rollout_ref.rollout.enforce_eager=True \
|
| 38 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 39 |
+
actor_rollout_ref.rollout.n=5 \
|
| 40 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 41 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 42 |
+
algorithm.use_kl_in_reward=False \
|
| 43 |
+
trainer.critic_warmup=0 \
|
| 44 |
+
trainer.logger=console \
|
| 45 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 46 |
+
trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
|
| 47 |
+
trainer.n_gpus_per_node=16 \
|
| 48 |
+
trainer.nnodes=1 \
|
| 49 |
+
trainer.save_freq=-1 \
|
| 50 |
+
trainer.test_freq=-1 \
|
| 51 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -xeuo pipefail
|
| 3 |
+
|
| 4 |
+
## !!!!!!!important!!!!!!
|
| 5 |
+
## set the following environment variables on all your nodes
|
| 6 |
+
# env_vars:
|
| 7 |
+
# CUDA_DEVICE_MAX_CONNECTIONS: "1"
|
| 8 |
+
# NCCL_NVLS_ENABLE: "0"
|
| 9 |
+
# VLLM_USE_V1: 1
|
| 10 |
+
# install mbridge=0.1.13 on all your node with the following command:
|
| 11 |
+
# pip3 install git+https://github.com/ISEEKYAN/mbridge
|
| 12 |
+
|
| 13 |
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
| 14 |
+
[ -f "${SCRIPT_DIR}/env.sh" ] && source "${SCRIPT_DIR}/env.sh"
|
| 15 |
+
|
| 16 |
+
adv_estimator=grpo
|
| 17 |
+
|
| 18 |
+
use_kl_in_reward=False
|
| 19 |
+
kl_coef=0.0
|
| 20 |
+
use_kl_loss=True
|
| 21 |
+
kl_loss_coef=0.001
|
| 22 |
+
|
| 23 |
+
clip_ratio_low=0.2
|
| 24 |
+
clip_ratio_high=0.28
|
| 25 |
+
|
| 26 |
+
max_prompt_length=$((1024 * 2))
|
| 27 |
+
max_response_length=$((1204 * 8))
|
| 28 |
+
enable_overlong_buffer=True
|
| 29 |
+
overlong_buffer_len=$((1024 * 1))
|
| 30 |
+
overlong_penalty_factor=1.0
|
| 31 |
+
|
| 32 |
+
loss_agg_mode="token-mean"
|
| 33 |
+
|
| 34 |
+
train_prompt_bsz=${TRAIN_BS:-32}
|
| 35 |
+
n_resp_per_prompt=8
|
| 36 |
+
train_prompt_mini_bsz=16
|
| 37 |
+
|
| 38 |
+
# minimum nodes need for qwen3-235B-A22B
|
| 39 |
+
NNODES=${NNODES:-4}
|
| 40 |
+
# Paths
|
| 41 |
+
|
| 42 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 43 |
+
|
| 44 |
+
MODEL_PATH=$RAY_DATA_HOME/models/Qwen3-235B-A22B
|
| 45 |
+
|
| 46 |
+
TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
|
| 47 |
+
TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
|
| 48 |
+
|
| 49 |
+
# Algorithm
|
| 50 |
+
temperature=1.0
|
| 51 |
+
top_p=1.0
|
| 52 |
+
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
|
| 53 |
+
val_top_p=0.7
|
| 54 |
+
# Performance Related Parameter
|
| 55 |
+
use_dynamic_bsz=True
|
| 56 |
+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10))
|
| 57 |
+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
|
| 58 |
+
offload=True
|
| 59 |
+
OPTIM_OFFLOAD=${OPTIM_OFFLOAD:-True}
|
| 60 |
+
gen_tp=8
|
| 61 |
+
train_tp=${TP:-4}
|
| 62 |
+
train_pp=${PP:-8}
|
| 63 |
+
|
| 64 |
+
EP=${EP:-4}
|
| 65 |
+
ETP=1
|
| 66 |
+
CP=1
|
| 67 |
+
optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
|
| 68 |
+
last_layer=${LAST_LAYER:-10}
|
| 69 |
+
|
| 70 |
+
project_name='verl-qwen3'
|
| 71 |
+
exp_name="235B-${NNODES}-pp${train_pp}-tp${train_tp}-ep${EP}-actor-length${actor_ppo_max_token_len}"
|
| 72 |
+
CKPTS_DIR=$RAY_DATA_HOME/ckpt/${project_name}/${exp_name}
|
| 73 |
+
|
| 74 |
+
# TODO: support cuda graph for rollout by setting the following config
|
| 75 |
+
# actor_rollout_ref.rollout.cudagraph_capture_sizes=[1,2,4,8,16,32]
|
| 76 |
+
# actor_rollout_ref.rollout.enforce_eager=False
|
| 77 |
+
|
| 78 |
+
python3 -m verl.trainer.main_ppo \
|
| 79 |
+
--config-path=config \
|
| 80 |
+
--config-name='ppo_megatron_trainer.yaml' \
|
| 81 |
+
data.train_files="${TRAIN_FILE}" \
|
| 82 |
+
data.val_files="${TEST_FILE}" \
|
| 83 |
+
data.prompt_key=prompt \
|
| 84 |
+
data.truncation='left' \
|
| 85 |
+
data.max_prompt_length=${max_prompt_length} \
|
| 86 |
+
data.max_response_length=${max_response_length} \
|
| 87 |
+
data.train_batch_size=${train_prompt_bsz} \
|
| 88 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
|
| 89 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 90 |
+
actor_rollout_ref.rollout.enforce_eager=True \
|
| 91 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 92 |
+
algorithm.adv_estimator=${adv_estimator} \
|
| 93 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward} \
|
| 94 |
+
algorithm.kl_ctrl.kl_coef=${kl_coef} \
|
| 95 |
+
actor_rollout_ref.model.use_fused_kernels=True \
|
| 96 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True \
|
| 97 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
|
| 98 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
|
| 99 |
+
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
|
| 100 |
+
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
|
| 101 |
+
actor_rollout_ref.actor.clip_ratio_c=10.0 \
|
| 102 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
|
| 103 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
|
| 104 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
| 105 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
|
| 106 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 107 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 108 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
|
| 109 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 110 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 111 |
+
actor_rollout_ref.model.path="${MODEL_PATH}" \
|
| 112 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 113 |
+
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
|
| 114 |
+
actor_rollout_ref.actor.optim.weight_decay=0.1 \
|
| 115 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} \
|
| 116 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
|
| 117 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
|
| 118 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
|
| 119 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
| 120 |
+
actor_rollout_ref.actor.megatron.param_offload=${offload} \
|
| 121 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${OPTIM_OFFLOAD} \
|
| 122 |
+
actor_rollout_ref.actor.megatron.grad_offload=${offload} \
|
| 123 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
|
| 124 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
|
| 125 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
|
| 126 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
|
| 127 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
|
| 128 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 129 |
+
actor_rollout_ref.actor.optim.clip_grad=1.0 \
|
| 130 |
+
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
|
| 131 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
|
| 132 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
|
| 133 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=True \
|
| 134 |
+
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
|
| 135 |
+
actor_rollout_ref.rollout.temperature=${temperature} \
|
| 136 |
+
actor_rollout_ref.rollout.top_p=${top_p} \
|
| 137 |
+
actor_rollout_ref.rollout.top_k=${top_k} \
|
| 138 |
+
actor_rollout_ref.nccl_timeout=1200 \
|
| 139 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
|
| 140 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
|
| 141 |
+
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
|
| 142 |
+
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
|
| 143 |
+
actor_rollout_ref.rollout.val_kwargs.n=1 \
|
| 144 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
|
| 145 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
|
| 146 |
+
actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
|
| 147 |
+
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
|
| 148 |
+
actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
|
| 149 |
+
actor_rollout_ref.ref.megatron.param_offload=${offload} \
|
| 150 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
|
| 151 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.masked_softmax_fusion=True \
|
| 152 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \
|
| 153 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.bias_dropout_fusion=True \
|
| 154 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
|
| 155 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.deallocate_pipeline_outputs=True \
|
| 156 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.persist_layer_norm=True \
|
| 157 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True \
|
| 158 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
|
| 159 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type="flex" \
|
| 160 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
|
| 161 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
|
| 162 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=True \
|
| 163 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=True \
|
| 164 |
+
reward_model.reward_manager=dapo \
|
| 165 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
|
| 166 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
|
| 167 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
|
| 168 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
|
| 169 |
+
+reward_model.reward_kwargs.max_resp_len=${max_response_length} \
|
| 170 |
+
trainer.logger=['console','wandb'] \
|
| 171 |
+
trainer.project_name="${project_name}" \
|
| 172 |
+
trainer.experiment_name="${exp_name}" \
|
| 173 |
+
trainer.n_gpus_per_node=8 \
|
| 174 |
+
trainer.nnodes="${NNODES}" \
|
| 175 |
+
trainer.val_before_train=False \
|
| 176 |
+
trainer.test_freq=10 \
|
| 177 |
+
trainer.save_freq=100 \
|
| 178 |
+
trainer.total_epochs=10 \
|
| 179 |
+
trainer.default_local_dir="${CKPTS_DIR}" \
|
| 180 |
+
trainer.resume_mode=auto \
|
| 181 |
+
trainer.log_val_generations=10
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b_npu.sh
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
project_name='GRPO-Qwen3'
|
| 4 |
+
exp_name='GRPO-Qwen3-8B-npu'
|
| 5 |
+
gen_tp=2
|
| 6 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 7 |
+
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-8B"}
|
| 8 |
+
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
|
| 9 |
+
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
|
| 10 |
+
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
|
| 11 |
+
|
| 12 |
+
python3 -m verl.trainer.main_ppo \
|
| 13 |
+
algorithm.adv_estimator=grpo \
|
| 14 |
+
data.train_files="${TRAIN_FILE}" \
|
| 15 |
+
data.val_files="${TEST_FILE}" \
|
| 16 |
+
data.train_batch_size=256 \
|
| 17 |
+
data.max_prompt_length=512 \
|
| 18 |
+
data.max_response_length=1024 \
|
| 19 |
+
data.filter_overlong_prompts=True \
|
| 20 |
+
data.truncation='error' \
|
| 21 |
+
actor_rollout_ref.model.path=${MODEL_PATH} \
|
| 22 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 23 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 24 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
|
| 25 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
|
| 26 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 27 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 28 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 29 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 30 |
+
actor_rollout_ref.actor.use_torch_compile=False \
|
| 31 |
+
actor_rollout_ref.ref.use_torch_compile=False \
|
| 32 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 33 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=False \
|
| 34 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
|
| 35 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
|
| 36 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
|
| 37 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 38 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 39 |
+
actor_rollout_ref.rollout.n=5 \
|
| 40 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
|
| 41 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 42 |
+
algorithm.use_kl_in_reward=False \
|
| 43 |
+
trainer.critic_warmup=0 \
|
| 44 |
+
trainer.logger='["console","wandb"]' \
|
| 45 |
+
trainer.project_name="${project_name}" \
|
| 46 |
+
trainer.experiment_name="${exp_name}" \
|
| 47 |
+
trainer.n_gpus_per_node=8 \
|
| 48 |
+
trainer.nnodes=1 \
|
| 49 |
+
trainer.default_local_dir=${CKPTS_DIR} \
|
| 50 |
+
trainer.resume_mode=auto \
|
| 51 |
+
actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \
|
| 52 |
+
actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \
|
| 53 |
+
++actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
|
| 54 |
+
++actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
|
| 55 |
+
trainer.val_before_train=True \
|
| 56 |
+
trainer.save_freq=5 \
|
| 57 |
+
trainer.test_freq=5 \
|
| 58 |
+
trainer.total_epochs=15
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -xeuo pipefail
|
| 2 |
+
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
| 3 |
+
source /usr/local/Ascend/nnal/atb/set_env.sh
|
| 4 |
+
|
| 5 |
+
# 使用v1引擎
|
| 6 |
+
export VLLM_USE_V1=1
|
| 7 |
+
# 指定vllm 版本
|
| 8 |
+
export VLLM_VERSION=0.9.1
|
| 9 |
+
|
| 10 |
+
# 开启二级流水
|
| 11 |
+
export TASK_QUEUE_ENABLE=2
|
| 12 |
+
# 开启细绑核
|
| 13 |
+
export CPU_AFFINITY_CONF=1
|
| 14 |
+
# 使用jemalloc优化内存访问(依赖安装jemalloc)
|
| 15 |
+
export LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libjemalloc.so.2${LD_PRELOAD:+:$LD_PRELOAD}"
|
| 16 |
+
|
| 17 |
+
# A3 机器单机8卡
|
| 18 |
+
trainer_n_gpus_per_node=16
|
| 19 |
+
trainer_nnodes=1
|
| 20 |
+
trainer_project_name='verl_grpo_example_gsm8k'
|
| 21 |
+
trainer_experiment_name="qwen3_4b_grpo_8npu}"
|
| 22 |
+
|
| 23 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 24 |
+
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-4B"}
|
| 25 |
+
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${trainer_project_name}/${trainer_experiment_name}"}
|
| 26 |
+
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/gsm8k/train.parquet"}
|
| 27 |
+
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/gsm8k/test.parquet"}
|
| 28 |
+
|
| 29 |
+
export TENSORBOARD_DIR="${RAY_DATA_HOME}/tensorboard_dir/${trainer_project_name}/${trainer_experiment_name}"
|
| 30 |
+
mkdir -p "${RAY_DATA_HOME}/logs/${trainer_project_name}"
|
| 31 |
+
LOG_PATH="${RAY_DATA_HOME}/logs/${trainer_project_name}/${trainer_experiment_name}.log"
|
| 32 |
+
|
| 33 |
+
use_dynamic_bsz=True
|
| 34 |
+
|
| 35 |
+
python3 -m verl.trainer.main_ppo \
|
| 36 |
+
algorithm.adv_estimator=grpo \
|
| 37 |
+
data.train_files=${TRAIN_FILE} \
|
| 38 |
+
data.val_files=${TEST_FILE} \
|
| 39 |
+
data.train_batch_size=512 \
|
| 40 |
+
data.max_prompt_length=1024 \
|
| 41 |
+
data.max_response_length=1024 \
|
| 42 |
+
data.filter_overlong_prompts=True \
|
| 43 |
+
data.truncation='error' \
|
| 44 |
+
actor_rollout_ref.model.path=${MODEL_PATH} \
|
| 45 |
+
actor_rollout_ref.actor.optim.lr=5e-7 \
|
| 46 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 47 |
+
actor_rollout_ref.actor.entropy_coeff=0.001 \
|
| 48 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
|
| 49 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 50 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 51 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 52 |
+
actor_rollout_ref.actor.use_torch_compile=False \
|
| 53 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
|
| 54 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=3000 \
|
| 55 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 56 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=True \
|
| 57 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 58 |
+
actor_rollout_ref.rollout.enforce_eager=True \
|
| 59 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 60 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
|
| 61 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 62 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 63 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 64 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
|
| 65 |
+
actor_rollout_ref.rollout.n=5 \
|
| 66 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 67 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192 \
|
| 68 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 69 |
+
actor_rollout_ref.ref.use_torch_compile=True \
|
| 70 |
+
algorithm.kl_ctrl.kl_coef=0.001 \
|
| 71 |
+
trainer.critic_warmup=0 \
|
| 72 |
+
trainer.project_name=${trainer_project_name} \
|
| 73 |
+
trainer.experiment_name=${trainer_experiment_name} \
|
| 74 |
+
trainer.logger=['console','tensorboard'] \
|
| 75 |
+
trainer.default_local_dir=${CKPTS_DIR} \
|
| 76 |
+
trainer.n_gpus_per_node=$trainer_n_gpus_per_node \
|
| 77 |
+
trainer.nnodes=$trainer_nnodes \
|
| 78 |
+
trainer.save_freq=-1 \
|
| 79 |
+
trainer.test_freq=5 \
|
| 80 |
+
trainer.total_epochs=15 \
|
| 81 |
+
trainer.val_before_train=False 2>&1 | tee ${LOG_PATH}
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
export HCCL_CONNECT_TIMEOUT=1500
|
| 3 |
+
export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
|
| 4 |
+
export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
|
| 5 |
+
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
|
| 6 |
+
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
| 7 |
+
# WORKSPACE_HOME and DATA_HOME support custom path configuration.
|
| 8 |
+
WORKSPACE_HOME=$pwd
|
| 9 |
+
DATA_HOME=$pwd
|
| 10 |
+
|
| 11 |
+
sp_size=4
|
| 12 |
+
num_gpu=8
|
| 13 |
+
tp_size=4
|
| 14 |
+
train_prompt_bsz=16
|
| 15 |
+
train_prompt_mini_bsz=16
|
| 16 |
+
|
| 17 |
+
max_prompt_length=$((1024 * 2))
|
| 18 |
+
max_response_length=$((1024 * 32))
|
| 19 |
+
|
| 20 |
+
CKPTS_DIR=$WORKSPACE_HOME/logs/ckpt/qwen3_8b
|
| 21 |
+
model_path=$DATA_HOME/models/Qwen3-8B
|
| 22 |
+
train_data=$DATA_HOME/datasets/dapo/dapo-math-17k.parquet
|
| 23 |
+
valid_data=$DATA_HOME/datasets/dapo/aime-2024.parquet
|
| 24 |
+
|
| 25 |
+
python3 -m verl.trainer.main_ppo \
|
| 26 |
+
algorithm.adv_estimator=grpo \
|
| 27 |
+
data.train_files=$train_data \
|
| 28 |
+
data.val_files=$valid_data \
|
| 29 |
+
data.train_batch_size=$train_prompt_bsz \
|
| 30 |
+
data.max_prompt_length=$max_prompt_length \
|
| 31 |
+
data.max_response_length=$max_response_length \
|
| 32 |
+
data.filter_overlong_prompts=False \
|
| 33 |
+
data.truncation='error' \
|
| 34 |
+
actor_rollout_ref.model.path=$model_path \
|
| 35 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 36 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 37 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=$train_prompt_mini_bsz \
|
| 38 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
|
| 39 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 40 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 41 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 42 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 43 |
+
actor_rollout_ref.actor.use_torch_compile=False \
|
| 44 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 45 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=True \
|
| 46 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 47 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
|
| 48 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=$tp_size \
|
| 49 |
+
actor_rollout_ref.rollout.name=sglang \
|
| 50 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
|
| 51 |
+
actor_rollout_ref.rollout.n=5 \
|
| 52 |
+
+actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" \
|
| 53 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 54 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
| 55 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
|
| 56 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 57 |
+
actor_rollout_ref.nccl_timeout=3600 \
|
| 58 |
+
algorithm.use_kl_in_reward=False \
|
| 59 |
+
trainer.critic_warmup=0 \
|
| 60 |
+
trainer.logger=console \
|
| 61 |
+
trainer.val_before_train=False \
|
| 62 |
+
trainer.project_name='verl_grpo_example_2k_32k' \
|
| 63 |
+
trainer.experiment_name='qwen3_8b_function_rm' \
|
| 64 |
+
trainer.n_gpus_per_node=$num_gpu \
|
| 65 |
+
trainer.nnodes=1 \
|
| 66 |
+
trainer.save_freq=1000 \
|
| 67 |
+
trainer.test_freq=10000 \
|
| 68 |
+
trainer.total_epochs=5 \
|
| 69 |
+
trainer.default_local_dir="${CKPTS_DIR}" \
|
| 70 |
+
actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
|
| 71 |
+
actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-235b-megatron.sh
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-235B-A22B-Instruct"}
|
| 9 |
+
|
| 10 |
+
GEN_TP=${GEN_TP:-16}
|
| 11 |
+
CP=${CP:-2}
|
| 12 |
+
TP=${TP:-4}
|
| 13 |
+
PP=${PP:-8}
|
| 14 |
+
EP=${EP:-8}
|
| 15 |
+
ETP=${ETP:-1}
|
| 16 |
+
|
| 17 |
+
train_path=$HOME/data/geo3k/train.parquet
|
| 18 |
+
test_path=$HOME/data/geo3k/test.parquet
|
| 19 |
+
|
| 20 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 21 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 22 |
+
algorithm.adv_estimator=grpo \
|
| 23 |
+
data.train_files="$train_path" \
|
| 24 |
+
data.val_files="$test_path" \
|
| 25 |
+
data.train_batch_size=512 \
|
| 26 |
+
data.max_prompt_length=1024 \
|
| 27 |
+
data.max_response_length=2048 \
|
| 28 |
+
data.filter_overlong_prompts=True \
|
| 29 |
+
data.truncation='error' \
|
| 30 |
+
actor_rollout_ref.model.path=$HF_MODEL_PATH \
|
| 31 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 32 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 33 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
|
| 34 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
|
| 35 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
|
| 36 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
|
| 37 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
|
| 38 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
|
| 39 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 40 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 41 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 42 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 43 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
|
| 44 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \
|
| 45 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 46 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \
|
| 47 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
|
| 48 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \
|
| 49 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
|
| 50 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
|
| 51 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 52 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 53 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
|
| 54 |
+
actor_rollout_ref.rollout.n=5 \
|
| 55 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
|
| 56 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True \
|
| 57 |
+
actor_rollout_ref.actor.megatron.param_offload=True \
|
| 58 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=True \
|
| 59 |
+
actor_rollout_ref.actor.megatron.grad_offload=True \
|
| 60 |
+
actor_rollout_ref.ref.megatron.param_offload=True \
|
| 61 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
|
| 62 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
|
| 63 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
|
| 64 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
|
| 65 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
|
| 66 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
|
| 67 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
|
| 68 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
|
| 69 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
|
| 70 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
|
| 71 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
|
| 72 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
|
| 73 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=True \
|
| 74 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=True \
|
| 75 |
+
algorithm.use_kl_in_reward=False \
|
| 76 |
+
trainer.critic_warmup=0 \
|
| 77 |
+
trainer.logger='["console","wandb"]' \
|
| 78 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 79 |
+
trainer.experiment_name='qwen3_vl_235b_megatron' \
|
| 80 |
+
trainer.n_gpus_per_node=8 \
|
| 81 |
+
trainer.nnodes=8 \
|
| 82 |
+
trainer.save_freq=20 \
|
| 83 |
+
trainer.test_freq=5 \
|
| 84 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-30b-megatron.sh
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
ENGINE=${1:-vllm}
|
| 3 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
|
| 4 |
+
|
| 5 |
+
export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-30B-A3B-Instruct"}
|
| 9 |
+
|
| 10 |
+
GEN_TP=${GEN_TP:-4}
|
| 11 |
+
CP=${CP:-2}
|
| 12 |
+
TP=${TP:-2}
|
| 13 |
+
PP=${PP:-1}
|
| 14 |
+
EP=${EP:-8}
|
| 15 |
+
ETP=${ETP:-1}
|
| 16 |
+
|
| 17 |
+
train_path=$HOME/data/geo3k/train.parquet
|
| 18 |
+
test_path=$HOME/data/geo3k/test.parquet
|
| 19 |
+
|
| 20 |
+
python3 -m verl.trainer.main_ppo --config-path=config \
|
| 21 |
+
--config-name='ppo_megatron_trainer.yaml'\
|
| 22 |
+
algorithm.adv_estimator=grpo \
|
| 23 |
+
data.train_files="$train_path" \
|
| 24 |
+
data.val_files="$test_path" \
|
| 25 |
+
data.train_batch_size=512 \
|
| 26 |
+
data.max_prompt_length=1024 \
|
| 27 |
+
data.max_response_length=2048 \
|
| 28 |
+
data.filter_overlong_prompts=True \
|
| 29 |
+
data.truncation='error' \
|
| 30 |
+
actor_rollout_ref.model.path=$HF_MODEL_PATH \
|
| 31 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 32 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
|
| 33 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
|
| 34 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
|
| 35 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
|
| 36 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
|
| 37 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
|
| 38 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
|
| 39 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 40 |
+
actor_rollout_ref.actor.kl_loss_coef=0.01 \
|
| 41 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 42 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 43 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
|
| 44 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \
|
| 45 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 46 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \
|
| 47 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
|
| 48 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \
|
| 49 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
|
| 50 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
|
| 51 |
+
actor_rollout_ref.rollout.name=$ENGINE \
|
| 52 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
| 53 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
|
| 54 |
+
actor_rollout_ref.rollout.n=5 \
|
| 55 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
|
| 56 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True \
|
| 57 |
+
actor_rollout_ref.actor.megatron.param_offload=True \
|
| 58 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=True \
|
| 59 |
+
actor_rollout_ref.actor.megatron.grad_offload=True \
|
| 60 |
+
actor_rollout_ref.ref.megatron.param_offload=True \
|
| 61 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
|
| 62 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
|
| 63 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
|
| 64 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
|
| 65 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
|
| 66 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
|
| 67 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
|
| 68 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
|
| 69 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
|
| 70 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
|
| 71 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
|
| 72 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
|
| 73 |
+
# Use aux_loss and z_loss to mitigate expert load imbalance when training MoE models
|
| 74 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01 \
|
| 75 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001 \
|
| 76 |
+
algorithm.use_kl_in_reward=False \
|
| 77 |
+
trainer.critic_warmup=0 \
|
| 78 |
+
trainer.logger='["console","wandb"]' \
|
| 79 |
+
trainer.project_name='verl_grpo_example_geo3k' \
|
| 80 |
+
trainer.experiment_name='qwen3_vl_30b_megatron' \
|
| 81 |
+
trainer.n_gpus_per_node=8 \
|
| 82 |
+
trainer.nnodes=1 \
|
| 83 |
+
trainer.save_freq=20 \
|
| 84 |
+
trainer.test_freq=5 \
|
| 85 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
# tested in NNODES=1~4 * 96G H20 GPU
|
| 4 |
+
NNODES=${NNODES:-1}
|
| 5 |
+
NGPUS_PER_NODES=${NGPUS_PER_NODES:-8}
|
| 6 |
+
|
| 7 |
+
project_name='DAPO-Qwen3-30b-MATH'
|
| 8 |
+
exp_name='DAPO-Qwen3-30b-MATH-megatron'
|
| 9 |
+
|
| 10 |
+
adv_estimator=grpo
|
| 11 |
+
|
| 12 |
+
use_kl_in_reward=False
|
| 13 |
+
kl_coef=0.0
|
| 14 |
+
use_kl_loss=False
|
| 15 |
+
kl_loss_coef=0.0
|
| 16 |
+
|
| 17 |
+
clip_ratio_low=0.2
|
| 18 |
+
clip_ratio_high=0.28
|
| 19 |
+
max_prompt_length=$((1024 * 2))
|
| 20 |
+
max_response_length=$((1024 * 8))
|
| 21 |
+
enable_overlong_buffer=True
|
| 22 |
+
overlong_buffer_len=$((1024 * 4))
|
| 23 |
+
overlong_penalty_factor=1.0
|
| 24 |
+
|
| 25 |
+
loss_agg_mode="token-mean"
|
| 26 |
+
|
| 27 |
+
train_prompt_bsz=512
|
| 28 |
+
n_resp_per_prompt=16
|
| 29 |
+
train_prompt_mini_bsz=128
|
| 30 |
+
train_ppo_micro_batch_size_per_gpu=2
|
| 31 |
+
infer_ppo_micro_batch_size_per_gpu=2
|
| 32 |
+
# Paths
|
| 33 |
+
MODEL_PATH=Qwen/Qwen3-30B-A3B-Base
|
| 34 |
+
|
| 35 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 36 |
+
TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
|
| 37 |
+
TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
|
| 38 |
+
TEST_FILE="['$aime24_test_path']"
|
| 39 |
+
|
| 40 |
+
# Algorithm
|
| 41 |
+
temperature=1.0
|
| 42 |
+
top_p=1.0
|
| 43 |
+
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
|
| 44 |
+
val_top_p=0.7
|
| 45 |
+
|
| 46 |
+
# Performance Related Parameter
|
| 47 |
+
use_dynamic_bsz=True
|
| 48 |
+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
|
| 49 |
+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
|
| 50 |
+
offload=True
|
| 51 |
+
|
| 52 |
+
optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
|
| 53 |
+
|
| 54 |
+
COMMON_PP=${COMMON_PP:-1}
|
| 55 |
+
COMMON_VPP=${COMMON_VPP:-null}
|
| 56 |
+
COMMON_CP=${COMMON_CP:-1}
|
| 57 |
+
COMMON_TP=${COMMON_TP:-1}
|
| 58 |
+
COMMON_EP=${COMMON_EP:-8}
|
| 59 |
+
COMMON_ETP=${COMMON_ETP:-1}
|
| 60 |
+
|
| 61 |
+
TRAIN_TP=${TRAIN_TP:-$COMMON_TP}
|
| 62 |
+
INFER_TP=${INFER_TP:-4}
|
| 63 |
+
|
| 64 |
+
ACTOR_PP=${ACTOR_PP:-$COMMON_PP}
|
| 65 |
+
ACTOR_VPP=${ACTOR_VPP:-$COMMON_VPP}
|
| 66 |
+
ACTOR_CP=${ACTOR_CP:-$COMMON_CP}
|
| 67 |
+
ACTOR_TP=${ACTOR_TP:-$TRAIN_TP}
|
| 68 |
+
ACTOR_EP=${ACTOR_EP:-$COMMON_EP}
|
| 69 |
+
ACTOR_ETP=${ACTOR_ETP:-$COMMON_ETP}
|
| 70 |
+
ROLLOUT_TP=${ROLLOUT_TP:-$INFER_TP}
|
| 71 |
+
REF_PP=${REF_PP:-$COMMON_PP}
|
| 72 |
+
REF_VPP=${REF_VPP:-$COMMON_VPP}
|
| 73 |
+
REF_CP=${REF_CP:-$COMMON_CP}
|
| 74 |
+
REF_TP=${REF_TP:-$TRAIN_TP}
|
| 75 |
+
REF_EP=${REF_EP:-$COMMON_EP}
|
| 76 |
+
REF_ETP=${REF_ETP:-$COMMON_ETP}
|
| 77 |
+
CRITIC_PP=${CRITIC_PP:-$COMMON_PP}
|
| 78 |
+
CRITIC_VPP=${CRITIC_VPP:-$COMMON_VPP}
|
| 79 |
+
CRITIC_CP=${CRITIC_CP:-$COMMON_CP}
|
| 80 |
+
CRITIC_TP=${CRITIC_TP:-$TRAIN_TP}
|
| 81 |
+
CRITIC_EP=${CRITIC_EP:-$COMMON_EP}
|
| 82 |
+
CRITIC_ETP=${CRITIC_ETP:-$COMMON_ETP}
|
| 83 |
+
RM_PP=${RM_PP:-$COMMON_PP}
|
| 84 |
+
RM_VPP=${RM_VPP:-$COMMON_VPP}
|
| 85 |
+
RM_CP=${RM_CP:-$COMMON_CP}
|
| 86 |
+
RM_TP=${RM_TP:-$TRAIN_TP}
|
| 87 |
+
RM_EP=${RM_EP:-$COMMON_EP}
|
| 88 |
+
RM_ETP=${RM_ETP:-$COMMON_ETP}
|
| 89 |
+
|
| 90 |
+
# install mbridge
|
| 91 |
+
# pip3 install git+https://github.com/ISEEKYAN/mbridge
|
| 92 |
+
USE_MBRIDGE=True
|
| 93 |
+
USE_DIST_CKPT=False
|
| 94 |
+
|
| 95 |
+
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
|
| 96 |
+
data.train_files="${TRAIN_FILE}" \
|
| 97 |
+
data.val_files="${TEST_FILE}" \
|
| 98 |
+
data.prompt_key=prompt \
|
| 99 |
+
data.truncation='left' \
|
| 100 |
+
data.max_prompt_length=${max_prompt_length} \
|
| 101 |
+
data.max_response_length=${max_response_length} \
|
| 102 |
+
data.train_batch_size=${train_prompt_bsz} \
|
| 103 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
|
| 104 |
+
algorithm.adv_estimator=${adv_estimator} \
|
| 105 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward} \
|
| 106 |
+
algorithm.kl_ctrl.kl_coef=${kl_coef} \
|
| 107 |
+
actor_rollout_ref.model.path="${MODEL_PATH}" \
|
| 108 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
|
| 109 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
|
| 110 |
+
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
|
| 111 |
+
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
|
| 112 |
+
actor_rollout_ref.actor.clip_ratio_c=10.0 \
|
| 113 |
+
+actor_rollout_ref.model.override_config.model_config.max_position_embeddings=$((max_prompt_length + max_response_length)) \
|
| 114 |
+
actor_rollout_ref.model.use_fused_kernels=False \
|
| 115 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
|
| 116 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
| 117 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_ppo_micro_batch_size_per_gpu} \
|
| 118 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
|
| 119 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 120 |
+
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
|
| 121 |
+
actor_rollout_ref.actor.optim.lr_decay_style='constant' \
|
| 122 |
+
actor_rollout_ref.actor.optim.weight_decay=0.1 \
|
| 123 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} \
|
| 124 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
|
| 125 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
|
| 126 |
+
+actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
|
| 127 |
+
actor_rollout_ref.actor.megatron.use_mbridge=$USE_MBRIDGE \
|
| 128 |
+
actor_rollout_ref.actor.megatron.use_dist_checkpointing=$USE_DIST_CKPT \
|
| 129 |
+
actor_rollout_ref.actor.megatron.param_offload=${offload} \
|
| 130 |
+
actor_rollout_ref.actor.megatron.grad_offload=${offload} \
|
| 131 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
|
| 132 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${ACTOR_TP} \
|
| 133 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${ACTOR_PP} \
|
| 134 |
+
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=${ACTOR_VPP} \
|
| 135 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=${ACTOR_CP} \
|
| 136 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${ACTOR_EP} \
|
| 137 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ACTOR_ETP} \
|
| 138 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
|
| 139 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.masked_softmax_fusion=True \
|
| 140 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \
|
| 141 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.bias_dropout_fusion=True \
|
| 142 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
|
| 143 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.deallocate_pipeline_outputs=True \
|
| 144 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.persist_layer_norm=True \
|
| 145 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True \
|
| 146 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
|
| 147 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type="flex" \
|
| 148 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
|
| 149 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
|
| 150 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 151 |
+
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
|
| 152 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \
|
| 153 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 154 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
|
| 155 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${INFER_TP} \
|
| 156 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=True \
|
| 157 |
+
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
|
| 158 |
+
actor_rollout_ref.rollout.temperature=${temperature} \
|
| 159 |
+
actor_rollout_ref.rollout.top_p=${top_p} \
|
| 160 |
+
actor_rollout_ref.rollout.top_k=${top_k} \
|
| 161 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
|
| 162 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
|
| 163 |
+
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
|
| 164 |
+
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
|
| 165 |
+
actor_rollout_ref.rollout.val_kwargs.n=1 \
|
| 166 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 167 |
+
actor_rollout_ref.rollout.enforce_eager=True \
|
| 168 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 169 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \
|
| 170 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 171 |
+
actor_rollout_ref.ref.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \
|
| 172 |
+
actor_rollout_ref.ref.megatron.param_offload=${offload} \
|
| 173 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${REF_TP} \
|
| 174 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${REF_PP} \
|
| 175 |
+
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=${REF_VPP} \
|
| 176 |
+
actor_rollout_ref.ref.megatron.context_parallel_size=${REF_CP} \
|
| 177 |
+
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${REF_EP} \
|
| 178 |
+
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${REF_ETP} \
|
| 179 |
+
reward_model.reward_manager=dapo \
|
| 180 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
|
| 181 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
|
| 182 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
|
| 183 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
|
| 184 |
+
+reward_model.reward_kwargs.max_resp_len=${max_response_length} \
|
| 185 |
+
trainer.logger=['console','wandb'] \
|
| 186 |
+
trainer.project_name="${project_name}" \
|
| 187 |
+
trainer.experiment_name="${exp_name}" \
|
| 188 |
+
trainer.n_gpus_per_node="${NGPUS_PER_NODES}" \
|
| 189 |
+
trainer.nnodes="${NNODES}" \
|
| 190 |
+
trainer.val_before_train=False \
|
| 191 |
+
trainer.test_freq=10 \
|
| 192 |
+
trainer.save_freq=100 \
|
| 193 |
+
trainer.total_epochs=10 \
|
| 194 |
+
trainer.resume_mode=auto \
|
| 195 |
+
trainer.log_val_generations=10
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -xeuo pipefail
|
| 3 |
+
|
| 4 |
+
# Need to install Megatron-Bridge
|
| 5 |
+
# NOTE: Make sure you use Megatron-Bridge later than 0.2.0
|
| 6 |
+
# (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44 or later)
|
| 7 |
+
# for proper MoE LoRA support.
|
| 8 |
+
|
| 9 |
+
# For Megatron communication/computation overlapping
|
| 10 |
+
export CUDA_DEVICE_MAX_CONNECTIONS=1
|
| 11 |
+
|
| 12 |
+
########################### Quick Config ###########################
|
| 13 |
+
|
| 14 |
+
TP=${TP:-2}
|
| 15 |
+
PP=${PP:-2}
|
| 16 |
+
CP=${CP:-2}
|
| 17 |
+
EP=${EP:-4}
|
| 18 |
+
ETP=${ETP:-1}
|
| 19 |
+
|
| 20 |
+
ALL_OFFLOAD=${ALL_OFFLOAD:-True}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
rollout_name="vllm"
|
| 24 |
+
project_name='verl_grpo_example_gsm8k_math'
|
| 25 |
+
exp_name='qwen3_30b_a3b_megatron_lora'
|
| 26 |
+
adv_estimator=grpo
|
| 27 |
+
|
| 28 |
+
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
|
| 29 |
+
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
|
| 30 |
+
|
| 31 |
+
########################### Parameter Arrays ###########################
|
| 32 |
+
|
| 33 |
+
DATA=(
|
| 34 |
+
data.train_files=${gsm8k_train_path}
|
| 35 |
+
data.val_files=${gsm8k_test_path}
|
| 36 |
+
data.train_batch_size=128
|
| 37 |
+
data.max_prompt_length=1024
|
| 38 |
+
data.max_response_length=1024
|
| 39 |
+
data.truncation='error'
|
| 40 |
+
data.filter_overlong_prompts=True
|
| 41 |
+
data.shuffle=False
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
MODEL=(
|
| 45 |
+
actor_rollout_ref.model.path=Qwen/Qwen3-30B-A3B-Instruct-2507
|
| 46 |
+
actor_rollout_ref.model.use_fused_kernels=True
|
| 47 |
+
actor_rollout_ref.model.lora.rank=32
|
| 48 |
+
actor_rollout_ref.model.lora.alpha=64
|
| 49 |
+
actor_rollout_ref.model.lora.lora_A_init_method=kaiming
|
| 50 |
+
# # Optional: Use canonical LoRA
|
| 51 |
+
# actor_rollout_ref.model.lora.type="canonical_lora"
|
| 52 |
+
# actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'
|
| 53 |
+
|
| 54 |
+
# # Optional: Add dropout to LoRA layers
|
| 55 |
+
# actor_rollout_ref.model.lora.dropout=0.05
|
| 56 |
+
# actor_rollout_ref.model.lora.dropout_position=pre
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
ACTOR=(
|
| 60 |
+
actor_rollout_ref.actor.optim.lr=3e-6
|
| 61 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=16
|
| 62 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
|
| 63 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True
|
| 64 |
+
actor_rollout_ref.actor.megatron.vanilla_mbridge=False
|
| 65 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True
|
| 66 |
+
actor_rollout_ref.actor.use_kl_loss=True
|
| 67 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001
|
| 68 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl
|
| 69 |
+
actor_rollout_ref.actor.entropy_coeff=0
|
| 70 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
|
| 71 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
|
| 72 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
|
| 73 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
|
| 74 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
|
| 75 |
+
actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
|
| 76 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
|
| 77 |
+
actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
|
| 78 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
|
| 79 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
|
| 80 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
|
| 81 |
+
)
|
| 82 |
+
|
| 83 |
+
ROLLOUT=(
|
| 84 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=8
|
| 85 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
|
| 86 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True
|
| 87 |
+
actor_rollout_ref.rollout.name=${rollout_name}
|
| 88 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.25
|
| 89 |
+
actor_rollout_ref.rollout.enforce_eager=True
|
| 90 |
+
actor_rollout_ref.rollout.free_cache_engine=True
|
| 91 |
+
actor_rollout_ref.rollout.n=4
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
REF=(
|
| 95 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
|
| 96 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True
|
| 97 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
|
| 98 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
|
| 99 |
+
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
|
| 100 |
+
actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
|
| 101 |
+
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
|
| 102 |
+
actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
ALGORITHM=(
|
| 106 |
+
algorithm.adv_estimator=${adv_estimator}
|
| 107 |
+
)
|
| 108 |
+
|
| 109 |
+
TRAINER=(
|
| 110 |
+
trainer.critic_warmup=0
|
| 111 |
+
trainer.logger='["console","wandb"]'
|
| 112 |
+
trainer.project_name=${project_name}
|
| 113 |
+
trainer.experiment_name=${exp_name}
|
| 114 |
+
trainer.n_gpus_per_node=8
|
| 115 |
+
trainer.nnodes=1
|
| 116 |
+
trainer.save_freq=20
|
| 117 |
+
trainer.test_freq=5
|
| 118 |
+
trainer.total_epochs=15
|
| 119 |
+
)
|
| 120 |
+
|
| 121 |
+
########################### Launch ###########################
|
| 122 |
+
|
| 123 |
+
python3 -m verl.trainer.main_ppo \
|
| 124 |
+
--config-path=config \
|
| 125 |
+
--config-name='ppo_megatron_trainer.yaml' \
|
| 126 |
+
"${DATA[@]}" \
|
| 127 |
+
"${ALGORITHM[@]}" \
|
| 128 |
+
"${MODEL[@]}" \
|
| 129 |
+
"${ROLLOUT[@]}" \
|
| 130 |
+
"${ACTOR[@]}" \
|
| 131 |
+
"${REF[@]}" \
|
| 132 |
+
"${TRAINER[@]}" \
|
| 133 |
+
"$@"
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
ADDED
|
@@ -0,0 +1,236 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
set -xeuo pipefail
|
| 3 |
+
# Project Configuration
|
| 4 |
+
project_name='DAPO-Qwen3-30b-A3B-BASE-MATH'
|
| 5 |
+
exp_name='DAPO-Qwen3-30B-A3B-BASE-Megatron-SGLang'
|
| 6 |
+
|
| 7 |
+
# Necessary env
|
| 8 |
+
export HCCL_CONNECT_TIMEOUT=1500
|
| 9 |
+
export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
|
| 10 |
+
export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
|
| 11 |
+
|
| 12 |
+
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
|
| 13 |
+
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
| 14 |
+
|
| 15 |
+
export DISABLE_L2_CACHE=1
|
| 16 |
+
export TASK_QUEUE_ENABLE=1
|
| 17 |
+
|
| 18 |
+
# Node Info
|
| 19 |
+
NNODES=${NNODES:-1}
|
| 20 |
+
NPUS_PER_NODE=${NPUS_PER_NODE:-16}
|
| 21 |
+
|
| 22 |
+
# Model Weights Paths
|
| 23 |
+
MODEL_PATH=Qwen/Qwen3-30B-A3B
|
| 24 |
+
MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-mcore
|
| 25 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 26 |
+
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
|
| 27 |
+
|
| 28 |
+
# File System Paths
|
| 29 |
+
TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
|
| 30 |
+
TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
|
| 31 |
+
# Data Length Configuration
|
| 32 |
+
max_prompt_length=$((1024 * 2))
|
| 33 |
+
max_response_length=$((1024 * 8))
|
| 34 |
+
|
| 35 |
+
# Training Batch Configuration
|
| 36 |
+
train_prompt_bsz=16
|
| 37 |
+
train_prompt_mini_bsz=16
|
| 38 |
+
n_resp_per_prompt=8
|
| 39 |
+
|
| 40 |
+
# Algorithm Configuration
|
| 41 |
+
adv_estimator=grpo
|
| 42 |
+
use_kl_in_reward=False
|
| 43 |
+
kl_coef=0.0
|
| 44 |
+
use_kl_loss=True
|
| 45 |
+
kl_loss_coef=0.001
|
| 46 |
+
|
| 47 |
+
# Performance and Memory Management Configuration
|
| 48 |
+
all_offload=True
|
| 49 |
+
use_dynamic_bsz=False
|
| 50 |
+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
|
| 51 |
+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
|
| 52 |
+
|
| 53 |
+
# Megatron Parallelism Configuration
|
| 54 |
+
train_tp=4
|
| 55 |
+
train_ep=4
|
| 56 |
+
train_etp=4
|
| 57 |
+
train_pp=1
|
| 58 |
+
train_cp=1
|
| 59 |
+
|
| 60 |
+
# SGLang Generation Configuration
|
| 61 |
+
gen_tp=4
|
| 62 |
+
gen_dp=1
|
| 63 |
+
gen_ep=1
|
| 64 |
+
gpu_memory_utilization=0.5
|
| 65 |
+
max_model_len=$((max_prompt_length + max_response_length))
|
| 66 |
+
max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
|
| 67 |
+
|
| 68 |
+
# Data Configuration
|
| 69 |
+
DATA_CONFIG=(
|
| 70 |
+
# File Paths
|
| 71 |
+
data.train_files="${TRAIN_FILE}"
|
| 72 |
+
data.val_files="${TEST_FILE}"
|
| 73 |
+
# Data Structure
|
| 74 |
+
data.prompt_key=prompt
|
| 75 |
+
# Batch and Length Configuration
|
| 76 |
+
data.train_batch_size=${train_prompt_bsz}
|
| 77 |
+
data.max_prompt_length=${max_prompt_length}
|
| 78 |
+
data.max_response_length=${max_response_length}
|
| 79 |
+
# Preprocessing
|
| 80 |
+
data.filter_overlong_prompts=False
|
| 81 |
+
data.truncation='left'
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
# Model Configuration
|
| 85 |
+
MODEL_CONFIG=(
|
| 86 |
+
# Model Path
|
| 87 |
+
actor_rollout_ref.model.path="${MODEL_PATH}"
|
| 88 |
+
# Model Processing
|
| 89 |
+
actor_rollout_ref.model.use_remove_padding=True
|
| 90 |
+
)
|
| 91 |
+
|
| 92 |
+
# Reinforcement Learning Algorithm Configuration
|
| 93 |
+
ALGORITHM_CONFIG=(
|
| 94 |
+
# Advantage Estimation
|
| 95 |
+
algorithm.adv_estimator=${adv_estimator}
|
| 96 |
+
# KL Divergence Control
|
| 97 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward}
|
| 98 |
+
algorithm.kl_ctrl.kl_coef=${kl_coef}
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
ACTOR_CONFIG=(
|
| 102 |
+
# Core Runtime Settings
|
| 103 |
+
actor_rollout_ref.actor.use_torch_compile=False
|
| 104 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
|
| 105 |
+
# Loss Function Configuration
|
| 106 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
|
| 107 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
|
| 108 |
+
actor_rollout_ref.actor.entropy_coeff=0
|
| 109 |
+
# PPO Training Parameters
|
| 110 |
+
actor_rollout_ref.actor.ppo_epochs=1
|
| 111 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
|
| 112 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
|
| 113 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
|
| 114 |
+
# Optimizer Settings
|
| 115 |
+
actor_rollout_ref.actor.optim.lr=1e-6
|
| 116 |
+
# Megatron Parallelism Strategy
|
| 117 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
|
| 118 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
|
| 119 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp}
|
| 120 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep}
|
| 121 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp}
|
| 122 |
+
# Memory Optimization
|
| 123 |
+
actor_rollout_ref.actor.megatron.param_offload=${all_offload}
|
| 124 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload}
|
| 125 |
+
actor_rollout_ref.actor.megatron.grad_offload=${all_offload}
|
| 126 |
+
# Model Weights Management
|
| 127 |
+
actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
|
| 128 |
+
actor_rollout_ref.actor.megatron.use_dist_checkpointing=True
|
| 129 |
+
actor_rollout_ref.actor.megatron.use_mbridge=False
|
| 130 |
+
# Transformer Architecture Optimizations
|
| 131 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
|
| 132 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
|
| 133 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
|
| 134 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
REF_CONFIG=(
|
| 138 |
+
# Core Runtime Settings
|
| 139 |
+
actor_rollout_ref.ref.use_torch_compile=False
|
| 140 |
+
# Log Probability Inference
|
| 141 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
|
| 142 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
|
| 143 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
|
| 144 |
+
# Megatron Parallelism Strategy
|
| 145 |
+
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
|
| 146 |
+
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
|
| 147 |
+
actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp}
|
| 148 |
+
actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep}
|
| 149 |
+
actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp}
|
| 150 |
+
# Memory Optimization
|
| 151 |
+
actor_rollout_ref.ref.megatron.param_offload=${all_offload}
|
| 152 |
+
# Model Weights Management
|
| 153 |
+
actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
|
| 154 |
+
actor_rollout_ref.ref.megatron.use_dist_checkpointing=True
|
| 155 |
+
actor_rollout_ref.ref.megatron.use_mbridge=False
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
+
ROLLOUT_CONFIG=(
|
| 159 |
+
# Rollout Engine
|
| 160 |
+
actor_rollout_ref.rollout.name=sglang
|
| 161 |
+
+actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
|
| 162 |
+
# Generation Parameters
|
| 163 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt}
|
| 164 |
+
actor_rollout_ref.rollout.top_p=1.0
|
| 165 |
+
actor_rollout_ref.rollout.top_k=-1
|
| 166 |
+
actor_rollout_ref.rollout.temperature=1.0
|
| 167 |
+
# Log Probability Inference
|
| 168 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
|
| 169 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
|
| 170 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
|
| 171 |
+
# Memory Management
|
| 172 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
|
| 173 |
+
# Parallelism Strategy
|
| 174 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
|
| 175 |
+
actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
|
| 176 |
+
actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
|
| 177 |
+
+actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
|
| 178 |
+
# Performance Optimization
|
| 179 |
+
+actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
|
| 180 |
+
actor_rollout_ref.rollout.enforce_eager=False
|
| 181 |
+
# Validation Generation
|
| 182 |
+
actor_rollout_ref.rollout.val_kwargs.n=1
|
| 183 |
+
actor_rollout_ref.rollout.val_kwargs.do_sample=True
|
| 184 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=1.0
|
| 185 |
+
actor_rollout_ref.rollout.val_kwargs.top_k=-1
|
| 186 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=1.0
|
| 187 |
+
)
|
| 188 |
+
|
| 189 |
+
TRAINER_CONFIG=(
|
| 190 |
+
# Logger Configuration
|
| 191 |
+
trainer.logger='["console"]'
|
| 192 |
+
# Project Settings
|
| 193 |
+
trainer.project_name="${project_name}"
|
| 194 |
+
trainer.experiment_name="${exp_name}"
|
| 195 |
+
# Hardware Configuration
|
| 196 |
+
trainer.nnodes="${NNODES}"
|
| 197 |
+
trainer.n_gpus_per_node="${NPUS_PER_NODE}"
|
| 198 |
+
trainer.device='npu'
|
| 199 |
+
# Training Schedule
|
| 200 |
+
trainer.total_epochs=15
|
| 201 |
+
trainer.val_before_train=False
|
| 202 |
+
trainer.test_freq=-1
|
| 203 |
+
trainer.save_freq=-1
|
| 204 |
+
# Checkpoint Directory
|
| 205 |
+
trainer.default_local_dir="${CKPTS_DIR}"
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
# profiling configuration
|
| 209 |
+
PROF_CONFIG=(
|
| 210 |
+
global_profiler.tool=npu
|
| 211 |
+
global_profiler.steps=null
|
| 212 |
+
global_profiler.save_path=/profpath
|
| 213 |
+
actor_rollout_ref.actor.profiler.enable=True
|
| 214 |
+
actor_rollout_ref.actor.profiler.ranks="[0]"
|
| 215 |
+
actor_rollout_ref.actor.profiler.all_ranks=False
|
| 216 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.discrete=True
|
| 217 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.contents=['npu','cpu']
|
| 218 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.level=level0
|
| 219 |
+
actor_rollout_ref.actor.profiler.tool_config.npu.analysis=True
|
| 220 |
+
actor_rollout_ref.rollout.profiler.enable=True
|
| 221 |
+
actor_rollout_ref.rollout.profiler.ranks="[0]"
|
| 222 |
+
actor_rollout_ref.rollout.profiler.all_ranks=False
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
python3 -m verl.trainer.main_ppo \
|
| 226 |
+
--config-path=config \
|
| 227 |
+
--config-name='ppo_megatron_trainer.yaml' \
|
| 228 |
+
"${DATA_CONFIG[@]}" \
|
| 229 |
+
"${MODEL_CONFIG[@]}" \
|
| 230 |
+
"${ACTOR_CONFIG[@]}" \
|
| 231 |
+
"${REF_CONFIG[@]}" \
|
| 232 |
+
"${ROLLOUT_CONFIG[@]}" \
|
| 233 |
+
"${ALGORITHM_CONFIG[@]}" \
|
| 234 |
+
"${TRAINER_CONFIG[@]}" \
|
| 235 |
+
"${PROF_CONFIG[@]}" \
|
| 236 |
+
"$@"
|
code/RL_model/verl/verl_train/examples/grpo_trainer/run_seed_oss_36b.sh
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
set -x
|
| 2 |
+
|
| 3 |
+
python3 -m verl.trainer.main_ppo \
|
| 4 |
+
algorithm.adv_estimator=grpo \
|
| 5 |
+
data.train_files=$HOME/data/gsm8k/train.parquet \
|
| 6 |
+
data.val_files=$HOME/data/gsm8k/test.parquet \
|
| 7 |
+
data.train_batch_size=64 \
|
| 8 |
+
data.max_prompt_length=512 \
|
| 9 |
+
data.max_response_length=1024 \
|
| 10 |
+
data.filter_overlong_prompts=True \
|
| 11 |
+
data.truncation='error' \
|
| 12 |
+
actor_rollout_ref.model.path=ByteDance-Seed/Seed-OSS-36B-Base \
|
| 13 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 14 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 15 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
| 16 |
+
actor_rollout_ref.model.use_fused_kernels=True \
|
| 17 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=8 \
|
| 18 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
|
| 19 |
+
actor_rollout_ref.actor.use_kl_loss=True \
|
| 20 |
+
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
| 21 |
+
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
|
| 22 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 23 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 24 |
+
actor_rollout_ref.actor.strategy=fsdp2 \
|
| 25 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
|
| 26 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=True \
|
| 27 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=True \
|
| 28 |
+
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
|
| 29 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
| 30 |
+
actor_rollout_ref.rollout.name=vllm \
|
| 31 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
| 32 |
+
actor_rollout_ref.rollout.n=2 \
|
| 33 |
+
actor_rollout_ref.rollout.free_cache_engine=True \
|
| 34 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
|
| 35 |
+
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
|
| 36 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
| 37 |
+
actor_rollout_ref.ref.strategy=fsdp2 \
|
| 38 |
+
algorithm.use_kl_in_reward=False \
|
| 39 |
+
trainer.critic_warmup=0 \
|
| 40 |
+
trainer.logger='["console"]' \
|
| 41 |
+
trainer.project_name='verl_grpo_seed_oss_36b' \
|
| 42 |
+
trainer.experiment_name='seed_oss_36b' \
|
| 43 |
+
trainer.val_before_train=False \
|
| 44 |
+
trainer.n_gpus_per_node=8 \
|
| 45 |
+
trainer.nnodes=1 \
|
| 46 |
+
trainer.save_freq=20 \
|
| 47 |
+
trainer.test_freq=5 \
|
| 48 |
+
trainer.total_epochs=15 $@
|
code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen30b_gspo.sh
ADDED
|
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# run Qwen3-30B GSPO with new model engine
|
| 2 |
+
set -x
|
| 3 |
+
|
| 4 |
+
HDFS_ROOT=${HDFS_ROOT:-$PWD}
|
| 5 |
+
DATA_ROOT=${DATA_ROOT:-$PWD}
|
| 6 |
+
|
| 7 |
+
# wandb
|
| 8 |
+
backend=megatron # fsdp, fsdp2, megatron
|
| 9 |
+
project_name=wuxibin_gspo
|
| 10 |
+
experiment_name=qwen3-30B-base-grpo-$backend
|
| 11 |
+
default_local_dir=$DATA_ROOT/checkpoint/$project_name/$experiment_name
|
| 12 |
+
|
| 13 |
+
# ===================================== Algorithm =====================================
|
| 14 |
+
adv_estimator=grpo
|
| 15 |
+
loss_mode=gspo
|
| 16 |
+
|
| 17 |
+
# reference policy
|
| 18 |
+
use_kl_in_reward=False
|
| 19 |
+
kl_coef=0.001
|
| 20 |
+
use_kl_loss=False
|
| 21 |
+
kl_loss_coef=0.001
|
| 22 |
+
|
| 23 |
+
clip_ratio_low=3e-4
|
| 24 |
+
clip_ratio_high=4e-4
|
| 25 |
+
|
| 26 |
+
actor_lr=1e-6
|
| 27 |
+
critic_lr=2e-6
|
| 28 |
+
gae_gamma=1.0
|
| 29 |
+
gae_lam=0.95
|
| 30 |
+
critic_warmup=0
|
| 31 |
+
|
| 32 |
+
# ===================================== Data/Model =====================================
|
| 33 |
+
train_files=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k/data/dapo-math-17k.parquet
|
| 34 |
+
test_files=$DATA_ROOT/dataset/aime-2024.parquet
|
| 35 |
+
|
| 36 |
+
actor_model_path=$HDFS_ROOT/model/Qwen3-30B-A3B-Base
|
| 37 |
+
critic_model_path=$actor_model_path
|
| 38 |
+
|
| 39 |
+
max_prompt_length=$((1024 * 2))
|
| 40 |
+
max_response_length=$((1024 * 8))
|
| 41 |
+
enable_overlong_buffer=True
|
| 42 |
+
overlong_buffer_len=$((1024 * 4))
|
| 43 |
+
overlong_penalty_factor=1.0
|
| 44 |
+
|
| 45 |
+
train_batch_size=256
|
| 46 |
+
ppo_mini_batch_size=32
|
| 47 |
+
n_resp_per_prompt=16
|
| 48 |
+
n_resp_per_prompt_val=1
|
| 49 |
+
|
| 50 |
+
# ===================================== Training =====================================
|
| 51 |
+
actor_max_token_len_per_gpu=$(((max_prompt_length + max_response_length) * 3))
|
| 52 |
+
critic_max_token_len_per_gpu=$(((max_prompt_length + max_response_length) * 4))
|
| 53 |
+
|
| 54 |
+
# FSDP parallelism config
|
| 55 |
+
USP_SIZE=4
|
| 56 |
+
ACTOR_FSDP_CONFIG="
|
| 57 |
+
actor_rollout_ref.actor.fsdp_config.strategy=$backend \
|
| 58 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=True \
|
| 59 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
|
| 60 |
+
actor_rollout_ref.actor.ulysses_sequence_parallel_size=$USP_SIZE"
|
| 61 |
+
|
| 62 |
+
# Megatron parallelism config
|
| 63 |
+
TP_SIZE=2
|
| 64 |
+
CP_SIZE=1
|
| 65 |
+
PP_SIZE=1
|
| 66 |
+
VPP_SIZE=null
|
| 67 |
+
EP_SIZE=8
|
| 68 |
+
ETP_SIZE=1
|
| 69 |
+
ACTOR_MEGATRON_CONFIG="
|
| 70 |
+
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
|
| 71 |
+
actor_rollout_ref.actor.megatron.context_parallel_size=$CP_SIZE \
|
| 72 |
+
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
|
| 73 |
+
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$VPP_SIZE \
|
| 74 |
+
actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP_SIZE \
|
| 75 |
+
actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP_SIZE \
|
| 76 |
+
actor_rollout_ref.actor.megatron.param_offload=True \
|
| 77 |
+
actor_rollout_ref.actor.megatron.grad_offload=True \
|
| 78 |
+
actor_rollout_ref.actor.megatron.optimizer_offload=True \
|
| 79 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
|
| 80 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
|
| 81 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
|
| 82 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
|
| 83 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
|
| 84 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
|
| 85 |
+
+actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
|
| 86 |
+
actor_rollout_ref.actor.megatron.use_mbridge=True"
|
| 87 |
+
|
| 88 |
+
# Actor model config
|
| 89 |
+
ACTOR_CONFIG="
|
| 90 |
+
actor_rollout_ref.actor.optim.lr=$actor_lr \
|
| 91 |
+
actor_rollout_ref.model.path=$actor_model_path \
|
| 92 |
+
actor_rollout_ref.model.use_remove_padding=True \
|
| 93 |
+
actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
|
| 94 |
+
actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
|
| 95 |
+
actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
|
| 96 |
+
actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
|
| 97 |
+
actor_rollout_ref.actor.clip_ratio_c=10.0 \
|
| 98 |
+
actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode}
|
| 99 |
+
actor_rollout_ref.actor.use_dynamic_bsz=True \
|
| 100 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
|
| 101 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu"
|
| 102 |
+
|
| 103 |
+
# Critic model config
|
| 104 |
+
CIRITC_CONFIG="
|
| 105 |
+
critic.optim.lr=$critic_lr \
|
| 106 |
+
critic.model.path=$critic_model_path \
|
| 107 |
+
critic.model.use_remove_padding=True \
|
| 108 |
+
critic.ppo_max_token_len_per_gpu=$critic_max_token_len_per_gpu \
|
| 109 |
+
critic.ulysses_sequence_parallel_size=$USP_SIZE"
|
| 110 |
+
|
| 111 |
+
CRITIC_FSDP_CONFIG="${ACTOR_FSDP_CONFIG//actor_rollout_ref.actor/critic.model}"
|
| 112 |
+
CRITIC_MEGATRON_CONFIG="${ACTOR_MEGATRON_CONFIG//actor_rollout_ref.actor/critic}"
|
| 113 |
+
|
| 114 |
+
if [[ $backend == "megatron" ]]; then
|
| 115 |
+
CONFIG_NAME=ppo_megatron_trainer
|
| 116 |
+
ACTOR_CONFIG="$ACTOR_CONFIG $ACTOR_MEGATRON_CONFIG"
|
| 117 |
+
if [[ $adv_estimator == "gae" ]]; then
|
| 118 |
+
CIRITC_CONFIG="$CIRITC_CONFIG $CRITIC_MEGATRON_CONFIG"
|
| 119 |
+
else
|
| 120 |
+
CIRITC_CONFIG=""
|
| 121 |
+
fi
|
| 122 |
+
else # fsdp, fsdp2
|
| 123 |
+
CONFIG_NAME=ppo_trainer
|
| 124 |
+
ACTOR_CONFIG="$ACTOR_CONFIG $ACTOR_FSDP_CONFIG"
|
| 125 |
+
if [[ $adv_estimator == "gae" ]]; then
|
| 126 |
+
CIRITC_CONFIG="$CIRITC_CONFIG $CRITIC_FSDP_CONFIG"
|
| 127 |
+
else
|
| 128 |
+
CIRITC_CONFIG=""
|
| 129 |
+
fi
|
| 130 |
+
fi
|
| 131 |
+
|
| 132 |
+
# ===================================== Inference =====================================
|
| 133 |
+
rollout_name=vllm
|
| 134 |
+
if [ "$rollout_name" = "vllm" ]; then
|
| 135 |
+
export VLLM_USE_V1=1
|
| 136 |
+
fi
|
| 137 |
+
infer_tp=4
|
| 138 |
+
infer_dp=1
|
| 139 |
+
infer_ep=1
|
| 140 |
+
gpu_memory_utilization=0.8
|
| 141 |
+
|
| 142 |
+
ROLLOUT_CONFIG="
|
| 143 |
+
actor_rollout_ref.rollout.name=$rollout_name \
|
| 144 |
+
actor_rollout_ref.rollout.mode=async \
|
| 145 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
|
| 146 |
+
actor_rollout_ref.rollout.data_parallel_size=$infer_dp \
|
| 147 |
+
actor_rollout_ref.rollout.expert_parallel_size=$infer_ep \
|
| 148 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=$gpu_memory_utilization \
|
| 149 |
+
actor_rollout_ref.rollout.n=$n_resp_per_prompt \
|
| 150 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=0.7 \
|
| 151 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
|
| 152 |
+
actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val"
|
| 153 |
+
|
| 154 |
+
# ===================================== Reward =====================================
|
| 155 |
+
REWARD_CONFIG="
|
| 156 |
+
reward_model.reward_manager=dapo \
|
| 157 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
|
| 158 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
|
| 159 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
|
| 160 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
|
| 161 |
+
+reward_model.reward_kwargs.max_resp_len=${max_response_length}"
|
| 162 |
+
|
| 163 |
+
python3 -m verl.trainer.main_ppo \
|
| 164 |
+
--config-path=./config \
|
| 165 |
+
--config-name=$CONFIG_NAME \
|
| 166 |
+
algorithm.adv_estimator=$adv_estimator \
|
| 167 |
+
algorithm.use_kl_in_reward=$use_kl_in_reward \
|
| 168 |
+
algorithm.kl_ctrl.kl_coef=$kl_coef \
|
| 169 |
+
algorithm.gamma=$gae_gamma \
|
| 170 |
+
algorithm.lam=$gae_lam \
|
| 171 |
+
data.train_files="$train_files" \
|
| 172 |
+
data.val_files="$test_files" \
|
| 173 |
+
data.return_raw_chat=True \
|
| 174 |
+
data.train_batch_size=$train_batch_size \
|
| 175 |
+
data.max_prompt_length=$max_prompt_length \
|
| 176 |
+
data.max_response_length=$max_response_length \
|
| 177 |
+
data.filter_overlong_prompts=True \
|
| 178 |
+
data.filter_overlong_prompts_workers=64 \
|
| 179 |
+
data.truncation='error' \
|
| 180 |
+
trainer.use_legacy_worker_impl=disable \
|
| 181 |
+
trainer.critic_warmup=$critic_warmup \
|
| 182 |
+
trainer.logger=['console','wandb'] \
|
| 183 |
+
trainer.project_name=$project_name \
|
| 184 |
+
trainer.experiment_name=$experiment_name \
|
| 185 |
+
trainer.default_local_dir=$default_local_dir \
|
| 186 |
+
trainer.n_gpus_per_node=$ARNOLD_WORKER_GPU \
|
| 187 |
+
trainer.nnodes=$ARNOLD_WORKER_NUM \
|
| 188 |
+
trainer.val_before_train=False \
|
| 189 |
+
trainer.log_val_generations=100 \
|
| 190 |
+
trainer.save_freq=-1 \
|
| 191 |
+
trainer.test_freq=10 \
|
| 192 |
+
trainer.total_epochs=10 \
|
| 193 |
+
trainer.total_training_steps=500 \
|
| 194 |
+
$ACTOR_CONFIG \
|
| 195 |
+
$CIRITC_CONFIG \
|
| 196 |
+
$ROLLOUT_CONFIG \
|
| 197 |
+
$REWARD_CONFIG
|
code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh
ADDED
|
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
set -xeuo pipefail
|
| 3 |
+
mkdir -p logs
|
| 4 |
+
ulimit -n 32768
|
| 5 |
+
|
| 6 |
+
## Basic Environment Settings
|
| 7 |
+
export RAY_DEDUP_LOGS=0
|
| 8 |
+
export HYDRA_FULL_ERROR=1
|
| 9 |
+
export TASK_QUEUE_ENABLE=1
|
| 10 |
+
export HCCL_EXEC_TIMEOUT=3600
|
| 11 |
+
export HCCL_CONNECT_TIMEOUT=3600
|
| 12 |
+
export HCCL_ASYNC_ERROR_HANDLING=0
|
| 13 |
+
export CPU_AFFINITY_CONF=1
|
| 14 |
+
export VLLM_USE_V1=1
|
| 15 |
+
export VLLM_ATTENTION_BACKEND=XFORMERS
|
| 16 |
+
export VLLM_ASCEND_ENABLE_FLASHCOMM=1
|
| 17 |
+
export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1
|
| 18 |
+
export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1
|
| 19 |
+
export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
|
| 20 |
+
|
| 21 |
+
# Project Configuration
|
| 22 |
+
project_name='GSPO-Qwen3-32B-BASE-MATH'
|
| 23 |
+
exp_name='GSPO-Qwen3-32B-BASE-Megatron-vLLM'
|
| 24 |
+
|
| 25 |
+
# Node Info
|
| 26 |
+
NNODES=${NNODES:-4}
|
| 27 |
+
NPUS_PER_NODE=${NPUS_PER_NODE:-16}
|
| 28 |
+
|
| 29 |
+
# Model Weights Paths
|
| 30 |
+
MODEL_PATH=Qwen/Qwen3-32B
|
| 31 |
+
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
|
| 32 |
+
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
|
| 33 |
+
|
| 34 |
+
# File System Paths
|
| 35 |
+
TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
|
| 36 |
+
TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
|
| 37 |
+
|
| 38 |
+
# Ray Configuration
|
| 39 |
+
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
|
| 40 |
+
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
|
| 41 |
+
|
| 42 |
+
# Data Length Configuration
|
| 43 |
+
max_prompt_length=$((1024 * 16))
|
| 44 |
+
max_response_length=$((1024 * 16))
|
| 45 |
+
|
| 46 |
+
# Training Batch Configuration
|
| 47 |
+
train_prompt_bsz=256
|
| 48 |
+
gen_prompt_bsz=$((train_prompt_bsz * 1))
|
| 49 |
+
train_prompt_mini_bsz=64
|
| 50 |
+
n_resp_per_prompt=16
|
| 51 |
+
|
| 52 |
+
# GSPO Loss Configuration
|
| 53 |
+
adv_estimator=grpo
|
| 54 |
+
loss_mode=gspo
|
| 55 |
+
use_kl_in_reward=False
|
| 56 |
+
kl_coef=0.0
|
| 57 |
+
use_kl_loss=False
|
| 58 |
+
kl_loss_coef=0.0
|
| 59 |
+
clip_ratio_low=0.0003
|
| 60 |
+
clip_ratio_high=0.0004
|
| 61 |
+
loss_agg_mode="seq-mean-token-mean"
|
| 62 |
+
|
| 63 |
+
# Performance and Memory Management Configuration
|
| 64 |
+
offload=True
|
| 65 |
+
use_dynamic_bsz=True
|
| 66 |
+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
|
| 67 |
+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
|
| 68 |
+
|
| 69 |
+
# FSDP Parallelism Configuration
|
| 70 |
+
actor_strategy=fsdp2
|
| 71 |
+
ref_strategy=fsdp2
|
| 72 |
+
sp_size=4
|
| 73 |
+
fsdp_size=-1
|
| 74 |
+
# vLLM Configuration
|
| 75 |
+
gen_tp=4
|
| 76 |
+
gpu_memory_utilization=0.9
|
| 77 |
+
max_model_len=$((max_prompt_length + max_response_length))
|
| 78 |
+
max_num_batched_tokens=$((max_prompt_length + max_response_length))
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
# Data Configuration
|
| 82 |
+
DATA_CONFIG=(
|
| 83 |
+
data.train_files="${TRAIN_FILE}"
|
| 84 |
+
data.val_files="${TEST_FILE}"
|
| 85 |
+
data.prompt_key=prompt
|
| 86 |
+
data.train_batch_size=${train_prompt_bsz}
|
| 87 |
+
+data.gen_batch_size=${gen_prompt_bsz}
|
| 88 |
+
data.max_prompt_length=${max_prompt_length}
|
| 89 |
+
data.max_response_length=${max_response_length}
|
| 90 |
+
data.truncation='left'
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
# Model Configuration
|
| 94 |
+
MODEL_CONFIG=(
|
| 95 |
+
actor_rollout_ref.model.path="${MODEL_PATH}"
|
| 96 |
+
actor_rollout_ref.model.use_remove_padding=True
|
| 97 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=True
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
# Algorithm Configuration
|
| 101 |
+
ALGORITHM_CONFIG=(
|
| 102 |
+
algorithm.adv_estimator=${adv_estimator}
|
| 103 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward}
|
| 104 |
+
algorithm.kl_ctrl.kl_coef=${kl_coef}
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Actor Model Configuration
|
| 108 |
+
ACTOR_CONFIG=(
|
| 109 |
+
actor_rollout_ref.actor.use_torch_compile=False
|
| 110 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
|
| 111 |
+
actor_rollout_ref.actor.strategy=${actor_strategy}
|
| 112 |
+
actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode}
|
| 113 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
|
| 114 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
|
| 115 |
+
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low}
|
| 116 |
+
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high}
|
| 117 |
+
actor_rollout_ref.actor.clip_ratio_c=10.0
|
| 118 |
+
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
|
| 119 |
+
actor_rollout_ref.actor.entropy_coeff=0
|
| 120 |
+
actor_rollout_ref.actor.grad_clip=1.0
|
| 121 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
|
| 122 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
|
| 123 |
+
actor_rollout_ref.actor.optim.lr=1e-6
|
| 124 |
+
actor_rollout_ref.actor.optim.lr_warmup_steps=10
|
| 125 |
+
actor_rollout_ref.actor.optim.weight_decay=0.1
|
| 126 |
+
actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size}
|
| 127 |
+
actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size}
|
| 128 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=${offload}
|
| 129 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload}
|
| 130 |
+
actor_rollout_ref.actor.fsdp_config.forward_prefetch=True
|
| 131 |
+
actor_rollout_ref.actor.entropy_checkpointing=True
|
| 132 |
+
actor_rollout_ref.actor.entropy_from_logits_with_chunking=True
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
# Reference Model Configuration
|
| 136 |
+
REF_CONFIG=(
|
| 137 |
+
actor_rollout_ref.ref.use_torch_compile=False
|
| 138 |
+
actor_rollout_ref.ref.strategy=${ref_strategy}
|
| 139 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
|
| 140 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
|
| 141 |
+
actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size}
|
| 142 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=${offload}
|
| 143 |
+
actor_rollout_ref.ref.fsdp_config.forward_prefetch=True
|
| 144 |
+
actor_rollout_ref.ref.entropy_checkpointing=True
|
| 145 |
+
actor_rollout_ref.ref.entropy_from_logits_with_chunking=True
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Rollout Configuration
|
| 149 |
+
ROLLOUT_CONFIG=(
|
| 150 |
+
actor_rollout_ref.rollout.name=vllm
|
| 151 |
+
actor_rollout_ref.rollout.calculate_log_probs=True
|
| 152 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt}
|
| 153 |
+
actor_rollout_ref.rollout.top_p=1.0
|
| 154 |
+
actor_rollout_ref.rollout.top_k=-1
|
| 155 |
+
actor_rollout_ref.rollout.temperature=1.0
|
| 156 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
|
| 157 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
|
| 158 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
|
| 159 |
+
actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens}
|
| 160 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
|
| 161 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=True
|
| 162 |
+
actor_rollout_ref.rollout.enforce_eager=False
|
| 163 |
+
actor_rollout_ref.rollout.free_cache_engine=True
|
| 164 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes="[8, 16, 32, 64, 128, 192, 256, 384]"
|
| 165 |
+
+actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode="FULL_DECODE_ONLY"
|
| 166 |
+
actor_rollout_ref.rollout.val_kwargs.n=1
|
| 167 |
+
actor_rollout_ref.rollout.val_kwargs.do_sample=True
|
| 168 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=0.7
|
| 169 |
+
actor_rollout_ref.rollout.val_kwargs.top_k=-1
|
| 170 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=1.0
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
# Trainer Configuration
|
| 174 |
+
TRAINER_CONFIG=(
|
| 175 |
+
trainer.logger='["console"]'
|
| 176 |
+
trainer.project_name="${project_name}"
|
| 177 |
+
trainer.experiment_name="${exp_name}"
|
| 178 |
+
trainer.nnodes="${NNODES}"
|
| 179 |
+
trainer.n_gpus_per_node="${NPUS_PER_NODE}"
|
| 180 |
+
trainer.device='npu'
|
| 181 |
+
trainer.total_epochs=10
|
| 182 |
+
trainer.val_before_train=False
|
| 183 |
+
trainer.test_freq=-1
|
| 184 |
+
trainer.save_freq=100
|
| 185 |
+
trainer.default_local_dir="${CKPTS_DIR}"
|
| 186 |
+
trainer.resume_mode=auto
|
| 187 |
+
trainer.balance_batch=True
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
# Main GSPO Training Command
|
| 191 |
+
python3 -m verl.trainer.main_ppo \
|
| 192 |
+
"${DATA_CONFIG[@]}" \
|
| 193 |
+
"${MODEL_CONFIG[@]}" \
|
| 194 |
+
"${ACTOR_CONFIG[@]}" \
|
| 195 |
+
"${REF_CONFIG[@]}" \
|
| 196 |
+
"${ROLLOUT_CONFIG[@]}" \
|
| 197 |
+
"${ALGORITHM_CONFIG[@]}" \
|
| 198 |
+
"${TRAINER_CONFIG[@]}" \
|
| 199 |
+
"$@" | tee logs/run_qwen3_32b_gspo_megatron_vllm_npu.log
|
code/RL_model/verl/verl_train/examples/gspo_trainer/test_gspo_3b_math.sh
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#SBATCH --job-name=rl-gspo-3B
|
| 3 |
+
#SBATCH --partition=main
|
| 4 |
+
#SBATCH --nodes=1 # Number of nodes
|
| 5 |
+
#SBATCH --ntasks-per-node=1 # One task per node
|
| 6 |
+
#SBATCH --cpus-per-task=128 # cpu-cores per task
|
| 7 |
+
#SBATCH --gres=gpu:8
|
| 8 |
+
#SBATCH --mem=0
|
| 9 |
+
#SBATCH --exclusive
|
| 10 |
+
#SBATCH --time=500:00:00
|
| 11 |
+
#SBATCH --output=/rl/logs/Qwen2.5-3B/gspo/math/vllm_%x_%j.out
|
| 12 |
+
#SBATCH --error=/rl/logs/Qwen2.5-3B/gspo/math/vllm_%x_%j.err
|
| 13 |
+
|
| 14 |
+
set -xeuo pipefail
|
| 15 |
+
|
| 16 |
+
# activate the venv
|
| 17 |
+
echo "Activating verl environment..."
|
| 18 |
+
eval "$(conda shell.bash hook)"
|
| 19 |
+
conda deactivate
|
| 20 |
+
conda activate verl
|
| 21 |
+
|
| 22 |
+
# can make training faster, depends on your infrastructure
|
| 23 |
+
export NCCL_IBEXT_DISABLE=1
|
| 24 |
+
export NCCL_NVLS_ENABLE=1
|
| 25 |
+
export NCCL_IB_HCA=mlx5
|
| 26 |
+
export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1
|
| 27 |
+
|
| 28 |
+
# Set how many GPUs we actually have on this node.
|
| 29 |
+
export GPUS_PER_NODE=8
|
| 30 |
+
|
| 31 |
+
NNODES=${SLURM_JOB_NUM_NODES}
|
| 32 |
+
export NNODES
|
| 33 |
+
|
| 34 |
+
export VLLM_ATTENTION_BACKEND=FLASH_ATTN
|
| 35 |
+
export RAY_LOGGING_LEVEL=DEBUG
|
| 36 |
+
export HYDRA_FULL_ERROR=1
|
| 37 |
+
export WANDB_API_KEY=... # your wandb API key
|
| 38 |
+
|
| 39 |
+
echo "Using $NNODES nodes for training..."
|
| 40 |
+
|
| 41 |
+
# ------------------------------------- Setup xp params ---------------------------------------
|
| 42 |
+
project_name='RL-GSPO'
|
| 43 |
+
|
| 44 |
+
adv_estimator=grpo
|
| 45 |
+
loss_mode=gspo
|
| 46 |
+
loss_agg_mode="seq-mean-token-mean"
|
| 47 |
+
MODEL_PATH=Qwen/Qwen2.5-3B-Instruct
|
| 48 |
+
offload=false # it's a small model, offloading will just slow-down training
|
| 49 |
+
rollout_engine=vllm
|
| 50 |
+
rollout_mode=async
|
| 51 |
+
return_raw_chat="True"
|
| 52 |
+
if [ "$rollout_engine" = "vllm" ]; then
|
| 53 |
+
export VLLM_USE_V1=1
|
| 54 |
+
fi
|
| 55 |
+
gpu_memory_utilization=0.8
|
| 56 |
+
reward_manager=dapo
|
| 57 |
+
adv_estimator=grpo
|
| 58 |
+
shuffle_dataset=true
|
| 59 |
+
first_time_dataset_prep=true # prepare dataset
|
| 60 |
+
|
| 61 |
+
test_freq=10
|
| 62 |
+
save_freq=10
|
| 63 |
+
total_epochs=10
|
| 64 |
+
total_training_steps=500
|
| 65 |
+
val_before_train=false
|
| 66 |
+
|
| 67 |
+
use_kl_in_reward=false
|
| 68 |
+
kl_coef=0.0
|
| 69 |
+
use_kl_loss=false
|
| 70 |
+
kl_loss_coef=0.0
|
| 71 |
+
|
| 72 |
+
clip_ratio_low=0.0003 # as recommended by the paper, see Sec. 5.1
|
| 73 |
+
clip_ratio_high=0.0004 # as recommended by the paper, see Sec. 5.1
|
| 74 |
+
train_batch_size=512
|
| 75 |
+
ppo_mini_batch_size=128 # maintain 4 mini-batches as recommended by the paper, see Sec. 5.1
|
| 76 |
+
ppo_micro_batch_size_per_gpu=8 # setup depending on your GPU memory
|
| 77 |
+
n_resp_per_prompt=16
|
| 78 |
+
|
| 79 |
+
max_prompt_length=$((1024 * 2))
|
| 80 |
+
max_response_length=$((1024 * 8))
|
| 81 |
+
# dapo reward manager params
|
| 82 |
+
enable_overlong_buffer=false # true
|
| 83 |
+
overlong_buffer_len=$((1024 * 4))
|
| 84 |
+
overlong_penalty_factor=1.0
|
| 85 |
+
|
| 86 |
+
# Paths and namings
|
| 87 |
+
SFT_MODEL=$(basename $MODEL_PATH)
|
| 88 |
+
exp_name="${loss_mode}-epslow-${clip_ratio_low}-epshigh-${clip_ratio_high}-${SFT_MODEL}-RL"
|
| 89 |
+
CKPTS_DIR=/rl/checkpoints/experimental/4b/${loss_mode}/${exp_name}
|
| 90 |
+
|
| 91 |
+
# Sampling params at rollouts
|
| 92 |
+
temperature=1.0
|
| 93 |
+
top_p=1.0
|
| 94 |
+
top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
|
| 95 |
+
val_top_p=0.7
|
| 96 |
+
|
| 97 |
+
# Performance Related Parameter
|
| 98 |
+
sp_size=1
|
| 99 |
+
use_dynamic_bsz=true
|
| 100 |
+
actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
|
| 101 |
+
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
|
| 102 |
+
offload=true
|
| 103 |
+
gen_tp=1
|
| 104 |
+
entropy_checkpointing=true # This enables entropy recomputation specifically for the entropy calculation, lowering memory usage during training.
|
| 105 |
+
|
| 106 |
+
# ------------------------------------- train/val data preparation ---------------------------------------
|
| 107 |
+
if [ "$first_time_dataset_prep" = true ]; then
|
| 108 |
+
echo "Preprocessing GSM8K dataset..."
|
| 109 |
+
python examples/data_preprocess/gsm8k.py --local_save_dir /data/gsm8k/
|
| 110 |
+
fi
|
| 111 |
+
|
| 112 |
+
gsm8k_train_path=/data/gsm8k/train.parquet
|
| 113 |
+
gsm8k_test_path=/data/gsm8k/test.parquet
|
| 114 |
+
|
| 115 |
+
# set the paths
|
| 116 |
+
train_files="['$gsm8k_train_path']"
|
| 117 |
+
test_files="['$gsm8k_test_path']"
|
| 118 |
+
|
| 119 |
+
python3 -m verl.trainer.main_ppo \
|
| 120 |
+
algorithm.adv_estimator=${adv_estimator} \
|
| 121 |
+
actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
|
| 122 |
+
data.train_files="${train_files}" \
|
| 123 |
+
data.val_files="${test_files}" \
|
| 124 |
+
data.shuffle=$shuffle_dataset \
|
| 125 |
+
data.prompt_key=prompt \
|
| 126 |
+
data.truncation='error' \
|
| 127 |
+
data.filter_overlong_prompts=true \
|
| 128 |
+
data.return_raw_chat=${return_raw_chat} \
|
| 129 |
+
data.train_batch_size=${train_batch_size} \
|
| 130 |
+
data.max_prompt_length=${max_prompt_length} \
|
| 131 |
+
data.max_response_length=${max_response_length} \
|
| 132 |
+
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
|
| 133 |
+
algorithm.use_kl_in_reward=${use_kl_in_reward} \
|
| 134 |
+
algorithm.kl_ctrl.kl_coef=${kl_coef} \
|
| 135 |
+
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
|
| 136 |
+
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
|
| 137 |
+
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
|
| 138 |
+
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
|
| 139 |
+
actor_rollout_ref.model.use_remove_padding=true \
|
| 140 |
+
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
|
| 141 |
+
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 142 |
+
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
|
| 143 |
+
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
|
| 144 |
+
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 145 |
+
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
|
| 146 |
+
actor_rollout_ref.rollout.name=${rollout_engine} \
|
| 147 |
+
actor_rollout_ref.rollout.mode=${rollout_mode} \
|
| 148 |
+
actor_rollout_ref.model.path="${MODEL_PATH}" \
|
| 149 |
+
actor_rollout_ref.model.enable_gradient_checkpointing=true \
|
| 150 |
+
actor_rollout_ref.actor.optim.lr=1e-6 \
|
| 151 |
+
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.05 \
|
| 152 |
+
actor_rollout_ref.actor.optim.weight_decay=0.1 \
|
| 153 |
+
actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \
|
| 154 |
+
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${ppo_micro_batch_size_per_gpu} \
|
| 155 |
+
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
|
| 156 |
+
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
|
| 157 |
+
actor_rollout_ref.actor.entropy_coeff=0 \
|
| 158 |
+
actor_rollout_ref.actor.grad_clip=1.0 \
|
| 159 |
+
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
|
| 160 |
+
actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
|
| 161 |
+
actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} \
|
| 162 |
+
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
|
| 163 |
+
actor_rollout_ref.rollout.enable_chunked_prefill=true \
|
| 164 |
+
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
|
| 165 |
+
actor_rollout_ref.rollout.temperature=${temperature} \
|
| 166 |
+
actor_rollout_ref.rollout.top_p=${top_p} \
|
| 167 |
+
actor_rollout_ref.rollout.top_k=${top_k} \
|
| 168 |
+
actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
|
| 169 |
+
actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
|
| 170 |
+
actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
|
| 171 |
+
actor_rollout_ref.rollout.val_kwargs.do_sample=true \
|
| 172 |
+
actor_rollout_ref.rollout.val_kwargs.n=1 \
|
| 173 |
+
actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
|
| 174 |
+
actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
|
| 175 |
+
actor_rollout_ref.actor.entropy_checkpointing=${entropy_checkpointing} \
|
| 176 |
+
reward_model.reward_manager=${reward_manager} \
|
| 177 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
|
| 178 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
|
| 179 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
|
| 180 |
+
+reward_model.reward_kwargs.overlong_buffer_cfg.log=false \
|
| 181 |
+
+reward_model.reward_kwargs.max_resp_len=${max_response_length} \
|
| 182 |
+
trainer.logger='["console","wandb"]' \
|
| 183 |
+
trainer.project_name="${project_name}" \
|
| 184 |
+
trainer.experiment_name="${exp_name}" \
|
| 185 |
+
trainer.n_gpus_per_node="${GPUS_PER_NODE}" \
|
| 186 |
+
trainer.nnodes="${NNODES}" \
|
| 187 |
+
trainer.val_before_train=${val_before_train} \
|
| 188 |
+
trainer.test_freq=${test_freq} \
|
| 189 |
+
trainer.save_freq=${save_freq} \
|
| 190 |
+
trainer.total_epochs=${total_epochs} \
|
| 191 |
+
trainer.total_training_steps=${total_training_steps} \
|
| 192 |
+
trainer.default_local_dir="${CKPTS_DIR}" \
|
| 193 |
+
trainer.resume_mode=auto \
|
| 194 |
+
trainer.log_val_generations=2 \
|
| 195 |
+
$@
|