shahidul034 commited on
Commit
1267481
·
verified ·
1 Parent(s): d0f96bf

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. code/RL_model/verl/verl_train/examples/cispo_trainer/run_cispo_qwen2_5_0_5b_gsm8k.sh +51 -0
  2. code/RL_model/verl/verl_train/examples/generation/run_deepseek7b_mutli_node.sh +22 -0
  3. code/RL_model/verl/verl_train/examples/generation/run_deepseek_v2_lite_math.sh +22 -0
  4. code/RL_model/verl/verl_train/examples/gpg_trainer/gpg.md +34 -0
  5. code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math.sh +52 -0
  6. code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math_megatron.sh +53 -0
  7. code/RL_model/verl/verl_train/examples/grpo_trainer/README.md +70 -0
  8. code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh +118 -0
  9. code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh +179 -0
  10. code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm.sh +40 -0
  11. code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math.sh +49 -0
  12. code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_seq_balance.sh +39 -0
  13. code/RL_model/verl/verl_train/examples/grpo_trainer/run_gptoss_20b.sh +79 -0
  14. code/RL_model/verl/verl_train/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh +54 -0
  15. code/RL_model/verl/verl_train/examples/grpo_trainer/run_moonlight16b_math_megatron.sh +58 -0
  16. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh +182 -0
  17. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b.sh +41 -0
  18. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math.sh +49 -0
  19. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh +59 -0
  20. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh +122 -0
  21. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh +91 -0
  22. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh +89 -0
  23. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh +52 -0
  24. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance_math_megatron.sh +57 -0
  25. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh +51 -0
  26. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-7b_math_megatron_diff_tp.sh +50 -0
  27. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh +40 -0
  28. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh +71 -0
  29. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh +41 -0
  30. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh +88 -0
  31. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh +53 -0
  32. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_freeze_vision.sh +47 -0
  33. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh +52 -0
  34. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_seq_balance.sh +45 -0
  35. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh +51 -0
  36. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh +52 -0
  37. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh +51 -0
  38. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh +181 -0
  39. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b_npu.sh +58 -0
  40. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh +81 -0
  41. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh +71 -0
  42. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-235b-megatron.sh +84 -0
  43. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-30b-megatron.sh +85 -0
  44. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh +195 -0
  45. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh +133 -0
  46. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh +236 -0
  47. code/RL_model/verl/verl_train/examples/grpo_trainer/run_seed_oss_36b.sh +48 -0
  48. code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen30b_gspo.sh +197 -0
  49. code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh +199 -0
  50. code/RL_model/verl/verl_train/examples/gspo_trainer/test_gspo_3b_math.sh +195 -0
code/RL_model/verl/verl_train/examples/cispo_trainer/run_cispo_qwen2_5_0_5b_gsm8k.sh ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+
4
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
5
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
6
+
7
+ train_files="['$gsm8k_train_path']"
8
+ test_files="['$gsm8k_test_path']"
9
+
10
+ python3 -m verl.trainer.main_ppo \
11
+ algorithm.adv_estimator=grpo \
12
+ actor_rollout_ref.actor.policy_loss.loss_mode=cispo \
13
+ actor_rollout_ref.actor.clip_ratio_low=10 \
14
+ actor_rollout_ref.actor.clip_ratio_high=0.2 \
15
+ data.train_files="$train_files" \
16
+ data.val_files="$test_files" \
17
+ data.train_batch_size=256 \
18
+ data.max_prompt_length=1024 \
19
+ data.max_response_length=1024 \
20
+ data.filter_overlong_prompts=True \
21
+ data.truncation='error' \
22
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
23
+ actor_rollout_ref.model.torch_dtype=bfloat16 \
24
+ actor_rollout_ref.actor.optim.lr=1e-6 \
25
+ actor_rollout_ref.model.use_remove_padding=True \
26
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
27
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
28
+ actor_rollout_ref.actor.use_kl_loss=True \
29
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
30
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
31
+ actor_rollout_ref.actor.entropy_coeff=0 \
32
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
33
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
34
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
35
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
36
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
37
+ actor_rollout_ref.rollout.name=vllm \
38
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
39
+ actor_rollout_ref.rollout.n=5 \
40
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
41
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
42
+ algorithm.use_kl_in_reward=False \
43
+ trainer.critic_warmup=0 \
44
+ trainer.logger='["console","wandb"]' \
45
+ trainer.project_name='verl_cispo_example_gsm8k' \
46
+ trainer.experiment_name='qwen2_5_0_5b_cispo' \
47
+ trainer.n_gpus_per_node=1 \
48
+ trainer.nnodes=1 \
49
+ trainer.save_freq=5 \
50
+ trainer.test_freq=5 \
51
+ trainer.total_epochs=3 $@
code/RL_model/verl/verl_train/examples/generation/run_deepseek7b_mutli_node.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ data_path=$HOME/data/rlhf/gsm8k/test.parquet
4
+ save_path=$HOME/data/rlhf/math/deepseek_v2_lite_gen_test.parquet
5
+ model_path=deepseek-ai/deepseek-llm-7b-chat
6
+
7
+ python3 -m verl.trainer.main_generation \
8
+ trainer.nnodes=2 \
9
+ trainer.n_gpus_per_node=8 \
10
+ data.path=$data_path \
11
+ data.prompt_key=prompt \
12
+ data.n_samples=1 \
13
+ data.output_path=$save_path \
14
+ model.path=$model_path\
15
+ +model.trust_remote_code=True \
16
+ rollout.temperature=1.0 \
17
+ rollout.top_k=50 \
18
+ rollout.top_p=0.7 \
19
+ rollout.prompt_length=2048 \
20
+ rollout.response_length=1024 \
21
+ rollout.tensor_model_parallel_size=16 \
22
+ rollout.gpu_memory_utilization=0.8
code/RL_model/verl/verl_train/examples/generation/run_deepseek_v2_lite_math.sh ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ data_path=$HOME/data/gsm8k/test.parquet
4
+ save_path=$HOME/data/gsm8k/deepseek_v2_lite_gen_test.parquet
5
+ model_path=deepseek-ai/deepseek-llm-7b-chat
6
+
7
+ python3 -m verl.trainer.main_generation \
8
+ trainer.nnodes=1 \
9
+ trainer.n_gpus_per_node=8 \
10
+ data.path=$data_path \
11
+ data.prompt_key=prompt \
12
+ data.n_samples=1 \
13
+ data.output_path=$save_path \
14
+ model.path=$model_path \
15
+ +model.trust_remote_code=True \
16
+ rollout.temperature=1.0 \
17
+ rollout.top_k=50 \
18
+ rollout.top_p=0.7 \
19
+ rollout.prompt_length=2048 \
20
+ rollout.response_length=1024 \
21
+ rollout.tensor_model_parallel_size=2 \
22
+ rollout.gpu_memory_utilization=0.8
code/RL_model/verl/verl_train/examples/gpg_trainer/gpg.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPG: Group Policy Gradient
2
+
3
+ Group Policy Gradient (GPG) is a minimalist reinforcement learning (RL) method that enhances the reasoning ability of large language models without relying on supervised fine-tuning or complex tricks. GPG revisits traditional policy gradients and directly optimizes the RL objective—no surrogate losses, no KL penalties, no critic, and no reference model. Compared to GRPO, GPG is simpler, more efficient, and achieves better results on many tasks. For more details, please refer to the original paper [GPG: A Simple and Strong Reinforcement Learning Baseline for Model Reasoning
4
+ ](https://arxiv.org/abs/2504.02546).
5
+
6
+ ## Key Components
7
+ - Use a corrected advantage function to improve policy gradient accuracy and training efficiency.
8
+ - By eliminating the critic and reference models, avoiding KL divergence constraints, significantly simplifies the training process compared to Group Relative Policy Optimization (GRPO)
9
+
10
+ ## Configuration
11
+ To configure GPG within the framework, use the following YAML settings.
12
+
13
+ ```yaml
14
+ algorithm:
15
+ adv_estimator: gpg
16
+ actor_rollout_ref:
17
+ actor:
18
+ policy_loss:
19
+ loss_mode: "gpg"
20
+ ```
21
+
22
+ ## Advanced Extensions
23
+ GPG is a simple and strong baseline for model reasoning. Although it avoids using KL loss in its original form, you can still use KL loss to further improve the performance.
24
+
25
+ ```yaml
26
+ algorithm:
27
+ adv_estimator: gpg
28
+ actor_rollout_ref:
29
+ actor:
30
+ use_kl_loss: True # enable kl regularization
31
+ kl_loss_coef: 0.01
32
+ policy_loss:
33
+ loss_mode: "gpg"
34
+ ```
code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
4
+ # export VLLM_ATTENTION_BACKEND=XFORMERS
5
+
6
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
7
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
8
+ math_train_path=$HOME/data/math/train.parquet
9
+ math_test_path=$HOME/data/math/test.parquet
10
+
11
+ train_files="['$gsm8k_train_path', '$math_train_path']"
12
+ test_files="['$gsm8k_test_path', '$math_test_path']"
13
+
14
+ python3 -m verl.trainer.main_ppo \
15
+ algorithm.adv_estimator=gpg \
16
+ data.train_files="$train_files" \
17
+ data.val_files="$test_files" \
18
+ data.train_batch_size=1024 \
19
+ data.max_prompt_length=1024 \
20
+ data.max_response_length=1024 \
21
+ data.filter_overlong_prompts=True \
22
+ data.truncation='error' \
23
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
24
+ actor_rollout_ref.actor.optim.lr=1e-6 \
25
+ actor_rollout_ref.model.use_remove_padding=True \
26
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
27
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
28
+ actor_rollout_ref.actor.use_kl_loss=False \
29
+ actor_rollout_ref.actor.policy_loss.loss_mode=gpg \
30
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
31
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
32
+ actor_rollout_ref.actor.entropy_coeff=0 \
33
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
34
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
35
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
36
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
37
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
38
+ actor_rollout_ref.rollout.name=vllm \
39
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
40
+ actor_rollout_ref.rollout.n=5 \
41
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
42
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
43
+ algorithm.use_kl_in_reward=False \
44
+ trainer.critic_warmup=0 \
45
+ trainer.logger='["console","wandb"]' \
46
+ trainer.project_name='verl_gpg_example_gsm8k_math' \
47
+ trainer.experiment_name='qwen2_7b_function_rm' \
48
+ trainer.n_gpus_per_node=8 \
49
+ trainer.nnodes=1 \
50
+ trainer.save_freq=20 \
51
+ trainer.test_freq=5 \
52
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/gpg_trainer/run_qwen2-7b_math_megatron.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
4
+ # export VLLM_ATTENTION_BACKEND=XFORMERS
5
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
6
+
7
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
8
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
9
+ math_train_path=$HOME/data/math/train.parquet
10
+ math_test_path=$HOME/data/math/test.parquet
11
+
12
+ train_files="['$gsm8k_train_path', '$math_train_path']"
13
+ test_files="['$gsm8k_test_path', '$math_test_path']"
14
+
15
+ python3 -m verl.trainer.main_ppo --config-path=config \
16
+ --config-name='ppo_megatron_trainer.yaml'\
17
+ algorithm.adv_estimator=gpg \
18
+ data.train_files="$train_files" \
19
+ data.val_files="$test_files" \
20
+ data.train_batch_size=1024 \
21
+ data.max_prompt_length=1024 \
22
+ data.max_response_length=1024 \
23
+ data.filter_overlong_prompts=True \
24
+ data.truncation='error' \
25
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
26
+ actor_rollout_ref.actor.optim.lr=1e-6 \
27
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
28
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
29
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
30
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
31
+ actor_rollout_ref.actor.policy_loss.loss_mode=gpg \
32
+ actor_rollout_ref.actor.use_kl_loss=False \
33
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
34
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
35
+ actor_rollout_ref.actor.entropy_coeff=0 \
36
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
37
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
38
+ actor_rollout_ref.rollout.name=vllm \
39
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
40
+ actor_rollout_ref.rollout.n=5 \
41
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
42
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
43
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
44
+ algorithm.use_kl_in_reward=False \
45
+ trainer.critic_warmup=0 \
46
+ trainer.logger='["console","wandb"]' \
47
+ trainer.project_name='verl_gpg_example_gsm8k_math' \
48
+ trainer.experiment_name='qwen2_7b_megatron' \
49
+ trainer.n_gpus_per_node=8 \
50
+ trainer.nnodes=1 \
51
+ trainer.save_freq=20 \
52
+ trainer.test_freq=5 \
53
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Group Relative Policy Optimization (GRPO)
2
+
3
+ In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive.
4
+
5
+ GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows:
6
+ - Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs.
7
+ - Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality.
8
+ - Baseline Calculation: The average reward of the group serves as a baseline.
9
+ - Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones.
10
+
11
+ This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300)
12
+
13
+ ## Key Components
14
+
15
+ - No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic)
16
+ - Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group.
17
+ - Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group.
18
+
19
+ ## Configuration
20
+
21
+ Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
22
+
23
+ Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic).
24
+
25
+ ![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
26
+
27
+ - `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling.
28
+
29
+ - `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
30
+
31
+ - `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers.
32
+
33
+ - `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor
34
+
35
+ - `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2
36
+
37
+ - `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead
38
+
39
+ - `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead.
40
+
41
+ Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss:
42
+
43
+ - `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO.
44
+
45
+ - `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
46
+
47
+ - `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
48
+
49
+ ## Advanced Extensions
50
+
51
+ ### DrGRPO
52
+
53
+ The work [Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, that leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization, which can inadvertently favor longer, less accurate responses. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias.
54
+
55
+ Configure the following to enable DrGRPO, with all other parameters the same as GRPO's:
56
+
57
+ - `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging
58
+ - `actor_rollout_ref.actor.loss_scale_factor`: (Optional) Set to a constant integer (e.g., max response length) to ensure consistent normalization throughout training. If not set, uses the current batch's response length.
59
+ - `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO
60
+ - `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm
61
+
62
+ ## Reference Example
63
+
64
+ Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)
65
+
66
+ ```bash
67
+ bash examples/grpo_trainer/run_qwen3-8b.sh
68
+ ```
69
+
70
+ For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_80gb.sh ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # # 0. download HF checkpoint
4
+ # # remove the `quantization_config` in the `config.json`
5
+ # # set `num_nextn_predict_layers=0` to disable MTP, which is not currently supported
6
+ # hf download deepseek-ai/DeepSeek-V3-0324
7
+
8
+ # no offline dist checkpoint needed, now with mbridge>=0.13.0, we can directly init model from huggingface downloaded fp8 weights
9
+ # tested on docker://verlai/verl:app-verl0.5-transformers4.55.4-vllm0.10.0-mcore0.13.0-te2.2
10
+ LLM="<path_to_dsv3_config>"
11
+
12
+
13
+ # 2. run the script
14
+ gsm8k_train_path=/root/data/gsm8k/train.parquet
15
+ gsm8k_test_path=/root/data/gsm8k/test.parquet
16
+ train_files=$gsm8k_train_path
17
+ test_files=$gsm8k_test_path
18
+
19
+ ALL_OFFLOAD=${ALL_OFFLOAD:-True}
20
+ COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
21
+ COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
22
+ COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
23
+
24
+ ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
25
+ ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
26
+ ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
27
+ REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
28
+ CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
29
+ CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
30
+ CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
31
+ RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
32
+
33
+ # 256 H100(80GB)
34
+ NODES=32
35
+ PP=16
36
+ TP=1
37
+ EP=16
38
+ ETP=1
39
+ INFER_TP=32
40
+ # consider TP/ETP, and enable recompute if short of memory
41
+
42
+ # full recompute
43
+
44
+ n_resp_per_prompt=4
45
+ max_prompt_length=2048
46
+ max_response_length=4096
47
+ use_dynamic_bsz=True
48
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
49
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
50
+
51
+ use_kl_in_reward=False
52
+ kl_coef=0.0
53
+ use_kl_loss=True
54
+ kl_loss_coef=0.001
55
+
56
+ # RAY_ADDRESS='auto' ray job submit --working-dir . --
57
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
58
+ algorithm.adv_estimator=grpo \
59
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
60
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
61
+ data.train_files="$train_files" \
62
+ data.val_files="$test_files" \
63
+ data.train_batch_size=512 \
64
+ data.max_prompt_length=$max_prompt_length \
65
+ data.max_response_length=$max_response_length \
66
+ data.filter_overlong_prompts=True \
67
+ data.truncation='error' \
68
+ actor_rollout_ref.model.path=$LLM \
69
+ actor_rollout_ref.actor.optim.lr=1e-6 \
70
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
71
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
72
+ actor_rollout_ref.actor.use_torch_compile=False \
73
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
74
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
75
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
76
+ actor_rollout_ref.rollout.name=vllm \
77
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
78
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
79
+ actor_rollout_ref.rollout.temperature=1.0 \
80
+ actor_rollout_ref.rollout.top_p=1.0 \
81
+ actor_rollout_ref.rollout.top_k=-1 \
82
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$INFER_TP \
83
+ trainer.logger='["console","tensorboard"]' \
84
+ trainer.project_name='verl_megatron_gsm8k_examples' \
85
+ trainer.experiment_name='dsv3-32nodes' \
86
+ trainer.n_gpus_per_node=8 \
87
+ trainer.nnodes=$NODES \
88
+ trainer.save_freq=-1 \
89
+ trainer.test_freq=5 \
90
+ actor_rollout_ref.model.use_fused_kernels=True \
91
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
92
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
93
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
94
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
95
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
96
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
97
+ actor_rollout_ref.actor.megatron.override_transformer_config.attention_backend='fused' \
98
+ +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_first_pipeline_stage=4 \
99
+ +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=1 \
100
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
101
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
102
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
103
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
104
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
105
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
106
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
107
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
108
+ actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
109
+ actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
110
+ actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
111
+ actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
112
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
113
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
114
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
115
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
116
+ trainer.default_local_dir=$CKPT_DIR \
117
+ trainer.val_before_train=False \
118
+ trainer.total_epochs=100 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -xeuo pipefail
3
+
4
+ ## !!!!!!!important!!!!!!
5
+ # 1. set the following environment variables on all your nodes
6
+ # env_vars:
7
+ # CUDA_DEVICE_MAX_CONNECTIONS: "1"
8
+ # NCCL_NVLS_ENABLE: "0"
9
+ # VLLM_USE_V1: 1
10
+ # 2. install mbridge=0.1.13 on all your node with the following command:
11
+ # pip3 install git+https://github.com/ISEEKYAN/mbridge
12
+ # 3. remove the `quantization_config` in the DeepSeek-V3's `config.json` and
13
+ # set `num_nextn_predict_layers=0` to disable MTP, which is not currently supported
14
+
15
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
16
+ [ -f "${SCRIPT_DIR}/env.sh" ] && source "${SCRIPT_DIR}/env.sh"
17
+
18
+ adv_estimator=grpo
19
+
20
+ use_kl_in_reward=False
21
+ kl_coef=0.0
22
+ use_kl_loss=True
23
+ kl_loss_coef=0.001
24
+
25
+ clip_ratio_low=0.2
26
+ clip_ratio_high=0.28
27
+
28
+ max_prompt_length=$((1024 * 2))
29
+ max_response_length=$((1204 * 8))
30
+ enable_overlong_buffer=True
31
+ overlong_buffer_len=$((1024 * 4))
32
+ overlong_penalty_factor=1.0
33
+
34
+ loss_agg_mode="token-mean"
35
+
36
+ train_prompt_bsz=96
37
+ n_resp_per_prompt=8
38
+ train_prompt_mini_bsz=32
39
+
40
+
41
+ # minimum nodes for DeepSeek-V3: 12 nodes
42
+ NNODES=${NNODES:-12}
43
+
44
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
45
+
46
+ MODEL_PATH=$RAY_DATA_HOME/models/DeepSeek-V3-config-verl
47
+
48
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
49
+ TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
50
+
51
+ # Algorithm
52
+ temperature=1.0
53
+ top_p=1.0
54
+ top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
55
+ val_top_p=0.7
56
+
57
+ # Performance Related Parameter
58
+ use_dynamic_bsz=True
59
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10))
60
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
61
+ offload=True
62
+ optim_offload=${OFFLOAD_OPTIM:-True}
63
+ gen_tp=32
64
+ train_tp=${TP:-8}
65
+ train_pp=${PP:-12}
66
+
67
+ EP=${EP:-8}
68
+ ETP=1
69
+ CP=1
70
+ optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
71
+ LAST_LAYER=${LAST_LAYER:-6}
72
+
73
+
74
+ project_name='verl-deepseek-v3'
75
+ exp_name="671B-${NNODES}-pp${train_pp}-tp${train_tp}-ep${EP}-actor-length${actor_ppo_max_token_len}"
76
+ CKPTS_DIR=$RAY_DATA_HOME/ckpt/${project_name}/${exp_name}
77
+
78
+ python3 -m verl.trainer.main_ppo \
79
+ --config-path=config \
80
+ --config-name='ppo_megatron_trainer.yaml' \
81
+ data.train_files="${TRAIN_FILE}" \
82
+ data.val_files="${TEST_FILE}" \
83
+ data.prompt_key=prompt \
84
+ data.truncation='left' \
85
+ data.max_prompt_length=${max_prompt_length} \
86
+ data.max_response_length=${max_response_length} \
87
+ data.train_batch_size=${train_prompt_bsz} \
88
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
89
+ actor_rollout_ref.rollout.name=vllm \
90
+ algorithm.adv_estimator=${adv_estimator} \
91
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
92
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
93
+ actor_rollout_ref.model.use_fused_kernels=True \
94
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
95
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
96
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
97
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
98
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
99
+ actor_rollout_ref.actor.clip_ratio_c=10.0 \
100
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
101
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
102
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
103
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
104
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
105
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
106
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
107
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
108
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
109
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
110
+ actor_rollout_ref.actor.optim.lr=1e-6 \
111
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
112
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
113
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} \
114
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
115
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
116
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
117
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
118
+ actor_rollout_ref.actor.megatron.param_offload=${offload} \
119
+ actor_rollout_ref.actor.megatron.optimizer_offload=${optim_offload} \
120
+ actor_rollout_ref.actor.megatron.grad_offload=${offload} \
121
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
122
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
123
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
124
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
125
+ actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
126
+ actor_rollout_ref.actor.entropy_coeff=0 \
127
+ actor_rollout_ref.actor.optim.clip_grad=1.0 \
128
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
129
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
130
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
131
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
132
+ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
133
+ actor_rollout_ref.rollout.temperature=${temperature} \
134
+ actor_rollout_ref.rollout.top_p=${top_p} \
135
+ actor_rollout_ref.rollout.top_k=${top_k} \
136
+ actor_rollout_ref.nccl_timeout=1200 \
137
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
138
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
139
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
140
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
141
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
142
+ actor_rollout_ref.rollout.enforce_eager=True \
143
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
144
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
145
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
146
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
147
+ actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
148
+ actor_rollout_ref.ref.megatron.param_offload=${offload} \
149
+ +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=False \
150
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
151
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_shared_expert_overlap=False \
152
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
153
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
154
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
155
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
156
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
157
+ +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
158
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
159
+ +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=False \
160
+ +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=False \
161
+ +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=${LAST_LAYER} \
162
+ reward_model.reward_manager=dapo \
163
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
164
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
165
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
166
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
167
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
168
+ trainer.logger=['console','wandb'] \
169
+ trainer.project_name="${project_name}" \
170
+ trainer.experiment_name="${exp_name}" \
171
+ trainer.n_gpus_per_node=8 \
172
+ trainer.nnodes="${NNODES}" \
173
+ trainer.val_before_train=False \
174
+ trainer.test_freq=10 \
175
+ trainer.save_freq=100 \
176
+ trainer.total_epochs=10 \
177
+ trainer.default_local_dir="${CKPTS_DIR}" \
178
+ trainer.resume_mode=auto \
179
+ trainer.log_val_generations=10
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=grpo \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=1024 \
8
+ data.max_prompt_length=512 \
9
+ data.max_response_length=1024 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
13
+ actor_rollout_ref.actor.optim.lr=1e-6 \
14
+ actor_rollout_ref.model.use_remove_padding=True \
15
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
16
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
17
+ actor_rollout_ref.actor.use_kl_loss=True \
18
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
19
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
20
+ actor_rollout_ref.actor.entropy_coeff=0 \
21
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
22
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
23
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
24
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
25
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
26
+ actor_rollout_ref.rollout.name=vllm \
27
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
28
+ actor_rollout_ref.rollout.n=5 \
29
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
30
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
31
+ algorithm.use_kl_in_reward=False \
32
+ trainer.critic_warmup=0 \
33
+ trainer.logger=console \
34
+ trainer.project_name='verl_grpo_example_gsm8k' \
35
+ trainer.experiment_name='deepseek_llm_7b_function_rm' \
36
+ trainer.n_gpus_per_node=8 \
37
+ trainer.nnodes=1 \
38
+ trainer.save_freq=20 \
39
+ trainer.test_freq=5 \
40
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+
4
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
5
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
6
+ math_train_path=$HOME/data/math/train.parquet
7
+ math_test_path=$HOME/data/math/test.parquet
8
+
9
+ train_files="['$gsm8k_train_path', '$math_train_path']"
10
+ test_files="['$gsm8k_test_path', '$math_test_path']"
11
+
12
+ python3 -m verl.trainer.main_ppo \
13
+ algorithm.adv_estimator=grpo \
14
+ data.train_files="$train_files" \
15
+ data.val_files="$test_files" \
16
+ data.train_batch_size=1024 \
17
+ data.max_prompt_length=1024 \
18
+ data.max_response_length=1024 \
19
+ data.filter_overlong_prompts=True \
20
+ data.truncation='error' \
21
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
22
+ actor_rollout_ref.actor.optim.lr=1e-6 \
23
+ actor_rollout_ref.model.use_remove_padding=True \
24
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
25
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
26
+ actor_rollout_ref.actor.use_kl_loss=True \
27
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
28
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
29
+ actor_rollout_ref.actor.entropy_coeff=0 \
30
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
31
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
32
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
33
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
34
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
35
+ actor_rollout_ref.rollout.name=vllm \
36
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
37
+ actor_rollout_ref.rollout.n=5 \
38
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
39
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
40
+ algorithm.use_kl_in_reward=False \
41
+ trainer.critic_warmup=0 \
42
+ trainer.logger='["console","wandb"]' \
43
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
44
+ trainer.experiment_name='deepseek_llm_7b_function_rm_math' \
45
+ trainer.n_gpus_per_node=8 \
46
+ trainer.nnodes=1 \
47
+ trainer.save_freq=20 \
48
+ trainer.test_freq=5 \
49
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_seq_balance.sh ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=grpo \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=1024 \
8
+ data.max_prompt_length=512 \
9
+ data.max_response_length=512 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
13
+ actor_rollout_ref.actor.optim.lr=1e-6 \
14
+ actor_rollout_ref.model.use_remove_padding=True \
15
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
16
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
17
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
18
+ actor_rollout_ref.actor.use_kl_loss=True \
19
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
20
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
21
+ actor_rollout_ref.actor.entropy_coeff=0 \
22
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
23
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
24
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
25
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
26
+ actor_rollout_ref.rollout.name=vllm \
27
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
28
+ actor_rollout_ref.rollout.n=5 \
29
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
30
+ algorithm.use_kl_in_reward=False \
31
+ trainer.critic_warmup=0 \
32
+ trainer.logger='["console","wandb"]' \
33
+ trainer.project_name='verl_grpo_example_gsm8k' \
34
+ trainer.experiment_name='deepseek_llm_7b_function_rm_seq_packing' \
35
+ trainer.n_gpus_per_node=8 \
36
+ trainer.nnodes=1 \
37
+ trainer.save_freq=20 \
38
+ trainer.test_freq=5 \
39
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_gptoss_20b.sh ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ cat > get_model.py << EOF
4
+ import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
6
+
7
+ model_id = "openai/gpt-oss-20b"
8
+ output_dir = "$HOME/models/gpt-oss-20b-bf16"
9
+
10
+ quantization_config = Mxfp4Config(dequantize=True)
11
+ model_kwargs = dict(
12
+ attn_implementation="eager",
13
+ torch_dtype=torch.bfloat16,
14
+ quantization_config=quantization_config,
15
+ use_cache=False,
16
+ device_map="auto",
17
+ )
18
+
19
+ model = AutoModelForCausalLM.from_pretrained(model_id, **model_kwargs)
20
+
21
+ # Patch config with custom attribute before saving
22
+ model.config.attn_implementation = "eager"
23
+
24
+ model.save_pretrained(output_dir)
25
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
26
+ tokenizer.save_pretrained(output_dir)
27
+ EOF
28
+
29
+ python get_model.py
30
+ # or you can use lmsys/gpt-oss-20b-bf16
31
+ # recommend to use same value for train_batch_size and ppo_mini_batch_size
32
+ # to avoid MOE training instability
33
+ # use large value for max_response_length if you want to use reasoning effort high.
34
+
35
+
36
+ model_dir=$HOME/models/gpt-oss-20b-bf16
37
+ python3 -m verl.trainer.main_ppo \
38
+ algorithm.adv_estimator=grpo \
39
+ data.train_files="$gsm8k_train_path" \
40
+ data.val_files="$gsm8k_test_path" \
41
+ data.train_batch_size=256 \
42
+ data.max_prompt_length=512 \
43
+ data.max_response_length=8192 \
44
+ data.filter_overlong_prompts=True \
45
+ data.truncation='error' \
46
+ +data.apply_chat_template_kwargs.reasoning_effort=medium \
47
+ actor_rollout_ref.model.path=${model_dir} \
48
+ actor_rollout_ref.actor.optim.lr=1e-6 \
49
+ actor_rollout_ref.model.use_remove_padding=True \
50
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
51
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
52
+ actor_rollout_ref.actor.use_kl_loss=True \
53
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
54
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
55
+ actor_rollout_ref.actor.entropy_coeff=0 \
56
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
57
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
58
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
59
+ +actor_rollout_ref.actor.fsdp_config.model_dtype=bfloat16 \
60
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
61
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
62
+ actor_rollout_ref.rollout.name=sglang \
63
+ actor_rollout_ref.rollout.mode=async \
64
+ actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend=triton \
65
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
66
+ actor_rollout_ref.rollout.n=5 \
67
+ actor_rollout_ref.rollout.load_format=safetensors \
68
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
69
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
70
+ algorithm.use_kl_in_reward=False \
71
+ trainer.critic_warmup=0 \
72
+ trainer.logger='["console","wandb"]' \
73
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
74
+ trainer.experiment_name='oai_oss_20b_function_rm' \
75
+ trainer.n_gpus_per_node=8 \
76
+ trainer.nnodes=1 \
77
+ trainer.save_freq=50 \
78
+ trainer.test_freq=10 \
79
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_mistral13b_skyworkrm_hhrlhf.sh ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ train_files=data/full_hh_rlhf/rl/train.parquet
2
+ test_files=data/full_hh_rlhf/rl/train.parquet # no use
3
+
4
+ max_prompt_length=4096
5
+ max_response_length=2048
6
+
7
+ gen_tp=4
8
+ n_per_prompt=5
9
+ adv_estimator="grpo"
10
+
11
+ project_name=verl_full_hh_rlhf_examples
12
+ exp_name="grpo_mistral13B-skyworkLlama8b-hhrlhf"
13
+
14
+ python3 -m verl.trainer.main_ppo \
15
+ algorithm.adv_estimator=$adv_estimator \
16
+ data.train_files="$train_files" \
17
+ data.val_files="$test_files" \
18
+ data.train_batch_size=512 \
19
+ data.prompt_key="prompt" \
20
+ data.return_raw_chat=True \
21
+ data.max_prompt_length=$max_prompt_length \
22
+ data.max_response_length=$max_response_length \
23
+ data.filter_overlong_prompts=True \
24
+ data.truncation='error' \
25
+ actor_rollout_ref.model.path=mistralai/Mistral-Nemo-Instruct-2407 \
26
+ actor_rollout_ref.actor.optim.lr=1e-6 \
27
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
28
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
29
+ actor_rollout_ref.actor.use_kl_loss=False \
30
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=10 \
31
+ actor_rollout_ref.rollout.name=vllm \
32
+ actor_rollout_ref.rollout.n=$n_per_prompt \
33
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
34
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
35
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
36
+ reward_model.enable=True \
37
+ reward_model.model.path=Skywork/Skywork-Reward-Llama-3.1-8B \
38
+ reward_model.use_reward_loop=True \
39
+ reward_model.rollout.name=vllm \
40
+ reward_model.rollout.gpu_memory_utilization=0.8 \
41
+ reward_model.rollout.tensor_model_parallel_size=1 \
42
+ reward_model.rollout.prompt_length=8192 \
43
+ reward_model.rollout.response_length=4096 \
44
+ reward_model.num_workers=8 \
45
+ algorithm.use_kl_in_reward=False \
46
+ trainer.logger='["console","wandb"]' \
47
+ trainer.val_before_train=False \
48
+ trainer.project_name=$project_name \
49
+ trainer.experiment_name=$exp_name \
50
+ trainer.n_gpus_per_node=8 \
51
+ trainer.nnodes=1 \
52
+ trainer.save_freq=10 \
53
+ trainer.test_freq=-1 \
54
+ trainer.total_epochs=5 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_moonlight16b_math_megatron.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ HF_MODEL_PATH=moonshotai/Moonlight-16B-A3B
6
+ DIST_CKPT_PATH=${DIST_CKPT_PATH}
7
+
8
+ train_path=$HOME/data/gsm8k/train.parquet
9
+ test_path=$HOME/data/gsm8k/test.parquet
10
+
11
+ python3 -m verl.trainer.main_ppo --config-path=config \
12
+ --config-name='ppo_megatron_trainer.yaml'\
13
+ algorithm.adv_estimator=grpo \
14
+ data.train_files="$train_path" \
15
+ data.val_files="$test_path" \
16
+ data.train_batch_size=192 \
17
+ data.max_prompt_length=1024 \
18
+ data.max_response_length=2048 \
19
+ data.filter_overlong_prompts=True \
20
+ data.truncation='error' \
21
+ data.trust_remote_code=True \
22
+ actor_rollout_ref.model.path=$HF_MODEL_PATH \
23
+ actor_rollout_ref.model.trust_remote_code=True \
24
+ actor_rollout_ref.actor.optim.lr=1e-6 \
25
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
26
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
27
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=3 \
28
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
29
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=4 \
30
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=1 \
31
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
32
+ actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
33
+ actor_rollout_ref.actor.use_kl_loss=True \
34
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
35
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
36
+ actor_rollout_ref.actor.entropy_coeff=0 \
37
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
38
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
39
+ actor_rollout_ref.rollout.name=vllm \
40
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
41
+ actor_rollout_ref.rollout.n=5 \
42
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
43
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=3 \
44
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
45
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=4 \
46
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=1 \
47
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
48
+ actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
49
+ algorithm.use_kl_in_reward=False \
50
+ trainer.critic_warmup=0 \
51
+ trainer.logger='["console","wandb"]' \
52
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
53
+ trainer.experiment_name='moonlight_megatron_ep' \
54
+ trainer.n_gpus_per_node=8 \
55
+ trainer.nnodes=3 \
56
+ trainer.save_freq=20 \
57
+ trainer.test_freq=5 \
58
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -xeuo pipefail
3
+ mkdir -p logs
4
+
5
+ # Project Configuration
6
+ project_name='GRPO-Qwen2.5-32B-BASE-SGLang'
7
+ exp_name='GRPO-Qwen2.5-32B-BASE-FSDP-SGLang'
8
+
9
+ # Necessary env
10
+ export HCCL_CONNECT_TIMEOUT=1500
11
+ export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
12
+ export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
13
+
14
+ export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
15
+ # If the number of nodes is 16, ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
16
+ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
17
+
18
+ export DISABLE_L2_CACHE=1
19
+ export TASK_QUEUE_ENABLE=1
20
+
21
+ # Node Info
22
+ NNODES=${NNODES:-2}
23
+ NPUS_PER_NODE=${NPUS_PER_NODE:-8}
24
+
25
+ # Model Weights Paths
26
+ MODEL_PATH=Qwen/Qwen2.5-32B
27
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
28
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
29
+
30
+ # File System Paths
31
+ TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/datasets/deepscaler/train.parquet"}
32
+ TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/datasets/deepscaler/test.parquet"}
33
+
34
+ # Data Configuration
35
+ max_prompt_length=$((1024 * 2))
36
+ max_response_length=$((1024 * 8))
37
+
38
+ # Training Batch Configuration
39
+ train_prompt_bsz=32
40
+ train_prompt_mini_bsz=32
41
+ n_resp_per_prompt=8
42
+
43
+ # Algorithm Configuration
44
+ adv_estimator=grpo
45
+ use_kl_in_reward=False
46
+ kl_coef=0.0
47
+ use_kl_loss=True
48
+ kl_loss_coef=0.001
49
+
50
+ # Performance and Memory Management Configuration
51
+ all_offload=True
52
+ use_dynamic_bsz=False
53
+
54
+ # SGLang Configuration
55
+ gen_tp=4
56
+ gen_sp=1
57
+ gen_dp=1
58
+ gen_ep=1
59
+ gpu_memory_utilization=0.5
60
+
61
+ # Data Configuration
62
+ DATA_CONFIG=(
63
+ # File Paths
64
+ data.train_files="${TRAIN_FILE}"
65
+ data.val_files="${TEST_FILE}"
66
+ # Data Structure
67
+ data.prompt_key=prompt
68
+ # Batch and Length Configuration
69
+ data.train_batch_size=${train_prompt_bsz}
70
+ data.max_prompt_length=${max_prompt_length}
71
+ data.max_response_length=${max_response_length}
72
+ # Preprocessing
73
+ data.filter_overlong_prompts=False
74
+ data.truncation='left'
75
+ )
76
+
77
+ # Model Configuration
78
+ MODEL_CONFIG=(
79
+ # Model Path
80
+ actor_rollout_ref.model.path="${MODEL_PATH}"
81
+ # Model Processing
82
+ actor_rollout_ref.model.use_remove_padding=True
83
+ actor_rollout_ref.model.enable_gradient_checkpointing=True
84
+ )
85
+
86
+ # Reinforcement Learning Algorithm Configuration
87
+ ALGORITHM_CONFIG=(
88
+ # Advantage Estimation
89
+ algorithm.adv_estimator=${adv_estimator}
90
+ # KL Divergence Control
91
+ algorithm.use_kl_in_reward=${use_kl_in_reward}
92
+ )
93
+
94
+ # Actor Model Configuration
95
+ ACTOR_CONFIG=(
96
+ # Core Runtime Settings
97
+ actor_rollout_ref.actor.use_torch_compile=False
98
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
99
+ # Loss Function Configuration
100
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
101
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl
102
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
103
+ actor_rollout_ref.actor.entropy_coeff=0
104
+ # PPO Training Parameters
105
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
106
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
107
+ # Optimizer Settings
108
+ actor_rollout_ref.actor.optim.lr=1e-6
109
+ actor_rollout_ref.actor.fsdp_config.param_offload=${all_offload}
110
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=${all_offload}
111
+ )
112
+
113
+ # Reference Model Configuration
114
+ REF_CONFIG=(
115
+ # Core Runtime Settings
116
+ actor_rollout_ref.ref.use_torch_compile=False
117
+ # Log Probability Inference
118
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
119
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
120
+ # Memory Optimization
121
+ actor_rollout_ref.ref.fsdp_config.param_offload=${all_offload}
122
+ )
123
+
124
+ # Rollout Configuration
125
+ ROLLOUT_CONFIG=(
126
+ # Rollout Engine
127
+ actor_rollout_ref.rollout.name=sglang
128
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
129
+ # Generation Parameters
130
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt}
131
+ actor_rollout_ref.rollout.top_p=1.0
132
+ actor_rollout_ref.rollout.top_k=-1
133
+ actor_rollout_ref.rollout.temperature=1.0
134
+ # Log Probability Inference
135
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
136
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
137
+ # Memory Management
138
+ actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
139
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
140
+ actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
141
+ actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
142
+ actor_rollout_ref.rollout.enable_chunked_prefill=False
143
+ actor_rollout_ref.rollout.multi_stage_wake_up=True
144
+ # Validation Generation
145
+ actor_rollout_ref.rollout.val_kwargs.n=1
146
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True
147
+ actor_rollout_ref.rollout.val_kwargs.top_p=1.0
148
+ actor_rollout_ref.rollout.val_kwargs.top_k=-1
149
+ actor_rollout_ref.rollout.val_kwargs.temperature=1.0
150
+ actor_rollout_ref.nccl_timeout=1800
151
+ )
152
+
153
+ # Trainer Configuration
154
+ TRAINER_CONFIG=(
155
+ trainer.logger='["console"]'
156
+ trainer.project_name="${project_name}"
157
+ trainer.experiment_name="${exp_name}"
158
+ trainer.nnodes="${NNODES}"
159
+ trainer.n_gpus_per_node="${NPUS_PER_NODE}"
160
+ trainer.total_epochs=5
161
+ trainer.val_before_train=False
162
+ trainer.test_freq=-1
163
+ trainer.save_freq=100
164
+ trainer.default_local_dir="${CKPTS_DIR}"
165
+ trainer.critic_warmup=0
166
+ )
167
+
168
+ # Main GRPO Training Command
169
+ # Add the reward function processing for the DeepScaler dataset here
170
+ python3 -m verl.trainer.main_ppo \
171
+ --config-path=config \
172
+ --config-name='ppo_trainer.yaml' \
173
+ custom_reward_function.path=recipe/r1_ascend/deepscaler.py \
174
+ custom_reward_function.name=compute_score \
175
+ "${DATA_CONFIG[@]}" \
176
+ "${MODEL_CONFIG[@]}" \
177
+ "${ACTOR_CONFIG[@]}" \
178
+ "${REF_CONFIG[@]}" \
179
+ "${ROLLOUT_CONFIG[@]}" \
180
+ "${ALGORITHM_CONFIG[@]}" \
181
+ "${TRAINER_CONFIG[@]}" \
182
+ "$@" | tee logs/run_qwen2_5-32b_grpo_fsdp_sglang_npu.log
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+
4
+ python3 -m verl.trainer.main_ppo \
5
+ algorithm.adv_estimator=grpo \
6
+ data.train_files=$HOME/data/gsm8k/train.parquet \
7
+ data.val_files=$HOME/data/gsm8k/test.parquet \
8
+ data.train_batch_size=1024 \
9
+ data.max_prompt_length=512 \
10
+ data.max_response_length=1024 \
11
+ data.filter_overlong_prompts=True \
12
+ data.truncation='error' \
13
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
14
+ actor_rollout_ref.actor.optim.lr=1e-6 \
15
+ actor_rollout_ref.model.use_remove_padding=True \
16
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
17
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
18
+ actor_rollout_ref.actor.use_kl_loss=True \
19
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
20
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
21
+ actor_rollout_ref.actor.entropy_coeff=0 \
22
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
23
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
24
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
25
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
26
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
27
+ actor_rollout_ref.rollout.name=vllm \
28
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
29
+ actor_rollout_ref.rollout.n=5 \
30
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
31
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
32
+ algorithm.use_kl_in_reward=False \
33
+ trainer.critic_warmup=0 \
34
+ trainer.logger='["console","wandb"]' \
35
+ trainer.project_name='verl_grpo_example_gsm8k' \
36
+ trainer.experiment_name='qwen2_7b_function_rm' \
37
+ trainer.n_gpus_per_node=8 \
38
+ trainer.nnodes=1 \
39
+ trainer.save_freq=20 \
40
+ trainer.test_freq=5 \
41
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+
4
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
5
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
6
+ math_train_path=$HOME/data/math/train.parquet
7
+ math_test_path=$HOME/data/math/test.parquet
8
+
9
+ train_files="['$gsm8k_train_path', '$math_train_path']"
10
+ test_files="['$gsm8k_test_path', '$math_test_path']"
11
+
12
+ python3 -m verl.trainer.main_ppo \
13
+ algorithm.adv_estimator=grpo \
14
+ data.train_files="$train_files" \
15
+ data.val_files="$test_files" \
16
+ data.train_batch_size=1024 \
17
+ data.max_prompt_length=1024 \
18
+ data.max_response_length=1024 \
19
+ data.filter_overlong_prompts=True \
20
+ data.truncation='error' \
21
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
22
+ actor_rollout_ref.actor.optim.lr=1e-6 \
23
+ actor_rollout_ref.model.use_remove_padding=True \
24
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
25
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
26
+ actor_rollout_ref.actor.use_kl_loss=True \
27
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
28
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
29
+ actor_rollout_ref.actor.entropy_coeff=0 \
30
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
31
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
32
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
33
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
34
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
35
+ actor_rollout_ref.rollout.name=vllm \
36
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
37
+ actor_rollout_ref.rollout.n=5 \
38
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
39
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
40
+ algorithm.use_kl_in_reward=False \
41
+ trainer.critic_warmup=0 \
42
+ trainer.logger='["console","wandb"]' \
43
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
44
+ trainer.experiment_name='qwen2_7b_function_rm' \
45
+ trainer.n_gpus_per_node=8 \
46
+ trainer.nnodes=1 \
47
+ trainer.save_freq=20 \
48
+ trainer.test_freq=5 \
49
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron.sh ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ rollout_mode="async"
6
+ export VLLM_USE_V1=1
7
+ return_raw_chat="True"
8
+
9
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
10
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
11
+ math_train_path=$HOME/data/math/train.parquet
12
+ math_test_path=$HOME/data/math/test.parquet
13
+
14
+ train_files="['$gsm8k_train_path', '$math_train_path']"
15
+ test_files="['$gsm8k_test_path', '$math_test_path']"
16
+
17
+ USE_FUSED_KERNELS=True
18
+
19
+ python3 -m verl.trainer.main_ppo --config-path=config \
20
+ --config-name='ppo_megatron_trainer.yaml'\
21
+ algorithm.adv_estimator=grpo \
22
+ data.train_files="$train_files" \
23
+ data.val_files="$test_files" \
24
+ data.return_raw_chat=$return_raw_chat \
25
+ data.train_batch_size=1024 \
26
+ data.max_prompt_length=1024 \
27
+ data.max_response_length=1024 \
28
+ data.filter_overlong_prompts=True \
29
+ data.truncation='error' \
30
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
31
+ actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
32
+ actor_rollout_ref.actor.optim.lr=1e-6 \
33
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
34
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
35
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
36
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
37
+ actor_rollout_ref.actor.use_kl_loss=True \
38
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
39
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
40
+ actor_rollout_ref.actor.entropy_coeff=0 \
41
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
42
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
43
+ actor_rollout_ref.rollout.name=vllm \
44
+ actor_rollout_ref.rollout.mode=$rollout_mode \
45
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
46
+ actor_rollout_ref.rollout.n=5 \
47
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
48
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
49
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
50
+ algorithm.use_kl_in_reward=False \
51
+ trainer.critic_warmup=0 \
52
+ trainer.logger='["console","wandb"]' \
53
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
54
+ trainer.experiment_name='qwen2_7b_megatron' \
55
+ trainer.n_gpus_per_node=8 \
56
+ trainer.nnodes=1 \
57
+ trainer.save_freq=20 \
58
+ trainer.test_freq=5 \
59
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -xeuo pipefail
3
+
4
+ # Need to install Megatron-Bridge
5
+ # NOTE: Make sure you use Megatron-Bridge later than 0.2.0
6
+ # (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44 or later)
7
+ # for proper MoE LoRA support.
8
+
9
+ # For Megatron communication/computation overlapping
10
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
11
+
12
+ ############################ Quick Config ############################
13
+
14
+ rollout_name="vllm" # sglang or vllm
15
+ project_name='verl_grpo_example_gsm8k_math'
16
+ exp_name='qwen2_7b_megatron_lora'
17
+
18
+ adv_estimator=grpo
19
+
20
+ max_prompt_length=1024
21
+ max_response_length=1024
22
+ train_prompt_bsz=128
23
+
24
+ ############################ Paths ############################
25
+
26
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
27
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
28
+ math_train_path=$HOME/data/math/train.parquet
29
+ math_test_path=$HOME/data/math/test.parquet
30
+
31
+ train_files="['$gsm8k_train_path', '$math_train_path']"
32
+ test_files="['$gsm8k_test_path', '$math_test_path']"
33
+
34
+ ############################ Parameter Groups ############################
35
+
36
+ DATA=(
37
+ data.train_files="$train_files"
38
+ data.val_files="$test_files"
39
+ data.max_prompt_length=$max_prompt_length
40
+ data.max_response_length=$max_response_length
41
+ data.train_batch_size=$train_prompt_bsz
42
+ data.filter_overlong_prompts=True
43
+ data.truncation='error'
44
+ data.shuffle=False
45
+ )
46
+
47
+ MODEL=(
48
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct
49
+ actor_rollout_ref.model.lora.rank=256
50
+ actor_rollout_ref.model.lora.alpha=512
51
+ actor_rollout_ref.model.lora.lora_A_init_method=kaiming
52
+ # # Optional: Use canonical LoRA
53
+ # actor_rollout_ref.model.lora.type="canonical_lora"
54
+ # actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'
55
+
56
+ # # Optional: Add dropout to LoRA layers
57
+ # actor_rollout_ref.model.lora.dropout=0.05
58
+ # actor_rollout_ref.model.lora.dropout_position=pre
59
+ )
60
+
61
+ ACTOR=(
62
+ actor_rollout_ref.actor.optim.lr=1e-6
63
+ actor_rollout_ref.actor.ppo_mini_batch_size=16
64
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
65
+ actor_rollout_ref.actor.use_dynamic_bsz=True
66
+ actor_rollout_ref.actor.megatron.use_mbridge=True
67
+ actor_rollout_ref.actor.megatron.vanilla_mbridge=False
68
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1
69
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4
70
+ actor_rollout_ref.actor.use_kl_loss=True
71
+ actor_rollout_ref.actor.kl_loss_coef=0.001
72
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl
73
+ actor_rollout_ref.actor.entropy_coeff=0
74
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
75
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
76
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
77
+ )
78
+
79
+ ROLLOUT=(
80
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
81
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2
82
+ actor_rollout_ref.rollout.name=$rollout_name
83
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6
84
+ actor_rollout_ref.rollout.n=4
85
+ )
86
+
87
+ REF=(
88
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
89
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1
90
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4
91
+ )
92
+
93
+ ALGORITHM=(
94
+ algorithm.adv_estimator=$adv_estimator
95
+ algorithm.use_kl_in_reward=False
96
+ )
97
+
98
+ TRAINER=(
99
+ trainer.logger='["console","wandb"]'
100
+ trainer.project_name=$project_name
101
+ trainer.experiment_name=$exp_name
102
+ trainer.n_gpus_per_node=8
103
+ trainer.nnodes=1
104
+ trainer.save_freq=20
105
+ trainer.test_freq=5
106
+ trainer.total_epochs=15
107
+ trainer.val_before_train=False
108
+ )
109
+
110
+ ############################ Launch ############################
111
+
112
+ python3 -m verl.trainer.main_ppo \
113
+ --config-path=config \
114
+ --config-name='ppo_megatron_trainer.yaml' \
115
+ "${DATA[@]}" \
116
+ "${ALGORITHM[@]}" \
117
+ "${MODEL[@]}" \
118
+ "${ROLLOUT[@]}" \
119
+ "${ACTOR[@]}" \
120
+ "${REF[@]}" \
121
+ "${TRAINER[@]}" \
122
+ "$@"
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ # Clean all slurm / MPI / PMIx env to avoid pmix mismatch error
6
+ for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do
7
+ unset "$v"
8
+ done
9
+
10
+ export RAY_DEDUP_LOGS=0
11
+
12
+ # -----
13
+ # Config
14
+ # -----
15
+ TP=${1:-4}
16
+ ACTOR_TP=${ACTOR_TP:-4}
17
+ PROJECT_NAME=${PROJECT_NAME:-"verl_grpo_example_gsm8k_math"}
18
+ EXP_NAME=megatron-trtllm-qwen2-7b-tp${TP}-8gpus
19
+
20
+ if [ $TP -eq 4 ]; then
21
+ MAX_BATCH_SIZE=1024
22
+ else
23
+ MAX_BATCH_SIZE=384
24
+ fi
25
+
26
+ # -----
27
+ # Data
28
+ # -----
29
+ DATADIR=${DATADIR:-$PWD/data}
30
+
31
+ GSM8K_TRAIN_PATH=${DATADIR}/gsm8k/train.parquet
32
+ GSM8K_TEST_PATH=${DATADIR}/gsm8k/test.parquet
33
+ MATH_TRAIN_PATH=${DATADIR}/math/train.parquet
34
+ MATH_TEST_PATH=${DATADIR}/math/test.parquet
35
+
36
+ TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']"
37
+ TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']"
38
+
39
+ USE_FUSED_KERNELS=True
40
+
41
+ # -----
42
+ # Launch
43
+ # -----
44
+ python3 -m verl.trainer.main_ppo --config-path=config \
45
+ --config-name='ppo_megatron_trainer.yaml' \
46
+ algorithm.adv_estimator=grpo \
47
+ data.train_files="$TRAIN_FILES" \
48
+ data.val_files="$TEST_FILES" \
49
+ data.return_raw_chat=True \
50
+ data.train_batch_size=1024 \
51
+ data.max_prompt_length=2048 \
52
+ data.max_response_length=1024 \
53
+ data.filter_overlong_prompts=True \
54
+ data.truncation='error' \
55
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
56
+ actor_rollout_ref.model.use_fused_kernels=$USE_FUSED_KERNELS \
57
+ actor_rollout_ref.actor.optim.lr=1e-6 \
58
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
59
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
60
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
61
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${ACTOR_TP} \
62
+ actor_rollout_ref.actor.use_kl_loss=True \
63
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
64
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
65
+ actor_rollout_ref.actor.entropy_coeff=0 \
66
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
67
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
68
+ actor_rollout_ref.rollout.name=trtllm \
69
+ actor_rollout_ref.rollout.mode="async" \
70
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
71
+ actor_rollout_ref.rollout.n=5 \
72
+ actor_rollout_ref.rollout.max_num_seqs=${MAX_BATCH_SIZE} \
73
+ actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
74
+ actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \
75
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
76
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${ACTOR_TP} \
77
+ +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
78
+ +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
79
+ actor_rollout_ref.rollout.calculate_log_probs=True \
80
+ algorithm.use_kl_in_reward=False \
81
+ trainer.critic_warmup=0 \
82
+ trainer.logger='["console","wandb"]' \
83
+ trainer.project_name="${PROJECT_NAME}" \
84
+ trainer.experiment_name=${EXP_NAME} \
85
+ trainer.n_gpus_per_node=8 \
86
+ trainer.nnodes=1 \
87
+ trainer.save_freq=-1 \
88
+ trainer.test_freq=5 \
89
+ trainer.resume_mode=disable \
90
+ trainer.total_epochs=15 \
91
+ "${@:2}"
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # Clean all slurm / MPI / PMIx env to avoid pmix mismatch error
4
+ for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do
5
+ unset "$v"
6
+ done
7
+
8
+ export RAY_DEDUP_LOGS=0
9
+
10
+ # -----
11
+ # Config
12
+ # -----
13
+ TP=${1:-4}
14
+ PROJECT_NAME=${PROJECT_NAME:-"verl_grpo_example_gsm8k_math"}
15
+ EXP_NAME=trtllm-qwen2-7b-tp${TP}-8gpus${EXP_NAME_SUFFIX:+"-"}${EXP_NAME_SUFFIX}
16
+
17
+ if [ $TP -eq 4 ]; then
18
+ MAX_BATCH_SIZE=1024
19
+ else
20
+ MAX_BATCH_SIZE=384
21
+ fi
22
+
23
+ # -----
24
+ # Data
25
+ # -----
26
+ DATADIR=${DATADIR:-$PWD/data}
27
+ MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-7B-Instruct"}
28
+
29
+ GSM8K_TRAIN_PATH=${DATADIR}/gsm8k/train.parquet
30
+ GSM8K_TEST_PATH=${DATADIR}/gsm8k/test.parquet
31
+ MATH_TRAIN_PATH=${DATADIR}/math/train.parquet
32
+ MATH_TEST_PATH=${DATADIR}/math/test.parquet
33
+
34
+ TRAIN_FILES="['$GSM8K_TRAIN_PATH', '$MATH_TRAIN_PATH']"
35
+ TEST_FILES="['$GSM8K_TEST_PATH', '$MATH_TEST_PATH']"
36
+
37
+ # -----
38
+ # Launch
39
+ # -----
40
+ python3 -m verl.trainer.main_ppo \
41
+ algorithm.adv_estimator=grpo \
42
+ algorithm.rollout_correction.rollout_is_threshold=2.0 \
43
+ data.train_files="$TRAIN_FILES" \
44
+ data.val_files="$TEST_FILES" \
45
+ data.train_batch_size=1024 \
46
+ data.max_prompt_length=2048 \
47
+ data.max_response_length=1024 \
48
+ data.return_raw_chat=True \
49
+ data.filter_overlong_prompts=True \
50
+ data.truncation='error' \
51
+ actor_rollout_ref.hybrid_engine=True \
52
+ actor_rollout_ref.model.path=${MODEL_PATH} \
53
+ actor_rollout_ref.actor.optim.lr=1e-6 \
54
+ actor_rollout_ref.model.use_remove_padding=True \
55
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
56
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
57
+ actor_rollout_ref.actor.use_kl_loss=True \
58
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
59
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
60
+ actor_rollout_ref.actor.entropy_coeff=0 \
61
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
62
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
63
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
64
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
65
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${TP} \
66
+ actor_rollout_ref.rollout.name=trtllm \
67
+ actor_rollout_ref.rollout.mode="async" \
68
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
69
+ actor_rollout_ref.rollout.n=5 \
70
+ actor_rollout_ref.rollout.max_num_seqs=${MAX_BATCH_SIZE} \
71
+ actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
72
+ +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_timeout_iters=32 \
73
+ +actor_rollout_ref.rollout.engine_kwargs.trtllm.batch_wait_max_tokens_ratio=0.5 \
74
+ actor_rollout_ref.rollout.calculate_log_probs=True \
75
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
76
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
77
+ actor_rollout_ref.rollout.checkpoint_engine.update_weights_bucket_megabytes=4096 \
78
+ algorithm.use_kl_in_reward=False \
79
+ trainer.critic_warmup=0 \
80
+ trainer.logger='["console","wandb"]' \
81
+ trainer.project_name="${PROJECT_NAME}" \
82
+ trainer.experiment_name=${EXP_NAME} \
83
+ trainer.n_gpus_per_node=8 \
84
+ trainer.nnodes=1 \
85
+ trainer.save_freq=-1 \
86
+ trainer.test_freq=5 \
87
+ trainer.resume_mode=disable \
88
+ trainer.total_epochs=15 \
89
+ "${@:2}"
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+
4
+ # For async rollout mode, dataset should return raw chat.
5
+ rollout_mode="async"
6
+ rollout_name="sglang" # sglang or vllm
7
+ return_raw_chat="True"
8
+ if [ "$rollout_name" = "vllm" ]; then
9
+ export VLLM_USE_V1=1
10
+ fi
11
+
12
+ python3 -m verl.trainer.main_ppo \
13
+ algorithm.adv_estimator=grpo \
14
+ data.train_files=$HOME/data/gsm8k/train.parquet \
15
+ data.val_files=$HOME/data/gsm8k/test.parquet \
16
+ data.return_raw_chat=$return_raw_chat \
17
+ data.train_batch_size=1024 \
18
+ data.max_prompt_length=512 \
19
+ data.max_response_length=1024 \
20
+ data.filter_overlong_prompts=True \
21
+ data.truncation='error' \
22
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
23
+ actor_rollout_ref.actor.optim.lr=1e-6 \
24
+ actor_rollout_ref.model.use_remove_padding=True \
25
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
26
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
27
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
28
+ actor_rollout_ref.actor.use_kl_loss=True \
29
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
30
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
31
+ actor_rollout_ref.actor.entropy_coeff=0 \
32
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
33
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
34
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
35
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
36
+ actor_rollout_ref.rollout.name=$rollout_name \
37
+ actor_rollout_ref.rollout.mode=$rollout_mode \
38
+ actor_rollout_ref.rollout.multi_turn.format=hermes \
39
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
40
+ actor_rollout_ref.rollout.n=5 \
41
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
42
+ algorithm.use_kl_in_reward=False \
43
+ trainer.critic_warmup=0 \
44
+ trainer.logger='["console","wandb"]' \
45
+ trainer.project_name='verl_grpo_example_gsm8k' \
46
+ trainer.experiment_name='qwen2_7b_function_rm_kl1e-3' \
47
+ trainer.val_before_train=False \
48
+ trainer.n_gpus_per_node=8 \
49
+ trainer.nnodes=1 \
50
+ trainer.save_freq=20 \
51
+ trainer.test_freq=5 \
52
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_seq_balance_math_megatron.sh ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
6
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
7
+ math_train_path=$HOME/data/math/train.parquet
8
+ math_test_path=$HOME/data/math/test.parquet
9
+
10
+ train_files="['$gsm8k_train_path', '$math_train_path']"
11
+ test_files="['$gsm8k_test_path', '$math_test_path']"
12
+
13
+ offload=True
14
+
15
+ python3 -m verl.trainer.main_ppo --config-path=config \
16
+ --config-name='ppo_megatron_trainer.yaml'\
17
+ algorithm.adv_estimator=grpo \
18
+ data.train_files="$train_files" \
19
+ data.val_files="$test_files" \
20
+ data.train_batch_size=1024 \
21
+ data.max_prompt_length=1024 \
22
+ data.max_response_length=1024 \
23
+ data.filter_overlong_prompts=True \
24
+ data.truncation='error' \
25
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
26
+ actor_rollout_ref.actor.optim.lr=1e-6 \
27
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
28
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
29
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \
30
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
31
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
32
+ actor_rollout_ref.actor.use_kl_loss=True \
33
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
34
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
35
+ actor_rollout_ref.actor.entropy_coeff=0 \
36
+ actor_rollout_ref.actor.megatron.param_offload=${offload} \
37
+ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
38
+ actor_rollout_ref.actor.megatron.grad_offload=${offload} \
39
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
40
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
41
+ actor_rollout_ref.ref.megatron.param_offload=${offload} \
42
+ actor_rollout_ref.rollout.name=vllm \
43
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
44
+ actor_rollout_ref.rollout.n=5 \
45
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
46
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
47
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
48
+ algorithm.use_kl_in_reward=False \
49
+ trainer.critic_warmup=0 \
50
+ trainer.logger='["console","wandb"]' \
51
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
52
+ trainer.experiment_name='qwen2_7b_megatron' \
53
+ trainer.n_gpus_per_node=8 \
54
+ trainer.nnodes=1 \
55
+ trainer.save_freq=20 \
56
+ trainer.test_freq=5 \
57
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=grpo \
5
+ trainer.val_before_train=False \
6
+ data.train_files=$HOME/data/gsm8k/train.parquet \
7
+ data.val_files=$HOME/data/gsm8k/test.parquet \
8
+ data.train_batch_size=16 \
9
+ data.max_prompt_length=512 \
10
+ data.max_response_length=1024 \
11
+ data.filter_overlong_prompts=True \
12
+ data.truncation='error' \
13
+ data.shuffle=False \
14
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
15
+ actor_rollout_ref.model.lora_rank=64 \
16
+ actor_rollout_ref.model.lora_alpha=32 \
17
+ actor_rollout_ref.actor.optim.lr=3e-6 \
18
+ actor_rollout_ref.model.use_remove_padding=True \
19
+ actor_rollout_ref.actor.ppo_mini_batch_size=16 \
20
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
21
+ actor_rollout_ref.actor.use_kl_loss=True \
22
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
23
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
24
+ actor_rollout_ref.actor.entropy_coeff=0 \
25
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
26
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
27
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
28
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
29
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
30
+ actor_rollout_ref.rollout.name=vllm \
31
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
32
+ actor_rollout_ref.rollout.n=5 \
33
+ actor_rollout_ref.rollout.load_format=safetensors \
34
+ actor_rollout_ref.rollout.layered_summon=True \
35
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
36
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
37
+ algorithm.use_kl_in_reward=False \
38
+ trainer.critic_warmup=0 \
39
+ trainer.logger='["console","wandb"]' \
40
+ trainer.project_name='verl_grpo_example_gsm8k' \
41
+ trainer.experiment_name='qwen2.5_3b_grpo_lora' \
42
+ trainer.n_gpus_per_node=2 \
43
+ trainer.nnodes=1 \
44
+ trainer.save_freq=20 \
45
+ trainer.test_freq=5 \
46
+ trainer.total_epochs=15 $@
47
+
48
+ # actor_rollout_ref.actor.ppo_mini_batch_size=256 \
49
+ # data.train_batch_size=1024 \
50
+ # trainer.n_gpus_per_node=8 \
51
+ # actor_rollout_ref.model.use_shm=True \
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-7b_math_megatron_diff_tp.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
6
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
7
+ math_train_path=$HOME/data/math/train.parquet
8
+ math_test_path=$HOME/data/math/test.parquet
9
+
10
+ train_files="['$gsm8k_train_path', '$math_train_path']"
11
+ test_files="['$gsm8k_test_path', '$math_test_path']"
12
+
13
+ python3 -m verl.trainer.main_ppo --config-path=config \
14
+ --config-name='ppo_megatron_trainer.yaml'\
15
+ algorithm.adv_estimator=grpo \
16
+ data.train_files="$train_files" \
17
+ data.val_files="$test_files" \
18
+ data.train_batch_size=1024 \
19
+ data.max_prompt_length=1024 \
20
+ data.max_response_length=1024 \
21
+ data.filter_overlong_prompts=True \
22
+ data.truncation='error' \
23
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
24
+ actor_rollout_ref.actor.optim.lr=1e-6 \
25
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
26
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
27
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
28
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
29
+ actor_rollout_ref.actor.use_kl_loss=True \
30
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
31
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
32
+ actor_rollout_ref.actor.entropy_coeff=0 \
33
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
34
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
35
+ actor_rollout_ref.rollout.name=vllm \
36
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
37
+ actor_rollout_ref.rollout.n=5 \
38
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
39
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
40
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
41
+ algorithm.use_kl_in_reward=False \
42
+ trainer.critic_warmup=0 \
43
+ trainer.logger='["console","wandb"]' \
44
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
45
+ trainer.experiment_name='qwen2_7b_megatron' \
46
+ trainer.n_gpus_per_node=8 \
47
+ trainer.nnodes=1 \
48
+ trainer.save_freq=20 \
49
+ trainer.test_freq=5 \
50
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=grpo \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=1024 \
8
+ data.max_prompt_length=1024 \
9
+ data.max_response_length=1024 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-32B-Instruct \
13
+ actor_rollout_ref.actor.optim.lr=1e-6\
14
+ actor_rollout_ref.model.use_remove_padding=False \
15
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
16
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
17
+ actor_rollout_ref.actor.use_kl_loss=True \
18
+ actor_rollout_ref.actor.entropy_coeff=0 \
19
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
20
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
21
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
22
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
23
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
24
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
25
+ actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
26
+ actor_rollout_ref.rollout.name=vllm \
27
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
28
+ actor_rollout_ref.rollout.n=5 \
29
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
30
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
31
+ algorithm.use_kl_in_reward=False \
32
+ trainer.critic_warmup=0 \
33
+ trainer.logger=console \
34
+ trainer.project_name='verl_grpo_example_gsm8k' \
35
+ trainer.experiment_name='qwen2_5_32b_function_rm' \
36
+ trainer.n_gpus_per_node=16 \
37
+ trainer.nnodes=2 \
38
+ trainer.save_freq=-1 \
39
+ trainer.test_freq=10 \
40
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_discrete_prof_npu.sh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # profiling configuration
4
+ PROFILE_STEPS="[2,4]"
5
+ PROFILE_RANKS_ALL=False
6
+ DISCRETE=True
7
+ PROFILE_RANKS="[1,2]"
8
+
9
+ # profiling NPU options
10
+ SAVE_PATH="$HOME/profile_data"
11
+ LEVEL="level0"
12
+ CONTENTS=['npu','cpu']
13
+ ANALYSIS=True
14
+
15
+ python3 -m verl.trainer.main_ppo \
16
+ algorithm.adv_estimator=grpo \
17
+ data.train_files=$HOME/data/gsm8k/train.parquet \
18
+ data.val_files=$HOME/data/gsm8k/test.parquet \
19
+ data.train_batch_size=32 \
20
+ data.max_prompt_length=1024 \
21
+ data.max_response_length=1024 \
22
+ data.filter_overlong_prompts=True \
23
+ data.truncation='error' \
24
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
25
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
26
+ actor_rollout_ref.model.use_remove_padding=False \
27
+ actor_rollout_ref.actor.optim.lr=5e-8 \
28
+ actor_rollout_ref.actor.ppo_mini_batch_size=2 \
29
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
30
+ actor_rollout_ref.actor.use_kl_loss=True \
31
+ actor_rollout_ref.actor.entropy_coeff=0 \
32
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
33
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
34
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
35
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
36
+ actor_rollout_ref.actor.profiler.enable=True \
37
+ actor_rollout_ref.actor.profiler.ranks=$PROFILE_RANKS \
38
+ actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \
39
+ actor_rollout_ref.actor.profiler.tool_config.npu.discrete=$DISCRETE \
40
+ actor_rollout_ref.actor.profiler.tool_config.npu.contents=$CONTENTS \
41
+ actor_rollout_ref.actor.profiler.tool_config.npu.level=$LEVEL \
42
+ actor_rollout_ref.actor.profiler.tool_config.npu.analysis=$ANALYSIS \
43
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
44
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
45
+ actor_rollout_ref.rollout.name=vllm \
46
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
47
+ actor_rollout_ref.rollout.n=4 \
48
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
49
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
50
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
51
+ actor_rollout_ref.ref.profiler.enable=True \
52
+ actor_rollout_ref.ref.profiler.ranks=$PROFILE_RANKS \
53
+ actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
54
+ actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
55
+ actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
56
+ actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
57
+ actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
58
+ algorithm.use_kl_in_reward=False \
59
+ trainer.critic_warmup=0 \
60
+ trainer.logger=console \
61
+ trainer.project_name='verl_grpo_example_gsm8k' \
62
+ trainer.experiment_name='qwen2_5_7b_function_rm' \
63
+ trainer.n_gpus_per_node=8 \
64
+ trainer.nnodes=1 \
65
+ trainer.save_freq=-1 \
66
+ trainer.test_freq=5 \
67
+ trainer.total_epochs=5 \
68
+ global_profiler.tool=npu \
69
+ global_profiler.steps=$PROFILE_STEPS \
70
+ global_profiler.save_path=$SAVE_PATH
71
+ $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=grpo \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=1024 \
8
+ data.max_prompt_length=1024 \
9
+ data.max_response_length=1024 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
13
+ actor_rollout_ref.actor.optim.lr=5e-8 \
14
+ actor_rollout_ref.model.use_remove_padding=False \
15
+ actor_rollout_ref.actor.ppo_mini_batch_size=32 \
16
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
17
+ actor_rollout_ref.actor.use_kl_loss=True \
18
+ actor_rollout_ref.actor.entropy_coeff=0 \
19
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
20
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
21
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
22
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
23
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
24
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
25
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
26
+ actor_rollout_ref.rollout.name=vllm \
27
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
28
+ actor_rollout_ref.rollout.n=5 \
29
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
30
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
31
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
32
+ algorithm.use_kl_in_reward=False \
33
+ trainer.critic_warmup=0 \
34
+ trainer.logger=console \
35
+ trainer.project_name='verl_grpo_example_gsm8k' \
36
+ trainer.experiment_name='qwen2_5_7b_function_rm' \
37
+ trainer.n_gpus_per_node=16 \
38
+ trainer.nnodes=1 \
39
+ trainer.save_freq=-1 \
40
+ trainer.test_freq=5 \
41
+ trainer.total_epochs=5 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ HF_MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct
6
+ DIST_CKPT_PATH=${DIST_CKPT_PATH}
7
+
8
+ # convert HF model to megatron format offlinely
9
+ # python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
10
+
11
+
12
+ # megatron tuning guide:
13
+ # 1. recommend to offload all states by setting ALL_OFFLOAD=True
14
+ # 2. enable dynamic batch size by setting actor_rollout_ref.actor.use_dynamic_bsz=True ref.log_prob_use_dynamic_bsz=True rollout.log_prob_use_dynamic_bsz=True
15
+ # 3. set ppo_max_token_len_per_gpu and log_prob_max_token_len_per_gpu as large as possible for better MFU (limited by GPU memory). assure ppo_max_token_len_per_gpu > max_prompt_length+max_response_length, if sequence length is too long, you can increase the TP/PP size
16
+ # 4. if memory is very limited, enable full recompute, but the mfu will be 30% lower
17
+ # full recompute settings:
18
+ # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
19
+ # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
20
+ # +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
21
+
22
+ ALL_OFFLOAD=${ALL_OFFLOAD:-True}
23
+ COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
24
+ COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
25
+ COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
26
+
27
+ ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
28
+ ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
29
+ ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
30
+ REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
31
+
32
+
33
+ train_path=$HOME/data/geo3k/train.parquet
34
+ test_path=$HOME/data/geo3k/test.parquet
35
+
36
+ python3 -m verl.trainer.main_ppo --config-path=config \
37
+ --config-name='ppo_megatron_trainer.yaml'\
38
+ algorithm.adv_estimator=grpo \
39
+ data.train_files="$train_path" \
40
+ data.val_files="$test_path" \
41
+ data.train_batch_size=512 \
42
+ data.max_prompt_length=1024 \
43
+ data.max_response_length=2048 \
44
+ data.filter_overlong_prompts=True \
45
+ data.truncation='error' \
46
+ actor_rollout_ref.model.path=$HF_MODEL_PATH \
47
+ actor_rollout_ref.actor.optim.lr=1e-6 \
48
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
49
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
50
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=1 \
51
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
52
+ actor_rollout_ref.actor.use_kl_loss=True \
53
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
54
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
55
+ actor_rollout_ref.actor.entropy_coeff=0 \
56
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
57
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
58
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
59
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=5120 \
60
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
61
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=20480 \
62
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
63
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \
64
+ actor_rollout_ref.rollout.name=$ENGINE \
65
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
66
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
67
+ actor_rollout_ref.rollout.n=5 \
68
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
69
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=1 \
70
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
71
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
72
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
73
+ actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
74
+ actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
75
+ actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
76
+ actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
77
+ actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
78
+ actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
79
+ algorithm.use_kl_in_reward=False \
80
+ trainer.critic_warmup=0 \
81
+ trainer.logger='["console","wandb"]' \
82
+ trainer.project_name='verl_grpo_example_geo3k' \
83
+ trainer.experiment_name='qwen2_5_vl_7b_megatron' \
84
+ trainer.n_gpus_per_node=8 \
85
+ trainer.nnodes=1 \
86
+ trainer.save_freq=20 \
87
+ trainer.test_freq=5 \
88
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b-sglang.sh ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # python examples/data_preprocess/geo3k.py --local_dir ~/data/geo3k
4
+
5
+ python3 -m verl.trainer.main_ppo \
6
+ algorithm.adv_estimator=grpo \
7
+ data.train_files=$HOME/data/geo3k/train.parquet \
8
+ data.val_files=$HOME/data/geo3k/test.parquet \
9
+ data.train_batch_size=512 \
10
+ data.max_prompt_length=1024 \
11
+ data.max_response_length=2048 \
12
+ data.filter_overlong_prompts=True \
13
+ data.truncation='error' \
14
+ data.image_key=images \
15
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
16
+ actor_rollout_ref.actor.optim.lr=1e-6 \
17
+ actor_rollout_ref.model.use_remove_padding=True \
18
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
19
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
20
+ actor_rollout_ref.actor.use_kl_loss=True \
21
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
22
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
23
+ actor_rollout_ref.actor.entropy_coeff=0 \
24
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
25
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
26
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
27
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
28
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
29
+ actor_rollout_ref.rollout.name=sglang \
30
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
31
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
32
+ actor_rollout_ref.rollout.multi_stage_wake_up=True \
33
+ global_profiler.tool=torch_memory \
34
+ global_profiler.save_path=./mem_snapshots \
35
+ global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries=100000 \
36
+ global_profiler.global_tool_config.torch_memory.stack_depth=32 \
37
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
38
+ actor_rollout_ref.rollout.enforce_eager=False \
39
+ actor_rollout_ref.rollout.free_cache_engine=True \
40
+ actor_rollout_ref.rollout.n=5 \
41
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
42
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
43
+ actor_rollout_ref.rollout.mode=async \
44
+ algorithm.use_kl_in_reward=False \
45
+ trainer.critic_warmup=0 \
46
+ trainer.logger='["console","wandb"]' \
47
+ trainer.project_name='verl_grpo_example_geo3k' \
48
+ trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
49
+ trainer.n_gpus_per_node=8 \
50
+ trainer.nnodes=1 \
51
+ trainer.save_freq=20 \
52
+ trainer.test_freq=5 \
53
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_freeze_vision.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+
4
+ python3 -m verl.trainer.main_ppo \
5
+ algorithm.adv_estimator=grpo \
6
+ data.train_files=$HOME/data/geo3k/train.parquet \
7
+ data.val_files=$HOME/data/geo3k/test.parquet \
8
+ data.train_batch_size=512 \
9
+ data.max_prompt_length=1024 \
10
+ data.max_response_length=2048 \
11
+ data.filter_overlong_prompts=True \
12
+ data.truncation='error' \
13
+ data.image_key=images \
14
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
15
+ actor_rollout_ref.actor.optim.lr=1e-6 \
16
+ actor_rollout_ref.actor.freeze_vision_tower=True \
17
+ actor_rollout_ref.model.use_remove_padding=True \
18
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
19
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
20
+ actor_rollout_ref.actor.use_kl_loss=True \
21
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
22
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
23
+ actor_rollout_ref.actor.entropy_coeff=0 \
24
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
25
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
26
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
27
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
28
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
29
+ actor_rollout_ref.rollout.name=$ENGINE \
30
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
31
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
32
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
33
+ actor_rollout_ref.rollout.enforce_eager=False \
34
+ actor_rollout_ref.rollout.free_cache_engine=True \
35
+ actor_rollout_ref.rollout.n=5 \
36
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
37
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
38
+ algorithm.use_kl_in_reward=False \
39
+ trainer.critic_warmup=0 \
40
+ trainer.logger='["console","wandb"]' \
41
+ trainer.project_name='verl_grpo_example_geo3k' \
42
+ trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
43
+ trainer.n_gpus_per_node=8 \
44
+ trainer.nnodes=1 \
45
+ trainer.save_freq=20 \
46
+ trainer.test_freq=5 \
47
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+ # If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
4
+ # export VLLM_ATTENTION_BACKEND=XFORMERS
5
+
6
+ python3 -m verl.trainer.main_ppo \
7
+ algorithm.adv_estimator=grpo \
8
+ data.train_files=$HOME/data/geo3k/train.parquet \
9
+ data.val_files=$HOME/data/geo3k/test.parquet \
10
+ data.train_batch_size=512 \
11
+ data.max_prompt_length=1024 \
12
+ data.max_response_length=2048 \
13
+ data.filter_overlong_prompts=True \
14
+ data.truncation='error' \
15
+ data.image_key=images \
16
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
17
+ actor_rollout_ref.actor.optim.lr=3e-6 \
18
+ actor_rollout_ref.model.use_remove_padding=True \
19
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
20
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
21
+ actor_rollout_ref.model.lora_rank=64 \
22
+ actor_rollout_ref.model.lora_alpha=32 \
23
+ actor_rollout_ref.model.target_modules=all-linear \
24
+ actor_rollout_ref.model.exclude_modules='.*visual.*' \
25
+ actor_rollout_ref.actor.use_kl_loss=True \
26
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
27
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
28
+ actor_rollout_ref.actor.entropy_coeff=0 \
29
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
30
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
31
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
32
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
33
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
34
+ actor_rollout_ref.rollout.name=$ENGINE \
35
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
36
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
37
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
38
+ actor_rollout_ref.rollout.enforce_eager=False \
39
+ actor_rollout_ref.rollout.free_cache_engine=False \
40
+ actor_rollout_ref.rollout.n=5 \
41
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
42
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
43
+ algorithm.use_kl_in_reward=False \
44
+ trainer.critic_warmup=0 \
45
+ trainer.logger='["console","wandb"]' \
46
+ trainer.project_name='verl_grpo_example_geo3k' \
47
+ trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
48
+ trainer.n_gpus_per_node=8 \
49
+ trainer.nnodes=1 \
50
+ trainer.save_freq=20 \
51
+ trainer.test_freq=5 \
52
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b_seq_balance.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+
4
+ python3 -m verl.trainer.main_ppo \
5
+ algorithm.adv_estimator=grpo \
6
+ data.train_files=$HOME/data/geo3k/train.parquet \
7
+ data.val_files=$HOME/data/geo3k/test.parquet \
8
+ data.train_batch_size=512 \
9
+ data.max_prompt_length=1024 \
10
+ data.max_response_length=2048 \
11
+ data.filter_overlong_prompts=True \
12
+ data.truncation='error' \
13
+ data.image_key=images \
14
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
15
+ actor_rollout_ref.actor.optim.lr=1e-6 \
16
+ actor_rollout_ref.model.use_remove_padding=True \
17
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
18
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
19
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=6144 \
20
+ actor_rollout_ref.actor.use_kl_loss=True \
21
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
22
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
23
+ actor_rollout_ref.actor.entropy_coeff=0 \
24
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
25
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
26
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
27
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
28
+ actor_rollout_ref.rollout.name=$ENGINE \
29
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
30
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
31
+ actor_rollout_ref.rollout.enforce_eager=False \
32
+ actor_rollout_ref.rollout.free_cache_engine=False \
33
+ actor_rollout_ref.rollout.n=5 \
34
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=6144 \
35
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
36
+ algorithm.use_kl_in_reward=False \
37
+ trainer.critic_warmup=0 \
38
+ trainer.logger='["console","wandb"]' \
39
+ trainer.project_name='verl_grpo_example_geo3k' \
40
+ trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
41
+ trainer.n_gpus_per_node=8 \
42
+ trainer.nnodes=1 \
43
+ trainer.save_freq=20 \
44
+ trainer.test_freq=5 \
45
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+
4
+ # Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
5
+ # the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
6
+ export USE_OPTIMIZED_MODEL=0
7
+
8
+ python3 -m verl.trainer.main_ppo \
9
+ algorithm.adv_estimator=grpo \
10
+ data.train_files=$HOME/data/geo3k/train.parquet \
11
+ data.val_files=$HOME/data/geo3k/test.parquet \
12
+ data.train_batch_size=512 \
13
+ data.max_prompt_length=1024 \
14
+ data.max_response_length=2048 \
15
+ data.filter_overlong_prompts=True \
16
+ data.truncation='error' \
17
+ data.image_key=images \
18
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-32B-Instruct \
19
+ actor_rollout_ref.actor.optim.lr=1e-6 \
20
+ actor_rollout_ref.model.use_remove_padding=True \
21
+ actor_rollout_ref.actor.ppo_mini_batch_size=32 \
22
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
23
+ actor_rollout_ref.actor.use_kl_loss=True \
24
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
25
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
26
+ actor_rollout_ref.actor.entropy_coeff=0 \
27
+ actor_rollout_ref.actor.use_torch_compile=False \
28
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
29
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
30
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
31
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
32
+ actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
33
+ actor_rollout_ref.rollout.name=$ENGINE \
34
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
35
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
36
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
37
+ actor_rollout_ref.rollout.enforce_eager=True \
38
+ actor_rollout_ref.rollout.free_cache_engine=True \
39
+ actor_rollout_ref.rollout.n=5 \
40
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
41
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
42
+ algorithm.use_kl_in_reward=False \
43
+ trainer.critic_warmup=0 \
44
+ trainer.logger=console \
45
+ trainer.project_name='verl_grpo_example_geo3k' \
46
+ trainer.experiment_name='qwen2_5_vl_32b_function_rm' \
47
+ trainer.n_gpus_per_node=16 \
48
+ trainer.nnodes=2 \
49
+ trainer.save_freq=-1 \
50
+ trainer.test_freq=-1 \
51
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+
4
+ # Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
5
+ # the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
6
+ export USE_OPTIMIZED_MODEL=0
7
+
8
+ python3 -m verl.trainer.main_ppo \
9
+ algorithm.adv_estimator=grpo \
10
+ data.train_files=$HOME/data/geo3k/train.parquet \
11
+ data.val_files=$HOME/data/geo3k/test.parquet \
12
+ data.train_batch_size=512 \
13
+ data.max_prompt_length=1024 \
14
+ data.max_response_length=2048 \
15
+ data.filter_overlong_prompts=True \
16
+ data.truncation='error' \
17
+ data.image_key=images \
18
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-3B-Instruct \
19
+ actor_rollout_ref.actor.optim.lr=1e-6 \
20
+ actor_rollout_ref.model.use_remove_padding=True \
21
+ actor_rollout_ref.actor.ppo_mini_batch_size=16 \
22
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
23
+ actor_rollout_ref.actor.use_kl_loss=True \
24
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
25
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
26
+ actor_rollout_ref.actor.entropy_coeff=0 \
27
+ actor_rollout_ref.actor.use_torch_compile=False \
28
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
29
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
30
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
31
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
32
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
33
+ actor_rollout_ref.rollout.name=$ENGINE \
34
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
35
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
36
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
37
+ actor_rollout_ref.rollout.enforce_eager=True \
38
+ actor_rollout_ref.rollout.free_cache_engine=True \
39
+ actor_rollout_ref.rollout.n=5 \
40
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
41
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
42
+ algorithm.use_kl_in_reward=False \
43
+ trainer.use_legacy_worker_impl=disable \
44
+ trainer.critic_warmup=0 \
45
+ trainer.logger=console \
46
+ trainer.project_name='verl_grpo_example_geo3k' \
47
+ trainer.experiment_name='qwen2_5_vl_3b_function_rm' \
48
+ trainer.n_gpus_per_node=8 \
49
+ trainer.nnodes=1 \
50
+ trainer.save_freq=-1 \
51
+ trainer.test_freq=-1 \
52
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+
4
+ # Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
5
+ # the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
6
+ export USE_OPTIMIZED_MODEL=0
7
+
8
+ python3 -m verl.trainer.main_ppo \
9
+ algorithm.adv_estimator=grpo \
10
+ data.train_files=$HOME/data/geo3k/train.parquet \
11
+ data.val_files=$HOME/data/geo3k/test.parquet \
12
+ data.train_batch_size=512 \
13
+ data.max_prompt_length=1024 \
14
+ data.max_response_length=2048 \
15
+ data.filter_overlong_prompts=True \
16
+ data.truncation='error' \
17
+ data.image_key=images \
18
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
19
+ actor_rollout_ref.actor.optim.lr=1e-6 \
20
+ actor_rollout_ref.model.use_remove_padding=True \
21
+ actor_rollout_ref.actor.ppo_mini_batch_size=32 \
22
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
23
+ actor_rollout_ref.actor.use_kl_loss=True \
24
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
25
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
26
+ actor_rollout_ref.actor.entropy_coeff=0 \
27
+ actor_rollout_ref.actor.use_torch_compile=False \
28
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
29
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
30
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
31
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
32
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
33
+ actor_rollout_ref.rollout.name=$ENGINE \
34
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
35
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
36
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
37
+ actor_rollout_ref.rollout.enforce_eager=True \
38
+ actor_rollout_ref.rollout.free_cache_engine=True \
39
+ actor_rollout_ref.rollout.n=5 \
40
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
41
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
42
+ algorithm.use_kl_in_reward=False \
43
+ trainer.critic_warmup=0 \
44
+ trainer.logger=console \
45
+ trainer.project_name='verl_grpo_example_geo3k' \
46
+ trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
47
+ trainer.n_gpus_per_node=16 \
48
+ trainer.nnodes=1 \
49
+ trainer.save_freq=-1 \
50
+ trainer.test_freq=-1 \
51
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh ADDED
@@ -0,0 +1,181 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -xeuo pipefail
3
+
4
+ ## !!!!!!!important!!!!!!
5
+ ## set the following environment variables on all your nodes
6
+ # env_vars:
7
+ # CUDA_DEVICE_MAX_CONNECTIONS: "1"
8
+ # NCCL_NVLS_ENABLE: "0"
9
+ # VLLM_USE_V1: 1
10
+ # install mbridge=0.1.13 on all your node with the following command:
11
+ # pip3 install git+https://github.com/ISEEKYAN/mbridge
12
+
13
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
14
+ [ -f "${SCRIPT_DIR}/env.sh" ] && source "${SCRIPT_DIR}/env.sh"
15
+
16
+ adv_estimator=grpo
17
+
18
+ use_kl_in_reward=False
19
+ kl_coef=0.0
20
+ use_kl_loss=True
21
+ kl_loss_coef=0.001
22
+
23
+ clip_ratio_low=0.2
24
+ clip_ratio_high=0.28
25
+
26
+ max_prompt_length=$((1024 * 2))
27
+ max_response_length=$((1204 * 8))
28
+ enable_overlong_buffer=True
29
+ overlong_buffer_len=$((1024 * 1))
30
+ overlong_penalty_factor=1.0
31
+
32
+ loss_agg_mode="token-mean"
33
+
34
+ train_prompt_bsz=${TRAIN_BS:-32}
35
+ n_resp_per_prompt=8
36
+ train_prompt_mini_bsz=16
37
+
38
+ # minimum nodes need for qwen3-235B-A22B
39
+ NNODES=${NNODES:-4}
40
+ # Paths
41
+
42
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
43
+
44
+ MODEL_PATH=$RAY_DATA_HOME/models/Qwen3-235B-A22B
45
+
46
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
47
+ TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
48
+
49
+ # Algorithm
50
+ temperature=1.0
51
+ top_p=1.0
52
+ top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
53
+ val_top_p=0.7
54
+ # Performance Related Parameter
55
+ use_dynamic_bsz=True
56
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10))
57
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
58
+ offload=True
59
+ OPTIM_OFFLOAD=${OPTIM_OFFLOAD:-True}
60
+ gen_tp=8
61
+ train_tp=${TP:-4}
62
+ train_pp=${PP:-8}
63
+
64
+ EP=${EP:-4}
65
+ ETP=1
66
+ CP=1
67
+ optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
68
+ last_layer=${LAST_LAYER:-10}
69
+
70
+ project_name='verl-qwen3'
71
+ exp_name="235B-${NNODES}-pp${train_pp}-tp${train_tp}-ep${EP}-actor-length${actor_ppo_max_token_len}"
72
+ CKPTS_DIR=$RAY_DATA_HOME/ckpt/${project_name}/${exp_name}
73
+
74
+ # TODO: support cuda graph for rollout by setting the following config
75
+ # actor_rollout_ref.rollout.cudagraph_capture_sizes=[1,2,4,8,16,32]
76
+ # actor_rollout_ref.rollout.enforce_eager=False
77
+
78
+ python3 -m verl.trainer.main_ppo \
79
+ --config-path=config \
80
+ --config-name='ppo_megatron_trainer.yaml' \
81
+ data.train_files="${TRAIN_FILE}" \
82
+ data.val_files="${TEST_FILE}" \
83
+ data.prompt_key=prompt \
84
+ data.truncation='left' \
85
+ data.max_prompt_length=${max_prompt_length} \
86
+ data.max_response_length=${max_response_length} \
87
+ data.train_batch_size=${train_prompt_bsz} \
88
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
89
+ actor_rollout_ref.rollout.name=vllm \
90
+ actor_rollout_ref.rollout.enforce_eager=True \
91
+ actor_rollout_ref.rollout.free_cache_engine=True \
92
+ algorithm.adv_estimator=${adv_estimator} \
93
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
94
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
95
+ actor_rollout_ref.model.use_fused_kernels=True \
96
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
97
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
98
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
99
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
100
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
101
+ actor_rollout_ref.actor.clip_ratio_c=10.0 \
102
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
103
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
104
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
105
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
106
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
107
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
108
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
109
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
110
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
111
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
112
+ actor_rollout_ref.actor.optim.lr=1e-6 \
113
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
114
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
115
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} \
116
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
117
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
118
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
119
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
120
+ actor_rollout_ref.actor.megatron.param_offload=${offload} \
121
+ actor_rollout_ref.actor.megatron.optimizer_offload=${OPTIM_OFFLOAD} \
122
+ actor_rollout_ref.actor.megatron.grad_offload=${offload} \
123
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
124
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
125
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
126
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
127
+ actor_rollout_ref.actor.megatron.context_parallel_size=${CP} \
128
+ actor_rollout_ref.actor.entropy_coeff=0 \
129
+ actor_rollout_ref.actor.optim.clip_grad=1.0 \
130
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
131
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
132
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
133
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
134
+ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
135
+ actor_rollout_ref.rollout.temperature=${temperature} \
136
+ actor_rollout_ref.rollout.top_p=${top_p} \
137
+ actor_rollout_ref.rollout.top_k=${top_k} \
138
+ actor_rollout_ref.nccl_timeout=1200 \
139
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
140
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
141
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
142
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
143
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
144
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
145
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
146
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
147
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
148
+ actor_rollout_ref.ref.megatron.context_parallel_size=${CP} \
149
+ actor_rollout_ref.ref.megatron.param_offload=${offload} \
150
+ +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
151
+ +actor_rollout_ref.actor.megatron.override_transformer_config.masked_softmax_fusion=True \
152
+ +actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \
153
+ +actor_rollout_ref.actor.megatron.override_transformer_config.bias_dropout_fusion=True \
154
+ +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
155
+ +actor_rollout_ref.actor.megatron.override_transformer_config.deallocate_pipeline_outputs=True \
156
+ +actor_rollout_ref.actor.megatron.override_transformer_config.persist_layer_norm=True \
157
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True \
158
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
159
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type="flex" \
160
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
161
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
162
+ +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=True \
163
+ +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=True \
164
+ reward_model.reward_manager=dapo \
165
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
166
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
167
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
168
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
169
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
170
+ trainer.logger=['console','wandb'] \
171
+ trainer.project_name="${project_name}" \
172
+ trainer.experiment_name="${exp_name}" \
173
+ trainer.n_gpus_per_node=8 \
174
+ trainer.nnodes="${NNODES}" \
175
+ trainer.val_before_train=False \
176
+ trainer.test_freq=10 \
177
+ trainer.save_freq=100 \
178
+ trainer.total_epochs=10 \
179
+ trainer.default_local_dir="${CKPTS_DIR}" \
180
+ trainer.resume_mode=auto \
181
+ trainer.log_val_generations=10
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b_npu.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ project_name='GRPO-Qwen3'
4
+ exp_name='GRPO-Qwen3-8B-npu'
5
+ gen_tp=2
6
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
7
+ MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-8B"}
8
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
9
+ TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
10
+ TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
11
+
12
+ python3 -m verl.trainer.main_ppo \
13
+ algorithm.adv_estimator=grpo \
14
+ data.train_files="${TRAIN_FILE}" \
15
+ data.val_files="${TEST_FILE}" \
16
+ data.train_batch_size=256 \
17
+ data.max_prompt_length=512 \
18
+ data.max_response_length=1024 \
19
+ data.filter_overlong_prompts=True \
20
+ data.truncation='error' \
21
+ actor_rollout_ref.model.path=${MODEL_PATH} \
22
+ actor_rollout_ref.actor.optim.lr=1e-6 \
23
+ actor_rollout_ref.model.use_remove_padding=True \
24
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
25
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
26
+ actor_rollout_ref.actor.use_kl_loss=True \
27
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
28
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
29
+ actor_rollout_ref.actor.entropy_coeff=0 \
30
+ actor_rollout_ref.actor.use_torch_compile=False \
31
+ actor_rollout_ref.ref.use_torch_compile=False \
32
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
33
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
34
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
35
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
36
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
37
+ actor_rollout_ref.rollout.name=vllm \
38
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
39
+ actor_rollout_ref.rollout.n=5 \
40
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
41
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
42
+ algorithm.use_kl_in_reward=False \
43
+ trainer.critic_warmup=0 \
44
+ trainer.logger='["console","wandb"]' \
45
+ trainer.project_name="${project_name}" \
46
+ trainer.experiment_name="${exp_name}" \
47
+ trainer.n_gpus_per_node=8 \
48
+ trainer.nnodes=1 \
49
+ trainer.default_local_dir=${CKPTS_DIR} \
50
+ trainer.resume_mode=auto \
51
+ actor_rollout_ref.actor.fsdp_config.forward_prefetch=True \
52
+ actor_rollout_ref.ref.fsdp_config.forward_prefetch=True \
53
+ ++actor_rollout_ref.actor.entropy_from_logits_with_chunking=True \
54
+ ++actor_rollout_ref.ref.entropy_from_logits_with_chunking=True \
55
+ trainer.val_before_train=True \
56
+ trainer.save_freq=5 \
57
+ trainer.test_freq=5 \
58
+ trainer.total_epochs=15
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -xeuo pipefail
2
+ source /usr/local/Ascend/ascend-toolkit/set_env.sh
3
+ source /usr/local/Ascend/nnal/atb/set_env.sh
4
+
5
+ # 使用v1引擎
6
+ export VLLM_USE_V1=1
7
+ # 指定vllm 版本
8
+ export VLLM_VERSION=0.9.1
9
+
10
+ # 开启二级流水
11
+ export TASK_QUEUE_ENABLE=2
12
+ # 开启细绑核
13
+ export CPU_AFFINITY_CONF=1
14
+ # 使用jemalloc优化内存访问(依赖安装jemalloc)
15
+ export LD_PRELOAD="/usr/lib/aarch64-linux-gnu/libjemalloc.so.2${LD_PRELOAD:+:$LD_PRELOAD}"
16
+
17
+ # A3 机器单机8卡
18
+ trainer_n_gpus_per_node=16
19
+ trainer_nnodes=1
20
+ trainer_project_name='verl_grpo_example_gsm8k'
21
+ trainer_experiment_name="qwen3_4b_grpo_8npu}"
22
+
23
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
24
+ MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-4B"}
25
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${trainer_project_name}/${trainer_experiment_name}"}
26
+ TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/gsm8k/train.parquet"}
27
+ TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/gsm8k/test.parquet"}
28
+
29
+ export TENSORBOARD_DIR="${RAY_DATA_HOME}/tensorboard_dir/${trainer_project_name}/${trainer_experiment_name}"
30
+ mkdir -p "${RAY_DATA_HOME}/logs/${trainer_project_name}"
31
+ LOG_PATH="${RAY_DATA_HOME}/logs/${trainer_project_name}/${trainer_experiment_name}.log"
32
+
33
+ use_dynamic_bsz=True
34
+
35
+ python3 -m verl.trainer.main_ppo \
36
+ algorithm.adv_estimator=grpo \
37
+ data.train_files=${TRAIN_FILE} \
38
+ data.val_files=${TEST_FILE} \
39
+ data.train_batch_size=512 \
40
+ data.max_prompt_length=1024 \
41
+ data.max_response_length=1024 \
42
+ data.filter_overlong_prompts=True \
43
+ data.truncation='error' \
44
+ actor_rollout_ref.model.path=${MODEL_PATH} \
45
+ actor_rollout_ref.actor.optim.lr=5e-7 \
46
+ actor_rollout_ref.model.use_remove_padding=True \
47
+ actor_rollout_ref.actor.entropy_coeff=0.001 \
48
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
49
+ actor_rollout_ref.actor.use_kl_loss=True \
50
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
51
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
52
+ actor_rollout_ref.actor.use_torch_compile=False \
53
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
54
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=3000 \
55
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
56
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
57
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
58
+ actor_rollout_ref.rollout.enforce_eager=True \
59
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
60
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
61
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
62
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
63
+ actor_rollout_ref.rollout.name=vllm \
64
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
65
+ actor_rollout_ref.rollout.n=5 \
66
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
67
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=8192 \
68
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
69
+ actor_rollout_ref.ref.use_torch_compile=True \
70
+ algorithm.kl_ctrl.kl_coef=0.001 \
71
+ trainer.critic_warmup=0 \
72
+ trainer.project_name=${trainer_project_name} \
73
+ trainer.experiment_name=${trainer_experiment_name} \
74
+ trainer.logger=['console','tensorboard'] \
75
+ trainer.default_local_dir=${CKPTS_DIR} \
76
+ trainer.n_gpus_per_node=$trainer_n_gpus_per_node \
77
+ trainer.nnodes=$trainer_nnodes \
78
+ trainer.save_freq=-1 \
79
+ trainer.test_freq=5 \
80
+ trainer.total_epochs=15 \
81
+ trainer.val_before_train=False 2>&1 | tee ${LOG_PATH}
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ export HCCL_CONNECT_TIMEOUT=1500
3
+ export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
4
+ export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
5
+ export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
6
+ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
7
+ # WORKSPACE_HOME and DATA_HOME support custom path configuration.
8
+ WORKSPACE_HOME=$pwd
9
+ DATA_HOME=$pwd
10
+
11
+ sp_size=4
12
+ num_gpu=8
13
+ tp_size=4
14
+ train_prompt_bsz=16
15
+ train_prompt_mini_bsz=16
16
+
17
+ max_prompt_length=$((1024 * 2))
18
+ max_response_length=$((1024 * 32))
19
+
20
+ CKPTS_DIR=$WORKSPACE_HOME/logs/ckpt/qwen3_8b
21
+ model_path=$DATA_HOME/models/Qwen3-8B
22
+ train_data=$DATA_HOME/datasets/dapo/dapo-math-17k.parquet
23
+ valid_data=$DATA_HOME/datasets/dapo/aime-2024.parquet
24
+
25
+ python3 -m verl.trainer.main_ppo \
26
+ algorithm.adv_estimator=grpo \
27
+ data.train_files=$train_data \
28
+ data.val_files=$valid_data \
29
+ data.train_batch_size=$train_prompt_bsz \
30
+ data.max_prompt_length=$max_prompt_length \
31
+ data.max_response_length=$max_response_length \
32
+ data.filter_overlong_prompts=False \
33
+ data.truncation='error' \
34
+ actor_rollout_ref.model.path=$model_path \
35
+ actor_rollout_ref.actor.optim.lr=1e-6 \
36
+ actor_rollout_ref.model.use_remove_padding=True \
37
+ actor_rollout_ref.actor.ppo_mini_batch_size=$train_prompt_mini_bsz \
38
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
39
+ actor_rollout_ref.actor.use_kl_loss=True \
40
+ actor_rollout_ref.actor.entropy_coeff=0 \
41
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
42
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
43
+ actor_rollout_ref.actor.use_torch_compile=False \
44
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
45
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
46
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
47
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
48
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$tp_size \
49
+ actor_rollout_ref.rollout.name=sglang \
50
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
51
+ actor_rollout_ref.rollout.n=5 \
52
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" \
53
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
54
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
55
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
56
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
57
+ actor_rollout_ref.nccl_timeout=3600 \
58
+ algorithm.use_kl_in_reward=False \
59
+ trainer.critic_warmup=0 \
60
+ trainer.logger=console \
61
+ trainer.val_before_train=False \
62
+ trainer.project_name='verl_grpo_example_2k_32k' \
63
+ trainer.experiment_name='qwen3_8b_function_rm' \
64
+ trainer.n_gpus_per_node=$num_gpu \
65
+ trainer.nnodes=1 \
66
+ trainer.save_freq=1000 \
67
+ trainer.test_freq=10000 \
68
+ trainer.total_epochs=5 \
69
+ trainer.default_local_dir="${CKPTS_DIR}" \
70
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
71
+ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-235b-megatron.sh ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP
6
+
7
+
8
+ HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-235B-A22B-Instruct"}
9
+
10
+ GEN_TP=${GEN_TP:-16}
11
+ CP=${CP:-2}
12
+ TP=${TP:-4}
13
+ PP=${PP:-8}
14
+ EP=${EP:-8}
15
+ ETP=${ETP:-1}
16
+
17
+ train_path=$HOME/data/geo3k/train.parquet
18
+ test_path=$HOME/data/geo3k/test.parquet
19
+
20
+ python3 -m verl.trainer.main_ppo --config-path=config \
21
+ --config-name='ppo_megatron_trainer.yaml'\
22
+ algorithm.adv_estimator=grpo \
23
+ data.train_files="$train_path" \
24
+ data.val_files="$test_path" \
25
+ data.train_batch_size=512 \
26
+ data.max_prompt_length=1024 \
27
+ data.max_response_length=2048 \
28
+ data.filter_overlong_prompts=True \
29
+ data.truncation='error' \
30
+ actor_rollout_ref.model.path=$HF_MODEL_PATH \
31
+ actor_rollout_ref.actor.optim.lr=1e-6 \
32
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
33
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
34
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
35
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
36
+ actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
37
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
38
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
39
+ actor_rollout_ref.actor.use_kl_loss=True \
40
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
41
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
42
+ actor_rollout_ref.actor.entropy_coeff=0 \
43
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
44
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \
45
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
46
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \
47
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
48
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \
49
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
50
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
51
+ actor_rollout_ref.rollout.name=$ENGINE \
52
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
53
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
54
+ actor_rollout_ref.rollout.n=5 \
55
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
56
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
57
+ actor_rollout_ref.actor.megatron.param_offload=True \
58
+ actor_rollout_ref.actor.megatron.optimizer_offload=True \
59
+ actor_rollout_ref.actor.megatron.grad_offload=True \
60
+ actor_rollout_ref.ref.megatron.param_offload=True \
61
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
62
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
63
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
64
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
65
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
66
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
67
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
68
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
69
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
70
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
71
+ +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
72
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
73
+ +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_loss_in_pipeline_split=True \
74
+ +actor_rollout_ref.actor.megatron.override_transformer_config.account_for_embedding_in_pipeline_split=True \
75
+ algorithm.use_kl_in_reward=False \
76
+ trainer.critic_warmup=0 \
77
+ trainer.logger='["console","wandb"]' \
78
+ trainer.project_name='verl_grpo_example_geo3k' \
79
+ trainer.experiment_name='qwen3_vl_235b_megatron' \
80
+ trainer.n_gpus_per_node=8 \
81
+ trainer.nnodes=8 \
82
+ trainer.save_freq=20 \
83
+ trainer.test_freq=5 \
84
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-30b-megatron.sh ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP
6
+
7
+
8
+ HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-30B-A3B-Instruct"}
9
+
10
+ GEN_TP=${GEN_TP:-4}
11
+ CP=${CP:-2}
12
+ TP=${TP:-2}
13
+ PP=${PP:-1}
14
+ EP=${EP:-8}
15
+ ETP=${ETP:-1}
16
+
17
+ train_path=$HOME/data/geo3k/train.parquet
18
+ test_path=$HOME/data/geo3k/test.parquet
19
+
20
+ python3 -m verl.trainer.main_ppo --config-path=config \
21
+ --config-name='ppo_megatron_trainer.yaml'\
22
+ algorithm.adv_estimator=grpo \
23
+ data.train_files="$train_path" \
24
+ data.val_files="$test_path" \
25
+ data.train_batch_size=512 \
26
+ data.max_prompt_length=1024 \
27
+ data.max_response_length=2048 \
28
+ data.filter_overlong_prompts=True \
29
+ data.truncation='error' \
30
+ actor_rollout_ref.model.path=$HF_MODEL_PATH \
31
+ actor_rollout_ref.actor.optim.lr=1e-6 \
32
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
33
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
34
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
35
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
36
+ actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
37
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
38
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
39
+ actor_rollout_ref.actor.use_kl_loss=True \
40
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
41
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
42
+ actor_rollout_ref.actor.entropy_coeff=0 \
43
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
44
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \
45
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
46
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \
47
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
48
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \
49
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
50
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
51
+ actor_rollout_ref.rollout.name=$ENGINE \
52
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
53
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
54
+ actor_rollout_ref.rollout.n=5 \
55
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
56
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
57
+ actor_rollout_ref.actor.megatron.param_offload=True \
58
+ actor_rollout_ref.actor.megatron.optimizer_offload=True \
59
+ actor_rollout_ref.actor.megatron.grad_offload=True \
60
+ actor_rollout_ref.ref.megatron.param_offload=True \
61
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
62
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
63
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
64
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
65
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
66
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
67
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
68
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
69
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
70
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
71
+ +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
72
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
73
+ # Use aux_loss and z_loss to mitigate expert load imbalance when training MoE models
74
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_aux_loss_coeff=0.01 \
75
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_z_loss_coeff=0.001 \
76
+ algorithm.use_kl_in_reward=False \
77
+ trainer.critic_warmup=0 \
78
+ trainer.logger='["console","wandb"]' \
79
+ trainer.project_name='verl_grpo_example_geo3k' \
80
+ trainer.experiment_name='qwen3_vl_30b_megatron' \
81
+ trainer.n_gpus_per_node=8 \
82
+ trainer.nnodes=1 \
83
+ trainer.save_freq=20 \
84
+ trainer.test_freq=5 \
85
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # tested in NNODES=1~4 * 96G H20 GPU
4
+ NNODES=${NNODES:-1}
5
+ NGPUS_PER_NODES=${NGPUS_PER_NODES:-8}
6
+
7
+ project_name='DAPO-Qwen3-30b-MATH'
8
+ exp_name='DAPO-Qwen3-30b-MATH-megatron'
9
+
10
+ adv_estimator=grpo
11
+
12
+ use_kl_in_reward=False
13
+ kl_coef=0.0
14
+ use_kl_loss=False
15
+ kl_loss_coef=0.0
16
+
17
+ clip_ratio_low=0.2
18
+ clip_ratio_high=0.28
19
+ max_prompt_length=$((1024 * 2))
20
+ max_response_length=$((1024 * 8))
21
+ enable_overlong_buffer=True
22
+ overlong_buffer_len=$((1024 * 4))
23
+ overlong_penalty_factor=1.0
24
+
25
+ loss_agg_mode="token-mean"
26
+
27
+ train_prompt_bsz=512
28
+ n_resp_per_prompt=16
29
+ train_prompt_mini_bsz=128
30
+ train_ppo_micro_batch_size_per_gpu=2
31
+ infer_ppo_micro_batch_size_per_gpu=2
32
+ # Paths
33
+ MODEL_PATH=Qwen/Qwen3-30B-A3B-Base
34
+
35
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
36
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
37
+ TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
38
+ TEST_FILE="['$aime24_test_path']"
39
+
40
+ # Algorithm
41
+ temperature=1.0
42
+ top_p=1.0
43
+ top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
44
+ val_top_p=0.7
45
+
46
+ # Performance Related Parameter
47
+ use_dynamic_bsz=True
48
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
49
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
50
+ offload=True
51
+
52
+ optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
53
+
54
+ COMMON_PP=${COMMON_PP:-1}
55
+ COMMON_VPP=${COMMON_VPP:-null}
56
+ COMMON_CP=${COMMON_CP:-1}
57
+ COMMON_TP=${COMMON_TP:-1}
58
+ COMMON_EP=${COMMON_EP:-8}
59
+ COMMON_ETP=${COMMON_ETP:-1}
60
+
61
+ TRAIN_TP=${TRAIN_TP:-$COMMON_TP}
62
+ INFER_TP=${INFER_TP:-4}
63
+
64
+ ACTOR_PP=${ACTOR_PP:-$COMMON_PP}
65
+ ACTOR_VPP=${ACTOR_VPP:-$COMMON_VPP}
66
+ ACTOR_CP=${ACTOR_CP:-$COMMON_CP}
67
+ ACTOR_TP=${ACTOR_TP:-$TRAIN_TP}
68
+ ACTOR_EP=${ACTOR_EP:-$COMMON_EP}
69
+ ACTOR_ETP=${ACTOR_ETP:-$COMMON_ETP}
70
+ ROLLOUT_TP=${ROLLOUT_TP:-$INFER_TP}
71
+ REF_PP=${REF_PP:-$COMMON_PP}
72
+ REF_VPP=${REF_VPP:-$COMMON_VPP}
73
+ REF_CP=${REF_CP:-$COMMON_CP}
74
+ REF_TP=${REF_TP:-$TRAIN_TP}
75
+ REF_EP=${REF_EP:-$COMMON_EP}
76
+ REF_ETP=${REF_ETP:-$COMMON_ETP}
77
+ CRITIC_PP=${CRITIC_PP:-$COMMON_PP}
78
+ CRITIC_VPP=${CRITIC_VPP:-$COMMON_VPP}
79
+ CRITIC_CP=${CRITIC_CP:-$COMMON_CP}
80
+ CRITIC_TP=${CRITIC_TP:-$TRAIN_TP}
81
+ CRITIC_EP=${CRITIC_EP:-$COMMON_EP}
82
+ CRITIC_ETP=${CRITIC_ETP:-$COMMON_ETP}
83
+ RM_PP=${RM_PP:-$COMMON_PP}
84
+ RM_VPP=${RM_VPP:-$COMMON_VPP}
85
+ RM_CP=${RM_CP:-$COMMON_CP}
86
+ RM_TP=${RM_TP:-$TRAIN_TP}
87
+ RM_EP=${RM_EP:-$COMMON_EP}
88
+ RM_ETP=${RM_ETP:-$COMMON_ETP}
89
+
90
+ # install mbridge
91
+ # pip3 install git+https://github.com/ISEEKYAN/mbridge
92
+ USE_MBRIDGE=True
93
+ USE_DIST_CKPT=False
94
+
95
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
96
+ data.train_files="${TRAIN_FILE}" \
97
+ data.val_files="${TEST_FILE}" \
98
+ data.prompt_key=prompt \
99
+ data.truncation='left' \
100
+ data.max_prompt_length=${max_prompt_length} \
101
+ data.max_response_length=${max_response_length} \
102
+ data.train_batch_size=${train_prompt_bsz} \
103
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
104
+ algorithm.adv_estimator=${adv_estimator} \
105
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
106
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
107
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
108
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
109
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
110
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
111
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
112
+ actor_rollout_ref.actor.clip_ratio_c=10.0 \
113
+ +actor_rollout_ref.model.override_config.model_config.max_position_embeddings=$((max_prompt_length + max_response_length)) \
114
+ actor_rollout_ref.model.use_fused_kernels=False \
115
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
116
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
117
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_ppo_micro_batch_size_per_gpu} \
118
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
119
+ actor_rollout_ref.actor.optim.lr=1e-6 \
120
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
121
+ actor_rollout_ref.actor.optim.lr_decay_style='constant' \
122
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
123
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction} \
124
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
125
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
126
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
127
+ actor_rollout_ref.actor.megatron.use_mbridge=$USE_MBRIDGE \
128
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=$USE_DIST_CKPT \
129
+ actor_rollout_ref.actor.megatron.param_offload=${offload} \
130
+ actor_rollout_ref.actor.megatron.grad_offload=${offload} \
131
+ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
132
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${ACTOR_TP} \
133
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${ACTOR_PP} \
134
+ actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=${ACTOR_VPP} \
135
+ actor_rollout_ref.actor.megatron.context_parallel_size=${ACTOR_CP} \
136
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=${ACTOR_EP} \
137
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ACTOR_ETP} \
138
+ +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
139
+ +actor_rollout_ref.actor.megatron.override_transformer_config.masked_softmax_fusion=True \
140
+ +actor_rollout_ref.actor.megatron.override_transformer_config.bias_activation_fusion=True \
141
+ +actor_rollout_ref.actor.megatron.override_transformer_config.bias_dropout_fusion=True \
142
+ +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
143
+ +actor_rollout_ref.actor.megatron.override_transformer_config.deallocate_pipeline_outputs=True \
144
+ +actor_rollout_ref.actor.megatron.override_transformer_config.persist_layer_norm=True \
145
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_grouped_gemm=True \
146
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
147
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type="flex" \
148
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
149
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
150
+ actor_rollout_ref.actor.entropy_coeff=0 \
151
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
152
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \
153
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
154
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
155
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${INFER_TP} \
156
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
157
+ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
158
+ actor_rollout_ref.rollout.temperature=${temperature} \
159
+ actor_rollout_ref.rollout.top_p=${top_p} \
160
+ actor_rollout_ref.rollout.top_k=${top_k} \
161
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
162
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
163
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
164
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
165
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
166
+ actor_rollout_ref.rollout.name=vllm \
167
+ actor_rollout_ref.rollout.enforce_eager=True \
168
+ actor_rollout_ref.rollout.free_cache_engine=True \
169
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${infer_ppo_micro_batch_size_per_gpu} \
170
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
171
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=${USE_DIST_CKPT} \
172
+ actor_rollout_ref.ref.megatron.param_offload=${offload} \
173
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${REF_TP} \
174
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${REF_PP} \
175
+ actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=${REF_VPP} \
176
+ actor_rollout_ref.ref.megatron.context_parallel_size=${REF_CP} \
177
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=${REF_EP} \
178
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${REF_ETP} \
179
+ reward_model.reward_manager=dapo \
180
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
181
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
182
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
183
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
184
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
185
+ trainer.logger=['console','wandb'] \
186
+ trainer.project_name="${project_name}" \
187
+ trainer.experiment_name="${exp_name}" \
188
+ trainer.n_gpus_per_node="${NGPUS_PER_NODES}" \
189
+ trainer.nnodes="${NNODES}" \
190
+ trainer.val_before_train=False \
191
+ trainer.test_freq=10 \
192
+ trainer.save_freq=100 \
193
+ trainer.total_epochs=10 \
194
+ trainer.resume_mode=auto \
195
+ trainer.log_val_generations=10
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -xeuo pipefail
3
+
4
+ # Need to install Megatron-Bridge
5
+ # NOTE: Make sure you use Megatron-Bridge later than 0.2.0
6
+ # (Recommend https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44 or later)
7
+ # for proper MoE LoRA support.
8
+
9
+ # For Megatron communication/computation overlapping
10
+ export CUDA_DEVICE_MAX_CONNECTIONS=1
11
+
12
+ ########################### Quick Config ###########################
13
+
14
+ TP=${TP:-2}
15
+ PP=${PP:-2}
16
+ CP=${CP:-2}
17
+ EP=${EP:-4}
18
+ ETP=${ETP:-1}
19
+
20
+ ALL_OFFLOAD=${ALL_OFFLOAD:-True}
21
+
22
+
23
+ rollout_name="vllm"
24
+ project_name='verl_grpo_example_gsm8k_math'
25
+ exp_name='qwen3_30b_a3b_megatron_lora'
26
+ adv_estimator=grpo
27
+
28
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
29
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
30
+
31
+ ########################### Parameter Arrays ###########################
32
+
33
+ DATA=(
34
+ data.train_files=${gsm8k_train_path}
35
+ data.val_files=${gsm8k_test_path}
36
+ data.train_batch_size=128
37
+ data.max_prompt_length=1024
38
+ data.max_response_length=1024
39
+ data.truncation='error'
40
+ data.filter_overlong_prompts=True
41
+ data.shuffle=False
42
+ )
43
+
44
+ MODEL=(
45
+ actor_rollout_ref.model.path=Qwen/Qwen3-30B-A3B-Instruct-2507
46
+ actor_rollout_ref.model.use_fused_kernels=True
47
+ actor_rollout_ref.model.lora.rank=32
48
+ actor_rollout_ref.model.lora.alpha=64
49
+ actor_rollout_ref.model.lora.lora_A_init_method=kaiming
50
+ # # Optional: Use canonical LoRA
51
+ # actor_rollout_ref.model.lora.type="canonical_lora"
52
+ # actor_rollout_ref.model.lora.target_modules='["linear_q","linear_k","linear_v","linear_proj","linear_fc1_up","linear_fc1_gate","linear_fc2"]'
53
+
54
+ # # Optional: Add dropout to LoRA layers
55
+ # actor_rollout_ref.model.lora.dropout=0.05
56
+ # actor_rollout_ref.model.lora.dropout_position=pre
57
+ )
58
+
59
+ ACTOR=(
60
+ actor_rollout_ref.actor.optim.lr=3e-6
61
+ actor_rollout_ref.actor.ppo_mini_batch_size=16
62
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
63
+ actor_rollout_ref.actor.megatron.use_mbridge=True
64
+ actor_rollout_ref.actor.megatron.vanilla_mbridge=False
65
+ actor_rollout_ref.actor.use_dynamic_bsz=True
66
+ actor_rollout_ref.actor.use_kl_loss=True
67
+ actor_rollout_ref.actor.kl_loss_coef=0.001
68
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl
69
+ actor_rollout_ref.actor.entropy_coeff=0
70
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${TP}
71
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${PP}
72
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=${EP}
73
+ actor_rollout_ref.actor.megatron.context_parallel_size=${CP}
74
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${ETP}
75
+ actor_rollout_ref.actor.megatron.param_offload=${ALL_OFFLOAD}
76
+ actor_rollout_ref.actor.megatron.optimizer_offload=${ALL_OFFLOAD}
77
+ actor_rollout_ref.actor.megatron.grad_offload=${ALL_OFFLOAD}
78
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
79
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
80
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
81
+ )
82
+
83
+ ROLLOUT=(
84
+ actor_rollout_ref.rollout.tensor_model_parallel_size=8
85
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4
86
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True
87
+ actor_rollout_ref.rollout.name=${rollout_name}
88
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.25
89
+ actor_rollout_ref.rollout.enforce_eager=True
90
+ actor_rollout_ref.rollout.free_cache_engine=True
91
+ actor_rollout_ref.rollout.n=4
92
+ )
93
+
94
+ REF=(
95
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4
96
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True
97
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${TP}
98
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${PP}
99
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=${EP}
100
+ actor_rollout_ref.ref.megatron.context_parallel_size=${CP}
101
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${ETP}
102
+ actor_rollout_ref.ref.megatron.param_offload=${ALL_OFFLOAD}
103
+ )
104
+
105
+ ALGORITHM=(
106
+ algorithm.adv_estimator=${adv_estimator}
107
+ )
108
+
109
+ TRAINER=(
110
+ trainer.critic_warmup=0
111
+ trainer.logger='["console","wandb"]'
112
+ trainer.project_name=${project_name}
113
+ trainer.experiment_name=${exp_name}
114
+ trainer.n_gpus_per_node=8
115
+ trainer.nnodes=1
116
+ trainer.save_freq=20
117
+ trainer.test_freq=5
118
+ trainer.total_epochs=15
119
+ )
120
+
121
+ ########################### Launch ###########################
122
+
123
+ python3 -m verl.trainer.main_ppo \
124
+ --config-path=config \
125
+ --config-name='ppo_megatron_trainer.yaml' \
126
+ "${DATA[@]}" \
127
+ "${ALGORITHM[@]}" \
128
+ "${MODEL[@]}" \
129
+ "${ROLLOUT[@]}" \
130
+ "${ACTOR[@]}" \
131
+ "${REF[@]}" \
132
+ "${TRAINER[@]}" \
133
+ "$@"
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -xeuo pipefail
3
+ # Project Configuration
4
+ project_name='DAPO-Qwen3-30b-A3B-BASE-MATH'
5
+ exp_name='DAPO-Qwen3-30B-A3B-BASE-Megatron-SGLang'
6
+
7
+ # Necessary env
8
+ export HCCL_CONNECT_TIMEOUT=1500
9
+ export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
10
+ export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
11
+
12
+ export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
13
+ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
14
+
15
+ export DISABLE_L2_CACHE=1
16
+ export TASK_QUEUE_ENABLE=1
17
+
18
+ # Node Info
19
+ NNODES=${NNODES:-1}
20
+ NPUS_PER_NODE=${NPUS_PER_NODE:-16}
21
+
22
+ # Model Weights Paths
23
+ MODEL_PATH=Qwen/Qwen3-30B-A3B
24
+ MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-mcore
25
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
26
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
27
+
28
+ # File System Paths
29
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
30
+ TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
31
+ # Data Length Configuration
32
+ max_prompt_length=$((1024 * 2))
33
+ max_response_length=$((1024 * 8))
34
+
35
+ # Training Batch Configuration
36
+ train_prompt_bsz=16
37
+ train_prompt_mini_bsz=16
38
+ n_resp_per_prompt=8
39
+
40
+ # Algorithm Configuration
41
+ adv_estimator=grpo
42
+ use_kl_in_reward=False
43
+ kl_coef=0.0
44
+ use_kl_loss=True
45
+ kl_loss_coef=0.001
46
+
47
+ # Performance and Memory Management Configuration
48
+ all_offload=True
49
+ use_dynamic_bsz=False
50
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
51
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length)))
52
+
53
+ # Megatron Parallelism Configuration
54
+ train_tp=4
55
+ train_ep=4
56
+ train_etp=4
57
+ train_pp=1
58
+ train_cp=1
59
+
60
+ # SGLang Generation Configuration
61
+ gen_tp=4
62
+ gen_dp=1
63
+ gen_ep=1
64
+ gpu_memory_utilization=0.5
65
+ max_model_len=$((max_prompt_length + max_response_length))
66
+ max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
67
+
68
+ # Data Configuration
69
+ DATA_CONFIG=(
70
+ # File Paths
71
+ data.train_files="${TRAIN_FILE}"
72
+ data.val_files="${TEST_FILE}"
73
+ # Data Structure
74
+ data.prompt_key=prompt
75
+ # Batch and Length Configuration
76
+ data.train_batch_size=${train_prompt_bsz}
77
+ data.max_prompt_length=${max_prompt_length}
78
+ data.max_response_length=${max_response_length}
79
+ # Preprocessing
80
+ data.filter_overlong_prompts=False
81
+ data.truncation='left'
82
+ )
83
+
84
+ # Model Configuration
85
+ MODEL_CONFIG=(
86
+ # Model Path
87
+ actor_rollout_ref.model.path="${MODEL_PATH}"
88
+ # Model Processing
89
+ actor_rollout_ref.model.use_remove_padding=True
90
+ )
91
+
92
+ # Reinforcement Learning Algorithm Configuration
93
+ ALGORITHM_CONFIG=(
94
+ # Advantage Estimation
95
+ algorithm.adv_estimator=${adv_estimator}
96
+ # KL Divergence Control
97
+ algorithm.use_kl_in_reward=${use_kl_in_reward}
98
+ algorithm.kl_ctrl.kl_coef=${kl_coef}
99
+ )
100
+
101
+ ACTOR_CONFIG=(
102
+ # Core Runtime Settings
103
+ actor_rollout_ref.actor.use_torch_compile=False
104
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
105
+ # Loss Function Configuration
106
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
107
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
108
+ actor_rollout_ref.actor.entropy_coeff=0
109
+ # PPO Training Parameters
110
+ actor_rollout_ref.actor.ppo_epochs=1
111
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
112
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
113
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
114
+ # Optimizer Settings
115
+ actor_rollout_ref.actor.optim.lr=1e-6
116
+ # Megatron Parallelism Strategy
117
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
118
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
119
+ actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp}
120
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep}
121
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp}
122
+ # Memory Optimization
123
+ actor_rollout_ref.actor.megatron.param_offload=${all_offload}
124
+ actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload}
125
+ actor_rollout_ref.actor.megatron.grad_offload=${all_offload}
126
+ # Model Weights Management
127
+ actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
128
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=True
129
+ actor_rollout_ref.actor.megatron.use_mbridge=False
130
+ # Transformer Architecture Optimizations
131
+ +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
132
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
133
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
134
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
135
+ )
136
+
137
+ REF_CONFIG=(
138
+ # Core Runtime Settings
139
+ actor_rollout_ref.ref.use_torch_compile=False
140
+ # Log Probability Inference
141
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
142
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
143
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
144
+ # Megatron Parallelism Strategy
145
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
146
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
147
+ actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp}
148
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep}
149
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp}
150
+ # Memory Optimization
151
+ actor_rollout_ref.ref.megatron.param_offload=${all_offload}
152
+ # Model Weights Management
153
+ actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
154
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=True
155
+ actor_rollout_ref.ref.megatron.use_mbridge=False
156
+ )
157
+
158
+ ROLLOUT_CONFIG=(
159
+ # Rollout Engine
160
+ actor_rollout_ref.rollout.name=sglang
161
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
162
+ # Generation Parameters
163
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt}
164
+ actor_rollout_ref.rollout.top_p=1.0
165
+ actor_rollout_ref.rollout.top_k=-1
166
+ actor_rollout_ref.rollout.temperature=1.0
167
+ # Log Probability Inference
168
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
169
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
170
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
171
+ # Memory Management
172
+ actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
173
+ # Parallelism Strategy
174
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
175
+ actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
176
+ actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
177
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
178
+ # Performance Optimization
179
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1
180
+ actor_rollout_ref.rollout.enforce_eager=False
181
+ # Validation Generation
182
+ actor_rollout_ref.rollout.val_kwargs.n=1
183
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True
184
+ actor_rollout_ref.rollout.val_kwargs.top_p=1.0
185
+ actor_rollout_ref.rollout.val_kwargs.top_k=-1
186
+ actor_rollout_ref.rollout.val_kwargs.temperature=1.0
187
+ )
188
+
189
+ TRAINER_CONFIG=(
190
+ # Logger Configuration
191
+ trainer.logger='["console"]'
192
+ # Project Settings
193
+ trainer.project_name="${project_name}"
194
+ trainer.experiment_name="${exp_name}"
195
+ # Hardware Configuration
196
+ trainer.nnodes="${NNODES}"
197
+ trainer.n_gpus_per_node="${NPUS_PER_NODE}"
198
+ trainer.device='npu'
199
+ # Training Schedule
200
+ trainer.total_epochs=15
201
+ trainer.val_before_train=False
202
+ trainer.test_freq=-1
203
+ trainer.save_freq=-1
204
+ # Checkpoint Directory
205
+ trainer.default_local_dir="${CKPTS_DIR}"
206
+ )
207
+
208
+ # profiling configuration
209
+ PROF_CONFIG=(
210
+ global_profiler.tool=npu
211
+ global_profiler.steps=null
212
+ global_profiler.save_path=/profpath
213
+ actor_rollout_ref.actor.profiler.enable=True
214
+ actor_rollout_ref.actor.profiler.ranks="[0]"
215
+ actor_rollout_ref.actor.profiler.all_ranks=False
216
+ actor_rollout_ref.actor.profiler.tool_config.npu.discrete=True
217
+ actor_rollout_ref.actor.profiler.tool_config.npu.contents=['npu','cpu']
218
+ actor_rollout_ref.actor.profiler.tool_config.npu.level=level0
219
+ actor_rollout_ref.actor.profiler.tool_config.npu.analysis=True
220
+ actor_rollout_ref.rollout.profiler.enable=True
221
+ actor_rollout_ref.rollout.profiler.ranks="[0]"
222
+ actor_rollout_ref.rollout.profiler.all_ranks=False
223
+ )
224
+
225
+ python3 -m verl.trainer.main_ppo \
226
+ --config-path=config \
227
+ --config-name='ppo_megatron_trainer.yaml' \
228
+ "${DATA_CONFIG[@]}" \
229
+ "${MODEL_CONFIG[@]}" \
230
+ "${ACTOR_CONFIG[@]}" \
231
+ "${REF_CONFIG[@]}" \
232
+ "${ROLLOUT_CONFIG[@]}" \
233
+ "${ALGORITHM_CONFIG[@]}" \
234
+ "${TRAINER_CONFIG[@]}" \
235
+ "${PROF_CONFIG[@]}" \
236
+ "$@"
code/RL_model/verl/verl_train/examples/grpo_trainer/run_seed_oss_36b.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=grpo \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=64 \
8
+ data.max_prompt_length=512 \
9
+ data.max_response_length=1024 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=ByteDance-Seed/Seed-OSS-36B-Base \
13
+ actor_rollout_ref.actor.optim.lr=1e-6 \
14
+ actor_rollout_ref.model.use_remove_padding=True \
15
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
16
+ actor_rollout_ref.model.use_fused_kernels=True \
17
+ actor_rollout_ref.actor.ppo_mini_batch_size=8 \
18
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
19
+ actor_rollout_ref.actor.use_kl_loss=True \
20
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
21
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
22
+ actor_rollout_ref.actor.entropy_coeff=0 \
23
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
24
+ actor_rollout_ref.actor.strategy=fsdp2 \
25
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
26
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
27
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
28
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
29
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
30
+ actor_rollout_ref.rollout.name=vllm \
31
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
32
+ actor_rollout_ref.rollout.n=2 \
33
+ actor_rollout_ref.rollout.free_cache_engine=True \
34
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
35
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
36
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
37
+ actor_rollout_ref.ref.strategy=fsdp2 \
38
+ algorithm.use_kl_in_reward=False \
39
+ trainer.critic_warmup=0 \
40
+ trainer.logger='["console"]' \
41
+ trainer.project_name='verl_grpo_seed_oss_36b' \
42
+ trainer.experiment_name='seed_oss_36b' \
43
+ trainer.val_before_train=False \
44
+ trainer.n_gpus_per_node=8 \
45
+ trainer.nnodes=1 \
46
+ trainer.save_freq=20 \
47
+ trainer.test_freq=5 \
48
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen30b_gspo.sh ADDED
@@ -0,0 +1,197 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # run Qwen3-30B GSPO with new model engine
2
+ set -x
3
+
4
+ HDFS_ROOT=${HDFS_ROOT:-$PWD}
5
+ DATA_ROOT=${DATA_ROOT:-$PWD}
6
+
7
+ # wandb
8
+ backend=megatron # fsdp, fsdp2, megatron
9
+ project_name=wuxibin_gspo
10
+ experiment_name=qwen3-30B-base-grpo-$backend
11
+ default_local_dir=$DATA_ROOT/checkpoint/$project_name/$experiment_name
12
+
13
+ # ===================================== Algorithm =====================================
14
+ adv_estimator=grpo
15
+ loss_mode=gspo
16
+
17
+ # reference policy
18
+ use_kl_in_reward=False
19
+ kl_coef=0.001
20
+ use_kl_loss=False
21
+ kl_loss_coef=0.001
22
+
23
+ clip_ratio_low=3e-4
24
+ clip_ratio_high=4e-4
25
+
26
+ actor_lr=1e-6
27
+ critic_lr=2e-6
28
+ gae_gamma=1.0
29
+ gae_lam=0.95
30
+ critic_warmup=0
31
+
32
+ # ===================================== Data/Model =====================================
33
+ train_files=$DATA_ROOT/dataset/BytedTsinghua-SIA/DAPO-Math-17k/data/dapo-math-17k.parquet
34
+ test_files=$DATA_ROOT/dataset/aime-2024.parquet
35
+
36
+ actor_model_path=$HDFS_ROOT/model/Qwen3-30B-A3B-Base
37
+ critic_model_path=$actor_model_path
38
+
39
+ max_prompt_length=$((1024 * 2))
40
+ max_response_length=$((1024 * 8))
41
+ enable_overlong_buffer=True
42
+ overlong_buffer_len=$((1024 * 4))
43
+ overlong_penalty_factor=1.0
44
+
45
+ train_batch_size=256
46
+ ppo_mini_batch_size=32
47
+ n_resp_per_prompt=16
48
+ n_resp_per_prompt_val=1
49
+
50
+ # ===================================== Training =====================================
51
+ actor_max_token_len_per_gpu=$(((max_prompt_length + max_response_length) * 3))
52
+ critic_max_token_len_per_gpu=$(((max_prompt_length + max_response_length) * 4))
53
+
54
+ # FSDP parallelism config
55
+ USP_SIZE=4
56
+ ACTOR_FSDP_CONFIG="
57
+ actor_rollout_ref.actor.fsdp_config.strategy=$backend \
58
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
59
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
60
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=$USP_SIZE"
61
+
62
+ # Megatron parallelism config
63
+ TP_SIZE=2
64
+ CP_SIZE=1
65
+ PP_SIZE=1
66
+ VPP_SIZE=null
67
+ EP_SIZE=8
68
+ ETP_SIZE=1
69
+ ACTOR_MEGATRON_CONFIG="
70
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP_SIZE \
71
+ actor_rollout_ref.actor.megatron.context_parallel_size=$CP_SIZE \
72
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP_SIZE \
73
+ actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=$VPP_SIZE \
74
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP_SIZE \
75
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP_SIZE \
76
+ actor_rollout_ref.actor.megatron.param_offload=True \
77
+ actor_rollout_ref.actor.megatron.grad_offload=True \
78
+ actor_rollout_ref.actor.megatron.optimizer_offload=True \
79
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
80
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
81
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
82
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
83
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
84
+ +actor_rollout_ref.actor.megatron.override_transformer_config.apply_rope_fusion=True \
85
+ +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
86
+ actor_rollout_ref.actor.megatron.use_mbridge=True"
87
+
88
+ # Actor model config
89
+ ACTOR_CONFIG="
90
+ actor_rollout_ref.actor.optim.lr=$actor_lr \
91
+ actor_rollout_ref.model.path=$actor_model_path \
92
+ actor_rollout_ref.model.use_remove_padding=True \
93
+ actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
94
+ actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
95
+ actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
96
+ actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
97
+ actor_rollout_ref.actor.clip_ratio_c=10.0 \
98
+ actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode}
99
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
100
+ actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
101
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu"
102
+
103
+ # Critic model config
104
+ CIRITC_CONFIG="
105
+ critic.optim.lr=$critic_lr \
106
+ critic.model.path=$critic_model_path \
107
+ critic.model.use_remove_padding=True \
108
+ critic.ppo_max_token_len_per_gpu=$critic_max_token_len_per_gpu \
109
+ critic.ulysses_sequence_parallel_size=$USP_SIZE"
110
+
111
+ CRITIC_FSDP_CONFIG="${ACTOR_FSDP_CONFIG//actor_rollout_ref.actor/critic.model}"
112
+ CRITIC_MEGATRON_CONFIG="${ACTOR_MEGATRON_CONFIG//actor_rollout_ref.actor/critic}"
113
+
114
+ if [[ $backend == "megatron" ]]; then
115
+ CONFIG_NAME=ppo_megatron_trainer
116
+ ACTOR_CONFIG="$ACTOR_CONFIG $ACTOR_MEGATRON_CONFIG"
117
+ if [[ $adv_estimator == "gae" ]]; then
118
+ CIRITC_CONFIG="$CIRITC_CONFIG $CRITIC_MEGATRON_CONFIG"
119
+ else
120
+ CIRITC_CONFIG=""
121
+ fi
122
+ else # fsdp, fsdp2
123
+ CONFIG_NAME=ppo_trainer
124
+ ACTOR_CONFIG="$ACTOR_CONFIG $ACTOR_FSDP_CONFIG"
125
+ if [[ $adv_estimator == "gae" ]]; then
126
+ CIRITC_CONFIG="$CIRITC_CONFIG $CRITIC_FSDP_CONFIG"
127
+ else
128
+ CIRITC_CONFIG=""
129
+ fi
130
+ fi
131
+
132
+ # ===================================== Inference =====================================
133
+ rollout_name=vllm
134
+ if [ "$rollout_name" = "vllm" ]; then
135
+ export VLLM_USE_V1=1
136
+ fi
137
+ infer_tp=4
138
+ infer_dp=1
139
+ infer_ep=1
140
+ gpu_memory_utilization=0.8
141
+
142
+ ROLLOUT_CONFIG="
143
+ actor_rollout_ref.rollout.name=$rollout_name \
144
+ actor_rollout_ref.rollout.mode=async \
145
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
146
+ actor_rollout_ref.rollout.data_parallel_size=$infer_dp \
147
+ actor_rollout_ref.rollout.expert_parallel_size=$infer_ep \
148
+ actor_rollout_ref.rollout.gpu_memory_utilization=$gpu_memory_utilization \
149
+ actor_rollout_ref.rollout.n=$n_resp_per_prompt \
150
+ actor_rollout_ref.rollout.val_kwargs.top_p=0.7 \
151
+ actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
152
+ actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val"
153
+
154
+ # ===================================== Reward =====================================
155
+ REWARD_CONFIG="
156
+ reward_model.reward_manager=dapo \
157
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
158
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
159
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
160
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
161
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length}"
162
+
163
+ python3 -m verl.trainer.main_ppo \
164
+ --config-path=./config \
165
+ --config-name=$CONFIG_NAME \
166
+ algorithm.adv_estimator=$adv_estimator \
167
+ algorithm.use_kl_in_reward=$use_kl_in_reward \
168
+ algorithm.kl_ctrl.kl_coef=$kl_coef \
169
+ algorithm.gamma=$gae_gamma \
170
+ algorithm.lam=$gae_lam \
171
+ data.train_files="$train_files" \
172
+ data.val_files="$test_files" \
173
+ data.return_raw_chat=True \
174
+ data.train_batch_size=$train_batch_size \
175
+ data.max_prompt_length=$max_prompt_length \
176
+ data.max_response_length=$max_response_length \
177
+ data.filter_overlong_prompts=True \
178
+ data.filter_overlong_prompts_workers=64 \
179
+ data.truncation='error' \
180
+ trainer.use_legacy_worker_impl=disable \
181
+ trainer.critic_warmup=$critic_warmup \
182
+ trainer.logger=['console','wandb'] \
183
+ trainer.project_name=$project_name \
184
+ trainer.experiment_name=$experiment_name \
185
+ trainer.default_local_dir=$default_local_dir \
186
+ trainer.n_gpus_per_node=$ARNOLD_WORKER_GPU \
187
+ trainer.nnodes=$ARNOLD_WORKER_NUM \
188
+ trainer.val_before_train=False \
189
+ trainer.log_val_generations=100 \
190
+ trainer.save_freq=-1 \
191
+ trainer.test_freq=10 \
192
+ trainer.total_epochs=10 \
193
+ trainer.total_training_steps=500 \
194
+ $ACTOR_CONFIG \
195
+ $CIRITC_CONFIG \
196
+ $ROLLOUT_CONFIG \
197
+ $REWARD_CONFIG
code/RL_model/verl/verl_train/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -xeuo pipefail
3
+ mkdir -p logs
4
+ ulimit -n 32768
5
+
6
+ ## Basic Environment Settings
7
+ export RAY_DEDUP_LOGS=0
8
+ export HYDRA_FULL_ERROR=1
9
+ export TASK_QUEUE_ENABLE=1
10
+ export HCCL_EXEC_TIMEOUT=3600
11
+ export HCCL_CONNECT_TIMEOUT=3600
12
+ export HCCL_ASYNC_ERROR_HANDLING=0
13
+ export CPU_AFFINITY_CONF=1
14
+ export VLLM_USE_V1=1
15
+ export VLLM_ATTENTION_BACKEND=XFORMERS
16
+ export VLLM_ASCEND_ENABLE_FLASHCOMM=1
17
+ export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1
18
+ export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1
19
+ export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
20
+
21
+ # Project Configuration
22
+ project_name='GSPO-Qwen3-32B-BASE-MATH'
23
+ exp_name='GSPO-Qwen3-32B-BASE-Megatron-vLLM'
24
+
25
+ # Node Info
26
+ NNODES=${NNODES:-4}
27
+ NPUS_PER_NODE=${NPUS_PER_NODE:-16}
28
+
29
+ # Model Weights Paths
30
+ MODEL_PATH=Qwen/Qwen3-32B
31
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
32
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
33
+
34
+ # File System Paths
35
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
36
+ TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
37
+
38
+ # Ray Configuration
39
+ WORKING_DIR=${WORKING_DIR:-"${PWD}"}
40
+ RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
41
+
42
+ # Data Length Configuration
43
+ max_prompt_length=$((1024 * 16))
44
+ max_response_length=$((1024 * 16))
45
+
46
+ # Training Batch Configuration
47
+ train_prompt_bsz=256
48
+ gen_prompt_bsz=$((train_prompt_bsz * 1))
49
+ train_prompt_mini_bsz=64
50
+ n_resp_per_prompt=16
51
+
52
+ # GSPO Loss Configuration
53
+ adv_estimator=grpo
54
+ loss_mode=gspo
55
+ use_kl_in_reward=False
56
+ kl_coef=0.0
57
+ use_kl_loss=False
58
+ kl_loss_coef=0.0
59
+ clip_ratio_low=0.0003
60
+ clip_ratio_high=0.0004
61
+ loss_agg_mode="seq-mean-token-mean"
62
+
63
+ # Performance and Memory Management Configuration
64
+ offload=True
65
+ use_dynamic_bsz=True
66
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
67
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
68
+
69
+ # FSDP Parallelism Configuration
70
+ actor_strategy=fsdp2
71
+ ref_strategy=fsdp2
72
+ sp_size=4
73
+ fsdp_size=-1
74
+ # vLLM Configuration
75
+ gen_tp=4
76
+ gpu_memory_utilization=0.9
77
+ max_model_len=$((max_prompt_length + max_response_length))
78
+ max_num_batched_tokens=$((max_prompt_length + max_response_length))
79
+
80
+
81
+ # Data Configuration
82
+ DATA_CONFIG=(
83
+ data.train_files="${TRAIN_FILE}"
84
+ data.val_files="${TEST_FILE}"
85
+ data.prompt_key=prompt
86
+ data.train_batch_size=${train_prompt_bsz}
87
+ +data.gen_batch_size=${gen_prompt_bsz}
88
+ data.max_prompt_length=${max_prompt_length}
89
+ data.max_response_length=${max_response_length}
90
+ data.truncation='left'
91
+ )
92
+
93
+ # Model Configuration
94
+ MODEL_CONFIG=(
95
+ actor_rollout_ref.model.path="${MODEL_PATH}"
96
+ actor_rollout_ref.model.use_remove_padding=True
97
+ actor_rollout_ref.model.enable_gradient_checkpointing=True
98
+ )
99
+
100
+ # Algorithm Configuration
101
+ ALGORITHM_CONFIG=(
102
+ algorithm.adv_estimator=${adv_estimator}
103
+ algorithm.use_kl_in_reward=${use_kl_in_reward}
104
+ algorithm.kl_ctrl.kl_coef=${kl_coef}
105
+ )
106
+
107
+ # Actor Model Configuration
108
+ ACTOR_CONFIG=(
109
+ actor_rollout_ref.actor.use_torch_compile=False
110
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
111
+ actor_rollout_ref.actor.strategy=${actor_strategy}
112
+ actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode}
113
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
114
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
115
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low}
116
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high}
117
+ actor_rollout_ref.actor.clip_ratio_c=10.0
118
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
119
+ actor_rollout_ref.actor.entropy_coeff=0
120
+ actor_rollout_ref.actor.grad_clip=1.0
121
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
122
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
123
+ actor_rollout_ref.actor.optim.lr=1e-6
124
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10
125
+ actor_rollout_ref.actor.optim.weight_decay=0.1
126
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size}
127
+ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size}
128
+ actor_rollout_ref.actor.fsdp_config.param_offload=${offload}
129
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload}
130
+ actor_rollout_ref.actor.fsdp_config.forward_prefetch=True
131
+ actor_rollout_ref.actor.entropy_checkpointing=True
132
+ actor_rollout_ref.actor.entropy_from_logits_with_chunking=True
133
+ )
134
+
135
+ # Reference Model Configuration
136
+ REF_CONFIG=(
137
+ actor_rollout_ref.ref.use_torch_compile=False
138
+ actor_rollout_ref.ref.strategy=${ref_strategy}
139
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
140
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
141
+ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size}
142
+ actor_rollout_ref.ref.fsdp_config.param_offload=${offload}
143
+ actor_rollout_ref.ref.fsdp_config.forward_prefetch=True
144
+ actor_rollout_ref.ref.entropy_checkpointing=True
145
+ actor_rollout_ref.ref.entropy_from_logits_with_chunking=True
146
+ )
147
+
148
+ # Rollout Configuration
149
+ ROLLOUT_CONFIG=(
150
+ actor_rollout_ref.rollout.name=vllm
151
+ actor_rollout_ref.rollout.calculate_log_probs=True
152
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt}
153
+ actor_rollout_ref.rollout.top_p=1.0
154
+ actor_rollout_ref.rollout.top_k=-1
155
+ actor_rollout_ref.rollout.temperature=1.0
156
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
157
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
158
+ actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
159
+ actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens}
160
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
161
+ actor_rollout_ref.rollout.enable_chunked_prefill=True
162
+ actor_rollout_ref.rollout.enforce_eager=False
163
+ actor_rollout_ref.rollout.free_cache_engine=True
164
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes="[8, 16, 32, 64, 128, 192, 256, 384]"
165
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode="FULL_DECODE_ONLY"
166
+ actor_rollout_ref.rollout.val_kwargs.n=1
167
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True
168
+ actor_rollout_ref.rollout.val_kwargs.top_p=0.7
169
+ actor_rollout_ref.rollout.val_kwargs.top_k=-1
170
+ actor_rollout_ref.rollout.val_kwargs.temperature=1.0
171
+ )
172
+
173
+ # Trainer Configuration
174
+ TRAINER_CONFIG=(
175
+ trainer.logger='["console"]'
176
+ trainer.project_name="${project_name}"
177
+ trainer.experiment_name="${exp_name}"
178
+ trainer.nnodes="${NNODES}"
179
+ trainer.n_gpus_per_node="${NPUS_PER_NODE}"
180
+ trainer.device='npu'
181
+ trainer.total_epochs=10
182
+ trainer.val_before_train=False
183
+ trainer.test_freq=-1
184
+ trainer.save_freq=100
185
+ trainer.default_local_dir="${CKPTS_DIR}"
186
+ trainer.resume_mode=auto
187
+ trainer.balance_batch=True
188
+ )
189
+
190
+ # Main GSPO Training Command
191
+ python3 -m verl.trainer.main_ppo \
192
+ "${DATA_CONFIG[@]}" \
193
+ "${MODEL_CONFIG[@]}" \
194
+ "${ACTOR_CONFIG[@]}" \
195
+ "${REF_CONFIG[@]}" \
196
+ "${ROLLOUT_CONFIG[@]}" \
197
+ "${ALGORITHM_CONFIG[@]}" \
198
+ "${TRAINER_CONFIG[@]}" \
199
+ "$@" | tee logs/run_qwen3_32b_gspo_megatron_vllm_npu.log
code/RL_model/verl/verl_train/examples/gspo_trainer/test_gspo_3b_math.sh ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ #SBATCH --job-name=rl-gspo-3B
3
+ #SBATCH --partition=main
4
+ #SBATCH --nodes=1 # Number of nodes
5
+ #SBATCH --ntasks-per-node=1 # One task per node
6
+ #SBATCH --cpus-per-task=128 # cpu-cores per task
7
+ #SBATCH --gres=gpu:8
8
+ #SBATCH --mem=0
9
+ #SBATCH --exclusive
10
+ #SBATCH --time=500:00:00
11
+ #SBATCH --output=/rl/logs/Qwen2.5-3B/gspo/math/vllm_%x_%j.out
12
+ #SBATCH --error=/rl/logs/Qwen2.5-3B/gspo/math/vllm_%x_%j.err
13
+
14
+ set -xeuo pipefail
15
+
16
+ # activate the venv
17
+ echo "Activating verl environment..."
18
+ eval "$(conda shell.bash hook)"
19
+ conda deactivate
20
+ conda activate verl
21
+
22
+ # can make training faster, depends on your infrastructure
23
+ export NCCL_IBEXT_DISABLE=1
24
+ export NCCL_NVLS_ENABLE=1
25
+ export NCCL_IB_HCA=mlx5
26
+ export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1
27
+
28
+ # Set how many GPUs we actually have on this node.
29
+ export GPUS_PER_NODE=8
30
+
31
+ NNODES=${SLURM_JOB_NUM_NODES}
32
+ export NNODES
33
+
34
+ export VLLM_ATTENTION_BACKEND=FLASH_ATTN
35
+ export RAY_LOGGING_LEVEL=DEBUG
36
+ export HYDRA_FULL_ERROR=1
37
+ export WANDB_API_KEY=... # your wandb API key
38
+
39
+ echo "Using $NNODES nodes for training..."
40
+
41
+ # ------------------------------------- Setup xp params ---------------------------------------
42
+ project_name='RL-GSPO'
43
+
44
+ adv_estimator=grpo
45
+ loss_mode=gspo
46
+ loss_agg_mode="seq-mean-token-mean"
47
+ MODEL_PATH=Qwen/Qwen2.5-3B-Instruct
48
+ offload=false # it's a small model, offloading will just slow-down training
49
+ rollout_engine=vllm
50
+ rollout_mode=async
51
+ return_raw_chat="True"
52
+ if [ "$rollout_engine" = "vllm" ]; then
53
+ export VLLM_USE_V1=1
54
+ fi
55
+ gpu_memory_utilization=0.8
56
+ reward_manager=dapo
57
+ adv_estimator=grpo
58
+ shuffle_dataset=true
59
+ first_time_dataset_prep=true # prepare dataset
60
+
61
+ test_freq=10
62
+ save_freq=10
63
+ total_epochs=10
64
+ total_training_steps=500
65
+ val_before_train=false
66
+
67
+ use_kl_in_reward=false
68
+ kl_coef=0.0
69
+ use_kl_loss=false
70
+ kl_loss_coef=0.0
71
+
72
+ clip_ratio_low=0.0003 # as recommended by the paper, see Sec. 5.1
73
+ clip_ratio_high=0.0004 # as recommended by the paper, see Sec. 5.1
74
+ train_batch_size=512
75
+ ppo_mini_batch_size=128 # maintain 4 mini-batches as recommended by the paper, see Sec. 5.1
76
+ ppo_micro_batch_size_per_gpu=8 # setup depending on your GPU memory
77
+ n_resp_per_prompt=16
78
+
79
+ max_prompt_length=$((1024 * 2))
80
+ max_response_length=$((1024 * 8))
81
+ # dapo reward manager params
82
+ enable_overlong_buffer=false # true
83
+ overlong_buffer_len=$((1024 * 4))
84
+ overlong_penalty_factor=1.0
85
+
86
+ # Paths and namings
87
+ SFT_MODEL=$(basename $MODEL_PATH)
88
+ exp_name="${loss_mode}-epslow-${clip_ratio_low}-epshigh-${clip_ratio_high}-${SFT_MODEL}-RL"
89
+ CKPTS_DIR=/rl/checkpoints/experimental/4b/${loss_mode}/${exp_name}
90
+
91
+ # Sampling params at rollouts
92
+ temperature=1.0
93
+ top_p=1.0
94
+ top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
95
+ val_top_p=0.7
96
+
97
+ # Performance Related Parameter
98
+ sp_size=1
99
+ use_dynamic_bsz=true
100
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
101
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
102
+ offload=true
103
+ gen_tp=1
104
+ entropy_checkpointing=true # This enables entropy recomputation specifically for the entropy calculation, lowering memory usage during training.
105
+
106
+ # ------------------------------------- train/val data preparation ---------------------------------------
107
+ if [ "$first_time_dataset_prep" = true ]; then
108
+ echo "Preprocessing GSM8K dataset..."
109
+ python examples/data_preprocess/gsm8k.py --local_save_dir /data/gsm8k/
110
+ fi
111
+
112
+ gsm8k_train_path=/data/gsm8k/train.parquet
113
+ gsm8k_test_path=/data/gsm8k/test.parquet
114
+
115
+ # set the paths
116
+ train_files="['$gsm8k_train_path']"
117
+ test_files="['$gsm8k_test_path']"
118
+
119
+ python3 -m verl.trainer.main_ppo \
120
+ algorithm.adv_estimator=${adv_estimator} \
121
+ actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
122
+ data.train_files="${train_files}" \
123
+ data.val_files="${test_files}" \
124
+ data.shuffle=$shuffle_dataset \
125
+ data.prompt_key=prompt \
126
+ data.truncation='error' \
127
+ data.filter_overlong_prompts=true \
128
+ data.return_raw_chat=${return_raw_chat} \
129
+ data.train_batch_size=${train_batch_size} \
130
+ data.max_prompt_length=${max_prompt_length} \
131
+ data.max_response_length=${max_response_length} \
132
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
133
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
134
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
135
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
136
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
137
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
138
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
139
+ actor_rollout_ref.model.use_remove_padding=true \
140
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
141
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
142
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
143
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
144
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
145
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
146
+ actor_rollout_ref.rollout.name=${rollout_engine} \
147
+ actor_rollout_ref.rollout.mode=${rollout_mode} \
148
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
149
+ actor_rollout_ref.model.enable_gradient_checkpointing=true \
150
+ actor_rollout_ref.actor.optim.lr=1e-6 \
151
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.05 \
152
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
153
+ actor_rollout_ref.actor.ppo_mini_batch_size=${ppo_mini_batch_size} \
154
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${ppo_micro_batch_size_per_gpu} \
155
+ actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
156
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
157
+ actor_rollout_ref.actor.entropy_coeff=0 \
158
+ actor_rollout_ref.actor.grad_clip=1.0 \
159
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
160
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
161
+ actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization} \
162
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
163
+ actor_rollout_ref.rollout.enable_chunked_prefill=true \
164
+ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
165
+ actor_rollout_ref.rollout.temperature=${temperature} \
166
+ actor_rollout_ref.rollout.top_p=${top_p} \
167
+ actor_rollout_ref.rollout.top_k=${top_k} \
168
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
169
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
170
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
171
+ actor_rollout_ref.rollout.val_kwargs.do_sample=true \
172
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
173
+ actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
174
+ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
175
+ actor_rollout_ref.actor.entropy_checkpointing=${entropy_checkpointing} \
176
+ reward_model.reward_manager=${reward_manager} \
177
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
178
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
179
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
180
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=false \
181
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
182
+ trainer.logger='["console","wandb"]' \
183
+ trainer.project_name="${project_name}" \
184
+ trainer.experiment_name="${exp_name}" \
185
+ trainer.n_gpus_per_node="${GPUS_PER_NODE}" \
186
+ trainer.nnodes="${NNODES}" \
187
+ trainer.val_before_train=${val_before_train} \
188
+ trainer.test_freq=${test_freq} \
189
+ trainer.save_freq=${save_freq} \
190
+ trainer.total_epochs=${total_epochs} \
191
+ trainer.total_training_steps=${total_training_steps} \
192
+ trainer.default_local_dir="${CKPTS_DIR}" \
193
+ trainer.resume_mode=auto \
194
+ trainer.log_val_generations=2 \
195
+ $@