shahidul034 commited on
Commit
d0f96bf
·
verified ·
1 Parent(s): ff8fd11

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. code/RL_model/verl/verl_train/examples/data_preprocess/aime2024_multiturn_w_tool.py +79 -0
  2. code/RL_model/verl/verl_train/examples/data_preprocess/dapo_multiturn_w_tool.py +79 -0
  3. code/RL_model/verl/verl_train/examples/data_preprocess/full_hh_rlhf.py +161 -0
  4. code/RL_model/verl/verl_train/examples/data_preprocess/geo3k.py +102 -0
  5. code/RL_model/verl/verl_train/examples/data_preprocess/geo3k_multiturn_w_tool.py +120 -0
  6. code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k.py +105 -0
  7. code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_sft.py +102 -0
  8. code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_interaction.py +119 -0
  9. code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_tool.py +129 -0
  10. code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_tool_agent_loop.py +130 -0
  11. code/RL_model/verl/verl_train/examples/data_preprocess/hellaswag.py +108 -0
  12. code/RL_model/verl/verl_train/examples/data_preprocess/math_dataset.py +106 -0
  13. code/RL_model/verl/verl_train/examples/data_preprocess/multiturn.py +125 -0
  14. code/RL_model/verl/verl_train/examples/data_preprocess/pokemon.py +75 -0
  15. code/RL_model/verl/verl_train/examples/data_preprocess/preprocess_search_r1_dataset.py +178 -0
  16. code/RL_model/verl/verl_train/examples/gmpo_trainer/README.md +59 -0
  17. code/RL_model/verl/verl_train/examples/gmpo_trainer/run_qwen2_5-7b_math.sh +60 -0
  18. code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_7b_math.sh +138 -0
  19. code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh +134 -0
  20. code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh +50 -0
  21. code/RL_model/verl/verl_train/examples/grpo_trainer/run_glm41v_9b.sh +46 -0
  22. code/RL_model/verl/verl_train/examples/grpo_trainer/run_minicpmo2_6.sh +49 -0
  23. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh +47 -0
  24. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-32b_grpo_megatron_vllm_npu.sh +185 -0
  25. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh +47 -0
  26. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh +68 -0
  27. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b.sh +47 -0
  28. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-32b_npu.sh +58 -0
  29. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b.sh +43 -0
  30. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh +71 -0
  31. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-8b-megatron.sh +86 -0
  32. code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_grpo_megatron_vllm_npu.sh +188 -0
  33. code/RL_model/verl/verl_train/examples/mtp_trainer/runtime_env.yaml +17 -0
  34. code/RL_model/verl/verl_train/examples/mtp_trainer/test_dapo_mimo_7b_with_mtp_math_megatron.sh +144 -0
  35. code/RL_model/verl/verl_train/examples/ppo_trainer/README.md +103 -0
  36. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm.sh +42 -0
  37. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh +42 -0
  38. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh +45 -0
  39. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh +44 -0
  40. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh +43 -0
  41. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh +45 -0
  42. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh +49 -0
  43. code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh +65 -0
  44. code/RL_model/verl/verl_train/examples/ppo_trainer/run_gemma.sh +40 -0
  45. code/RL_model/verl/verl_train/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh +106 -0
  46. code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh +73 -0
  47. code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh +47 -0
  48. code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm.sh +75 -0
  49. code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh +63 -0
  50. code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh +69 -0
code/RL_model/verl/verl_train/examples/data_preprocess/aime2024_multiturn_w_tool.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ # Copyright 2023-2024 SGLang Team
3
+ # Copyright 2025 ModelBest Inc. and/or its affiliates
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Preprocess the DAPO-Math-17k dataset to multiturn format
18
+ """
19
+
20
+ import argparse
21
+ import os
22
+
23
+ import datasets
24
+
25
+ from verl.utils.hdfs_io import copy, makedirs
26
+
27
+ if __name__ == "__main__":
28
+ parser = argparse.ArgumentParser()
29
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
30
+ parser.add_argument("--hdfs_dir", default=None)
31
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
32
+ parser.add_argument(
33
+ "--local_save_dir", default="~/data/retool_aime2024", help="The save directory for the preprocessed dataset."
34
+ )
35
+
36
+ args = parser.parse_args()
37
+ local_dataset_path = args.local_dataset_path
38
+
39
+ data_path = "BytedTsinghua-SIA/AIME-2024"
40
+
41
+ if local_dataset_path is not None:
42
+ dataset = datasets.load_dataset(local_dataset_path, "default")
43
+ else:
44
+ dataset = datasets.load_dataset(data_path, "default")
45
+
46
+ train_dataset = dataset["train"]
47
+
48
+ # add a row to each data item that represents a unique id
49
+ def make_map_fn(split):
50
+ def process_fn(example, idx):
51
+ orig_extra_info = example.pop("extra_info")
52
+ extra_info = orig_extra_info.copy()
53
+ extra_info["need_tools_kwargs"] = True
54
+ extra_info["tools_kwargs"] = {
55
+ "code_interpreter": {
56
+ "create_kwargs": {
57
+ "ground_truth": example["reward_model"]["ground_truth"],
58
+ },
59
+ },
60
+ }
61
+ example["extra_info"] = extra_info
62
+ return example
63
+
64
+ return process_fn
65
+
66
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
67
+
68
+ hdfs_dir = args.hdfs_dir
69
+ local_save_dir = args.local_dir
70
+ if local_save_dir is not None:
71
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
72
+ else:
73
+ local_save_dir = args.local_save_dir
74
+
75
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
76
+
77
+ if hdfs_dir is not None:
78
+ makedirs(hdfs_dir)
79
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/dapo_multiturn_w_tool.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ # Copyright 2023-2024 SGLang Team
3
+ # Copyright 2025 ModelBest Inc. and/or its affiliates
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Preprocess the DAPO-Math-17k dataset to multiturn format
18
+ """
19
+
20
+ import argparse
21
+ import os
22
+
23
+ import datasets
24
+
25
+ from verl.utils.hdfs_io import copy, makedirs
26
+
27
+ if __name__ == "__main__":
28
+ parser = argparse.ArgumentParser()
29
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
30
+ parser.add_argument("--hdfs_dir", default=None)
31
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
32
+ parser.add_argument(
33
+ "--local_save_dir", default="~/data/retool_dapo", help="The save directory for the preprocessed dataset."
34
+ )
35
+
36
+ args = parser.parse_args()
37
+ local_dataset_path = args.local_dataset_path
38
+
39
+ data_path = "BytedTsinghua-SIA/DAPO-Math-17k"
40
+
41
+ if local_dataset_path is not None:
42
+ dataset = datasets.load_dataset(local_dataset_path, "default")
43
+ else:
44
+ dataset = datasets.load_dataset(data_path, "default")
45
+
46
+ train_dataset = dataset["train"]
47
+
48
+ # add a row to each data item that represents a unique id
49
+ def make_map_fn(split):
50
+ def process_fn(example, idx):
51
+ orig_extra_info = example.pop("extra_info")
52
+ extra_info = orig_extra_info.copy()
53
+ extra_info["need_tools_kwargs"] = True
54
+ extra_info["tools_kwargs"] = {
55
+ "code_interpreter": {
56
+ "create_kwargs": {
57
+ "ground_truth": example["reward_model"]["ground_truth"],
58
+ },
59
+ },
60
+ }
61
+ example["extra_info"] = extra_info
62
+ return example
63
+
64
+ return process_fn
65
+
66
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
67
+
68
+ hdfs_dir = args.hdfs_dir
69
+ local_save_dir = args.local_dir
70
+ if local_save_dir is not None:
71
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
72
+ else:
73
+ local_save_dir = args.local_save_dir
74
+
75
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
76
+
77
+ if hdfs_dir is not None:
78
+ makedirs(hdfs_dir)
79
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/full_hh_rlhf.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ - Preprocess data and split the training set into 75% for training RM and 25% for validting RM.
16
+ - All the training data is used to train SFT and RL.
17
+ - Both chosen and rejected is used to train SFT
18
+ """
19
+
20
+ import argparse
21
+ import os
22
+
23
+ import pandas as pd
24
+ from datasets import load_dataset
25
+ from tqdm.auto import tqdm
26
+
27
+ from verl.utils.fs import copy, makedirs
28
+
29
+
30
+ def generate_sft_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/sft", local_dataset_path=None):
31
+ if local_dataset_path is not None:
32
+ dataset = load_dataset(local_dataset_path)
33
+ else:
34
+ dataset = load_dataset("Dahoas/full-hh-rlhf")
35
+ output = {"prompt": [], "response": []}
36
+ for data in tqdm(dataset["train"]):
37
+ # add chosen
38
+ output["prompt"].append(data["prompt"])
39
+ output["response"].append(data["chosen"])
40
+
41
+ # add rejection
42
+ output["prompt"].append(data["prompt"])
43
+ output["response"].append(data["rejected"])
44
+
45
+ df = pd.DataFrame(output)
46
+
47
+ local_dir = os.path.expanduser(local_dir)
48
+ os.makedirs(local_dir, exist_ok=True)
49
+
50
+ local_path = os.path.join(local_dir, "train.parquet")
51
+
52
+ df.to_parquet(path=local_path)
53
+
54
+ if target_hdfs_path_dir is not None:
55
+ hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet"
56
+ makedirs(hdfs_dir)
57
+
58
+ copy(local_path, hdfs_dir)
59
+
60
+
61
+ def generate_rm_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/rm", local_dataset_path=None):
62
+ if local_dataset_path is not None:
63
+ train_dataset = load_dataset(local_dataset_path, split="train[:75%]")
64
+ test_dataset = load_dataset(local_dataset_path, split="train[-25%:]")
65
+ else:
66
+ train_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[:75%]")
67
+ test_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[-25%:]")
68
+
69
+ local_dir = os.path.expanduser(local_dir)
70
+ os.makedirs(local_dir, exist_ok=True)
71
+
72
+ for dataset, name in zip([train_dataset, test_dataset], ["train", "test"], strict=True):
73
+ output = {"prompt": [], "chosen": [], "rejected": []}
74
+ for data in tqdm(dataset):
75
+ # add chosen
76
+ output["prompt"].append(data["prompt"])
77
+ output["chosen"].append(data["chosen"])
78
+ output["rejected"].append(data["rejected"])
79
+
80
+ df = pd.DataFrame(output)
81
+
82
+ local_path = os.path.join(local_dir, name + ".parquet")
83
+
84
+ df.to_parquet(path=local_path)
85
+
86
+ if target_hdfs_path_dir is not None:
87
+ hdfs_dir = target_hdfs_path_dir + "/" + name + ".parquet"
88
+ makedirs(hdfs_dir)
89
+
90
+ copy(local_path, hdfs_dir)
91
+
92
+
93
+ def generate_rl_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlhf/rl", local_dataset_path=None):
94
+ if local_dataset_path is not None:
95
+ dataset = load_dataset(local_dataset_path)
96
+ else:
97
+ dataset = load_dataset("Dahoas/full-hh-rlhf")
98
+ train_dataset = dataset["train"]
99
+
100
+ data_source = "Dahoas/full-hh-rlhf"
101
+
102
+ # add a row to each data item that represents a unique id
103
+ def make_map_fn(split):
104
+ def process_fn(example, idx):
105
+ prompt = example.pop("prompt")
106
+ response = example.pop("response")
107
+
108
+ data = {
109
+ "data_source": data_source,
110
+ "prompt": [{"role": "user", "content": prompt}],
111
+ "ability": "alignment",
112
+ "reward_model": {
113
+ "style": "model",
114
+ "ground_truth": response, # should not be used
115
+ },
116
+ "extra_info": {"split": split, "index": idx},
117
+ }
118
+ return data
119
+
120
+ return process_fn
121
+
122
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
123
+ local_dir = os.path.expanduser(local_dir)
124
+ local_path = os.path.join(local_dir, "train.parquet")
125
+ train_dataset.to_parquet(local_path)
126
+
127
+ if target_hdfs_path_dir is not None:
128
+ hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet"
129
+ makedirs(hdfs_dir)
130
+
131
+ copy(local_path, hdfs_dir)
132
+
133
+
134
+ if __name__ == "__main__":
135
+ parser = argparse.ArgumentParser()
136
+ parser.add_argument("--split", type=str, choices=["sft", "rm", "rl"], required=True)
137
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
138
+ parser.add_argument("--hdfs_dir", type=str, required=False, default=None)
139
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
140
+ parser.add_argument(
141
+ "--local_save_dir",
142
+ type=str,
143
+ default="~/data/full_hh_rlhf",
144
+ help="The save directory for the preprocessed dataset.",
145
+ )
146
+
147
+ args = parser.parse_args()
148
+ local_save_dir = args.local_dir
149
+ if local_save_dir is not None:
150
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
151
+ else:
152
+ local_save_dir = args.local_save_dir
153
+
154
+ if args.split == "sft":
155
+ generate_sft_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path)
156
+ elif args.split == "rm":
157
+ generate_rm_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path)
158
+ elif args.split == "rl":
159
+ generate_rl_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path)
160
+ else:
161
+ raise NotImplementedError
code/RL_model/verl/verl_train/examples/data_preprocess/geo3k.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the Geometry3k dataset to parquet format
16
+ """
17
+
18
+ import argparse
19
+ import os
20
+
21
+ import datasets
22
+
23
+ from verl.utils.hdfs_io import copy, makedirs
24
+
25
+ if __name__ == "__main__":
26
+ parser = argparse.ArgumentParser()
27
+ parser.add_argument("--local_dir", default=None)
28
+ parser.add_argument("--hdfs_dir", default=None)
29
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
30
+ parser.add_argument(
31
+ "--local_save_dir", default="~/data/geo3k", help="The save directory for the preprocessed dataset."
32
+ )
33
+
34
+ args = parser.parse_args()
35
+ local_dataset_path = args.local_dataset_path
36
+
37
+ data_source = "hiyouga/geometry3k"
38
+
39
+ if local_dataset_path is not None:
40
+ dataset = datasets.load_dataset(
41
+ local_dataset_path,
42
+ )
43
+ else:
44
+ dataset = datasets.load_dataset(
45
+ data_source,
46
+ )
47
+
48
+ train_dataset = dataset["train"]
49
+ test_dataset = dataset["test"]
50
+
51
+ instruction_following = (
52
+ r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
53
+ r"The reasoning process MUST BE enclosed within <think> </think> tags. "
54
+ r"The final answer MUST BE put in \boxed{}."
55
+ )
56
+
57
+ # add a row to each data item that represents a unique id
58
+ def make_map_fn(split):
59
+ def process_fn(example, idx):
60
+ problem = example.pop("problem")
61
+ prompt = problem + " " + instruction_following
62
+ answer = example.pop("answer")
63
+ images = example.pop("images")
64
+
65
+ data = {
66
+ "data_source": data_source,
67
+ "prompt": [
68
+ {
69
+ "role": "user",
70
+ "content": prompt,
71
+ }
72
+ ],
73
+ "images": images,
74
+ "ability": "math",
75
+ "reward_model": {"style": "rule", "ground_truth": answer},
76
+ "extra_info": {
77
+ "split": split,
78
+ "index": idx,
79
+ "answer": answer,
80
+ "question": problem,
81
+ },
82
+ }
83
+ return data
84
+
85
+ return process_fn
86
+
87
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
88
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
89
+
90
+ hdfs_dir = args.hdfs_dir
91
+ local_save_dir = args.local_dir
92
+ if local_save_dir is not None:
93
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
94
+ else:
95
+ local_save_dir = args.local_save_dir
96
+
97
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
98
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
99
+
100
+ if hdfs_dir is not None:
101
+ makedirs(hdfs_dir)
102
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/geo3k_multiturn_w_tool.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023-2025 SGLang Team
2
+ # Copyright Amazon.com, Inc. or its affiliates.
3
+ # Copyright 2025 Reallm Labs Ltd. or its affiliates
4
+ # Copyright 2025 ModelBest Inc. and/or its affiliates
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ """
19
+ Preprocess the Geometry3k dataset to parquet format
20
+ """
21
+
22
+ import argparse
23
+ import os
24
+
25
+ import datasets
26
+
27
+ from verl.utils.hdfs_io import copy, makedirs
28
+
29
+ if __name__ == "__main__":
30
+ parser = argparse.ArgumentParser()
31
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
32
+ parser.add_argument("--hdfs_dir", default=None)
33
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
34
+ parser.add_argument(
35
+ "--local_save_dir",
36
+ default="~/data/geo3k_multiturn_w_tool",
37
+ help="The save directory for the preprocessed dataset.",
38
+ )
39
+
40
+ args = parser.parse_args()
41
+ local_dataset_path = args.local_dataset_path
42
+
43
+ data_source = "hiyouga/geometry3k"
44
+
45
+ if local_dataset_path is not None:
46
+ dataset = datasets.load_dataset(local_dataset_path)
47
+ else:
48
+ dataset = datasets.load_dataset(data_source)
49
+
50
+ train_dataset = dataset["train"]
51
+ test_dataset = dataset["test"]
52
+
53
+ instruction_following = (
54
+ r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
55
+ r"The reasoning process MUST BE enclosed within <think> </think> tags. "
56
+ r"The final answer MUST BE put in \boxed{}."
57
+ )
58
+
59
+ # add a row to each data item that represents a unique id
60
+ def make_map_fn(split):
61
+ def process_fn(example, idx):
62
+ problem = example.pop("problem")
63
+ prompt = problem + " " + instruction_following
64
+ answer = example.pop("answer")
65
+ images = example.pop("images")
66
+ data = {
67
+ "data_source": data_source,
68
+ "prompt": [
69
+ {
70
+ "role": "system",
71
+ "content": (
72
+ "You are a math expert. You are given a question and you need to solve it step by step. "
73
+ "Reasoning step by step before any tool call. "
74
+ "You should use the `calc_geo3k_reward` tool after step by step solving the question, "
75
+ "before generate final answer at least once and refine your answer if necessary. "
76
+ ),
77
+ },
78
+ {
79
+ "role": "user",
80
+ "content": prompt,
81
+ },
82
+ ],
83
+ "images": images,
84
+ "ability": "math",
85
+ "reward_model": {"style": "rule", "ground_truth": answer},
86
+ "extra_info": {
87
+ "split": split,
88
+ "index": idx,
89
+ "answer": answer,
90
+ "question": problem,
91
+ "need_tools_kwargs": True,
92
+ "tools_kwargs": {
93
+ "calc_geo3k_reward": {
94
+ "create_kwargs": {"ground_truth": answer},
95
+ # "execute_kwargs": {},
96
+ # "calc_reward_kwargs": {},
97
+ # "release_kwargs": {},
98
+ },
99
+ },
100
+ },
101
+ }
102
+ return data
103
+
104
+ return process_fn
105
+
106
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
107
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
108
+
109
+ hdfs_dir = args.hdfs_dir
110
+ local_save_dir = args.local_dir
111
+ if local_save_dir is not None:
112
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
113
+ else:
114
+ local_save_dir = args.local_save_dir
115
+
116
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
117
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
118
+ if hdfs_dir is not None:
119
+ makedirs(hdfs_dir)
120
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the GSM8k dataset to parquet format
16
+ """
17
+
18
+ import argparse
19
+ import os
20
+ import re
21
+
22
+ import datasets
23
+
24
+ from verl.utils.hdfs_io import copy, makedirs
25
+
26
+
27
+ def extract_solution(solution_str):
28
+ solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
29
+ assert solution is not None
30
+ final_solution = solution.group(0)
31
+ final_solution = final_solution.split("#### ")[1].replace(",", "")
32
+ return final_solution
33
+
34
+
35
+ if __name__ == "__main__":
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
38
+ parser.add_argument("--hdfs_dir", default=None)
39
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
40
+ parser.add_argument(
41
+ "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
42
+ )
43
+
44
+ args = parser.parse_args()
45
+ local_dataset_path = args.local_dataset_path
46
+
47
+ data_source = "openai/gsm8k"
48
+
49
+ if local_dataset_path is not None:
50
+ dataset = datasets.load_dataset(local_dataset_path, "main")
51
+ else:
52
+ dataset = datasets.load_dataset(data_source, "main")
53
+
54
+ train_dataset = dataset["train"]
55
+ test_dataset = dataset["test"]
56
+
57
+ instruction_following = 'Let\'s think step by step and output the final answer after "####".'
58
+
59
+ # add a row to each data item that represents a unique id
60
+ def make_map_fn(split):
61
+ def process_fn(example, idx):
62
+ question_raw = example.pop("question")
63
+
64
+ question = question_raw + " " + instruction_following
65
+
66
+ answer_raw = example.pop("answer")
67
+ solution = extract_solution(answer_raw)
68
+ data = {
69
+ "data_source": data_source,
70
+ "prompt": [
71
+ {
72
+ "role": "user",
73
+ "content": question,
74
+ }
75
+ ],
76
+ "ability": "math",
77
+ "reward_model": {"style": "rule", "ground_truth": solution},
78
+ "extra_info": {
79
+ "split": split,
80
+ "index": idx,
81
+ "answer": answer_raw,
82
+ "question": question_raw,
83
+ },
84
+ }
85
+ return data
86
+
87
+ return process_fn
88
+
89
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
90
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
91
+
92
+ hdfs_dir = args.hdfs_dir
93
+ local_save_dir = args.local_dir
94
+ if local_save_dir is not None:
95
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
96
+ else:
97
+ local_save_dir = args.local_save_dir
98
+
99
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
100
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
101
+
102
+ if hdfs_dir is not None:
103
+ makedirs(hdfs_dir)
104
+
105
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_sft.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the GSM8k dataset to parquet format
16
+ """
17
+
18
+ import argparse
19
+ import os
20
+ import re
21
+
22
+ import datasets
23
+
24
+ from verl.utils.hdfs_io import copy, makedirs
25
+
26
+
27
+ def extract_solution(solution_str):
28
+ solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
29
+ assert solution is not None
30
+ final_solution = solution.group(0)
31
+ final_solution = final_solution.split("#### ")[1].replace(",", "")
32
+ return final_solution
33
+
34
+
35
+ if __name__ == "__main__":
36
+ parser = argparse.ArgumentParser()
37
+ parser.add_argument("--local_dir", default=None)
38
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
39
+ parser.add_argument(
40
+ "--local_save_dir", default="~/data/gsm8k_sft", help="The save directory for the preprocessed dataset."
41
+ )
42
+ parser.add_argument("--hdfs_dir", default=None)
43
+
44
+ args = parser.parse_args()
45
+ local_dataset_path = args.local_dataset_path
46
+
47
+ data_source = "openai/gsm8k"
48
+
49
+ if local_dataset_path is not None:
50
+ dataset = datasets.load_dataset(local_dataset_path, "main")
51
+ else:
52
+ dataset = datasets.load_dataset(data_source, "main")
53
+
54
+ train_dataset = dataset["train"]
55
+ test_dataset = dataset["test"]
56
+
57
+ instruction_following = 'Let\'s think step by step and output the final answer after "####".'
58
+
59
+ # add a row to each data item that represents a unique id
60
+ def make_map_fn(split):
61
+ def process_fn(example, idx):
62
+ question_raw = example.pop("question")
63
+
64
+ question = question_raw + " " + instruction_following
65
+
66
+ answer_raw = example.pop("answer")
67
+ data = {
68
+ "messages": [
69
+ {
70
+ "role": "user",
71
+ "content": question,
72
+ },
73
+ {
74
+ "role": "assistant",
75
+ "content": answer_raw,
76
+ },
77
+ ],
78
+ }
79
+ return data
80
+
81
+ return process_fn
82
+
83
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
84
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
85
+
86
+ hdfs_dir = args.hdfs_dir
87
+
88
+ local_save_dir = args.local_dir
89
+ if local_save_dir is not None:
90
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
91
+ else:
92
+ local_save_dir = args.local_save_dir
93
+
94
+ local_save_dir = os.path.expanduser(local_save_dir)
95
+
96
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
97
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
98
+
99
+ if hdfs_dir is not None:
100
+ makedirs(hdfs_dir)
101
+
102
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_interaction.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ # Copyright 2023-2024 SGLang Team
3
+ # Copyright 2025 ModelBest Inc. and/or its affiliates
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Preprocess the GSM8k dataset to parquet format
18
+ """
19
+
20
+ import argparse
21
+ import os
22
+ import re
23
+
24
+ import datasets
25
+
26
+ from verl.utils.hdfs_io import copy, makedirs
27
+
28
+
29
+ def extract_solution(solution_str):
30
+ solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
31
+ assert solution is not None
32
+ final_solution = solution.group(0)
33
+ final_solution = final_solution.split("#### ")[1].replace(",", "")
34
+ return final_solution
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
40
+ parser.add_argument("--hdfs_dir", default=None)
41
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
42
+ parser.add_argument(
43
+ "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
44
+ )
45
+
46
+ args = parser.parse_args()
47
+ local_dataset_path = args.local_dataset_path
48
+
49
+ data_source = "openai/gsm8k"
50
+
51
+ if local_dataset_path is not None:
52
+ dataset = datasets.load_dataset(local_dataset_path, "main")
53
+ else:
54
+ dataset = datasets.load_dataset(data_source, "main")
55
+
56
+ train_dataset = dataset["train"]
57
+ test_dataset = dataset["test"]
58
+
59
+ instruction_following = "Let's think step by step and output the final answer after `####`."
60
+
61
+ # add a row to each data item that represents a unique id
62
+ def make_map_fn(split):
63
+ def process_fn(example, idx):
64
+ question_raw = example.pop("question")
65
+
66
+ question = question_raw + " " + instruction_following
67
+
68
+ answer_raw = example.pop("answer")
69
+ solution = extract_solution(answer_raw)
70
+ data = {
71
+ "data_source": data_source,
72
+ "prompt": [
73
+ {
74
+ "role": "system",
75
+ "content": (
76
+ "You are a math expert. You are given a question and you need to solve it step by step. "
77
+ "You should rethinking carefully if user point out your answer is wrong. "
78
+ "Put your final answer in the format of `#### <answer>`."
79
+ ),
80
+ },
81
+ {
82
+ "role": "user",
83
+ "content": question,
84
+ },
85
+ ],
86
+ "ability": "math",
87
+ "reward_model": {"style": "rule", "ground_truth": solution},
88
+ "extra_info": {
89
+ "split": split,
90
+ "index": idx,
91
+ "answer": answer_raw,
92
+ "question": question_raw,
93
+ "interaction_kwargs": {
94
+ "name": "gsm8k",
95
+ "query": question,
96
+ "ground_truth": solution,
97
+ },
98
+ },
99
+ }
100
+ return data
101
+
102
+ return process_fn
103
+
104
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
105
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
106
+
107
+ hdfs_dir = args.hdfs_dir
108
+ local_save_dir = args.local_dir
109
+ if local_save_dir is not None:
110
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
111
+ else:
112
+ local_save_dir = args.local_save_dir
113
+
114
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
115
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
116
+
117
+ if hdfs_dir is not None:
118
+ makedirs(hdfs_dir)
119
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_tool.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ # Copyright 2023-2024 SGLang Team
3
+ # Copyright 2025 ModelBest Inc. and/or its affiliates
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Preprocess the GSM8k dataset to parquet format
18
+ """
19
+
20
+ import argparse
21
+ import os
22
+ import re
23
+
24
+ import datasets
25
+
26
+ from verl.utils.hdfs_io import copy, makedirs
27
+
28
+
29
+ def extract_solution(solution_str):
30
+ solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
31
+ assert solution is not None
32
+ final_solution = solution.group(0)
33
+ final_solution = final_solution.split("#### ")[1].replace(",", "")
34
+ return final_solution
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
40
+ parser.add_argument("--hdfs_dir", default=None)
41
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
42
+ parser.add_argument(
43
+ "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
44
+ )
45
+
46
+ args = parser.parse_args()
47
+ local_dataset_path = args.local_dataset_path
48
+
49
+ data_source = "openai/gsm8k"
50
+
51
+ if local_dataset_path is not None:
52
+ dataset = datasets.load_dataset(local_dataset_path, "main")
53
+ else:
54
+ dataset = datasets.load_dataset(data_source, "main")
55
+
56
+ train_dataset = dataset["train"]
57
+ test_dataset = dataset["test"]
58
+
59
+ instruction_following = "Let's think step by step and output the final answer after `####`."
60
+
61
+ # add a row to each data item that represents a unique id
62
+ def make_map_fn(split):
63
+ def process_fn(example, idx):
64
+ question_raw = example.pop("question")
65
+
66
+ question = question_raw + " " + instruction_following
67
+
68
+ answer_raw = example.pop("answer")
69
+ solution = extract_solution(answer_raw)
70
+ data = {
71
+ "data_source": data_source,
72
+ "prompt": [
73
+ {
74
+ "role": "system",
75
+ "content": (
76
+ "You are a math expert. You are given a question and you need to solve it step by step. "
77
+ "Reasoning step by step before any tool call. "
78
+ "You should use the `calc_gsm8k_reward` tool after step by step solving the question, "
79
+ "before generate final answer at least once and refine your answer if necessary. "
80
+ "Put your final answer in the format of `#### <answer>`."
81
+ ),
82
+ },
83
+ {
84
+ "role": "user",
85
+ "content": question,
86
+ },
87
+ ],
88
+ "ability": "math",
89
+ "reward_model": {"style": "rule", "ground_truth": solution},
90
+ "extra_info": {
91
+ "split": split,
92
+ "index": idx,
93
+ "answer": answer_raw,
94
+ "question": question_raw,
95
+ "need_tools_kwargs": True,
96
+ "tools_kwargs": {
97
+ "calc_gsm8k_reward": {
98
+ "create_kwargs": {"ground_truth": solution},
99
+ # "execute_kwargs": {},
100
+ # "calc_reward_kwargs": {},
101
+ # "release_kwargs": {},
102
+ },
103
+ },
104
+ "interaction_kwargs": {
105
+ "query": question,
106
+ "ground_truth": solution,
107
+ },
108
+ },
109
+ }
110
+ return data
111
+
112
+ return process_fn
113
+
114
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
115
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
116
+
117
+ hdfs_dir = args.hdfs_dir
118
+ local_save_dir = args.local_dir
119
+ if local_save_dir is not None:
120
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
121
+ else:
122
+ local_save_dir = args.local_save_dir
123
+
124
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
125
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
126
+
127
+ if hdfs_dir is not None:
128
+ makedirs(hdfs_dir)
129
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_tool_agent_loop.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ # Copyright 2023-2024 SGLang Team
3
+ # Copyright 2025 ModelBest Inc. and/or its affiliates
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """
17
+ Preprocess the GSM8k dataset to parquet format
18
+ """
19
+
20
+ import argparse
21
+ import os
22
+ import re
23
+
24
+ import datasets
25
+
26
+ from verl.utils.hdfs_io import copy, makedirs
27
+
28
+
29
+ def extract_solution(solution_str):
30
+ solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
31
+ assert solution is not None
32
+ final_solution = solution.group(0)
33
+ final_solution = final_solution.split("#### ")[1].replace(",", "")
34
+ return final_solution
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
40
+ parser.add_argument("--hdfs_dir", default=None)
41
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
42
+ parser.add_argument(
43
+ "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
44
+ )
45
+
46
+ args = parser.parse_args()
47
+ local_dataset_path = args.local_dataset_path
48
+
49
+ data_source = "openai/gsm8k"
50
+
51
+ if local_dataset_path is not None:
52
+ dataset = datasets.load_dataset(local_dataset_path, "main")
53
+ else:
54
+ dataset = datasets.load_dataset(data_source, "main")
55
+
56
+ train_dataset = dataset["train"]
57
+ test_dataset = dataset["test"]
58
+
59
+ instruction_following = "Let's think step by step and output the final answer after `####`."
60
+
61
+ # add a row to each data item that represents a unique id
62
+ def make_map_fn(split):
63
+ def process_fn(example, idx):
64
+ question_raw = example.pop("question")
65
+
66
+ question = question_raw + " " + instruction_following
67
+
68
+ answer_raw = example.pop("answer")
69
+ solution = extract_solution(answer_raw)
70
+ data = {
71
+ "data_source": data_source,
72
+ "agent_name": "tool_agent",
73
+ "prompt": [
74
+ {
75
+ "role": "system",
76
+ "content": (
77
+ "You are a math expert. You are given a question and you need to solve it step by step. "
78
+ "Reasoning step by step before any tool call. "
79
+ "You should use the `calc_gsm8k_reward` tool after step by step solving the question, "
80
+ "before generate final answer at least once and refine your answer if necessary. "
81
+ "Put your final answer in the format of `#### <answer>`."
82
+ ),
83
+ },
84
+ {
85
+ "role": "user",
86
+ "content": question,
87
+ },
88
+ ],
89
+ "ability": "math",
90
+ "reward_model": {"style": "rule", "ground_truth": solution},
91
+ "extra_info": {
92
+ "split": split,
93
+ "index": idx,
94
+ "answer": answer_raw,
95
+ "question": question_raw,
96
+ "need_tools_kwargs": True,
97
+ "tools_kwargs": {
98
+ "calc_gsm8k_reward": {
99
+ "create_kwargs": {"ground_truth": solution},
100
+ # "execute_kwargs": {},
101
+ # "calc_reward_kwargs": {},
102
+ # "release_kwargs": {},
103
+ },
104
+ },
105
+ "interaction_kwargs": {
106
+ "query": question,
107
+ "ground_truth": solution,
108
+ },
109
+ },
110
+ }
111
+ return data
112
+
113
+ return process_fn
114
+
115
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
116
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
117
+
118
+ hdfs_dir = args.hdfs_dir
119
+ local_save_dir = args.local_dir
120
+ if local_save_dir is not None:
121
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
122
+ else:
123
+ local_save_dir = args.local_save_dir
124
+
125
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
126
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
127
+
128
+ if hdfs_dir is not None:
129
+ makedirs(hdfs_dir)
130
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/hellaswag.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess Hellaswag dataset.
16
+
17
+ """
18
+
19
+ import argparse
20
+ import os
21
+ import re
22
+
23
+ import datasets
24
+
25
+ from verl.utils.hdfs_io import copy, makedirs
26
+
27
+
28
+ def preprocess(text):
29
+ text = text.strip()
30
+ # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
31
+ text = text.replace(" [title]", ". ")
32
+ text = re.sub("\\[.*?\\]", "", text)
33
+ text = text.replace(" ", " ")
34
+ return text
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser()
39
+ parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
40
+ parser.add_argument("--hdfs_dir", default=None)
41
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
42
+ parser.add_argument(
43
+ "--local_save_dir", default="~/data/hellaswag", help="The save directory for the preprocessed dataset."
44
+ )
45
+
46
+ args = parser.parse_args()
47
+ local_dataset_path = args.local_dataset_path
48
+
49
+ data_source = "Rowan/hellaswag"
50
+
51
+ if local_dataset_path is not None:
52
+ dataset = datasets.load_dataset(local_dataset_path)
53
+ else:
54
+ dataset = datasets.load_dataset(data_source, trust_remote_code=True)
55
+
56
+ train_dataset = dataset["train"]
57
+ val_dataset = dataset["validation"]
58
+ test_dataset = dataset["test"]
59
+
60
+ instruction = "Please complete the following sentence.\n"
61
+
62
+ def make_map_fn(split):
63
+ def process_fn(doc, idx):
64
+ ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
65
+ query = preprocess(doc["activity_label"] + ": " + ctx)
66
+ choices = [preprocess(ending) for ending in doc["endings"]]
67
+ gold = int(doc["label"])
68
+
69
+ data = {
70
+ "data_source": data_source,
71
+ "prompt": [{"role": "user", "content": query}],
72
+ "ability": "nlp",
73
+ "reward_model": {
74
+ "style": "model",
75
+ "eval": "multiple_choice", # using loglikelihood
76
+ "ground_truth": gold,
77
+ "choices": choices,
78
+ },
79
+ "extra_info": {"split": split, "index": idx},
80
+ }
81
+ return data
82
+
83
+ return process_fn
84
+
85
+ # filter data that doesn't have a label
86
+ train_dataset = train_dataset.filter(lambda x: len(x["label"]) > 0)
87
+ val_dataset = val_dataset.filter(lambda x: len(x["label"]) > 0)
88
+ test_dataset = test_dataset.filter(lambda x: len(x["label"]) > 0)
89
+
90
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
91
+ val_dataset = val_dataset.map(function=make_map_fn("validation"), with_indices=True)
92
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
93
+
94
+ hdfs_dir = args.hdfs_dir
95
+ local_save_dir = args.local_dir
96
+ if local_save_dir is not None:
97
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
98
+ else:
99
+ local_save_dir = args.local_save_dir
100
+
101
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
102
+ val_dataset.to_parquet(os.path.join(local_save_dir, "validation.parquet"))
103
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
104
+
105
+ if hdfs_dir is not None:
106
+ makedirs(hdfs_dir)
107
+
108
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/math_dataset.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Preprocess the MATH-lighteval dataset to parquet format
16
+ """
17
+
18
+ import argparse
19
+ import json
20
+ import os
21
+
22
+ import datasets
23
+
24
+ from verl.utils.hdfs_io import copy, makedirs
25
+ from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed
26
+
27
+
28
+ def extract_solution(solution_str):
29
+ return remove_boxed(last_boxed_only_string(solution_str))
30
+
31
+
32
+ if __name__ == "__main__":
33
+ parser = argparse.ArgumentParser()
34
+ parser.add_argument("--local_dir", default=None)
35
+ parser.add_argument("--hdfs_dir", default=None)
36
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
37
+ parser.add_argument(
38
+ "--local_save_dir", default="~/data/math", help="The save directory for the preprocessed dataset."
39
+ )
40
+
41
+ args = parser.parse_args()
42
+ local_dataset_path = args.local_dataset_path
43
+
44
+ # 'lighteval/MATH' is no longer available on huggingface.
45
+ # Use mirror repo: DigitalLearningGmbH/MATH-lighteval
46
+ data_source = "DigitalLearningGmbH/MATH-lighteval"
47
+ print(f"Loading the {data_source} dataset from huggingface...", flush=True)
48
+ if local_dataset_path is not None:
49
+ dataset = datasets.load_dataset(
50
+ local_dataset_path,
51
+ )
52
+ else:
53
+ dataset = datasets.load_dataset(
54
+ data_source,
55
+ )
56
+
57
+ train_dataset = dataset["train"]
58
+ test_dataset = dataset["test"]
59
+
60
+ instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
61
+
62
+ # add a row to each data item that represents a unique id
63
+ def make_map_fn(split):
64
+ def process_fn(example, idx):
65
+ question = example.pop("problem")
66
+
67
+ question = question + " " + instruction_following
68
+
69
+ answer = example.pop("solution")
70
+ solution = extract_solution(answer)
71
+ data = {
72
+ "data_source": data_source,
73
+ "prompt": [{"role": "user", "content": question}],
74
+ "ability": "math",
75
+ "reward_model": {"style": "rule", "ground_truth": solution},
76
+ "extra_info": {"split": split, "index": idx},
77
+ }
78
+ return data
79
+
80
+ return process_fn
81
+
82
+ train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
83
+ test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
84
+
85
+ local_save_dir = args.local_dir
86
+ if local_save_dir is not None:
87
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
88
+ else:
89
+ local_save_dir = args.local_save_dir
90
+
91
+ local_dir = os.path.expanduser(local_save_dir)
92
+ hdfs_dir = args.hdfs_dir
93
+
94
+ train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
95
+ test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
96
+ # Save one example as JSON for reference
97
+ example = train_dataset[0]
98
+ with open(os.path.join(local_dir, "train_example.json"), "w") as f:
99
+ json.dump(example, f, indent=2)
100
+ example = test_dataset[0]
101
+ with open(os.path.join(local_dir, "test_example.json"), "w") as f:
102
+ json.dump(example, f, indent=2)
103
+ if hdfs_dir is not None:
104
+ makedirs(hdfs_dir)
105
+
106
+ copy(src=local_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/multiturn.py ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """
15
+ Create a simple multi-turn dataset for testing
16
+ """
17
+
18
+ import argparse
19
+ import os
20
+
21
+ import pandas as pd
22
+
23
+
24
+ def main():
25
+ parser = argparse.ArgumentParser()
26
+ parser.add_argument("--local_dir", default="~/data/multiturn")
27
+ parser.add_argument("--hdfs_dir", default=None)
28
+ args = parser.parse_args()
29
+
30
+ # Create example conversations
31
+ conversations = []
32
+
33
+ # Conversation 1
34
+ conversations.append(
35
+ {
36
+ "messages": [
37
+ {"role": "system", "content": "You are a helpful assistant."},
38
+ {"role": "user", "content": "What is the capital of France?"},
39
+ {"role": "assistant", "content": "The capital of France is Paris."},
40
+ {"role": "user", "content": "And what about Germany?"},
41
+ {"role": "assistant", "content": "The capital of Germany is Berlin."},
42
+ ]
43
+ }
44
+ )
45
+
46
+ # Conversation 2
47
+ conversations.append(
48
+ {
49
+ "messages": [
50
+ {"role": "system", "content": "You are a helpful assistant."},
51
+ {"role": "user", "content": "Can you explain quantum computing?"},
52
+ {
53
+ "role": "assistant",
54
+ "content": "Quantum computing is a type of computing that uses quantum-mechanical phenomena, "
55
+ "such as superposition and entanglement, to perform operations on data.",
56
+ },
57
+ {"role": "user", "content": "How is it different from classical computing?"},
58
+ {
59
+ "role": "assistant",
60
+ "content": "Classical computing uses bits that are either 0 or 1, while quantum computing uses "
61
+ "quantum bits or qubits that can exist in multiple states simultaneously due to superposition.",
62
+ },
63
+ ]
64
+ }
65
+ )
66
+
67
+ # Conversation 3
68
+ conversations.append(
69
+ {
70
+ "messages": [
71
+ {"role": "system", "content": "You are a helpful assistant."},
72
+ {"role": "user", "content": "Write a simple Python function to calculate factorial."},
73
+ {
74
+ "role": "assistant",
75
+ "content": (
76
+ "```python\ndef factorial(n):\n if n == 0 or n == 1:\n return 1\n else:\n "
77
+ "return n * factorial(n-1)\n```\n\nThis is a recursive function to calculate the "
78
+ "factorial of a number."
79
+ ),
80
+ },
81
+ {"role": "user", "content": "Can you make it iterative instead?"},
82
+ {
83
+ "role": "assistant",
84
+ "content": (
85
+ "```python\ndef factorial(n):\n result = 1\n for i in range(1, n+1):\n "
86
+ "result *= i\n return result\n```\n\nThis is an iterative version of the factorial function."
87
+ ),
88
+ },
89
+ ]
90
+ }
91
+ )
92
+
93
+ # Create train and test datasets
94
+ train_data = conversations[:2] # First 2 conversations for training
95
+ test_data = conversations[2:] # Last conversation for testing
96
+
97
+ # Create output directory
98
+ local_dir = os.path.expanduser(args.local_dir)
99
+ os.makedirs(local_dir, exist_ok=True)
100
+
101
+ # Save to parquet files
102
+ train_df = pd.DataFrame(train_data)
103
+ test_df = pd.DataFrame(test_data)
104
+
105
+ train_df.to_parquet(os.path.join(local_dir, "train.parquet"))
106
+ test_df.to_parquet(os.path.join(local_dir, "test.parquet"))
107
+
108
+ # Handle HDFS if specified
109
+ if args.hdfs_dir is not None:
110
+ try:
111
+ from verl.utils.hdfs_io import copy, makedirs
112
+
113
+ makedirs(args.hdfs_dir)
114
+ copy(src=local_dir, dst=args.hdfs_dir)
115
+ except ImportError:
116
+ print("Warning: HDFS support not available. Skipping HDFS copy.")
117
+
118
+ # Print statistics
119
+ print(f"Train dataset size: {len(train_df)}")
120
+ print(f"Test dataset size: {len(test_df)}")
121
+ print(f"Data saved to {local_dir}")
122
+
123
+
124
+ if __name__ == "__main__":
125
+ main()
code/RL_model/verl/verl_train/examples/data_preprocess/pokemon.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ """
3
+ Preprocess the llamafactory/pokemon-gpt4o-captions dataset to parquet format
4
+ """
5
+
6
+ import argparse
7
+ import os
8
+
9
+ import datasets
10
+
11
+ from verl.utils.hdfs_io import copy, makedirs
12
+
13
+ if __name__ == "__main__":
14
+ parser = argparse.ArgumentParser()
15
+ parser.add_argument("--local_dir", default=None)
16
+ parser.add_argument("--hdfs_dir", default=None)
17
+ parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
18
+ parser.add_argument(
19
+ "--local_save_dir",
20
+ default="~/data/pokemon-gpt4o-captions",
21
+ help="The save directory for the preprocessed dataset.",
22
+ )
23
+
24
+ args = parser.parse_args()
25
+ local_dataset_path = args.local_dataset_path
26
+
27
+ data_source = "llamafactory/pokemon-gpt4o-captions"
28
+
29
+ if local_dataset_path is not None:
30
+ dataset = datasets.load_dataset(
31
+ local_dataset_path,
32
+ )
33
+ else:
34
+ dataset = datasets.load_dataset(
35
+ data_source,
36
+ )
37
+
38
+ def map_fn(row: dict):
39
+ messages = []
40
+ conversation = row.pop("conversations")
41
+ for conv in conversation:
42
+ if conv["from"] == "gpt":
43
+ role = "assistant"
44
+ elif conv["from"] == "human":
45
+ role = "user"
46
+ else:
47
+ raise ValueError(f"Unknown role: {conv['from']}")
48
+ messages.append(
49
+ {
50
+ "role": role,
51
+ "content": conv["value"],
52
+ }
53
+ )
54
+
55
+ row["messages"] = messages
56
+ return row
57
+
58
+ dataset = dataset["train"].map(map_fn, num_proc=16)
59
+ dataset = dataset.train_test_split(test_size=0.1)
60
+ train_dataset = dataset["train"]
61
+ test_dataset = dataset["test"]
62
+
63
+ hdfs_dir = args.hdfs_dir
64
+ local_save_dir = args.local_dir
65
+ if local_save_dir is not None:
66
+ print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
67
+ else:
68
+ local_save_dir = args.local_save_dir
69
+
70
+ train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
71
+ test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
72
+
73
+ if hdfs_dir is not None:
74
+ makedirs(hdfs_dir)
75
+ copy(src=local_save_dir, dst=hdfs_dir)
code/RL_model/verl/verl_train/examples/data_preprocess/preprocess_search_r1_dataset.py ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Bytedance Ltd. and/or its affiliates
2
+ # Copyright 2023-2024 SGLang Team
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import argparse
17
+ import logging
18
+ import os
19
+ import tempfile
20
+
21
+ import pandas as pd
22
+ from huggingface_hub import hf_hub_download
23
+ from huggingface_hub.utils import EntryNotFoundError
24
+
25
+ from verl.utils.hdfs_io import copy, makedirs
26
+
27
+ # Setup logging
28
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
29
+ logger = logging.getLogger(__name__)
30
+
31
+ # Configuration constants
32
+ DEFAULT_SYSTEM_CONTENT = "You are a helpful and harmless assistant."
33
+ DEFAULT_USER_CONTENT_PREFIX = (
34
+ "Answer the given question. You must conduct reasoning inside <think> and </think> "
35
+ "first every time you get new information. After reasoning, if you find you lack "
36
+ "some knowledge, you can call a search engine by <tool_call> query </tool_call> "
37
+ "and it will return the top searched results between <tool_response> and "
38
+ "</tool_response>. You can search as many times as your want. If you find no "
39
+ "further external knowledge needed, you can directly provide the answer inside "
40
+ "<answer> and </answer>, without detailed illustrations. For example, "
41
+ "<answer> Beijing </answer>. Question: "
42
+ )
43
+
44
+
45
+ def process_single_row(row, current_split_name, row_index):
46
+ """
47
+ Process a single row of data for SearchR1-like format.
48
+
49
+ Args:
50
+ row: DataFrame row containing the original data
51
+ current_split_name: Name of the current split (train/test)
52
+ row_index: Index of the row in the DataFrame
53
+
54
+ Returns:
55
+ pd.Series: Processed row data in the required format
56
+ """
57
+ question = row.get("question", "")
58
+
59
+ # Build prompt structure
60
+ user_content = user_content_prefix.rstrip("\n") + question
61
+ prompt = [{"role": "system", "content": system_content}, {"role": "user", "content": user_content}]
62
+
63
+ # Extract ground truth from reward_model or fallback to golden_answers
64
+ reward_model_data = row.get("reward_model")
65
+ if isinstance(reward_model_data, dict) and "ground_truth" in reward_model_data:
66
+ ground_truth = reward_model_data.get("ground_truth")
67
+ else:
68
+ ground_truth = row.get("golden_answers", [])
69
+
70
+ # Process data source
71
+ data_source_tagged = "searchR1_" + str(row.get("data_source", ""))
72
+
73
+ # Build tools kwargs structure
74
+ tools_kwargs = {
75
+ "search": {
76
+ "create_kwargs": {"ground_truth": ground_truth, "question": question, "data_source": data_source_tagged}
77
+ }
78
+ }
79
+
80
+ # Build complete extra_info structure
81
+ extra_info = {
82
+ "index": row_index,
83
+ "need_tools_kwargs": True,
84
+ "question": question,
85
+ "split": current_split_name,
86
+ "tools_kwargs": tools_kwargs,
87
+ }
88
+
89
+ return pd.Series(
90
+ {
91
+ "data_source": data_source_tagged,
92
+ "prompt": prompt,
93
+ "ability": row.get("ability"),
94
+ "reward_model": reward_model_data,
95
+ "extra_info": extra_info,
96
+ "metadata": row.get("metadata"),
97
+ }
98
+ )
99
+
100
+
101
+ def main():
102
+ local_save_dir = os.path.expanduser(args.local_dir)
103
+ os.makedirs(local_save_dir, exist_ok=True)
104
+
105
+ processed_files = []
106
+
107
+ # Download and process files using temporary directory
108
+ with tempfile.TemporaryDirectory() as tmp_download_dir:
109
+ for split in ["train", "test"]:
110
+ parquet_filename = f"{split}.parquet"
111
+ logger.info(f"Processing {split} split...")
112
+
113
+ try:
114
+ # Download Parquet file from HuggingFace
115
+ logger.info(f"Downloading {parquet_filename} from {args.hf_repo_id}")
116
+ local_parquet_filepath = hf_hub_download(
117
+ repo_id=args.hf_repo_id,
118
+ filename=parquet_filename,
119
+ repo_type="dataset",
120
+ local_dir=tmp_download_dir,
121
+ local_dir_use_symlinks=False,
122
+ )
123
+
124
+ # Load and process Parquet file
125
+ df_raw = pd.read_parquet(local_parquet_filepath)
126
+ logger.info(f"Loaded {len(df_raw)} rows from {parquet_filename}")
127
+
128
+ def apply_process_row(row, split_name=split):
129
+ return process_single_row(row, current_split_name=split_name, row_index=row.name)
130
+
131
+ df_processed = df_raw.apply(apply_process_row, axis=1)
132
+
133
+ # Save processed DataFrame
134
+ output_file_path = os.path.join(local_save_dir, f"{split}.parquet")
135
+ df_processed.to_parquet(output_file_path, index=False)
136
+ logger.info(f"Saved {len(df_processed)} processed rows to {output_file_path}")
137
+ processed_files.append(output_file_path)
138
+
139
+ except EntryNotFoundError:
140
+ logger.warning(f"{parquet_filename} not found in repository {args.hf_repo_id}")
141
+ except Exception as e:
142
+ logger.error(f"Error processing {split} split: {e}")
143
+
144
+ if not processed_files:
145
+ logger.warning("No data was processed or saved")
146
+ return
147
+
148
+ logger.info(f"Successfully processed {len(processed_files)} files to {local_save_dir}")
149
+
150
+ # Copy to HDFS if specified
151
+ if args.hdfs_dir:
152
+ try:
153
+ makedirs(args.hdfs_dir)
154
+ copy(src=local_save_dir, dst=args.hdfs_dir)
155
+ logger.info(f"Successfully copied files to HDFS: {args.hdfs_dir}")
156
+ except Exception as e:
157
+ logger.error(f"Error copying files to HDFS: {e}")
158
+
159
+
160
+ if __name__ == "__main__":
161
+ parser = argparse.ArgumentParser(description="Download Search-R1 from HuggingFace, process, and save to Parquet.")
162
+ parser.add_argument(
163
+ "--hf_repo_id", default="PeterJinGo/nq_hotpotqa_train", help="HuggingFace dataset repository ID."
164
+ )
165
+ parser.add_argument(
166
+ "--local_dir",
167
+ default="~/data/searchR1_processed_direct",
168
+ help="Local directory to save the processed Parquet files.",
169
+ )
170
+ parser.add_argument("--hdfs_dir", default=None, help="Optional HDFS directory to copy the Parquet files to.")
171
+
172
+ args = parser.parse_args()
173
+
174
+ # System and user content configuration
175
+ system_content = DEFAULT_SYSTEM_CONTENT
176
+ user_content_prefix = DEFAULT_USER_CONTENT_PREFIX
177
+
178
+ main()
code/RL_model/verl/verl_train/examples/gmpo_trainer/README.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align=center>
2
+
3
+ # Geometric-Mean Policy Optimization
4
+ </div>
5
+
6
+ This is the official implementaion of paper [***Geometric-Mean Policy Optimization***](https://arxiv.org/abs/2507.20673).
7
+
8
+ <div align=center>
9
+ <img width="3092" height="864" alt="image" src="https://github.com/user-attachments/assets/20b04c4e-7ee8-4775-9af8-33c0158336e2" />
10
+ </div>
11
+
12
+ ## 1. Contents
13
+ - Geometric-Mean Policy Optimization
14
+ - [1. Contents](#1-contents)
15
+ - [2. Introduction](#2-introduction)
16
+ - [3. Code Usage](#3-code-usage)
17
+ - [4. Contacts](#4-contacts)
18
+ - [5. Citation](#5-citation)
19
+
20
+ ## 2. Introduction
21
+
22
+ Group Relative Policy Optimization (GRPO) has significantly enhanced the reasoning capability of large language models by optimizing the arithmetic mean of token-level rewards. Unfortunately, GRPO is observed to suffer from unstable policy updates when facing tokens with outlier importance-weighted rewards, which manifest as extreme importance sampling ratios during training. In this study, we propose Geometric-Mean Policy Optimization (GMPO), with the aim to improve the stability of GRPO through suppressing token reward outliers. Instead of optimizing the arithmetic mean, GMPO maximizes the geometric mean of token-level rewards, which is inherently less sensitive to outliers and maintains a more stable range of importance sampling ratio. GMPO is plug-and-play—simply replacing GRPO's arithmetic mean with the geometric mean of token-level rewards, as the latter is inherently less sensitive to outliers. GMPO is theoretically plausible—analysis reveals that both GMPO and GRPO are weighted forms of the policy gradient while the former enjoys more stable weights, which consequently benefits policy optimization and performance. Experiments on multiple mathematical reasoning benchmarks show that GMPO-7B improves the average Pass@1 of GRPO by up to 4.1%, outperforming many state-of-the-art approaches.
23
+
24
+ ## 3. Code Usage
25
+
26
+ The key configurations are:
27
+ ```
28
+ clip_ratio_low=0.4
29
+ clip_ratio_high=0.4
30
+ loss_mode=geo_mean
31
+ ```
32
+ We observed that using a large clip ratio during Mixture-of-Experts (MoE) model training often leads to optimization instability. When training MoE models, consider lowering the clip ratio to achieve more stable convergence.
33
+ To get started quickly, run:
34
+ ```
35
+ bash examples/gmpo_trainer/run_qwen2_5-7b_math.sh
36
+ ```
37
+
38
+ GMPO can be combined with other methods such as DAPO (experimental - not fully tested):
39
+ ```
40
+ bash examples/gmpo_trainer/test_dapo_7b_math.sh
41
+ bash examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh
42
+ ```
43
+
44
+ ## 4. Contacts
45
+ If you have any question about our work or this repository, please don't hesitate to contact us by emails or open an issue under this project.
46
+ - [zhaoyuzhong20@mails.ucas.ac.cn](zhaoyuzhong20@mails.ucas.ac.cn)
47
+ - [liuyue171@mails.ucas.ac.cn](liuyue171@mails.ucas.ac.cn)
48
+ - [lecu@microsoft.com](lecu@microsoft.com)
49
+ - [wanfang@ucas.ac.cn](wanfang@ucas.ac.cn)
50
+
51
+ ## 5. Citation
52
+ ```
53
+ @article{zhao2025geometric,
54
+ title={Geometric-mean policy optimization},
55
+ author={Zhao, Yuzhong and Liu, Yue and Liu, Junpeng and Chen, Jingye and Wu, Xun and Hao, Yaru and Lv, Tengchao and Huang, Shaohan and Cui, Lei and Ye, Qixiang and others},
56
+ journal={arXiv preprint arXiv:2507.20673},
57
+ year={2025}
58
+ }
59
+ ```
code/RL_model/verl/verl_train/examples/gmpo_trainer/run_qwen2_5-7b_math.sh ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
4
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
5
+ math_train_path=$HOME/data/math/train.parquet
6
+ math_test_path=$HOME/data/math/test.parquet
7
+
8
+ train_files="['$gsm8k_train_path', '$math_train_path']"
9
+ test_files="['$gsm8k_test_path', '$math_test_path']"
10
+
11
+ use_kl_loss=False
12
+ loss_mode=geo_mean
13
+ clip_ratio=0.4
14
+ save_contents="['model', 'optimizer', 'extra']"
15
+
16
+ export WANDB_MODE=offline
17
+ save_contents="['hf_model']"
18
+
19
+ python3 -m verl.trainer.main_ppo \
20
+ algorithm.adv_estimator=grpo \
21
+ data.train_files="$train_files" \
22
+ data.val_files="$test_files" \
23
+ data.train_batch_size=1024 \
24
+ data.max_prompt_length=1024 \
25
+ data.max_response_length=1024 \
26
+ data.filter_overlong_prompts=True \
27
+ data.truncation='error' \
28
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-Math-7B \
29
+ actor_rollout_ref.actor.optim.lr=1e-6 \
30
+ actor_rollout_ref.model.use_remove_padding=True \
31
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
32
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
33
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
34
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
35
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
36
+ actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
37
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio} \
38
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio} \
39
+ actor_rollout_ref.actor.entropy_coeff=0 \
40
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
41
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
42
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
43
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
44
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
45
+ actor_rollout_ref.rollout.name=vllm \
46
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
47
+ actor_rollout_ref.rollout.n=5 \
48
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
49
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
50
+ actor_rollout_ref.actor.checkpoint.save_contents=${save_contents} \
51
+ algorithm.use_kl_in_reward=False \
52
+ trainer.critic_warmup=0 \
53
+ trainer.logger='["console","wandb"]' \
54
+ trainer.project_name='verl_gmpo_example_gsm8k_math' \
55
+ trainer.experiment_name='qwen2_5_7b_function_rm' \
56
+ trainer.n_gpus_per_node=8 \
57
+ trainer.nnodes=1 \
58
+ trainer.save_freq=20 \
59
+ trainer.test_freq=5 \
60
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_7b_math.sh ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -xeuo pipefail
3
+
4
+ project_name='DAPO'
5
+ exp_name='DAPO-Qwen2.5-7b-MATH-0527a1'
6
+
7
+ adv_estimator=grpo
8
+
9
+ use_kl_in_reward=False
10
+ kl_coef=0.0
11
+ use_kl_loss=False
12
+ kl_loss_coef=0.0
13
+
14
+ clip_ratio_low=0.4
15
+ clip_ratio_high=0.4
16
+
17
+ max_prompt_length=$((1024 * 2))
18
+ max_response_length=$((1024 * 8))
19
+ enable_overlong_buffer=True
20
+ overlong_buffer_len=$((1024 * 4))
21
+ overlong_penalty_factor=1.0
22
+
23
+ loss_agg_mode="token-mean"
24
+
25
+ train_prompt_bsz=512
26
+ n_resp_per_prompt=16
27
+ train_prompt_mini_bsz=32
28
+
29
+ # Ray
30
+ # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
31
+ # WORKING_DIR=${WORKING_DIR:-"${PWD}"}
32
+ # RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
33
+ NNODES=${NNODES:-8}
34
+ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
35
+ # Paths
36
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
37
+ # very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
38
+ MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
39
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
40
+ TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
41
+ TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
42
+
43
+ # Algorithm
44
+ temperature=1.0
45
+ top_p=1.0
46
+ top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
47
+ val_top_p=0.7
48
+
49
+ # Performance Related Parameter
50
+ sp_size=4
51
+ use_dynamic_bsz=True
52
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
53
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
54
+ offload=True
55
+ gen_tp=4
56
+ fsdp_size=32
57
+
58
+ loss_mode=geo_mean
59
+
60
+ # export WANDB_MODE=offline
61
+ save_contents="['model', 'optimizer', 'extra']"
62
+ # save_contents="['hf_model']"
63
+
64
+ # reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
65
+
66
+ python3 -m verl.trainer.main_ppo \
67
+ data.train_files="${TRAIN_FILE}" \
68
+ data.val_files="${TEST_FILE}" \
69
+ data.prompt_key=prompt \
70
+ data.truncation='left' \
71
+ data.max_prompt_length=${max_prompt_length} \
72
+ data.max_response_length=${max_response_length} \
73
+ data.train_batch_size=${train_prompt_bsz} \
74
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
75
+ algorithm.adv_estimator=${adv_estimator} \
76
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
77
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
78
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
79
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
80
+ actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
81
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
82
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
83
+ actor_rollout_ref.model.use_remove_padding=True \
84
+ +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
85
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
86
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
87
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
88
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
89
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
90
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
91
+ actor_rollout_ref.rollout.name=vllm \
92
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
93
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
94
+ actor_rollout_ref.actor.optim.lr=1e-6 \
95
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
96
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
97
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
98
+ actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
99
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
100
+ actor_rollout_ref.actor.entropy_coeff=0 \
101
+ actor_rollout_ref.actor.grad_clip=1.0 \
102
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
103
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
104
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
105
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
106
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
107
+ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
108
+ actor_rollout_ref.rollout.temperature=${temperature} \
109
+ actor_rollout_ref.rollout.top_p=${top_p} \
110
+ actor_rollout_ref.rollout.top_k=${top_k} \
111
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
112
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
113
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
114
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
115
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
116
+ actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
117
+ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
118
+ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
119
+ actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \
120
+ reward_model.reward_manager=dapo \
121
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
122
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
123
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
124
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
125
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
126
+ trainer.logger='["console","wandb"]' \
127
+ trainer.project_name="${project_name}" \
128
+ trainer.experiment_name="${exp_name}" \
129
+ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
130
+ trainer.nnodes="${NNODES}" \
131
+ trainer.val_before_train=True \
132
+ trainer.test_freq=10 \
133
+ trainer.save_freq=10 \
134
+ trainer.total_epochs=10 \
135
+ trainer.total_training_steps=200 \
136
+ trainer.default_local_dir="${CKPTS_DIR}" \
137
+ trainer.resume_mode=auto \
138
+ trainer.log_val_generations=10
code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+ set -xeuo pipefail
3
+
4
+ project_name='DAPO'
5
+ exp_name='DAPO-Qwen3-30B-A3B-Base-MATH-0527a1'
6
+
7
+ adv_estimator=grpo
8
+
9
+ use_kl_in_reward=False
10
+ kl_coef=0.0
11
+ use_kl_loss=False
12
+ kl_loss_coef=0.0
13
+
14
+ clip_ratio_low=0.4
15
+ clip_ratio_high=0.4
16
+
17
+ max_prompt_length=$((1024 * 2))
18
+ max_response_length=$((1024 * 8))
19
+ enable_overlong_buffer=True
20
+ overlong_buffer_len=$((1024 * 4))
21
+ overlong_penalty_factor=1.0
22
+
23
+ loss_agg_mode="token-mean"
24
+
25
+ train_prompt_bsz=512
26
+ n_resp_per_prompt=16
27
+ train_prompt_mini_bsz=32
28
+
29
+ loss_mode=geo_mean
30
+
31
+ # export WANDB_MODE=offline
32
+ save_contents="['model', 'optimizer', 'extra']"
33
+ # save_contents="['hf_model']"
34
+
35
+ # Ray
36
+ # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
37
+ # WORKING_DIR=${WORKING_DIR:-"${PWD}"}
38
+ # RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
39
+ NNODES=${NNODES:-8}
40
+ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
41
+ # Paths
42
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
43
+ MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-30B-A3B-Base"}
44
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
45
+ TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
46
+ TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
47
+
48
+ # Algorithm
49
+ temperature=1.0
50
+ top_p=1.0
51
+ top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
52
+ val_top_p=0.7
53
+
54
+ # Performance Related Parameter
55
+ sp_size=4
56
+ use_dynamic_bsz=True
57
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
58
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
59
+ offload=True
60
+ gen_tp=4
61
+ fsdp_size=32
62
+
63
+ python3 -m verl.trainer.main_ppo \
64
+ data.train_files="${TRAIN_FILE}" \
65
+ data.val_files="${TEST_FILE}" \
66
+ data.prompt_key=prompt \
67
+ data.truncation='left' \
68
+ data.max_prompt_length=${max_prompt_length} \
69
+ data.max_response_length=${max_response_length} \
70
+ data.train_batch_size=${train_prompt_bsz} \
71
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
72
+ algorithm.adv_estimator=${adv_estimator} \
73
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
74
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
75
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
76
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
77
+ actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
78
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
79
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
80
+ actor_rollout_ref.model.use_remove_padding=True \
81
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
82
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
83
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
84
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
85
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
86
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
87
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
88
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
89
+ actor_rollout_ref.actor.optim.lr=1e-6 \
90
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
91
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
92
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
93
+ actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
94
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
95
+ actor_rollout_ref.actor.entropy_coeff=0 \
96
+ actor_rollout_ref.actor.grad_clip=1.0 \
97
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
98
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
99
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
100
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
101
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
102
+ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
103
+ actor_rollout_ref.rollout.temperature=${temperature} \
104
+ actor_rollout_ref.rollout.top_p=${top_p} \
105
+ actor_rollout_ref.rollout.top_k=${top_k} \
106
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
107
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
108
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
109
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
110
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
111
+ actor_rollout_ref.rollout.name=vllm \
112
+ actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
113
+ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
114
+ actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
115
+ actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \
116
+ reward_model.reward_manager=dapo \
117
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
118
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
119
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
120
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
121
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
122
+ trainer.logger='["console","wandb"]' \
123
+ trainer.project_name="${project_name}" \
124
+ trainer.experiment_name="${exp_name}" \
125
+ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
126
+ trainer.nnodes="${NNODES}" \
127
+ trainer.val_before_train=True \
128
+ trainer.test_freq=10 \
129
+ trainer.save_freq=10 \
130
+ trainer.total_epochs=10 \
131
+ trainer.total_training_steps=300 \
132
+ trainer.default_local_dir="${CKPTS_DIR}" \
133
+ trainer.resume_mode=auto \
134
+ trainer.log_val_generations=10
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
6
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
7
+ math_train_path=$HOME/data/math/train.parquet
8
+ math_test_path=$HOME/data/math/test.parquet
9
+
10
+ train_files="['$gsm8k_train_path', '$math_train_path']"
11
+ test_files="['$gsm8k_test_path', '$math_test_path']"
12
+
13
+ python3 -m verl.trainer.main_ppo --config-path=config \
14
+ --config-name='ppo_megatron_trainer.yaml'\
15
+ algorithm.adv_estimator=grpo \
16
+ data.train_files="$train_files" \
17
+ data.val_files="$test_files" \
18
+ data.train_batch_size=1024 \
19
+ data.max_prompt_length=1024 \
20
+ data.max_response_length=1024 \
21
+ data.filter_overlong_prompts=True \
22
+ data.truncation='error' \
23
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
24
+ actor_rollout_ref.actor.optim.lr=1e-6 \
25
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
26
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
27
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
28
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
29
+ actor_rollout_ref.actor.use_kl_loss=True \
30
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
31
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
32
+ actor_rollout_ref.actor.entropy_coeff=0 \
33
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
34
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
35
+ actor_rollout_ref.rollout.name=vllm \
36
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
37
+ actor_rollout_ref.rollout.n=5 \
38
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
39
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
40
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
41
+ algorithm.use_kl_in_reward=False \
42
+ trainer.critic_warmup=0 \
43
+ trainer.logger='["console","wandb"]' \
44
+ trainer.project_name='verl_grpo_example_gsm8k_math' \
45
+ trainer.experiment_name='deepseek_llm_7b_math_megatron' \
46
+ trainer.n_gpus_per_node=16 \
47
+ trainer.nnodes=1 \
48
+ trainer.save_freq=20 \
49
+ trainer.test_freq=5 \
50
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_glm41v_9b.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+
4
+ python3 -m verl.trainer.main_ppo \
5
+ algorithm.adv_estimator=grpo \
6
+ data.train_files=$HOME/data/geo3k/train.parquet \
7
+ data.val_files=$HOME/data/geo3k/test.parquet \
8
+ data.train_batch_size=512 \
9
+ data.max_prompt_length=1024 \
10
+ data.max_response_length=2048 \
11
+ data.filter_overlong_prompts=True \
12
+ data.truncation='error' \
13
+ data.image_key=images \
14
+ actor_rollout_ref.model.path=zai-org/GLM-4.1V-9B-Thinking \
15
+ actor_rollout_ref.actor.optim.lr=1e-6 \
16
+ actor_rollout_ref.model.use_remove_padding=True \
17
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
18
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
19
+ actor_rollout_ref.actor.use_kl_loss=True \
20
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
21
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
22
+ actor_rollout_ref.actor.entropy_coeff=0 \
23
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
24
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
25
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
26
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
27
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
28
+ actor_rollout_ref.rollout.name=$ENGINE \
29
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
30
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
31
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
32
+ actor_rollout_ref.rollout.enforce_eager=False \
33
+ actor_rollout_ref.rollout.free_cache_engine=True \
34
+ actor_rollout_ref.rollout.n=5 \
35
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
36
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
37
+ algorithm.use_kl_in_reward=False \
38
+ trainer.critic_warmup=0 \
39
+ trainer.logger='["console","wandb"]' \
40
+ trainer.project_name='verl_grpo_example_geo3k' \
41
+ trainer.experiment_name='glm41v_9b_function_rm' \
42
+ trainer.n_gpus_per_node=8 \
43
+ trainer.nnodes=1 \
44
+ trainer.save_freq=20 \
45
+ trainer.test_freq=5 \
46
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_minicpmo2_6.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=grpo \
5
+ data.train_files=$HOME/data/geo3k/train.parquet \
6
+ data.val_files=$HOME/data/geo3k/test.parquet \
7
+ data.train_batch_size=128 \
8
+ data.max_prompt_length=1024 \
9
+ data.max_response_length=2048 \
10
+ data.filter_overlong_prompts=False \
11
+ data.truncation='error' \
12
+ data.image_key=images \
13
+ data.trust_remote_code=True \
14
+ data.custom_cls.path=recipe/minicpmo/rl_dataset.py \
15
+ data.custom_cls.name=RLHFDataset \
16
+ actor_rollout_ref.model.path=openbmb/MiniCPM-o-2_6 \
17
+ actor_rollout_ref.model.trust_remote_code=True \
18
+ actor_rollout_ref.model.use_remove_padding=True \
19
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
20
+ actor_rollout_ref.actor.optim.lr=1e-6 \
21
+ actor_rollout_ref.actor.ppo_mini_batch_size=32 \
22
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
23
+ actor_rollout_ref.actor.use_kl_loss=True \
24
+ actor_rollout_ref.actor.use_dynamic_bsz=False \
25
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
26
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
27
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
28
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
29
+ actor_rollout_ref.actor.fsdp_config.use_orig_params=True \
30
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
31
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
32
+ actor_rollout_ref.rollout.name=vllm \
33
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
34
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
35
+ actor_rollout_ref.rollout.enforce_eager=False \
36
+ actor_rollout_ref.rollout.free_cache_engine=False \
37
+ actor_rollout_ref.rollout.n=8 \
38
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
39
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
40
+ algorithm.kl_ctrl.kl_coef=0.001 \
41
+ trainer.critic_warmup=0 \
42
+ trainer.logger='["console","wandb"]' \
43
+ trainer.project_name='verl_grpo_example_geo3k' \
44
+ trainer.experiment_name='minicpmo2_6_function_rm' \
45
+ trainer.n_gpus_per_node=8 \
46
+ trainer.nnodes=1 \
47
+ trainer.save_freq=-1 \
48
+ trainer.test_freq=5 \
49
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
4
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
5
+ math_train_path=$HOME/data/math/train.parquet
6
+ math_test_path=$HOME/data/math/test.parquet
7
+
8
+ train_files="['$gsm8k_train_path', '$math_train_path']"
9
+ test_files="['$gsm8k_test_path', '$math_test_path']"
10
+
11
+ python3 -m verl.trainer.main_ppo --config-path=config \
12
+ --config-name='ppo_megatron_trainer.yaml'\
13
+ algorithm.adv_estimator=grpo \
14
+ data.train_files="$train_files" \
15
+ data.val_files="$test_files" \
16
+ data.train_batch_size=1024 \
17
+ data.max_prompt_length=1024 \
18
+ data.max_response_length=1024 \
19
+ data.filter_overlong_prompts=True \
20
+ data.truncation='error' \
21
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
22
+ actor_rollout_ref.actor.optim.lr=1e-6 \
23
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
24
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
25
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
26
+ actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
27
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
28
+ actor_rollout_ref.actor.use_kl_loss=True \
29
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
30
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
31
+ actor_rollout_ref.actor.entropy_coeff=0 \
32
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
33
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
34
+ actor_rollout_ref.rollout.name=sglang \
35
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
36
+ actor_rollout_ref.rollout.n=5 \
37
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
38
+ algorithm.use_kl_in_reward=False \
39
+ trainer.critic_warmup=0 \
40
+ trainer.logger='["console","wandb"]' \
41
+ trainer.project_name='verl_grpo_example_gsm8k' \
42
+ trainer.experiment_name='qwen2_7b_function_rm_megatron' \
43
+ trainer.n_gpus_per_node=8 \
44
+ trainer.nnodes=1 \
45
+ trainer.save_freq=-1 \
46
+ trainer.test_freq=5 \
47
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-32b_grpo_megatron_vllm_npu.sh ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -xeuo pipefail
3
+ mkdir -p logs
4
+
5
+ # Project Configuration
6
+ project_name='GRPO-Qwen2.5-32B-BASE-MATH'
7
+ exp_name='GRPO-Qwen2.5-32B-BASE-Megatron-vLLM'
8
+
9
+ # Node Info
10
+ NNODES=${NNODES:-1}
11
+ NPUS_PER_NODE=${NPUS_PER_NODE:-16}
12
+
13
+ # Model Weights Paths
14
+ MODEL_PATH=Qwen/Qwen2.5-32B
15
+ MCORE_MODEL_PATH=Qwen/Qwen2.5-32B-dist
16
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
17
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
18
+
19
+ # File System Paths
20
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/gsm8k/train.parquet
21
+ TEST_FILE=$RAY_DATA_HOME/dataset/gsm8k/test.parquet
22
+
23
+ # Data Configuration
24
+ max_prompt_length=$((1024 * 1))
25
+ max_response_length=$((1024 * 1))
26
+
27
+ # Training Batch Configuration
28
+ train_prompt_bsz=128
29
+ train_prompt_mini_bsz=32
30
+ n_resp_per_prompt=16
31
+
32
+ # Algorithm Configuration
33
+ adv_estimator=grpo
34
+ use_kl_in_reward=False
35
+ kl_coef=0.0
36
+ use_kl_loss=True
37
+ kl_loss_coef=0.001
38
+
39
+ # Performance and Memory Management Configuration
40
+ all_offload=True
41
+ use_dynamic_bsz=True
42
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
43
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 8))
44
+ optimizer_offload_fraction=1
45
+
46
+ # Megatron Configuration
47
+ train_tp=4
48
+ train_ep=1
49
+ train_etp=1
50
+ train_pp=4
51
+ train_cp=1
52
+
53
+ # vLLM Configuration
54
+ gen_tp=2
55
+ gen_dp=1
56
+ gen_ep=1
57
+ gpu_memory_utilization=0.8
58
+ max_model_len=$((max_prompt_length + max_response_length))
59
+ max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
60
+
61
+ # Data Configuration
62
+ DATA_CONFIG=(
63
+ data.train_files="${TRAIN_FILE}"
64
+ data.val_files="${TEST_FILE}"
65
+ data.prompt_key=prompt
66
+ data.train_batch_size=${train_prompt_bsz}
67
+ data.max_prompt_length=${max_prompt_length}
68
+ data.max_response_length=${max_response_length}
69
+ data.filter_overlong_prompts=False
70
+ data.truncation='left'
71
+ )
72
+
73
+ # Model Configuration
74
+ MODEL_CONFIG=(
75
+ actor_rollout_ref.model.path="${MODEL_PATH}"
76
+ actor_rollout_ref.model.use_remove_padding=True
77
+ )
78
+
79
+ # Algorithm Configuration
80
+ ALGORITHM_CONFIG=(
81
+ algorithm.adv_estimator=${adv_estimator}
82
+ algorithm.use_kl_in_reward=${use_kl_in_reward}
83
+ algorithm.kl_ctrl.kl_coef=${kl_coef}
84
+ )
85
+
86
+ # Actor Model Configuration
87
+ ACTOR_CONFIG=(
88
+ actor_rollout_ref.actor.use_torch_compile=False
89
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
90
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
91
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl
92
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
93
+ actor_rollout_ref.actor.entropy_coeff=0
94
+ actor_rollout_ref.actor.ppo_epochs=1
95
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
96
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
97
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
98
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl
99
+ actor_rollout_ref.actor.optim.lr=1e-6
100
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction}
101
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
102
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
103
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
104
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
105
+ actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp}
106
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep}
107
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp}
108
+ actor_rollout_ref.actor.megatron.param_offload=${all_offload}
109
+ actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload}
110
+ actor_rollout_ref.actor.megatron.grad_offload=${all_offload}
111
+ actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
112
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
113
+ +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
114
+ )
115
+
116
+ # Reference Model Configuration
117
+ REF_CONFIG=(
118
+ actor_rollout_ref.ref.use_torch_compile=False
119
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
120
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
121
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
122
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
123
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
124
+ actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp}
125
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep}
126
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp}
127
+ actor_rollout_ref.ref.megatron.param_offload=${all_offload}
128
+ actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
129
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=False
130
+ )
131
+
132
+ # Rollout Configuration
133
+ ROLLOUT_CONFIG=(
134
+ actor_rollout_ref.rollout.name=vllm
135
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt}
136
+ actor_rollout_ref.rollout.top_p=1.0
137
+ actor_rollout_ref.rollout.top_k=-1
138
+ actor_rollout_ref.rollout.temperature=1.0
139
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
140
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
141
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
142
+ actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
143
+ actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens}
144
+ actor_rollout_ref.rollout.max_model_len=${max_model_len}
145
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
146
+ actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
147
+ actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
148
+ actor_rollout_ref.rollout.enable_chunked_prefill=True
149
+ actor_rollout_ref.rollout.enable_prefix_caching=True
150
+ actor_rollout_ref.rollout.enforce_eager=True
151
+ actor_rollout_ref.rollout.free_cache_engine=True
152
+ actor_rollout_ref.rollout.val_kwargs.n=1
153
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True
154
+ actor_rollout_ref.rollout.val_kwargs.top_p=1.0
155
+ actor_rollout_ref.rollout.val_kwargs.top_k=-1
156
+ actor_rollout_ref.rollout.val_kwargs.temperature=1.0
157
+ )
158
+
159
+ # Trainer Configuration
160
+ TRAINER_CONFIG=(
161
+ trainer.logger='["console","tensorboard"]'
162
+ trainer.project_name="${project_name}"
163
+ trainer.experiment_name="${exp_name}"
164
+ trainer.nnodes="${NNODES}"
165
+ trainer.n_gpus_per_node="${NPUS_PER_NODE}"
166
+ trainer.device='npu'
167
+ trainer.total_epochs=15
168
+ trainer.val_before_train=False
169
+ trainer.test_freq=-1
170
+ trainer.save_freq=-1
171
+ trainer.default_local_dir="${CKPTS_DIR}"
172
+ )
173
+
174
+ # Main GRPO Training Command
175
+ python3 -m verl.trainer.main_ppo \
176
+ --config-path=config \
177
+ --config-name='ppo_megatron_trainer.yaml' \
178
+ "${DATA_CONFIG[@]}" \
179
+ "${MODEL_CONFIG[@]}" \
180
+ "${ACTOR_CONFIG[@]}" \
181
+ "${REF_CONFIG[@]}" \
182
+ "${ROLLOUT_CONFIG[@]}" \
183
+ "${ALGORITHM_CONFIG[@]}" \
184
+ "${TRAINER_CONFIG[@]}" \
185
+ "$@" | tee logs/run_qwen2_5-32b_grpo_megatron_vllm_npu.log
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ lora_adapter_path=${lora_adapter_path:-/path/saved/lora_adapter}
4
+
5
+ python3 -m verl.trainer.main_ppo \
6
+ algorithm.adv_estimator=grpo \
7
+ data.train_files=$HOME/data/gsm8k/train.parquet \
8
+ data.val_files=$HOME/data/gsm8k/test.parquet \
9
+ data.train_batch_size=1024 \
10
+ data.max_prompt_length=512 \
11
+ data.max_response_length=1024 \
12
+ data.filter_overlong_prompts=True \
13
+ data.truncation='error' \
14
+ data.shuffle=False \
15
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
16
+ actor_rollout_ref.model.use_shm=True \
17
+ actor_rollout_ref.model.lora_adapter_path=${lora_adapter_path} \
18
+ actor_rollout_ref.actor.optim.lr=3e-6 \
19
+ actor_rollout_ref.model.use_remove_padding=True \
20
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
21
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
22
+ actor_rollout_ref.actor.use_kl_loss=True \
23
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
24
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
25
+ actor_rollout_ref.actor.entropy_coeff=0 \
26
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
27
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
28
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
29
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
30
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
31
+ actor_rollout_ref.rollout.name=vllm \
32
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
33
+ actor_rollout_ref.rollout.n=5 \
34
+ actor_rollout_ref.rollout.load_format=safetensors \
35
+ actor_rollout_ref.rollout.layered_summon=True \
36
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
37
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
38
+ algorithm.use_kl_in_reward=False \
39
+ trainer.critic_warmup=0 \
40
+ trainer.logger='["console","wandb"]' \
41
+ trainer.project_name='verl_grpo_example_gsm8k' \
42
+ trainer.experiment_name='qwen2.5_3b_grpo_lora' \
43
+ trainer.n_gpus_per_node=8 \
44
+ trainer.nnodes=1 \
45
+ trainer.save_freq=20 \
46
+ trainer.test_freq=5 \
47
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # profiling configuration
4
+ PROFILE_STEPS="[2,4]"
5
+ PROFILE_RANKS_ALL=True
6
+ DISCRETE=False
7
+
8
+ # profiling NPU options
9
+ SAVE_PATH="$HOME/profile_data"
10
+ LEVEL="level0"
11
+ CONTENTS=['npu','cpu']
12
+ ANALYSIS=True
13
+
14
+ python3 -m verl.trainer.main_ppo \
15
+ algorithm.adv_estimator=grpo \
16
+ data.train_files=$HOME/data/gsm8k/train.parquet \
17
+ data.val_files=$HOME/data/gsm8k/test.parquet \
18
+ data.train_batch_size=32 \
19
+ data.max_prompt_length=1024 \
20
+ data.max_response_length=1024 \
21
+ data.filter_overlong_prompts=True \
22
+ data.truncation='error' \
23
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
24
+ actor_rollout_ref.actor.optim.lr=5e-8 \
25
+ actor_rollout_ref.model.use_remove_padding=False \
26
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
27
+ actor_rollout_ref.actor.ppo_mini_batch_size=2 \
28
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
29
+ actor_rollout_ref.actor.use_kl_loss=True \
30
+ actor_rollout_ref.actor.entropy_coeff=0 \
31
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
32
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
33
+ actor_rollout_ref.actor.profiler.enable=True \
34
+ actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \
35
+ actor_rollout_ref.actor.profiler.tool_config.npu.discrete=$DISCRETE \
36
+ actor_rollout_ref.actor.profiler.tool_config.npu.contents=$CONTENTS \
37
+ actor_rollout_ref.actor.profiler.tool_config.npu.level=$LEVEL \
38
+ actor_rollout_ref.actor.profiler.tool_config.npu.analysis=$ANALYSIS \
39
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
40
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
41
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
42
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
43
+ actor_rollout_ref.rollout.name=vllm \
44
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
45
+ actor_rollout_ref.rollout.n=4 \
46
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
47
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
48
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
49
+ actor_rollout_ref.ref.profiler.enable=True \
50
+ actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
51
+ actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
52
+ actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
53
+ actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
54
+ actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
55
+ algorithm.use_kl_in_reward=False \
56
+ trainer.critic_warmup=0 \
57
+ trainer.logger=console \
58
+ trainer.project_name='verl_grpo_example_gsm8k' \
59
+ trainer.experiment_name='qwen2_5_7b_function_rm' \
60
+ trainer.n_gpus_per_node=8 \
61
+ trainer.nnodes=1 \
62
+ trainer.save_freq=-1 \
63
+ trainer.test_freq=5 \
64
+ trainer.total_epochs=5 \
65
+ global_profiler.tool=npu \
66
+ global_profiler.steps=$PROFILE_STEPS \
67
+ global_profiler.save_path=$SAVE_PATH
68
+ $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+
4
+ python3 -m verl.trainer.main_ppo \
5
+ algorithm.adv_estimator=grpo \
6
+ data.train_files=$HOME/data/geo3k/train.parquet \
7
+ data.val_files=$HOME/data/geo3k/test.parquet \
8
+ data.train_batch_size=512 \
9
+ data.max_prompt_length=1024 \
10
+ data.max_response_length=2048 \
11
+ data.filter_overlong_prompts=True \
12
+ data.truncation='error' \
13
+ data.image_key=images \
14
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
15
+ actor_rollout_ref.actor.optim.lr=1e-6 \
16
+ actor_rollout_ref.model.use_remove_padding=True \
17
+ actor_rollout_ref.model.use_fused_kernels=True \
18
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
19
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
20
+ actor_rollout_ref.actor.use_kl_loss=True \
21
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
22
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
23
+ actor_rollout_ref.actor.entropy_coeff=0 \
24
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
25
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
26
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
27
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
28
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
29
+ actor_rollout_ref.rollout.name=$ENGINE \
30
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
31
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
32
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
33
+ actor_rollout_ref.rollout.enforce_eager=False \
34
+ actor_rollout_ref.rollout.free_cache_engine=True \
35
+ actor_rollout_ref.rollout.n=5 \
36
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
37
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
38
+ algorithm.use_kl_in_reward=False \
39
+ trainer.critic_warmup=0 \
40
+ trainer.logger='["console","wandb"]' \
41
+ trainer.project_name='verl_grpo_example_geo3k' \
42
+ trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
43
+ trainer.n_gpus_per_node=8 \
44
+ trainer.nnodes=1 \
45
+ trainer.save_freq=20 \
46
+ trainer.test_freq=5 \
47
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-32b_npu.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ project_name='GRPO-Qwen3'
4
+ exp_name='GRPO-Qwen3-32b-npu'
5
+ gen_tp=4
6
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
7
+ MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-32B"}
8
+ TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/gsm8k/train.parquet"}
9
+ TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/gsm8k/test.parquet"}
10
+
11
+ python3 -m verl.trainer.main_ppo \
12
+ algorithm.adv_estimator=grpo \
13
+ data.train_files="${TRAIN_FILE}" \
14
+ data.val_files="${TEST_FILE}" \
15
+ data.train_batch_size=1024 \
16
+ data.max_prompt_length=2048 \
17
+ data.max_response_length=2048 \
18
+ data.filter_overlong_prompts=True \
19
+ data.truncation='error' \
20
+ data.shuffle=False \
21
+ actor_rollout_ref.model.path=${MODEL_PATH} \
22
+ actor_rollout_ref.actor.optim.lr=1e-6 \
23
+ actor_rollout_ref.model.use_remove_padding=True \
24
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=4 \
25
+ +actor_rollout_ref.actor.fsdp_config.mixed_precision.param_dtype=bf16 \
26
+ +actor_rollout_ref.actor.fsdp_config.mixed_precision.reduce_dtype=bf16 \
27
+ +actor_rollout_ref.actor.fsdp_config.mixed_precision.buffer_dtype=fp32 \
28
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
29
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
30
+ actor_rollout_ref.actor.use_kl_loss=True \
31
+ actor_rollout_ref.actor.entropy_coeff=0 \
32
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
33
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
34
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
35
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
36
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
37
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
38
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
39
+ actor_rollout_ref.rollout.name=vllm \
40
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
41
+ actor_rollout_ref.rollout.n=4 \
42
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
43
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
44
+ actor_rollout_ref.actor.use_torch_compile=False \
45
+ actor_rollout_ref.ref.use_torch_compile=False \
46
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
47
+ actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
48
+ algorithm.use_kl_in_reward=False \
49
+ trainer.critic_warmup=0 \
50
+ trainer.logger=['console','tensorboard'] \
51
+ trainer.project_name="${project_name}" \
52
+ trainer.experiment_name="${exp_name}" \
53
+ trainer.n_gpus_per_node=8 \
54
+ trainer.nnodes=4 \
55
+ trainer.resume_from_path=checkpoints/ \
56
+ trainer.save_freq=500 \
57
+ trainer.test_freq=50 \
58
+ trainer.total_epochs=50 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tested successfully on the hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0 image.
2
+ # It outperforms the Qwen2 7B base model by two percentage points on the test set of GSM8K.
3
+
4
+ set -x
5
+
6
+ python3 -m verl.trainer.main_ppo \
7
+ algorithm.adv_estimator=grpo \
8
+ data.train_files=$HOME/data/gsm8k/train.parquet \
9
+ data.val_files=$HOME/data/gsm8k/test.parquet \
10
+ data.train_batch_size=1024 \
11
+ data.max_prompt_length=512 \
12
+ data.max_response_length=1024 \
13
+ data.filter_overlong_prompts=True \
14
+ data.truncation='error' \
15
+ actor_rollout_ref.model.path=Qwen/Qwen3-8B \
16
+ actor_rollout_ref.actor.optim.lr=1e-6 \
17
+ actor_rollout_ref.model.use_remove_padding=True \
18
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
19
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
20
+ actor_rollout_ref.actor.use_kl_loss=True \
21
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
22
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
23
+ actor_rollout_ref.actor.entropy_coeff=0 \
24
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
25
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
26
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
27
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
28
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
29
+ actor_rollout_ref.rollout.name=vllm \
30
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
31
+ actor_rollout_ref.rollout.n=5 \
32
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
33
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
34
+ algorithm.use_kl_in_reward=False \
35
+ trainer.critic_warmup=0 \
36
+ trainer.logger='["console","wandb"]' \
37
+ trainer.project_name='verl_grpo_example_gsm8k' \
38
+ trainer.experiment_name='qwen3_8b_function_rm' \
39
+ trainer.n_gpus_per_node=8 \
40
+ trainer.nnodes=1 \
41
+ trainer.save_freq=20 \
42
+ trainer.test_freq=5 \
43
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ export HCCL_CONNECT_TIMEOUT=1500
3
+ export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
4
+ export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
5
+ export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
6
+ export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
7
+ # WORKSPACE_HOME and DATA_HOME support custom path configuration.
8
+ WORKSPACE_HOME=$pwd
9
+ DATA_HOME=$pwd
10
+
11
+ sp_size=4
12
+ num_npu=4
13
+ tp_size=4
14
+ train_prompt_bsz=16
15
+ train_prompt_mini_bsz=16
16
+
17
+ max_prompt_length=512
18
+ max_response_length=1024
19
+
20
+ CKPTS_DIR=$WORKSPACE_HOME/logs/ckpt/qwen3_8b
21
+ model_path=$DATA_HOME/models/Qwen3-8B
22
+ train_data=$DATA_HOME/datasets/processed_gsm8k/train.parquet
23
+ valid_data=$DATA_HOME/datasets/processed_gsm8k/test.parquet
24
+
25
+ python3 -m verl.trainer.main_ppo \
26
+ algorithm.adv_estimator=grpo \
27
+ data.train_files=$train_data \
28
+ data.val_files=$valid_data \
29
+ data.train_batch_size=$train_prompt_bsz \
30
+ data.max_prompt_length=$max_prompt_length \
31
+ data.max_response_length=$max_response_length \
32
+ data.filter_overlong_prompts=True \
33
+ data.truncation='error' \
34
+ actor_rollout_ref.model.path=$model_path \
35
+ actor_rollout_ref.actor.optim.lr=1e-6 \
36
+ actor_rollout_ref.model.use_remove_padding=True \
37
+ actor_rollout_ref.actor.ppo_mini_batch_size=$train_prompt_mini_bsz \
38
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
39
+ actor_rollout_ref.actor.use_kl_loss=True \
40
+ actor_rollout_ref.actor.entropy_coeff=0 \
41
+ actor_rollout_ref.actor.kl_loss_coef=0.001 \
42
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
43
+ actor_rollout_ref.actor.use_torch_compile=False \
44
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
45
+ actor_rollout_ref.actor.fsdp_config.param_offload=True \
46
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
47
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
48
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$tp_size \
49
+ actor_rollout_ref.rollout.name=sglang \
50
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
51
+ actor_rollout_ref.rollout.n=5 \
52
+ +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" \
53
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
54
+ actor_rollout_ref.rollout.enable_chunked_prefill=False \
55
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
56
+ actor_rollout_ref.ref.fsdp_config.param_offload=True \
57
+ actor_rollout_ref.nccl_timeout=1800 \
58
+ algorithm.use_kl_in_reward=False \
59
+ trainer.critic_warmup=0 \
60
+ trainer.logger=console \
61
+ trainer.val_before_train=False \
62
+ trainer.project_name='verl_grpo_example_512_1024_gsm8k' \
63
+ trainer.experiment_name='qwen3_8b_function_rm' \
64
+ trainer.n_gpus_per_node=$num_npu \
65
+ trainer.nnodes=1 \
66
+ trainer.save_freq=1000 \
67
+ trainer.test_freq=10000 \
68
+ trainer.total_epochs=5 \
69
+ trainer.default_local_dir="${CKPTS_DIR}" \
70
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
71
+ actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-8b-megatron.sh ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+ ENGINE=${1:-vllm}
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ # dependency: vllm>=0.11.0, megatron-lm>=0.13, mbridge with qwen3vl_cp branch
6
+ # environment option1: use a stable container later than docker://verlai/verl:vllm011.dev6
7
+ # and install mbridge in it by following the instruction in the container
8
+ # pip remove mbridge if you have installed it
9
+ # pip install git+https://github.com/ISEEKYAN/mbridge.git@qwen3vl_cp # for correct mbridge
10
+ # environment option2: use container docker://verlai/verl:vllm011.dev_qwenvl_cp
11
+
12
+
13
+ export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP
14
+
15
+
16
+ HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-8B-Instruct"}
17
+
18
+ GEN_TP=${GEN_TP:-4}
19
+ CP=${CP:-2}
20
+ TP=${TP:-2}
21
+ PP=${PP:-2}
22
+
23
+ train_path=$HOME/data/geo3k/train.parquet
24
+ test_path=$HOME/data/geo3k/test.parquet
25
+
26
+ python3 -m verl.trainer.main_ppo --config-path=config \
27
+ --config-name='ppo_megatron_trainer.yaml'\
28
+ algorithm.adv_estimator=grpo \
29
+ data.train_files="$train_path" \
30
+ data.val_files="$test_path" \
31
+ data.train_batch_size=512 \
32
+ data.max_prompt_length=1024 \
33
+ data.max_response_length=2048 \
34
+ data.filter_overlong_prompts=True \
35
+ data.truncation='error' \
36
+ actor_rollout_ref.model.path=$HF_MODEL_PATH \
37
+ actor_rollout_ref.actor.optim.lr=1e-6 \
38
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
39
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
40
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
41
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
42
+ actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
43
+ actor_rollout_ref.actor.use_kl_loss=True \
44
+ actor_rollout_ref.actor.kl_loss_coef=0.01 \
45
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
46
+ actor_rollout_ref.actor.entropy_coeff=0 \
47
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
48
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \
49
+ actor_rollout_ref.actor.use_dynamic_bsz=True \
50
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \
51
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
52
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \
53
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
54
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
55
+ actor_rollout_ref.rollout.name=$ENGINE \
56
+ +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
57
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
58
+ actor_rollout_ref.rollout.n=5 \
59
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
60
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
61
+ actor_rollout_ref.actor.megatron.param_offload=True \
62
+ actor_rollout_ref.actor.megatron.optimizer_offload=True \
63
+ actor_rollout_ref.actor.megatron.grad_offload=True \
64
+ actor_rollout_ref.ref.megatron.param_offload=True \
65
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
66
+ +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
67
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
68
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
69
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
70
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
71
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
72
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
73
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
74
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
75
+ +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
76
+ +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
77
+ algorithm.use_kl_in_reward=False \
78
+ trainer.critic_warmup=0 \
79
+ trainer.logger='["console","wandb"]' \
80
+ trainer.project_name='verl_grpo_example_geo3k' \
81
+ trainer.experiment_name='qwen3_vl_8b_megatron' \
82
+ trainer.n_gpus_per_node=8 \
83
+ trainer.nnodes=1 \
84
+ trainer.save_freq=20 \
85
+ trainer.test_freq=5 \
86
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_grpo_megatron_vllm_npu.sh ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -xeuo pipefail
3
+ mkdir -p logs
4
+
5
+ # Project Configuration
6
+ project_name='GRPO-Qwen3-30b-A3B-BASE-MATH'
7
+ exp_name='GRPO-Qwen3-30B-A3B-BASE-Megatron-vLLM'
8
+
9
+ # Node Info
10
+ NNODES=${NNODES:-1}
11
+ NPUS_PER_NODE=${NPUS_PER_NODE:-16}
12
+
13
+ # Model Weights Paths
14
+ MODEL_PATH=Qwen/Qwen3-30B-A3B-Base
15
+ MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-Base-dist
16
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
17
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
18
+
19
+ # File System Paths
20
+ TRAIN_FILE=$RAY_DATA_HOME/dataset/gsm8k/train.parquet
21
+ TEST_FILE=$RAY_DATA_HOME/dataset/gsm8k/test.parquet
22
+
23
+ # Data Configuration
24
+ max_prompt_length=$((1024 * 1))
25
+ max_response_length=$((1024 * 1))
26
+
27
+ # Training Batch Configuration
28
+ train_prompt_bsz=128
29
+ train_prompt_mini_bsz=32
30
+ n_resp_per_prompt=16
31
+
32
+ # Algorithm Configuration
33
+ adv_estimator=grpo
34
+ use_kl_in_reward=False
35
+ kl_coef=0.0
36
+ use_kl_loss=True
37
+ kl_loss_coef=0.001
38
+
39
+ # Performance and Memory Management Configuration
40
+ all_offload=True
41
+ use_dynamic_bsz=True
42
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 4))
43
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 8))
44
+ optimizer_offload_fraction=1
45
+
46
+ # Megatron Configuration
47
+ train_tp=2
48
+ train_ep=8
49
+ train_etp=1
50
+ train_pp=2
51
+ train_cp=1
52
+
53
+ # vLLM Configuration
54
+ gen_tp=2
55
+ gen_dp=1
56
+ gen_ep=1
57
+ gpu_memory_utilization=0.8
58
+ max_model_len=$((max_prompt_length + max_response_length))
59
+ max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
60
+
61
+ # Data Configuration
62
+ DATA_CONFIG=(
63
+ data.train_files="${TRAIN_FILE}"
64
+ data.val_files="${TEST_FILE}"
65
+ data.prompt_key=prompt
66
+ data.train_batch_size=${train_prompt_bsz}
67
+ data.max_prompt_length=${max_prompt_length}
68
+ data.max_response_length=${max_response_length}
69
+ data.filter_overlong_prompts=False
70
+ data.truncation='left'
71
+ )
72
+
73
+ # Model Configuration
74
+ MODEL_CONFIG=(
75
+ actor_rollout_ref.model.path="${MODEL_PATH}"
76
+ actor_rollout_ref.model.use_remove_padding=True
77
+ )
78
+
79
+ # Algorithm Configuration
80
+ ALGORITHM_CONFIG=(
81
+ algorithm.adv_estimator=${adv_estimator}
82
+ algorithm.use_kl_in_reward=${use_kl_in_reward}
83
+ algorithm.kl_ctrl.kl_coef=${kl_coef}
84
+ )
85
+
86
+ # Actor Model Configuration
87
+ ACTOR_CONFIG=(
88
+ actor_rollout_ref.actor.use_torch_compile=False
89
+ actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
90
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
91
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl
92
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
93
+ actor_rollout_ref.actor.entropy_coeff=0
94
+ actor_rollout_ref.actor.ppo_epochs=1
95
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
96
+ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
97
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
98
+ actor_rollout_ref.actor.kl_loss_type=low_var_kl
99
+ actor_rollout_ref.actor.optim.lr=1e-6
100
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction}
101
+ +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
102
+ +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
103
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
104
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
105
+ actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp}
106
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep}
107
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp}
108
+ actor_rollout_ref.actor.megatron.param_offload=${all_offload}
109
+ actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload}
110
+ actor_rollout_ref.actor.megatron.grad_offload=${all_offload}
111
+ actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
112
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
113
+ +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
114
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
115
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
116
+ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
117
+ )
118
+
119
+ # Reference Model Configuration
120
+ REF_CONFIG=(
121
+ actor_rollout_ref.ref.use_torch_compile=False
122
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
123
+ actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
124
+ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
125
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
126
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
127
+ actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp}
128
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep}
129
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp}
130
+ actor_rollout_ref.ref.megatron.param_offload=${all_offload}
131
+ actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
132
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=False
133
+ )
134
+
135
+ # Rollout Configuration
136
+ ROLLOUT_CONFIG=(
137
+ actor_rollout_ref.rollout.name=vllm
138
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt}
139
+ actor_rollout_ref.rollout.top_p=1.0
140
+ actor_rollout_ref.rollout.top_k=-1
141
+ actor_rollout_ref.rollout.temperature=1.0
142
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
143
+ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
144
+ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
145
+ actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
146
+ actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens}
147
+ actor_rollout_ref.rollout.max_model_len=${max_model_len}
148
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
149
+ actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
150
+ actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
151
+ actor_rollout_ref.rollout.enable_chunked_prefill=True
152
+ actor_rollout_ref.rollout.enable_prefix_caching=True
153
+ actor_rollout_ref.rollout.enforce_eager=True
154
+ actor_rollout_ref.rollout.free_cache_engine=True
155
+ actor_rollout_ref.rollout.val_kwargs.n=1
156
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True
157
+ actor_rollout_ref.rollout.val_kwargs.top_p=1.0
158
+ actor_rollout_ref.rollout.val_kwargs.top_k=-1
159
+ actor_rollout_ref.rollout.val_kwargs.temperature=1.0
160
+ )
161
+
162
+ # Trainer Configuration
163
+ TRAINER_CONFIG=(
164
+ trainer.logger='["console","tensorboard"]'
165
+ trainer.project_name="${project_name}"
166
+ trainer.experiment_name="${exp_name}"
167
+ trainer.nnodes="${NNODES}"
168
+ trainer.n_gpus_per_node="${NPUS_PER_NODE}"
169
+ trainer.device='npu'
170
+ trainer.total_epochs=15
171
+ trainer.val_before_train=False
172
+ trainer.test_freq=-1
173
+ trainer.save_freq=-1
174
+ trainer.default_local_dir="${CKPTS_DIR}"
175
+ )
176
+
177
+ # Main GRPO Training Command
178
+ python3 -m verl.trainer.main_ppo \
179
+ --config-path=config \
180
+ --config-name='ppo_megatron_trainer.yaml' \
181
+ "${DATA_CONFIG[@]}" \
182
+ "${MODEL_CONFIG[@]}" \
183
+ "${ACTOR_CONFIG[@]}" \
184
+ "${REF_CONFIG[@]}" \
185
+ "${ROLLOUT_CONFIG[@]}" \
186
+ "${ALGORITHM_CONFIG[@]}" \
187
+ "${TRAINER_CONFIG[@]}" \
188
+ "$@" | tee logs/run_qwen3moe-30b_grpo_megatron_vllm_npu.log
code/RL_model/verl/verl_train/examples/mtp_trainer/runtime_env.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ working_dir: ./
2
+
3
+ excludes:
4
+ - ".git/"
5
+
6
+ env_vars:
7
+ VLLM_USE_V1: "1"
8
+ HYDRA_FULL_ERROR: "1"
9
+ NCCL_NVLS_ENABLE: "0"
10
+ NCCL_SOCKET_IFNAME: "eth0"
11
+ TMPDIR: "/tmp"
12
+ CUDA_HOME: "/usr/local/cuda"
13
+ CUDA_TMPDIR: "/tmp"
14
+ CUDA_CACHE_PATH: "/tmp/cuda_cache"
15
+ # For distributed training, the path must be set on a distributed file system (DFS) to ensure visibility across all nodes.
16
+ HF_HOME: "/tmp/hf_home_mimo"
17
+ PYTHONPATH: "/tmp/hf_home_mimo/modules/"
code/RL_model/verl/verl_train/examples/mtp_trainer/test_dapo_mimo_7b_with_mtp_math_megatron.sh ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env bash
2
+
3
+ set -xeuo pipefail
4
+
5
+ project_name='DAPO'
6
+ exp_name='DAPO-mimo-7b-rl-megatron'
7
+
8
+ adv_estimator=grpo
9
+
10
+ use_kl_in_reward=False
11
+ kl_coef=0.0
12
+ use_kl_loss=False
13
+ kl_loss_coef=0.0
14
+
15
+ clip_ratio_low=0.2
16
+ clip_ratio_high=0.28
17
+
18
+ max_prompt_length=$((1024 * 2))
19
+ max_response_length=$((1024 * 8))
20
+ enable_overlong_buffer=True
21
+ overlong_buffer_len=$((1024 * 4))
22
+ overlong_penalty_factor=1.0
23
+
24
+ loss_agg_mode="token-mean"
25
+
26
+ train_prompt_bsz=128
27
+ n_resp_per_prompt=16
28
+ train_prompt_mini_bsz=32
29
+
30
+ # Ray
31
+ # RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
32
+ # WORKING_DIR=${WORKING_DIR:-"${PWD}"}
33
+ # RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/examples/mtp_trainer/runtime_env.yaml"}
34
+ NNODES=${NNODES:-16}
35
+ NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
36
+ # Paths
37
+ RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
38
+ # very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
39
+ MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/MiMo-7B-RL"}
40
+ CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
41
+ TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
42
+ TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
43
+
44
+ # Algorithm
45
+ temperature=1.0
46
+ top_p=1.0
47
+ top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
48
+ val_top_p=0.7
49
+
50
+ # Performance Related Parameter
51
+ use_dynamic_bsz=True
52
+ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
53
+ infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
54
+ offload=True
55
+ gen_tp=4
56
+ train_tp=2
57
+ train_pp=2
58
+ train_cp=2
59
+
60
+ common_params=(
61
+ actor_rollout_ref.model.mtp.enable=True
62
+ actor_rollout_ref.model.mtp.enable_train=True
63
+ actor_rollout_ref.model.mtp.mtp_loss_scaling_factor=0.1
64
+ actor_rollout_ref.model.mtp.detach_encoder=True
65
+ )
66
+
67
+ python -m verl.trainer.main_ppo \
68
+ --config-path=config \
69
+ --config-name='ppo_megatron_trainer.yaml' \
70
+ data.train_files="${TRAIN_FILE}" \
71
+ data.val_files="${TEST_FILE}" \
72
+ data.prompt_key=prompt \
73
+ data.truncation='left' \
74
+ data.max_prompt_length=${max_prompt_length} \
75
+ data.max_response_length=${max_response_length} \
76
+ data.train_batch_size=${train_prompt_bsz} \
77
+ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
78
+ algorithm.adv_estimator=${adv_estimator} \
79
+ algorithm.use_kl_in_reward=${use_kl_in_reward} \
80
+ algorithm.kl_ctrl.kl_coef=${kl_coef} \
81
+ actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
82
+ actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
83
+ actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
84
+ actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
85
+ actor_rollout_ref.actor.clip_ratio_c=10.0 \
86
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
87
+ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
88
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
89
+ actor_rollout_ref.model.path="${MODEL_PATH}" \
90
+ actor_rollout_ref.actor.optim.lr=1e-6 \
91
+ actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
92
+ actor_rollout_ref.actor.optim.weight_decay=0.1 \
93
+ actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
94
+ actor_rollout_ref.actor.megatron.param_offload=${offload} \
95
+ actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
96
+ actor_rollout_ref.actor.megatron.grad_offload=${offload} \
97
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
98
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
99
+ actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp} \
100
+ actor_rollout_ref.actor.entropy_coeff=0 \
101
+ actor_rollout_ref.actor.optim.clip_grad=1.0 \
102
+ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
103
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
104
+ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
105
+ actor_rollout_ref.rollout.enable_chunked_prefill=True \
106
+ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
107
+ actor_rollout_ref.rollout.temperature=${temperature} \
108
+ actor_rollout_ref.rollout.top_p=${top_p} \
109
+ actor_rollout_ref.rollout.top_k=${top_k} \
110
+ actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
111
+ actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
112
+ actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
113
+ actor_rollout_ref.rollout.val_kwargs.do_sample=True \
114
+ actor_rollout_ref.rollout.val_kwargs.n=1 \
115
+ actor_rollout_ref.rollout.name=sglang \
116
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
117
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
118
+ actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp} \
119
+ actor_rollout_ref.ref.megatron.param_offload=${offload} \
120
+ reward_model.reward_manager=dapo \
121
+ +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
122
+ +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
123
+ +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
124
+ +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
125
+ +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
126
+ trainer.logger='["console","tensorboard"]' \
127
+ trainer.project_name="${project_name}" \
128
+ trainer.experiment_name="${exp_name}" \
129
+ trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
130
+ trainer.nnodes="${NNODES}" \
131
+ trainer.val_before_train=False \
132
+ trainer.test_freq=10 \
133
+ trainer.save_freq=-1 \
134
+ trainer.total_epochs=10 \
135
+ trainer.resume_mode=auto \
136
+ trainer.log_val_generations=10 \
137
+ actor_rollout_ref.rollout.disable_log_stats=False \
138
+ actor_rollout_ref.rollout.prometheus.enable=True \
139
+ actor_rollout_ref.rollout.prometheus.port=44398 \
140
+ actor_rollout_ref.model.trust_remote_code=True \
141
+ data.trust_remote_code=True \
142
+ trainer.total_training_steps=400 \
143
+ actor_rollout_ref.actor.megatron.use_mbridge=True \
144
+ "${common_params[@]}"
code/RL_model/verl/verl_train/examples/ppo_trainer/README.md ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Proximal Policy Optimization (PPO)
2
+
3
+ Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning.
4
+
5
+ Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from:
6
+
7
+ - High variance and sample inefficiency.
8
+ - Instability due to large policy updates.
9
+
10
+ PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives.
11
+
12
+ For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347).
13
+
14
+ ## Key Components
15
+
16
+ - Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model.
17
+
18
+ - Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias.
19
+
20
+ - Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates.
21
+
22
+ ## Configuration
23
+
24
+ Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
25
+
26
+ Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below.
27
+
28
+ ![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
29
+
30
+ - `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
31
+
32
+ - `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers
33
+
34
+ - `critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers
35
+
36
+ - `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2
37
+
38
+ - `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor
39
+
40
+ - `critic.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic. Defaults to `actor_rollout_ref.actor.ppo_epochs`
41
+
42
+ - `algorithm.gamma`: discount factor
43
+
44
+ - `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator
45
+
46
+ - `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo, rloo_vectorized
47
+
48
+ ## Advanced Extensions
49
+
50
+ ### KL Divergence Control
51
+
52
+ Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
53
+
54
+ Options to use KL loss for KL divergence control:
55
+
56
+ - `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False
57
+
58
+ - `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
59
+
60
+ - `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
61
+
62
+ Options to use KL penalty in the reward:
63
+
64
+ - `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False.
65
+
66
+ - `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
67
+
68
+ - `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
69
+ - `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
70
+ - `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details.
71
+ - `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details.
72
+
73
+ ### Dual-clip PPO
74
+
75
+ The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound.
76
+
77
+ ![image](https://github.com/user-attachments/assets/fc232181-d8b0-4307-8dd2-4dc0a4c1c139)
78
+
79
+ - `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0
80
+
81
+ ## Reference Example
82
+
83
+ Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)
84
+
85
+ ```bash
86
+ bash run_gemma.sh
87
+ trainer.n_gpus_per_node=1 \
88
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
89
+ trainer.logger=console \
90
+ critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
91
+ actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
92
+ data.train_batch_size=256 \
93
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
94
+ actor_rollout_ref.actor.ppo_micro_batch_size=2 \
95
+ critic.ppo_micro_batch_size=2
96
+ ```
97
+
98
+ Reference performance with verl v0.2:
99
+
100
+ | Model | Method | Score | Link |
101
+ |-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------|
102
+ | Qwen/Qwen2.5-0.5B-Instruct | pretrained model | 36.4 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
103
+ | Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=gae \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=1024 \
8
+ data.max_prompt_length=512 \
9
+ data.max_response_length=512 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
13
+ actor_rollout_ref.actor.optim.lr=1e-6 \
14
+ actor_rollout_ref.model.use_remove_padding=True \
15
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
16
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
17
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
18
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
19
+ actor_rollout_ref.actor.use_kl_loss=False \
20
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
21
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
22
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
23
+ actor_rollout_ref.rollout.name=vllm \
24
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
25
+ critic.optim.lr=1e-5 \
26
+ critic.model.use_remove_padding=True \
27
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
28
+ critic.model.enable_gradient_checkpointing=True \
29
+ critic.ppo_micro_batch_size_per_gpu=32 \
30
+ critic.model.fsdp_config.param_offload=False \
31
+ critic.model.fsdp_config.optimizer_offload=False \
32
+ algorithm.use_kl_in_reward=False \
33
+ trainer.critic_warmup=0 \
34
+ trainer.logger='["console","wandb"]' \
35
+ trainer.project_name='verl_example_gsm8k' \
36
+ trainer.experiment_name='deepseek_llm_7b_function_rm' \
37
+ trainer.n_gpus_per_node=8 \
38
+ trainer.nnodes=1 \
39
+ trainer.save_freq=20 \
40
+ trainer.test_freq=1 \
41
+ trainer.use_legacy_worker_impl=auto \
42
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ VERL_USE_MODELSCOPE=True \
4
+ python3 -m verl.trainer.main_ppo \
5
+ algorithm.adv_estimator=gae \
6
+ data.train_files=$HOME/data/gsm8k/train.parquet \
7
+ data.val_files=$HOME/data/gsm8k/test.parquet \
8
+ data.train_batch_size=1024 \
9
+ data.max_prompt_length=512 \
10
+ data.max_response_length=512 \
11
+ data.filter_overlong_prompts=True \
12
+ data.truncation='error' \
13
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
14
+ actor_rollout_ref.actor.optim.lr=1e-6 \
15
+ actor_rollout_ref.model.use_remove_padding=True \
16
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
17
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
18
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
19
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
20
+ actor_rollout_ref.actor.use_kl_loss=False \
21
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
22
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
23
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
24
+ actor_rollout_ref.rollout.name=vllm \
25
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
26
+ critic.optim.lr=1e-5 \
27
+ critic.model.use_remove_padding=True \
28
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
29
+ critic.model.enable_gradient_checkpointing=True \
30
+ critic.ppo_micro_batch_size_per_gpu=32 \
31
+ critic.model.fsdp_config.param_offload=False \
32
+ critic.model.fsdp_config.optimizer_offload=False \
33
+ algorithm.use_kl_in_reward=False \
34
+ trainer.critic_warmup=0 \
35
+ trainer.logger='["console","wandb"]' \
36
+ trainer.project_name='verl_example_gsm8k' \
37
+ trainer.experiment_name='deepseek_llm_7b_function_rm' \
38
+ trainer.n_gpus_per_node=8 \
39
+ trainer.nnodes=1 \
40
+ trainer.save_freq=20 \
41
+ trainer.test_freq=1 \
42
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=gae \
5
+ algorithm.use_pf_ppo=True \
6
+ algorithm.pf_ppo.reweight_method=pow \ # ["pow", "max_min", "max_random"]
7
+ algorithm.pf_ppo.weight_pow=2.0 \
8
+ data.train_files=$HOME/data/gsm8k/train.parquet \
9
+ data.val_files=$HOME/data/gsm8k/test.parquet \
10
+ data.train_batch_size=1024 \
11
+ data.max_prompt_length=512 \
12
+ data.max_response_length=512 \
13
+ data.filter_overlong_prompts=True \
14
+ data.truncation='error' \
15
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
16
+ actor_rollout_ref.actor.optim.lr=1e-6 \
17
+ actor_rollout_ref.model.use_remove_padding=True \
18
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
19
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
20
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
21
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
22
+ actor_rollout_ref.actor.use_kl_loss=False \
23
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
24
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
25
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
26
+ actor_rollout_ref.rollout.name=vllm \
27
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
28
+ actor_rollout_ref.rollout.n=5 \
29
+ critic.optim.lr=1e-5 \
30
+ critic.model.use_remove_padding=True \
31
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
32
+ critic.model.enable_gradient_checkpointing=True \
33
+ critic.ppo_micro_batch_size_per_gpu=32 \
34
+ critic.model.fsdp_config.param_offload=False \
35
+ critic.model.fsdp_config.optimizer_offload=False \
36
+ algorithm.use_kl_in_reward=False \
37
+ trainer.critic_warmup=0 \
38
+ trainer.logger='["console","wandb"]' \
39
+ trainer.project_name='verl_example_gsm8k' \
40
+ trainer.experiment_name='deepseek_llm_7b_function_rm' \
41
+ trainer.n_gpus_per_node=8 \
42
+ trainer.nnodes=1 \
43
+ trainer.save_freq=20 \
44
+ trainer.test_freq=1 \
45
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ reward_model.sandbox_fusion.url='https://xxxxxxxxx.apigateway-cn-beijing.volceapi.com/run_code' \
5
+ reward_model.sandbox_fusion.max_concurrent=128 \
6
+ reward_model.reward_manager=prime \
7
+ algorithm.adv_estimator=gae \
8
+ data.train_files=$HOME/data/Eurus-2-RL-Data/train.parquet \
9
+ data.val_files=$HOME/data/Eurus-2-RL-Data/validation.parquet \
10
+ data.train_batch_size=1024 \
11
+ data.max_prompt_length=512 \
12
+ data.max_response_length=512 \
13
+ data.filter_overlong_prompts=True \
14
+ data.truncation='error' \
15
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
16
+ actor_rollout_ref.actor.optim.lr=1e-6 \
17
+ actor_rollout_ref.model.use_remove_padding=True \
18
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
19
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
20
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
21
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
22
+ actor_rollout_ref.actor.use_kl_loss=False \
23
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
24
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
25
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
26
+ actor_rollout_ref.rollout.name=vllm \
27
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
28
+ critic.optim.lr=1e-5 \
29
+ critic.model.use_remove_padding=True \
30
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
31
+ critic.model.enable_gradient_checkpointing=True \
32
+ critic.ppo_micro_batch_size_per_gpu=32 \
33
+ critic.model.fsdp_config.param_offload=False \
34
+ critic.model.fsdp_config.optimizer_offload=False \
35
+ algorithm.use_kl_in_reward=False \
36
+ trainer.critic_warmup=0 \
37
+ trainer.logger='["console","wandb"]' \
38
+ trainer.project_name='verl_example_sandbox_fusion' \
39
+ trainer.experiment_name='deepseek_llm_7b_function_sandbox_fusion' \
40
+ trainer.n_gpus_per_node=8 \
41
+ trainer.nnodes=1 \
42
+ trainer.save_freq=20 \
43
+ trainer.test_freq=1 \
44
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=gae \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=1024 \
8
+ data.max_prompt_length=512 \
9
+ data.max_response_length=512 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
13
+ actor_rollout_ref.actor.optim.lr=1e-6 \
14
+ actor_rollout_ref.model.use_remove_padding=True \
15
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
16
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
17
+ actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
18
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
19
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
20
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
21
+ actor_rollout_ref.actor.use_kl_loss=False \
22
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \
23
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
24
+ actor_rollout_ref.rollout.name=vllm \
25
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
26
+ critic.optim.lr=1e-5 \
27
+ critic.ulysses_sequence_parallel_size=2 \
28
+ critic.model.use_remove_padding=True \
29
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
30
+ critic.model.enable_gradient_checkpointing=True \
31
+ critic.ppo_micro_batch_size_per_gpu=64 \
32
+ critic.model.fsdp_config.param_offload=False \
33
+ critic.model.fsdp_config.optimizer_offload=False \
34
+ algorithm.use_kl_in_reward=False \
35
+ trainer.critic_warmup=0 \
36
+ trainer.logger='["console","wandb"]' \
37
+ trainer.project_name='verl_example_gsm8k' \
38
+ trainer.experiment_name='deepseek_llm_7b_function_rm_sp2' \
39
+ trainer.n_gpus_per_node=8 \
40
+ trainer.nnodes=1 \
41
+ trainer.save_freq=20 \
42
+ trainer.test_freq=5 \
43
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ train_files=$HOME/data/full_hh_rlhf/rl/train.parquet
4
+ test_files=$HOME/data/full_hh_rlhf/rl/train.parquet # no use
5
+
6
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
7
+ algorithm.adv_estimator=gae \
8
+ data.train_files="$train_files" \
9
+ data.val_files="$test_files" \
10
+ data.train_batch_size=512 \
11
+ data.max_prompt_length=128 \
12
+ data.max_response_length=128 \
13
+ data.filter_overlong_prompts=True \
14
+ data.truncation='error' \
15
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
16
+ actor_rollout_ref.actor.optim.lr=1e-6 \
17
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
18
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
19
+ actor_rollout_ref.actor.use_kl_loss=False \
20
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
21
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
22
+ actor_rollout_ref.rollout.name=vllm \
23
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
24
+ critic.optim.lr=1e-5 \
25
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
26
+ critic.ppo_micro_batch_size_per_gpu=4 \
27
+ reward_model.enable=True \
28
+ reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
29
+ reward_model.use_reward_loop=True \
30
+ reward_model.rollout.name=vllm \
31
+ reward_model.rollout.gpu_memory_utilization=0.8 \
32
+ reward_model.rollout.tensor_model_parallel_size=4 \
33
+ reward_model.rollout.prompt_length=256 \
34
+ reward_model.rollout.response_length=128 \
35
+ reward_model.num_workers=8 \
36
+ algorithm.use_kl_in_reward=False \
37
+ trainer.critic_warmup=0 \
38
+ trainer.logger='["console","wandb"]' \
39
+ trainer.project_name='verl_megatron_full_hh_rlhf_examples' \
40
+ trainer.experiment_name='deepseek_llm_7b_model_rm' \
41
+ trainer.n_gpus_per_node=8 \
42
+ trainer.nnodes=1 \
43
+ trainer.save_freq=20 \
44
+ trainer.test_freq=5 \
45
+ trainer.total_epochs=100 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # Example runnable on H20 * 8
4
+
5
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
6
+
7
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
8
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
9
+ math_train_path=$HOME/data/math/train.parquet
10
+ math_test_path=$HOME/data/math/test.parquet
11
+
12
+ train_files="['$gsm8k_train_path', '$math_train_path']"
13
+ test_files="['$gsm8k_test_path', '$math_test_path']"
14
+
15
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
16
+ algorithm.adv_estimator=gae \
17
+ data.train_files="$train_files" \
18
+ data.val_files="$test_files" \
19
+ data.train_batch_size=1024 \
20
+ data.max_prompt_length=1024 \
21
+ data.max_response_length=512 \
22
+ data.filter_overlong_prompts=True \
23
+ data.truncation='error' \
24
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
25
+ actor_rollout_ref.actor.optim.lr=1e-6 \
26
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
27
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
28
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
29
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
30
+ actor_rollout_ref.actor.use_kl_loss=False \
31
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
32
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
33
+ actor_rollout_ref.rollout.name=vllm \
34
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
35
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
36
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
37
+ critic.optim.lr=1e-5 \
38
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
39
+ critic.ppo_micro_batch_size_per_gpu=4 \
40
+ algorithm.use_kl_in_reward=False \
41
+ trainer.critic_warmup=0 \
42
+ trainer.logger='["console","wandb"]' \
43
+ trainer.project_name='verl_ppo_gsm8k_math_examples' \
44
+ trainer.experiment_name='deepseek_llm_7b_megatron' \
45
+ trainer.n_gpus_per_node=8 \
46
+ trainer.nnodes=1 \
47
+ trainer.save_freq=20 \
48
+ trainer.test_freq=5 \
49
+ trainer.total_epochs=100 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ # Example runnable on H20 * 8
4
+
5
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
6
+
7
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
8
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
9
+ math_train_path=$HOME/data/math/train.parquet
10
+ math_test_path=$HOME/data/math/test.parquet
11
+
12
+ train_files=${train_files:-"$gsm8k_train_path"}
13
+ test_files=${test_files:-"$gsm8k_test_path"}
14
+
15
+ # Nsight profiling configuration
16
+ PROFILE_STEPS="[1]" # or [] or null
17
+ PROFILE_RANKS_ALL=False # or True
18
+ PROFILE_RANKS=[0,4]
19
+ DISCRETE=True # or True
20
+
21
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
22
+ algorithm.adv_estimator=gae \
23
+ data.train_files="$train_files" \
24
+ data.val_files="$test_files" \
25
+ data.train_batch_size=256 \
26
+ data.max_prompt_length=1024 \
27
+ data.max_response_length=512 \
28
+ data.filter_overlong_prompts=True \
29
+ data.truncation='error' \
30
+ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
31
+ actor_rollout_ref.actor.optim.lr=1e-6 \
32
+ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
33
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
34
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
35
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
36
+ actor_rollout_ref.actor.use_kl_loss=False \
37
+ actor_rollout_ref.actor.profiler.enable=True \
38
+ actor_rollout_ref.actor.profiler.ranks=$PROFILE_RANKS \
39
+ actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \
40
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
41
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
42
+ actor_rollout_ref.rollout.name=vllm \
43
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
44
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
45
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
46
+ critic.optim.lr=1e-5 \
47
+ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
48
+ critic.ppo_micro_batch_size_per_gpu=4 \
49
+ critic.profiler.enable=True \
50
+ critic.profiler.ranks=$PROFILE_RANKS \
51
+ critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
52
+ algorithm.use_kl_in_reward=False \
53
+ trainer.critic_warmup=0 \
54
+ trainer.logger='["console","wandb"]' \
55
+ trainer.project_name='verl_ppo_gsm8k_math_examples' \
56
+ trainer.experiment_name='deepseek_llm_7b_megatron' \
57
+ trainer.n_gpus_per_node=8 \
58
+ trainer.nnodes=1 \
59
+ trainer.save_freq=-1 \
60
+ trainer.test_freq=-1 \
61
+ trainer.total_epochs=100 \
62
+ trainer.total_training_steps=1 \
63
+ global_profiler.tool=nsys \
64
+ global_profiler.steps=$PROFILE_STEPS \
65
+ global_profiler.global_tool_config.nsys.discrete=$DISCRETE $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_gemma.sh ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ python3 -m verl.trainer.main_ppo \
4
+ algorithm.adv_estimator=gae \
5
+ data.train_files=$HOME/data/gsm8k/train.parquet \
6
+ data.val_files=$HOME/data/gsm8k/test.parquet \
7
+ data.train_batch_size=512 \
8
+ data.max_prompt_length=1024 \
9
+ data.max_response_length=512 \
10
+ data.filter_overlong_prompts=True \
11
+ data.truncation='error' \
12
+ actor_rollout_ref.model.path=google/gemma-2-2b-it \
13
+ actor_rollout_ref.actor.optim.lr=1e-6 \
14
+ actor_rollout_ref.model.use_remove_padding=False \
15
+ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
16
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
17
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
18
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
19
+ actor_rollout_ref.actor.use_kl_loss=False \
20
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
21
+ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
22
+ actor_rollout_ref.rollout.name=vllm \
23
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
24
+ critic.optim.lr=1e-5 \
25
+ critic.model.use_remove_padding=False \
26
+ critic.model.path=google/gemma-2-2b-it \
27
+ critic.model.enable_gradient_checkpointing=False \
28
+ critic.ppo_micro_batch_size_per_gpu=4 \
29
+ critic.model.fsdp_config.param_offload=False \
30
+ critic.model.fsdp_config.optimizer_offload=False \
31
+ algorithm.use_kl_in_reward=False \
32
+ trainer.critic_warmup=0 \
33
+ trainer.logger='["console","wandb"]' \
34
+ trainer.project_name='verl_example' \
35
+ trainer.experiment_name='gemma2b_function_rm' \
36
+ trainer.n_gpus_per_node=2 \
37
+ trainer.nnodes=1 \
38
+ trainer.save_freq=20 \
39
+ trainer.test_freq=10 \
40
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+
6
+ # 0. download the model
7
+ hf download moonshotai/Moonlight-16B-A3B-Instruct
8
+
9
+ # 1. convert the model to mcore format
10
+ # change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
11
+ HF_MODEL_PATH=/data/models/moonshotai/Moonlight-16B-A3B-Instruct
12
+ DIST_CKPT_PATH=/data/mcore_ckpt/Moonlight-16B-A3B-Instruct
13
+ python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
14
+
15
+
16
+ # 2. run the script
17
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
18
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
19
+ train_files=$gsm8k_train_path
20
+ test_files=$gsm8k_test_path
21
+
22
+ ALL_OFFLOAD=${ALL_OFFLOAD:-False}
23
+ COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
24
+ COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
25
+ COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
26
+
27
+ ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
28
+ ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
29
+ ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
30
+ REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
31
+ CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
32
+ CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
33
+ CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
34
+ RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
35
+
36
+
37
+ NODES=4
38
+ PP=2
39
+ TP=8
40
+ EP=8
41
+ ETP=1
42
+ VLLM_TP=4
43
+
44
+ # RAY_ADDRESS='auto' ray job submit --working-dir . --
45
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
46
+ algorithm.adv_estimator=gae \
47
+ data.train_files="$train_files" \
48
+ data.val_files="$test_files" \
49
+ data.train_batch_size=1024 \
50
+ data.max_prompt_length=1024 \
51
+ data.max_response_length=512 \
52
+ data.filter_overlong_prompts=True \
53
+ data.truncation='error' \
54
+ data.trust_remote_code=True \
55
+ actor_rollout_ref.model.path=$LLM \
56
+ actor_rollout_ref.actor.optim.lr=1e-6 \
57
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
58
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
59
+ actor_rollout_ref.actor.use_kl_loss=False \
60
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
61
+ actor_rollout_ref.rollout.name=vllm \
62
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
63
+ critic.optim.lr=1e-5 \
64
+ critic.model.path=$LLM \
65
+ critic.ppo_micro_batch_size_per_gpu=4 \
66
+ algorithm.use_kl_in_reward=False \
67
+ trainer.critic_warmup=0 \
68
+ trainer.logger='["console","wandb"]' \
69
+ trainer.project_name='verl_megatron_gsm8k_examples' \
70
+ trainer.experiment_name='moonlight_16b_a3b_instruct_1node' \
71
+ trainer.n_gpus_per_node=8 \
72
+ trainer.nnodes=$NODES \
73
+ trainer.save_freq=-1 \
74
+ trainer.test_freq=5 \
75
+ actor_rollout_ref.model.trust_remote_code=True \
76
+ critic.model.trust_remote_code=True \
77
+ +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=13 \
78
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \
79
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
80
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
81
+ critic.megatron.pipeline_model_parallel_size=$PP \
82
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
83
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
84
+ critic.megatron.tensor_model_parallel_size=$TP \
85
+ actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
86
+ actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
87
+ critic.megatron.expert_model_parallel_size=$EP \
88
+ actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
89
+ actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
90
+ critic.megatron.expert_tensor_parallel_size=$ETP \
91
+ actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
92
+ actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
93
+ actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
94
+ actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
95
+ critic.megatron.param_offload=${CRITIC_PARAM_OFFLOAD} \
96
+ critic.megatron.optimizer_offload=${CRITIC_OPTIMIZER_OFFLOAD} \
97
+ critic.megatron.grad_offload=${CRITIC_GRAD_OFFLOAD} \
98
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
99
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
100
+ critic.megatron.use_dist_checkpointing=True \
101
+ actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
102
+ actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
103
+ critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
104
+ trainer.val_before_train=False \
105
+ trainer.total_epochs=100 $@
106
+
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ # 0. download the model
6
+ #hf download Qwen/Qwen1.5-MoE-A2.7B-Chat
7
+
8
+ # 1. convert the model to mcore format
9
+ # change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
10
+ HF_MODEL_PATH=/data/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
11
+ DIST_CKPT_PATH=/data/mcore_ckpt/Qwen1.5-MoE-A2.7B-Chat
12
+ python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
13
+
14
+ # 2. run the script
15
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
16
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
17
+ train_files=$gsm8k_train_path
18
+ test_files=$gsm8k_test_path
19
+
20
+ NODES=4
21
+ PP=2
22
+ TP=4
23
+ CP=1
24
+ VLLM_TP=4
25
+
26
+ # RAY_ADDRESS='auto' ray job submit --working-dir . --
27
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
28
+ algorithm.adv_estimator=gae \
29
+ data.train_files="$train_files" \
30
+ data.val_files="$test_files" \
31
+ data.train_batch_size=1024 \
32
+ data.max_prompt_length=1024 \
33
+ data.max_response_length=512 \
34
+ data.filter_overlong_prompts=True \
35
+ data.truncation='error' \
36
+ actor_rollout_ref.model.path=$HF_MODEL_PATH \
37
+ actor_rollout_ref.actor.optim.lr=1e-6 \
38
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
39
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
40
+ actor_rollout_ref.actor.use_kl_loss=False \
41
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
42
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
43
+ actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
44
+ actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
45
+ actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
46
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
47
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
48
+ actor_rollout_ref.ref.megatron.context_parallel_size=$CP \
49
+ actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
50
+ actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
51
+ actor_rollout_ref.rollout.name=vllm \
52
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
53
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
54
+ actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \
55
+ critic.optim.lr=1e-5 \
56
+ critic.model.path=$HF_MODEL_PATH \
57
+ critic.ppo_micro_batch_size_per_gpu=4 \
58
+ critic.megatron.tensor_model_parallel_size=$TP \
59
+ critic.megatron.pipeline_model_parallel_size=$PP \
60
+ critic.megatron.context_parallel_size=$CP \
61
+ critic.megatron.use_dist_checkpointing=True \
62
+ critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
63
+ algorithm.use_kl_in_reward=False \
64
+ trainer.critic_warmup=0 \
65
+ trainer.logger='["console","wandb"]' \
66
+ trainer.project_name='verl_megatron_gsm8k_examples' \
67
+ trainer.experiment_name='qwen1.5_moe_nochat' \
68
+ trainer.n_gpus_per_node=8 \
69
+ trainer.nnodes=$NODES \
70
+ trainer.save_freq=20 \
71
+ trainer.test_freq=5 \
72
+ trainer.total_epochs=100 $@
73
+
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ set -x
2
+
3
+ export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
4
+
5
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
6
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
7
+ math_train_path=$HOME/data/math/train.parquet
8
+ math_test_path=$HOME/data/math/test.parquet
9
+
10
+ train_files="['$gsm8k_train_path', '$math_train_path']"
11
+ test_files="['$gsm8k_test_path', '$math_test_path']"
12
+
13
+ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
14
+ algorithm.adv_estimator=gae \
15
+ data.train_files="$train_files" \
16
+ data.val_files="$test_files" \
17
+ data.train_batch_size=1024 \
18
+ data.max_prompt_length=1024 \
19
+ data.max_response_length=512 \
20
+ data.filter_overlong_prompts=True \
21
+ data.truncation='error' \
22
+ actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
23
+ actor_rollout_ref.actor.optim.lr=1e-6 \
24
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
25
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
26
+ actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
27
+ actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
28
+ actor_rollout_ref.actor.use_kl_loss=False \
29
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
30
+ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
31
+ actor_rollout_ref.rollout.name=vllm \
32
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
33
+ actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
34
+ actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
35
+ critic.optim.lr=1e-5 \
36
+ critic.model.path=Qwen/Qwen2-7B-Instruct \
37
+ critic.ppo_micro_batch_size_per_gpu=4 \
38
+ algorithm.use_kl_in_reward=False \
39
+ trainer.critic_warmup=0 \
40
+ trainer.logger='["console","wandb"]' \
41
+ trainer.project_name='verl_ppo_gsm8k_math_examples' \
42
+ trainer.experiment_name='qwen2_7b_megatron' \
43
+ trainer.n_gpus_per_node=8 \
44
+ trainer.nnodes=1 \
45
+ trainer.save_freq=20 \
46
+ trainer.test_freq=5 \
47
+ trainer.total_epochs=100 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm.sh ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Discliamer: the model used in the script is only for academic purpose.
2
+ set -x
3
+
4
+ # Data preparation scripts are available in ``examples/data_preprocess``.
5
+ # Example usage:
6
+ #
7
+ # python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
8
+ # python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
9
+
10
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
11
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
12
+ math_train_path=$HOME/data/math/train.parquet
13
+ math_test_path=$HOME/data/math/test.parquet
14
+
15
+ train_files="['$gsm8k_train_path', '$math_train_path']"
16
+ test_files="['$gsm8k_test_path', '$math_test_path']"
17
+
18
+
19
+ # prepare model ckpt
20
+ hf download Qwen/Qwen2-7B-Instruct --local-dir $HOME/models/Qwen2-7B-Instruct &
21
+ hf download sfairXC/FsfairX-LLaMA3-RM-v0.1 --local-dir $HOME/models/FsfairX-LLaMA3-RM-v0.1 &
22
+ wait
23
+
24
+ python3 -m verl.trainer.main_ppo \
25
+ algorithm.adv_estimator=gae \
26
+ data.train_files="$train_files" \
27
+ data.val_files="$test_files" \
28
+ data.train_batch_size=1024 \
29
+ data.max_prompt_length=1024 \
30
+ data.max_response_length=512 \
31
+ data.filter_overlong_prompts=True \
32
+ data.truncation='error' \
33
+ data.return_raw_chat=True \
34
+ actor_rollout_ref.model.path="$HOME/models/Qwen2-7B-Instruct" \
35
+ actor_rollout_ref.actor.optim.lr=1e-6 \
36
+ actor_rollout_ref.model.use_remove_padding=True \
37
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
38
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
39
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
40
+ actor_rollout_ref.actor.use_kl_loss=False \
41
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
42
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
43
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
44
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
45
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
46
+ actor_rollout_ref.rollout.name=vllm \
47
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
48
+ critic.optim.lr=1e-5 \
49
+ critic.model.use_remove_padding=True \
50
+ critic.optim.lr_warmup_steps_ratio=0.05 \
51
+ critic.model.path="$HOME/models/Qwen2-7B-Instruct" \
52
+ critic.model.enable_gradient_checkpointing=True \
53
+ critic.ppo_micro_batch_size_per_gpu=32 \
54
+ critic.model.fsdp_config.param_offload=False \
55
+ critic.model.fsdp_config.optimizer_offload=False \
56
+ reward_model.enable=True \
57
+ reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \
58
+ reward_model.use_reward_loop=True \
59
+ reward_model.rollout.name=vllm \
60
+ reward_model.rollout.gpu_memory_utilization=0.8 \
61
+ reward_model.rollout.tensor_model_parallel_size=1 \
62
+ reward_model.rollout.prompt_length=2048 \
63
+ reward_model.rollout.response_length=1024 \
64
+ reward_model.num_workers=8 \
65
+ algorithm.use_kl_in_reward=False \
66
+ trainer.critic_warmup=0 \
67
+ trainer.logger='["console","wandb"]' \
68
+ trainer.project_name='verl_example' \
69
+ trainer.val_before_train=False \
70
+ trainer.experiment_name='Qwen2-7B-Instruct_hybrid_rm' \
71
+ trainer.n_gpus_per_node=8 \
72
+ trainer.nnodes=1 \
73
+ trainer.save_freq=20 \
74
+ trainer.test_freq=5 \
75
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # download datasets and models
2
+ # python3 examples/data_preprocess/gsm8k.py
3
+ # python3 examples/data_preprocess/math_dataset.py
4
+ # hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
5
+ # hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
6
+
7
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
8
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
9
+ math_train_path=$HOME/data/math/train.parquet
10
+ math_test_path=$HOME/data/math/test.parquet
11
+
12
+ train_files="['$gsm8k_train_path', '$math_train_path']"
13
+ test_files="['$gsm8k_test_path', '$math_test_path']"
14
+
15
+ python3 -m verl.trainer.main_ppo \
16
+ algorithm.adv_estimator=gae \
17
+ data.train_files="$train_files" \
18
+ data.val_files="$test_files" \
19
+ data.train_batch_size=1024 \
20
+ data.max_prompt_length=1024 \
21
+ data.max_response_length=2048 \
22
+ data.filter_overlong_prompts=True \
23
+ data.truncation='error' \
24
+ data.return_raw_chat=True \
25
+ actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
26
+ actor_rollout_ref.actor.optim.lr=1e-6 \
27
+ actor_rollout_ref.model.use_remove_padding=True \
28
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
29
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
30
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
31
+ actor_rollout_ref.actor.use_kl_loss=False \
32
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
33
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
34
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
35
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
36
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
37
+ actor_rollout_ref.rollout.name=vllm \
38
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
39
+ critic.optim.lr=1e-5 \
40
+ critic.model.use_remove_padding=True \
41
+ critic.optim.lr_warmup_steps_ratio=0.05 \
42
+ critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
43
+ critic.model.enable_gradient_checkpointing=True \
44
+ critic.ppo_micro_batch_size_per_gpu=32 \
45
+ critic.model.fsdp_config.param_offload=False \
46
+ critic.model.fsdp_config.optimizer_offload=False \
47
+ reward_model.enable=True \
48
+ reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
49
+ reward_model.use_reward_loop=False \
50
+ reward_model.model.use_remove_padding=True \
51
+ reward_model.model.fsdp_config.param_offload=True \
52
+ reward_model.micro_batch_size_per_gpu=32 \
53
+ algorithm.use_kl_in_reward=False \
54
+ trainer.critic_warmup=0 \
55
+ trainer.logger='["console","wandb"]' \
56
+ trainer.project_name='verl_test_qwen25_rm' \
57
+ trainer.val_before_train=True \
58
+ trainer.experiment_name='legacy_fsdp_reward_model' \
59
+ trainer.n_gpus_per_node=8 \
60
+ trainer.nnodes=1 \
61
+ trainer.save_freq=-1 \
62
+ trainer.test_freq=10 \
63
+ trainer.total_epochs=15 $@
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # download datasets and models
2
+ # python3 examples/data_preprocess/gsm8k.py
3
+ # python3 examples/data_preprocess/math_dataset.py
4
+ # hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
5
+ # hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
6
+
7
+ gsm8k_train_path=$HOME/data/gsm8k/train.parquet
8
+ gsm8k_test_path=$HOME/data/gsm8k/test.parquet
9
+ math_train_path=$HOME/data/math/train.parquet
10
+ math_test_path=$HOME/data/math/test.parquet
11
+
12
+ train_files="['$gsm8k_train_path', '$math_train_path']"
13
+ test_files="['$gsm8k_test_path', '$math_test_path']"
14
+
15
+ python3 -m verl.trainer.main_ppo \
16
+ algorithm.adv_estimator=gae \
17
+ data.train_files="$train_files" \
18
+ data.val_files="$test_files" \
19
+ data.train_batch_size=1024 \
20
+ data.max_prompt_length=1024 \
21
+ data.max_response_length=2048 \
22
+ data.filter_overlong_prompts=True \
23
+ data.truncation='error' \
24
+ data.return_raw_chat=True \
25
+ actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
26
+ actor_rollout_ref.actor.optim.lr=1e-6 \
27
+ actor_rollout_ref.model.use_remove_padding=True \
28
+ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
29
+ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
30
+ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
31
+ actor_rollout_ref.actor.use_kl_loss=False \
32
+ actor_rollout_ref.model.enable_gradient_checkpointing=True \
33
+ actor_rollout_ref.actor.fsdp_config.param_offload=False \
34
+ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
35
+ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
36
+ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
37
+ actor_rollout_ref.rollout.name=vllm \
38
+ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
39
+ critic.optim.lr=1e-5 \
40
+ critic.model.use_remove_padding=True \
41
+ critic.optim.lr_warmup_steps_ratio=0.05 \
42
+ critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
43
+ critic.model.enable_gradient_checkpointing=True \
44
+ critic.ppo_micro_batch_size_per_gpu=32 \
45
+ critic.model.fsdp_config.param_offload=False \
46
+ critic.model.fsdp_config.optimizer_offload=False \
47
+ reward_model.enable=True \
48
+ reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
49
+ reward_model.use_reward_loop=True \
50
+ reward_model.rollout.name=vllm \
51
+ reward_model.rollout.gpu_memory_utilization=0.8 \
52
+ reward_model.rollout.prompt_length=4096 \
53
+ reward_model.rollout.response_length=4096 \
54
+ reward_model.rollout.tensor_model_parallel_size=1 \
55
+ reward_model.num_workers=8 \
56
+ reward_model.model.use_remove_padding=True \
57
+ reward_model.model.fsdp_config.param_offload=True \
58
+ reward_model.micro_batch_size_per_gpu=32 \
59
+ algorithm.use_kl_in_reward=False \
60
+ trainer.critic_warmup=0 \
61
+ trainer.logger='["console","wandb"]' \
62
+ trainer.project_name='verl_test_qwen25_rm' \
63
+ trainer.val_before_train=False \
64
+ trainer.experiment_name='reward_loop_colocate_reward_model' \
65
+ trainer.n_gpus_per_node=8 \
66
+ trainer.nnodes=1 \
67
+ trainer.save_freq=-1 \
68
+ trainer.test_freq=10 \
69
+ trainer.total_epochs=15 $@