shahidul034 commited on Feb 15

Commit

d0f96bf

verified ·

1 Parent(s): ff8fd11

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

code/RL_model/verl/verl_train/examples/data_preprocess/aime2024_multiturn_w_tool.py +79 -0
code/RL_model/verl/verl_train/examples/data_preprocess/dapo_multiturn_w_tool.py +79 -0
code/RL_model/verl/verl_train/examples/data_preprocess/full_hh_rlhf.py +161 -0
code/RL_model/verl/verl_train/examples/data_preprocess/geo3k.py +102 -0
code/RL_model/verl/verl_train/examples/data_preprocess/geo3k_multiturn_w_tool.py +120 -0
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k.py +105 -0
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_sft.py +102 -0
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_interaction.py +119 -0
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_tool.py +129 -0
code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_tool_agent_loop.py +130 -0
code/RL_model/verl/verl_train/examples/data_preprocess/hellaswag.py +108 -0
code/RL_model/verl/verl_train/examples/data_preprocess/math_dataset.py +106 -0
code/RL_model/verl/verl_train/examples/data_preprocess/multiturn.py +125 -0
code/RL_model/verl/verl_train/examples/data_preprocess/pokemon.py +75 -0
code/RL_model/verl/verl_train/examples/data_preprocess/preprocess_search_r1_dataset.py +178 -0
code/RL_model/verl/verl_train/examples/gmpo_trainer/README.md +59 -0
code/RL_model/verl/verl_train/examples/gmpo_trainer/run_qwen2_5-7b_math.sh +60 -0
code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_7b_math.sh +138 -0
code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh +134 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh +50 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_glm41v_9b.sh +46 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_minicpmo2_6.sh +49 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh +47 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-32b_grpo_megatron_vllm_npu.sh +185 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh +47 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh +68 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b.sh +47 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-32b_npu.sh +58 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b.sh +43 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh +71 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-8b-megatron.sh +86 -0
code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_grpo_megatron_vllm_npu.sh +188 -0
code/RL_model/verl/verl_train/examples/mtp_trainer/runtime_env.yaml +17 -0
code/RL_model/verl/verl_train/examples/mtp_trainer/test_dapo_mimo_7b_with_mtp_math_megatron.sh +144 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/README.md +103 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm.sh +42 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh +42 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh +45 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh +44 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh +43 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh +45 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh +49 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh +65 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_gemma.sh +40 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh +106 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh +73 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh +47 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm.sh +75 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh +63 -0
code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh +69 -0

code/RL_model/verl/verl_train/examples/data_preprocess/aime2024_multiturn_w_tool.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the DAPO-Math-17k dataset to multiturn format
+"""
+import argparse
+import os
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/retool_aime2024", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_path = "BytedTsinghua-SIA/AIME-2024"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path, "default")
+    else:
+        dataset = datasets.load_dataset(data_path, "default")
+    train_dataset = dataset["train"]
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            orig_extra_info = example.pop("extra_info")
+            extra_info = orig_extra_info.copy()
+            extra_info["need_tools_kwargs"] = True
+            extra_info["tools_kwargs"] = {
+                "code_interpreter": {
+                    "create_kwargs": {
+                        "ground_truth": example["reward_model"]["ground_truth"],
+                    },
+                },
+            }
+            example["extra_info"] = extra_info
+            return example
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/dapo_multiturn_w_tool.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the DAPO-Math-17k dataset to multiturn format
+"""
+import argparse
+import os
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/retool_dapo", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_path = "BytedTsinghua-SIA/DAPO-Math-17k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path, "default")
+    else:
+        dataset = datasets.load_dataset(data_path, "default")
+    train_dataset = dataset["train"]
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            orig_extra_info = example.pop("extra_info")
+            extra_info = orig_extra_info.copy()
+            extra_info["need_tools_kwargs"] = True
+            extra_info["tools_kwargs"] = {
+                "code_interpreter": {
+                    "create_kwargs": {
+                        "ground_truth": example["reward_model"]["ground_truth"],
+                    },
+                },
+            }
+            example["extra_info"] = extra_info
+            return example
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/full_hh_rlhf.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+- Preprocess data and split the training set into 75% for training RM and 25% for validting RM.
+- All the training data is used to train SFT and RL.
+- Both chosen and rejected is used to train SFT
+"""
+import argparse
+import os
+import pandas as pd
+from datasets import load_dataset
+from tqdm.auto import tqdm
+from verl.utils.fs import copy, makedirs
+def generate_sft_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/sft", local_dataset_path=None):
+    if local_dataset_path is not None:
+        dataset = load_dataset(local_dataset_path)
+    else:
+        dataset = load_dataset("Dahoas/full-hh-rlhf")
+    output = {"prompt": [], "response": []}
+    for data in tqdm(dataset["train"]):
+        # add chosen
+        output["prompt"].append(data["prompt"])
+        output["response"].append(data["chosen"])
+        # add rejection
+        output["prompt"].append(data["prompt"])
+        output["response"].append(data["rejected"])
+    df = pd.DataFrame(output)
+    local_dir = os.path.expanduser(local_dir)
+    os.makedirs(local_dir, exist_ok=True)
+    local_path = os.path.join(local_dir, "train.parquet")
+    df.to_parquet(path=local_path)
+    if target_hdfs_path_dir is not None:
+        hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet"
+        makedirs(hdfs_dir)
+        copy(local_path, hdfs_dir)
+def generate_rm_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlh/rm", local_dataset_path=None):
+    if local_dataset_path is not None:
+        train_dataset = load_dataset(local_dataset_path, split="train[:75%]")
+        test_dataset = load_dataset(local_dataset_path, split="train[-25%:]")
+    else:
+        train_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[:75%]")
+        test_dataset = load_dataset("Dahoas/full-hh-rlhf", split="train[-25%:]")
+    local_dir = os.path.expanduser(local_dir)
+    os.makedirs(local_dir, exist_ok=True)
+    for dataset, name in zip([train_dataset, test_dataset], ["train", "test"], strict=True):
+        output = {"prompt": [], "chosen": [], "rejected": []}
+        for data in tqdm(dataset):
+            # add chosen
+            output["prompt"].append(data["prompt"])
+            output["chosen"].append(data["chosen"])
+            output["rejected"].append(data["rejected"])
+        df = pd.DataFrame(output)
+        local_path = os.path.join(local_dir, name + ".parquet")
+        df.to_parquet(path=local_path)
+        if target_hdfs_path_dir is not None:
+            hdfs_dir = target_hdfs_path_dir + "/" + name + ".parquet"
+            makedirs(hdfs_dir)
+            copy(local_path, hdfs_dir)
+def generate_rl_dataset(target_hdfs_path_dir, local_dir="~/data/full_hh_rlhf/rl", local_dataset_path=None):
+    if local_dataset_path is not None:
+        dataset = load_dataset(local_dataset_path)
+    else:
+        dataset = load_dataset("Dahoas/full-hh-rlhf")
+    train_dataset = dataset["train"]
+    data_source = "Dahoas/full-hh-rlhf"
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            prompt = example.pop("prompt")
+            response = example.pop("response")
+            data = {
+                "data_source": data_source,
+                "prompt": [{"role": "user", "content": prompt}],
+                "ability": "alignment",
+                "reward_model": {
+                    "style": "model",
+                    "ground_truth": response,  # should not be used
+                },
+                "extra_info": {"split": split, "index": idx},
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    local_dir = os.path.expanduser(local_dir)
+    local_path = os.path.join(local_dir, "train.parquet")
+    train_dataset.to_parquet(local_path)
+    if target_hdfs_path_dir is not None:
+        hdfs_dir = target_hdfs_path_dir + "/" + "train.parquet"
+        makedirs(hdfs_dir)
+        copy(local_path, hdfs_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--split", type=str, choices=["sft", "rm", "rl"], required=True)
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", type=str, required=False, default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir",
+        type=str,
+        default="~/data/full_hh_rlhf",
+        help="The save directory for the preprocessed dataset.",
+    )
+    args = parser.parse_args()
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    if args.split == "sft":
+        generate_sft_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path)
+    elif args.split == "rm":
+        generate_rm_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path)
+    elif args.split == "rl":
+        generate_rl_dataset(args.hdfs_dir, os.path.join(local_save_dir, args.split), args.local_dataset_path)
+    else:
+        raise NotImplementedError

code/RL_model/verl/verl_train/examples/data_preprocess/geo3k.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the Geometry3k dataset to parquet format
+"""
+import argparse
+import os
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None)
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/geo3k", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "hiyouga/geometry3k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(
+            local_dataset_path,
+        )
+    else:
+        dataset = datasets.load_dataset(
+            data_source,
+        )
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = (
+        r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
+        r"The reasoning process MUST BE enclosed within <think> </think> tags. "
+        r"The final answer MUST BE put in \boxed{}."
+    )
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            problem = example.pop("problem")
+            prompt = problem + " " + instruction_following
+            answer = example.pop("answer")
+            images = example.pop("images")
+            data = {
+                "data_source": data_source,
+                "prompt": [
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    }
+                ],
+                "images": images,
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": answer},
+                "extra_info": {
+                    "split": split,
+                    "index": idx,
+                    "answer": answer,
+                    "question": problem,
+                },
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/geo3k_multiturn_w_tool.py ADDED Viewed

	@@ -0,0 +1,120 @@

+# Copyright 2023-2025 SGLang Team
+# Copyright Amazon.com, Inc. or its affiliates.
+# Copyright 2025 Reallm Labs Ltd. or its affiliates
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the Geometry3k dataset to parquet format
+"""
+import argparse
+import os
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir",
+        default="~/data/geo3k_multiturn_w_tool",
+        help="The save directory for the preprocessed dataset.",
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "hiyouga/geometry3k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path)
+    else:
+        dataset = datasets.load_dataset(data_source)
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = (
+        r"You FIRST think about the reasoning process as an internal monologue and then provide the final answer. "
+        r"The reasoning process MUST BE enclosed within <think> </think> tags. "
+        r"The final answer MUST BE put in \boxed{}."
+    )
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            problem = example.pop("problem")
+            prompt = problem + " " + instruction_following
+            answer = example.pop("answer")
+            images = example.pop("images")
+            data = {
+                "data_source": data_source,
+                "prompt": [
+                    {
+                        "role": "system",
+                        "content": (
+                            "You are a math expert. You are given a question and you need to solve it step by step. "
+                            "Reasoning step by step before any tool call. "
+                            "You should use the `calc_geo3k_reward` tool after step by step solving the question, "
+                            "before generate final answer at least once and refine your answer if necessary. "
+                        ),
+                    },
+                    {
+                        "role": "user",
+                        "content": prompt,
+                    },
+                ],
+                "images": images,
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": answer},
+                "extra_info": {
+                    "split": split,
+                    "index": idx,
+                    "answer": answer,
+                    "question": problem,
+                    "need_tools_kwargs": True,
+                    "tools_kwargs": {
+                        "calc_geo3k_reward": {
+                            "create_kwargs": {"ground_truth": answer},
+                            # "execute_kwargs": {},
+                            # "calc_reward_kwargs": {},
+                            # "release_kwargs": {},
+                        },
+                    },
+                },
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True, num_proc=8)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True, num_proc=8)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k.py ADDED Viewed

	@@ -0,0 +1,105 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the GSM8k dataset to parquet format
+"""
+import argparse
+import os
+import re
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+def extract_solution(solution_str):
+    solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
+    assert solution is not None
+    final_solution = solution.group(0)
+    final_solution = final_solution.split("#### ")[1].replace(",", "")
+    return final_solution
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "openai/gsm8k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path, "main")
+    else:
+        dataset = datasets.load_dataset(data_source, "main")
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = 'Let\'s think step by step and output the final answer after "####".'
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            question_raw = example.pop("question")
+            question = question_raw + " " + instruction_following
+            answer_raw = example.pop("answer")
+            solution = extract_solution(answer_raw)
+            data = {
+                "data_source": data_source,
+                "prompt": [
+                    {
+                        "role": "user",
+                        "content": question,
+                    }
+                ],
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": solution},
+                "extra_info": {
+                    "split": split,
+                    "index": idx,
+                    "answer": answer_raw,
+                    "question": question_raw,
+                },
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_sft.py ADDED Viewed

	@@ -0,0 +1,102 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the GSM8k dataset to parquet format
+"""
+import argparse
+import os
+import re
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+def extract_solution(solution_str):
+    solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
+    assert solution is not None
+    final_solution = solution.group(0)
+    final_solution = final_solution.split("#### ")[1].replace(",", "")
+    return final_solution
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/gsm8k_sft", help="The save directory for the preprocessed dataset."
+    )
+    parser.add_argument("--hdfs_dir", default=None)
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "openai/gsm8k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path, "main")
+    else:
+        dataset = datasets.load_dataset(data_source, "main")
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = 'Let\'s think step by step and output the final answer after "####".'
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            question_raw = example.pop("question")
+            question = question_raw + " " + instruction_following
+            answer_raw = example.pop("answer")
+            data = {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": question,
+                    },
+                    {
+                        "role": "assistant",
+                        "content": answer_raw,
+                    },
+                ],
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    local_save_dir = os.path.expanduser(local_save_dir)
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_interaction.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the GSM8k dataset to parquet format
+"""
+import argparse
+import os
+import re
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+def extract_solution(solution_str):
+    solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
+    assert solution is not None
+    final_solution = solution.group(0)
+    final_solution = final_solution.split("#### ")[1].replace(",", "")
+    return final_solution
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "openai/gsm8k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path, "main")
+    else:
+        dataset = datasets.load_dataset(data_source, "main")
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = "Let's think step by step and output the final answer after `####`."
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            question_raw = example.pop("question")
+            question = question_raw + " " + instruction_following
+            answer_raw = example.pop("answer")
+            solution = extract_solution(answer_raw)
+            data = {
+                "data_source": data_source,
+                "prompt": [
+                    {
+                        "role": "system",
+                        "content": (
+                            "You are a math expert. You are given a question and you need to solve it step by step. "
+                            "You should rethinking carefully if user point out your answer is wrong. "
+                            "Put your final answer in the format of `#### <answer>`."
+                        ),
+                    },
+                    {
+                        "role": "user",
+                        "content": question,
+                    },
+                ],
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": solution},
+                "extra_info": {
+                    "split": split,
+                    "index": idx,
+                    "answer": answer_raw,
+                    "question": question_raw,
+                    "interaction_kwargs": {
+                        "name": "gsm8k",
+                        "query": question,
+                        "ground_truth": solution,
+                    },
+                },
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_multiturn_w_tool.py ADDED Viewed

	@@ -0,0 +1,129 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the GSM8k dataset to parquet format
+"""
+import argparse
+import os
+import re
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+def extract_solution(solution_str):
+    solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
+    assert solution is not None
+    final_solution = solution.group(0)
+    final_solution = final_solution.split("#### ")[1].replace(",", "")
+    return final_solution
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "openai/gsm8k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path, "main")
+    else:
+        dataset = datasets.load_dataset(data_source, "main")
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = "Let's think step by step and output the final answer after `####`."
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            question_raw = example.pop("question")
+            question = question_raw + " " + instruction_following
+            answer_raw = example.pop("answer")
+            solution = extract_solution(answer_raw)
+            data = {
+                "data_source": data_source,
+                "prompt": [
+                    {
+                        "role": "system",
+                        "content": (
+                            "You are a math expert. You are given a question and you need to solve it step by step. "
+                            "Reasoning step by step before any tool call. "
+                            "You should use the `calc_gsm8k_reward` tool after step by step solving the question, "
+                            "before generate final answer at least once and refine your answer if necessary. "
+                            "Put your final answer in the format of `#### <answer>`."
+                        ),
+                    },
+                    {
+                        "role": "user",
+                        "content": question,
+                    },
+                ],
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": solution},
+                "extra_info": {
+                    "split": split,
+                    "index": idx,
+                    "answer": answer_raw,
+                    "question": question_raw,
+                    "need_tools_kwargs": True,
+                    "tools_kwargs": {
+                        "calc_gsm8k_reward": {
+                            "create_kwargs": {"ground_truth": solution},
+                            # "execute_kwargs": {},
+                            # "calc_reward_kwargs": {},
+                            # "release_kwargs": {},
+                        },
+                    },
+                    "interaction_kwargs": {
+                        "query": question,
+                        "ground_truth": solution,
+                    },
+                },
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/gsm8k_tool_agent_loop.py ADDED Viewed

	@@ -0,0 +1,130 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the GSM8k dataset to parquet format
+"""
+import argparse
+import os
+import re
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+def extract_solution(solution_str):
+    solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str)
+    assert solution is not None
+    final_solution = solution.group(0)
+    final_solution = final_solution.split("#### ")[1].replace(",", "")
+    return final_solution
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/gsm8k", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "openai/gsm8k"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path, "main")
+    else:
+        dataset = datasets.load_dataset(data_source, "main")
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = "Let's think step by step and output the final answer after `####`."
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            question_raw = example.pop("question")
+            question = question_raw + " " + instruction_following
+            answer_raw = example.pop("answer")
+            solution = extract_solution(answer_raw)
+            data = {
+                "data_source": data_source,
+                "agent_name": "tool_agent",
+                "prompt": [
+                    {
+                        "role": "system",
+                        "content": (
+                            "You are a math expert. You are given a question and you need to solve it step by step. "
+                            "Reasoning step by step before any tool call. "
+                            "You should use the `calc_gsm8k_reward` tool after step by step solving the question, "
+                            "before generate final answer at least once and refine your answer if necessary. "
+                            "Put your final answer in the format of `#### <answer>`."
+                        ),
+                    },
+                    {
+                        "role": "user",
+                        "content": question,
+                    },
+                ],
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": solution},
+                "extra_info": {
+                    "split": split,
+                    "index": idx,
+                    "answer": answer_raw,
+                    "question": question_raw,
+                    "need_tools_kwargs": True,
+                    "tools_kwargs": {
+                        "calc_gsm8k_reward": {
+                            "create_kwargs": {"ground_truth": solution},
+                            # "execute_kwargs": {},
+                            # "calc_reward_kwargs": {},
+                            # "release_kwargs": {},
+                        },
+                    },
+                    "interaction_kwargs": {
+                        "query": question,
+                        "ground_truth": solution,
+                    },
+                },
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/hellaswag.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess Hellaswag dataset.
+"""
+import argparse
+import os
+import re
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+def preprocess(text):
+    text = text.strip()
+    # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
+    text = text.replace(" [title]", ". ")
+    text = re.sub("\\[.*?\\]", "", text)
+    text = text.replace("  ", " ")
+    return text
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None, help="The save directory for the preprocessed dataset.")
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/hellaswag", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "Rowan/hellaswag"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(local_dataset_path)
+    else:
+        dataset = datasets.load_dataset(data_source, trust_remote_code=True)
+    train_dataset = dataset["train"]
+    val_dataset = dataset["validation"]
+    test_dataset = dataset["test"]
+    instruction = "Please complete the following sentence.\n"
+    def make_map_fn(split):
+        def process_fn(doc, idx):
+            ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
+            query = preprocess(doc["activity_label"] + ": " + ctx)
+            choices = [preprocess(ending) for ending in doc["endings"]]
+            gold = int(doc["label"])
+            data = {
+                "data_source": data_source,
+                "prompt": [{"role": "user", "content": query}],
+                "ability": "nlp",
+                "reward_model": {
+                    "style": "model",
+                    "eval": "multiple_choice",  # using loglikelihood
+                    "ground_truth": gold,
+                    "choices": choices,
+                },
+                "extra_info": {"split": split, "index": idx},
+            }
+            return data
+        return process_fn
+    # filter data that doesn't have a label
+    train_dataset = train_dataset.filter(lambda x: len(x["label"]) > 0)
+    val_dataset = val_dataset.filter(lambda x: len(x["label"]) > 0)
+    test_dataset = test_dataset.filter(lambda x: len(x["label"]) > 0)
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    val_dataset = val_dataset.map(function=make_map_fn("validation"), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    val_dataset.to_parquet(os.path.join(local_save_dir, "validation.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/math_dataset.py ADDED Viewed

	@@ -0,0 +1,106 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocess the MATH-lighteval dataset to parquet format
+"""
+import argparse
+import json
+import os
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+from verl.utils.reward_score.math_reward import last_boxed_only_string, remove_boxed
+def extract_solution(solution_str):
+    return remove_boxed(last_boxed_only_string(solution_str))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None)
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir", default="~/data/math", help="The save directory for the preprocessed dataset."
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    # 'lighteval/MATH' is no longer available on huggingface.
+    # Use mirror repo: DigitalLearningGmbH/MATH-lighteval
+    data_source = "DigitalLearningGmbH/MATH-lighteval"
+    print(f"Loading the {data_source} dataset from huggingface...", flush=True)
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(
+            local_dataset_path,
+        )
+    else:
+        dataset = datasets.load_dataset(
+            data_source,
+        )
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    instruction_following = "Let's think step by step and output the final answer within \\boxed{}."
+    # add a row to each data item that represents a unique id
+    def make_map_fn(split):
+        def process_fn(example, idx):
+            question = example.pop("problem")
+            question = question + " " + instruction_following
+            answer = example.pop("solution")
+            solution = extract_solution(answer)
+            data = {
+                "data_source": data_source,
+                "prompt": [{"role": "user", "content": question}],
+                "ability": "math",
+                "reward_model": {"style": "rule", "ground_truth": solution},
+                "extra_info": {"split": split, "index": idx},
+            }
+            return data
+        return process_fn
+    train_dataset = train_dataset.map(function=make_map_fn("train"), with_indices=True)
+    test_dataset = test_dataset.map(function=make_map_fn("test"), with_indices=True)
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    local_dir = os.path.expanduser(local_save_dir)
+    hdfs_dir = args.hdfs_dir
+    train_dataset.to_parquet(os.path.join(local_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_dir, "test.parquet"))
+    # Save one example as JSON for reference
+    example = train_dataset[0]
+    with open(os.path.join(local_dir, "train_example.json"), "w") as f:
+        json.dump(example, f, indent=2)
+    example = test_dataset[0]
+    with open(os.path.join(local_dir, "test_example.json"), "w") as f:
+        json.dump(example, f, indent=2)
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/multiturn.py ADDED Viewed

	@@ -0,0 +1,125 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Create a simple multi-turn dataset for testing
+"""
+import argparse
+import os
+import pandas as pd
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default="~/data/multiturn")
+    parser.add_argument("--hdfs_dir", default=None)
+    args = parser.parse_args()
+    # Create example conversations
+    conversations = []
+    # Conversation 1
+    conversations.append(
+        {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "What is the capital of France?"},
+                {"role": "assistant", "content": "The capital of France is Paris."},
+                {"role": "user", "content": "And what about Germany?"},
+                {"role": "assistant", "content": "The capital of Germany is Berlin."},
+            ]
+        }
+    )
+    # Conversation 2
+    conversations.append(
+        {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Can you explain quantum computing?"},
+                {
+                    "role": "assistant",
+                    "content": "Quantum computing is a type of computing that uses quantum-mechanical phenomena, "
+                    "such as superposition and entanglement, to perform operations on data.",
+                },
+                {"role": "user", "content": "How is it different from classical computing?"},
+                {
+                    "role": "assistant",
+                    "content": "Classical computing uses bits that are either 0 or 1, while quantum computing uses "
+                    "quantum bits or qubits that can exist in multiple states simultaneously due to superposition.",
+                },
+            ]
+        }
+    )
+    # Conversation 3
+    conversations.append(
+        {
+            "messages": [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": "Write a simple Python function to calculate factorial."},
+                {
+                    "role": "assistant",
+                    "content": (
+                        "```python\ndef factorial(n):\n    if n == 0 or n == 1:\n        return 1\n    else:\n        "
+                        "return n * factorial(n-1)\n```\n\nThis is a recursive function to calculate the "
+                        "factorial of a number."
+                    ),
+                },
+                {"role": "user", "content": "Can you make it iterative instead?"},
+                {
+                    "role": "assistant",
+                    "content": (
+                        "```python\ndef factorial(n):\n    result = 1\n    for i in range(1, n+1):\n        "
+                        "result *= i\n    return result\n```\n\nThis is an iterative version of the factorial function."
+                    ),
+                },
+            ]
+        }
+    )
+    # Create train and test datasets
+    train_data = conversations[:2]  # First 2 conversations for training
+    test_data = conversations[2:]  # Last conversation for testing
+    # Create output directory
+    local_dir = os.path.expanduser(args.local_dir)
+    os.makedirs(local_dir, exist_ok=True)
+    # Save to parquet files
+    train_df = pd.DataFrame(train_data)
+    test_df = pd.DataFrame(test_data)
+    train_df.to_parquet(os.path.join(local_dir, "train.parquet"))
+    test_df.to_parquet(os.path.join(local_dir, "test.parquet"))
+    # Handle HDFS if specified
+    if args.hdfs_dir is not None:
+        try:
+            from verl.utils.hdfs_io import copy, makedirs
+            makedirs(args.hdfs_dir)
+            copy(src=local_dir, dst=args.hdfs_dir)
+        except ImportError:
+            print("Warning: HDFS support not available. Skipping HDFS copy.")
+    # Print statistics
+    print(f"Train dataset size: {len(train_df)}")
+    print(f"Test dataset size: {len(test_df)}")
+    print(f"Data saved to {local_dir}")
+if __name__ == "__main__":
+    main()

code/RL_model/verl/verl_train/examples/data_preprocess/pokemon.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+"""
+Preprocess the llamafactory/pokemon-gpt4o-captions dataset to parquet format
+"""
+import argparse
+import os
+import datasets
+from verl.utils.hdfs_io import copy, makedirs
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_dir", default=None)
+    parser.add_argument("--hdfs_dir", default=None)
+    parser.add_argument("--local_dataset_path", default=None, help="The local path to the raw dataset, if it exists.")
+    parser.add_argument(
+        "--local_save_dir",
+        default="~/data/pokemon-gpt4o-captions",
+        help="The save directory for the preprocessed dataset.",
+    )
+    args = parser.parse_args()
+    local_dataset_path = args.local_dataset_path
+    data_source = "llamafactory/pokemon-gpt4o-captions"
+    if local_dataset_path is not None:
+        dataset = datasets.load_dataset(
+            local_dataset_path,
+        )
+    else:
+        dataset = datasets.load_dataset(
+            data_source,
+        )
+    def map_fn(row: dict):
+        messages = []
+        conversation = row.pop("conversations")
+        for conv in conversation:
+            if conv["from"] == "gpt":
+                role = "assistant"
+            elif conv["from"] == "human":
+                role = "user"
+            else:
+                raise ValueError(f"Unknown role: {conv['from']}")
+            messages.append(
+                {
+                    "role": role,
+                    "content": conv["value"],
+                }
+            )
+        row["messages"] = messages
+        return row
+    dataset = dataset["train"].map(map_fn, num_proc=16)
+    dataset = dataset.train_test_split(test_size=0.1)
+    train_dataset = dataset["train"]
+    test_dataset = dataset["test"]
+    hdfs_dir = args.hdfs_dir
+    local_save_dir = args.local_dir
+    if local_save_dir is not None:
+        print("Warning: Argument 'local_dir' is deprecated. Please use 'local_save_dir' instead.")
+    else:
+        local_save_dir = args.local_save_dir
+    train_dataset.to_parquet(os.path.join(local_save_dir, "train.parquet"))
+    test_dataset.to_parquet(os.path.join(local_save_dir, "test.parquet"))
+    if hdfs_dir is not None:
+        makedirs(hdfs_dir)
+        copy(src=local_save_dir, dst=hdfs_dir)

code/RL_model/verl/verl_train/examples/data_preprocess/preprocess_search_r1_dataset.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import logging
+import os
+import tempfile
+import pandas as pd
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+from verl.utils.hdfs_io import copy, makedirs
+# Setup logging
+logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
+logger = logging.getLogger(__name__)
+# Configuration constants
+DEFAULT_SYSTEM_CONTENT = "You are a helpful and harmless assistant."
+DEFAULT_USER_CONTENT_PREFIX = (
+    "Answer the given question. You must conduct reasoning inside <think> and </think> "
+    "first every time you get new information. After reasoning, if you find you lack "
+    "some knowledge, you can call a search engine by <tool_call> query </tool_call> "
+    "and it will return the top searched results between <tool_response> and "
+    "</tool_response>. You can search as many times as your want. If you find no "
+    "further external knowledge needed, you can directly provide the answer inside "
+    "<answer> and </answer>, without detailed illustrations. For example, "
+    "<answer> Beijing </answer>. Question: "
+)
+def process_single_row(row, current_split_name, row_index):
+    """
+    Process a single row of data for SearchR1-like format.
+    Args:
+        row: DataFrame row containing the original data
+        current_split_name: Name of the current split (train/test)
+        row_index: Index of the row in the DataFrame
+    Returns:
+        pd.Series: Processed row data in the required format
+    """
+    question = row.get("question", "")
+    # Build prompt structure
+    user_content = user_content_prefix.rstrip("\n") + question
+    prompt = [{"role": "system", "content": system_content}, {"role": "user", "content": user_content}]
+    # Extract ground truth from reward_model or fallback to golden_answers
+    reward_model_data = row.get("reward_model")
+    if isinstance(reward_model_data, dict) and "ground_truth" in reward_model_data:
+        ground_truth = reward_model_data.get("ground_truth")
+    else:
+        ground_truth = row.get("golden_answers", [])
+    # Process data source
+    data_source_tagged = "searchR1_" + str(row.get("data_source", ""))
+    # Build tools kwargs structure
+    tools_kwargs = {
+        "search": {
+            "create_kwargs": {"ground_truth": ground_truth, "question": question, "data_source": data_source_tagged}
+        }
+    }
+    # Build complete extra_info structure
+    extra_info = {
+        "index": row_index,
+        "need_tools_kwargs": True,
+        "question": question,
+        "split": current_split_name,
+        "tools_kwargs": tools_kwargs,
+    }
+    return pd.Series(
+        {
+            "data_source": data_source_tagged,
+            "prompt": prompt,
+            "ability": row.get("ability"),
+            "reward_model": reward_model_data,
+            "extra_info": extra_info,
+            "metadata": row.get("metadata"),
+        }
+    )
+def main():
+    local_save_dir = os.path.expanduser(args.local_dir)
+    os.makedirs(local_save_dir, exist_ok=True)
+    processed_files = []
+    # Download and process files using temporary directory
+    with tempfile.TemporaryDirectory() as tmp_download_dir:
+        for split in ["train", "test"]:
+            parquet_filename = f"{split}.parquet"
+            logger.info(f"Processing {split} split...")
+            try:
+                # Download Parquet file from HuggingFace
+                logger.info(f"Downloading {parquet_filename} from {args.hf_repo_id}")
+                local_parquet_filepath = hf_hub_download(
+                    repo_id=args.hf_repo_id,
+                    filename=parquet_filename,
+                    repo_type="dataset",
+                    local_dir=tmp_download_dir,
+                    local_dir_use_symlinks=False,
+                )
+                # Load and process Parquet file
+                df_raw = pd.read_parquet(local_parquet_filepath)
+                logger.info(f"Loaded {len(df_raw)} rows from {parquet_filename}")
+                def apply_process_row(row, split_name=split):
+                    return process_single_row(row, current_split_name=split_name, row_index=row.name)
+                df_processed = df_raw.apply(apply_process_row, axis=1)
+                # Save processed DataFrame
+                output_file_path = os.path.join(local_save_dir, f"{split}.parquet")
+                df_processed.to_parquet(output_file_path, index=False)
+                logger.info(f"Saved {len(df_processed)} processed rows to {output_file_path}")
+                processed_files.append(output_file_path)
+            except EntryNotFoundError:
+                logger.warning(f"{parquet_filename} not found in repository {args.hf_repo_id}")
+            except Exception as e:
+                logger.error(f"Error processing {split} split: {e}")
+    if not processed_files:
+        logger.warning("No data was processed or saved")
+        return
+    logger.info(f"Successfully processed {len(processed_files)} files to {local_save_dir}")
+    # Copy to HDFS if specified
+    if args.hdfs_dir:
+        try:
+            makedirs(args.hdfs_dir)
+            copy(src=local_save_dir, dst=args.hdfs_dir)
+            logger.info(f"Successfully copied files to HDFS: {args.hdfs_dir}")
+        except Exception as e:
+            logger.error(f"Error copying files to HDFS: {e}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download Search-R1 from HuggingFace, process, and save to Parquet.")
+    parser.add_argument(
+        "--hf_repo_id", default="PeterJinGo/nq_hotpotqa_train", help="HuggingFace dataset repository ID."
+    )
+    parser.add_argument(
+        "--local_dir",
+        default="~/data/searchR1_processed_direct",
+        help="Local directory to save the processed Parquet files.",
+    )
+    parser.add_argument("--hdfs_dir", default=None, help="Optional HDFS directory to copy the Parquet files to.")
+    args = parser.parse_args()
+    # System and user content configuration
+    system_content = DEFAULT_SYSTEM_CONTENT
+    user_content_prefix = DEFAULT_USER_CONTENT_PREFIX
+    main()

code/RL_model/verl/verl_train/examples/gmpo_trainer/README.md ADDED Viewed

	@@ -0,0 +1,59 @@

+<div align=center>
+# Geometric-Mean Policy Optimization
+</div>
+This is the official implementaion of paper [***Geometric-Mean Policy Optimization***](https://arxiv.org/abs/2507.20673).
+<div align=center>
+<img width="3092" height="864" alt="image" src="https://github.com/user-attachments/assets/20b04c4e-7ee8-4775-9af8-33c0158336e2" />
+</div>
+## 1. Contents
+- Geometric-Mean Policy Optimization
+  - [1. Contents](#1-contents)
+  - [2. Introduction](#2-introduction)
+  - [3. Code Usage](#3-code-usage)
+  - [4. Contacts](#4-contacts)
+  - [5. Citation](#5-citation)
+## 2. Introduction
+Group Relative Policy Optimization (GRPO) has significantly enhanced the reasoning capability of large language models by optimizing the arithmetic mean of token-level rewards. Unfortunately, GRPO is observed to suffer from unstable policy updates when facing tokens with outlier importance-weighted rewards, which manifest as extreme importance sampling ratios during training. In this study, we propose Geometric-Mean Policy Optimization (GMPO), with the aim to improve the stability of GRPO through suppressing token reward outliers. Instead of optimizing the arithmetic mean, GMPO maximizes the geometric mean of token-level rewards, which is inherently less sensitive to outliers and maintains a more stable range of importance sampling ratio. GMPO is plug-and-play—simply replacing GRPO's arithmetic mean with the geometric mean of token-level rewards, as the latter is inherently less sensitive to outliers. GMPO is theoretically plausible—analysis reveals that both GMPO and GRPO are weighted forms of the policy gradient while the former enjoys more stable weights, which consequently benefits policy optimization and performance. Experiments on multiple mathematical reasoning benchmarks show that GMPO-7B improves the average Pass@1 of GRPO by up to 4.1%, outperforming many state-of-the-art approaches.
+## 3. Code Usage
+The key configurations are:
+```
+clip_ratio_low=0.4
+clip_ratio_high=0.4
+loss_mode=geo_mean
+```
+We observed that using a large clip ratio during Mixture-of-Experts (MoE) model training often leads to optimization instability. When training MoE models, consider lowering the clip ratio to achieve more stable convergence.
+To get started quickly, run:
+```
+bash examples/gmpo_trainer/run_qwen2_5-7b_math.sh
+```
+GMPO can be combined with other methods such as DAPO (experimental - not fully tested):
+```
+bash examples/gmpo_trainer/test_dapo_7b_math.sh
+bash examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh
+```
+## 4. Contacts
+If you have any question about our work or this repository, please don't hesitate to contact us by emails or open an issue under this project.
+- [zhaoyuzhong20@mails.ucas.ac.cn](zhaoyuzhong20@mails.ucas.ac.cn)
+- [liuyue171@mails.ucas.ac.cn](liuyue171@mails.ucas.ac.cn)
+- [lecu@microsoft.com](lecu@microsoft.com)
+- [wanfang@ucas.ac.cn](wanfang@ucas.ac.cn)
+## 5. Citation
+```
+@article{zhao2025geometric,
+  title={Geometric-mean policy optimization},
+  author={Zhao, Yuzhong and Liu, Yue and Liu, Junpeng and Chen, Jingye and Wu, Xun and Hao, Yaru and Lv, Tengchao and Huang, Shaohan and Cui, Lei and Ye, Qixiang and others},
+  journal={arXiv preprint arXiv:2507.20673},
+  year={2025}
+}
+```

code/RL_model/verl/verl_train/examples/gmpo_trainer/run_qwen2_5-7b_math.sh ADDED Viewed

	@@ -0,0 +1,60 @@

+set -x
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+use_kl_loss=False
+loss_mode=geo_mean
+clip_ratio=0.4
+save_contents="['model', 'optimizer', 'extra']"
+export WANDB_MODE=offline
+save_contents="['hf_model']"
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-Math-7B \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.checkpoint.save_contents=${save_contents} \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_gmpo_example_gsm8k_math' \
+    trainer.experiment_name='qwen2_5_7b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_7b_math.sh ADDED Viewed

	@@ -0,0 +1,138 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+project_name='DAPO'
+exp_name='DAPO-Qwen2.5-7b-MATH-0527a1'
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+clip_ratio_low=0.4
+clip_ratio_high=0.4
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+loss_agg_mode="token-mean"
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+# Performance Related Parameter
+sp_size=4
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+fsdp_size=32
+loss_mode=geo_mean
+# export WANDB_MODE=offline
+save_contents="['model', 'optimizer', 'extra']"
+# save_contents="['hf_model']"
+# reference run wandb: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361
+python3 -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    +actor_rollout_ref.model.override_config.max_position_embeddings=32768 \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=10 \
+    trainer.save_freq=10 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=200 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10

code/RL_model/verl/verl_train/examples/gmpo_trainer/test_dapo_qwen3_30b_math.sh ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+project_name='DAPO'
+exp_name='DAPO-Qwen3-30B-A3B-Base-MATH-0527a1'
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+clip_ratio_low=0.4
+clip_ratio_high=0.4
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+loss_agg_mode="token-mean"
+train_prompt_bsz=512
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+loss_mode=geo_mean
+# export WANDB_MODE=offline
+save_contents="['model', 'optimizer', 'extra']"
+# save_contents="['hf_model']"
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-8}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-30B-A3B-Base"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+# Performance Related Parameter
+sp_size=4
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+fsdp_size=32
+python3 -m verl.trainer.main_ppo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.policy_loss.loss_mode=${loss_mode} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    actor_rollout_ref.actor.checkpoint.save_contents="${save_contents}" \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=True \
+    trainer.test_freq=10 \
+    trainer.save_freq=10 \
+    trainer.total_epochs=10 \
+    trainer.total_training_steps=300 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10

code/RL_model/verl/verl_train/examples/grpo_trainer/run_deepseek7b_llm_math_megatron.sh ADDED Viewed

	@@ -0,0 +1,50 @@

+set -x
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+python3 -m verl.trainer.main_ppo --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml'\
+    algorithm.adv_estimator=grpo \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_gsm8k_math' \
+    trainer.experiment_name='deepseek_llm_7b_math_megatron' \
+    trainer.n_gpus_per_node=16 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_glm41v_9b.sh ADDED Viewed

	@@ -0,0 +1,46 @@

+set -x
+ENGINE=${1:-vllm}
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$HOME/data/geo3k/train.parquet \
+    data.val_files=$HOME/data/geo3k/test.parquet \
+    data.train_batch_size=512 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.image_key=images \
+    actor_rollout_ref.model.path=zai-org/GLM-4.1V-9B-Thinking \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.01 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_geo3k' \
+    trainer.experiment_name='glm41v_9b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_minicpmo2_6.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+set -x
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$HOME/data/geo3k/train.parquet \
+    data.val_files=$HOME/data/geo3k/test.parquet \
+    data.train_batch_size=128 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=False \
+    data.truncation='error' \
+    data.image_key=images \
+    data.trust_remote_code=True \
+    data.custom_cls.path=recipe/minicpmo/rl_dataset.py \
+    data.custom_cls.name=RLHFDataset \
+    actor_rollout_ref.model.path=openbmb/MiniCPM-o-2_6 \
+    actor_rollout_ref.model.trust_remote_code=True \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=32 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.fsdp_config.use_orig_params=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=False \
+    actor_rollout_ref.rollout.n=8 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_geo3k' \
+    trainer.experiment_name='minicpmo2_6_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2-7b_sgl_megatron.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+set -x
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+python3 -m verl.trainer.main_ppo --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml'\
+    algorithm.adv_estimator=grpo \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_gsm8k' \
+    trainer.experiment_name='qwen2_7b_function_rm_megatron' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-32b_grpo_megatron_vllm_npu.sh ADDED Viewed

	@@ -0,0 +1,185 @@

+#!/bin/bash
+set -xeuo pipefail
+mkdir -p logs
+# Project Configuration
+project_name='GRPO-Qwen2.5-32B-BASE-MATH'
+exp_name='GRPO-Qwen2.5-32B-BASE-Megatron-vLLM'
+# Node Info
+NNODES=${NNODES:-1}
+NPUS_PER_NODE=${NPUS_PER_NODE:-16}
+# Model Weights Paths
+MODEL_PATH=Qwen/Qwen2.5-32B
+MCORE_MODEL_PATH=Qwen/Qwen2.5-32B-dist
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+# File System Paths
+TRAIN_FILE=$RAY_DATA_HOME/dataset/gsm8k/train.parquet
+TEST_FILE=$RAY_DATA_HOME/dataset/gsm8k/test.parquet
+# Data Configuration
+max_prompt_length=$((1024 * 1))
+max_response_length=$((1024 * 1))
+# Training Batch Configuration
+train_prompt_bsz=128
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+# Algorithm Configuration
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+# Performance and Memory Management Configuration
+all_offload=True
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 8))
+optimizer_offload_fraction=1
+# Megatron Configuration
+train_tp=4
+train_ep=1
+train_etp=1
+train_pp=4
+train_cp=1
+# vLLM Configuration
+gen_tp=2
+gen_dp=1
+gen_ep=1
+gpu_memory_utilization=0.8
+max_model_len=$((max_prompt_length + max_response_length))
+max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
+# Data Configuration
+DATA_CONFIG=(
+    data.train_files="${TRAIN_FILE}"
+    data.val_files="${TEST_FILE}"
+    data.prompt_key=prompt
+    data.train_batch_size=${train_prompt_bsz}
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    data.filter_overlong_prompts=False
+    data.truncation='left'
+)
+# Model Configuration
+MODEL_CONFIG=(
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    actor_rollout_ref.model.use_remove_padding=True
+)
+# Algorithm Configuration
+ALGORITHM_CONFIG=(
+    algorithm.adv_estimator=${adv_estimator}
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+)
+# Actor Model Configuration
+ACTOR_CONFIG=(
+    actor_rollout_ref.actor.use_torch_compile=False
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.ppo_epochs=1
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.optim.lr=1e-6
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction}
+    +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp}
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp}
+    actor_rollout_ref.actor.megatron.param_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.grad_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
+    +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
+)
+# Reference Model Configuration
+REF_CONFIG=(
+    actor_rollout_ref.ref.use_torch_compile=False
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp}
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp}
+    actor_rollout_ref.ref.megatron.param_offload=${all_offload}
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=False
+)
+# Rollout Configuration
+ROLLOUT_CONFIG=(
+    actor_rollout_ref.rollout.name=vllm
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.top_p=1.0
+    actor_rollout_ref.rollout.top_k=-1
+    actor_rollout_ref.rollout.temperature=1.0
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
+    actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens}
+    actor_rollout_ref.rollout.max_model_len=${max_model_len}
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
+    actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
+    actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
+    actor_rollout_ref.rollout.enable_chunked_prefill=True
+    actor_rollout_ref.rollout.enable_prefix_caching=True
+    actor_rollout_ref.rollout.enforce_eager=True
+    actor_rollout_ref.rollout.free_cache_engine=True
+    actor_rollout_ref.rollout.val_kwargs.n=1
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0
+)
+# Trainer Configuration
+TRAINER_CONFIG=(
+    trainer.logger='["console","tensorboard"]'
+    trainer.project_name="${project_name}"
+    trainer.experiment_name="${exp_name}"
+    trainer.nnodes="${NNODES}"
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}"
+    trainer.device='npu'
+    trainer.total_epochs=15
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    trainer.default_local_dir="${CKPTS_DIR}"
+)
+# Main GRPO Training Command
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    "${DATA_CONFIG[@]}" \
+    "${MODEL_CONFIG[@]}" \
+    "${ACTOR_CONFIG[@]}" \
+    "${REF_CONFIG[@]}" \
+    "${ROLLOUT_CONFIG[@]}" \
+    "${ALGORITHM_CONFIG[@]}" \
+    "${TRAINER_CONFIG[@]}" \
+    "$@" | tee logs/run_qwen2_5-32b_grpo_megatron_vllm_npu.log

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+set -x
+lora_adapter_path=${lora_adapter_path:-/path/saved/lora_adapter}
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
+    actor_rollout_ref.model.use_shm=True \
+    actor_rollout_ref.model.lora_adapter_path=${lora_adapter_path} \
+    actor_rollout_ref.actor.optim.lr=3e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.load_format=safetensors \
+    actor_rollout_ref.rollout.layered_summon=True \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_gsm8k' \
+    trainer.experiment_name='qwen2.5_3b_grpo_lora' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_7b_grpo_e2e_prof_npu.sh ADDED Viewed

	@@ -0,0 +1,68 @@

+set -x
+# profiling configuration
+PROFILE_STEPS="[2,4]"
+PROFILE_RANKS_ALL=True
+DISCRETE=False
+# profiling NPU options
+SAVE_PATH="$HOME/profile_data"
+LEVEL="level0"
+CONTENTS=['npu','cpu']
+ANALYSIS=True
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=32 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=5e-8 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=2 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.profiler.enable=True \
+    actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    actor_rollout_ref.actor.profiler.tool_config.npu.discrete=$DISCRETE \
+    actor_rollout_ref.actor.profiler.tool_config.npu.contents=$CONTENTS \
+    actor_rollout_ref.actor.profiler.tool_config.npu.level=$LEVEL \
+    actor_rollout_ref.actor.profiler.tool_config.npu.analysis=$ANALYSIS \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
+    actor_rollout_ref.rollout.n=4 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.profiler.enable=True \
+    actor_rollout_ref.ref.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.discrete=$DISCRETE \
+    actor_rollout_ref.ref.profiler.tool_config.npu.contents=$CONTENTS \
+    actor_rollout_ref.ref.profiler.tool_config.npu.level=$LEVEL \
+    actor_rollout_ref.ref.profiler.tool_config.npu.analysis=$ANALYSIS \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=console \
+    trainer.project_name='verl_grpo_example_gsm8k' \
+    trainer.experiment_name='qwen2_5_7b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=5 \
+    global_profiler.tool=npu \
+    global_profiler.steps=$PROFILE_STEPS \
+    global_profiler.save_path=$SAVE_PATH
+    $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen2_5_vl-7b.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+set -x
+ENGINE=${1:-vllm}
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$HOME/data/geo3k/train.parquet \
+    data.val_files=$HOME/data/geo3k/test.parquet \
+    data.train_batch_size=512 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.image_key=images \
+    actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.use_fused_kernels=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.01 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=True \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_geo3k' \
+    trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-32b_npu.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+set -x
+project_name='GRPO-Qwen3'
+exp_name='GRPO-Qwen3-32b-npu'
+gen_tp=4
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-32B"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/gsm8k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/gsm8k/test.parquet"}
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.shuffle=False \
+    actor_rollout_ref.model.path=${MODEL_PATH} \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=4 \
+    +actor_rollout_ref.actor.fsdp_config.mixed_precision.param_dtype=bf16 \
+    +actor_rollout_ref.actor.fsdp_config.mixed_precision.reduce_dtype=bf16 \
+    +actor_rollout_ref.actor.fsdp_config.mixed_precision.buffer_dtype=fp32 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.n=4 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.ref.use_torch_compile=False \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=32768 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','tensorboard'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=4 \
+    trainer.resume_from_path=checkpoints/ \
+    trainer.save_freq=500 \
+    trainer.test_freq=50 \
+    trainer.total_epochs=50 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3-8b.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+# Tested successfully on the hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.4-flashinfer0.2.2-cxx11abi0 image.
+# It outperforms the Qwen2 7B base model by two percentage points on the test set of GSM8K.
+set -x
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=1024 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=Qwen/Qwen3-8B \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_gsm8k' \
+    trainer.experiment_name='qwen3_8b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh ADDED Viewed

	@@ -0,0 +1,71 @@

+set -x
+export HCCL_CONNECT_TIMEOUT=1500
+export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
+# WORKSPACE_HOME and DATA_HOME support custom path configuration.
+WORKSPACE_HOME=$pwd
+DATA_HOME=$pwd
+sp_size=4
+num_npu=4
+tp_size=4
+train_prompt_bsz=16
+train_prompt_mini_bsz=16
+max_prompt_length=512
+max_response_length=1024
+CKPTS_DIR=$WORKSPACE_HOME/logs/ckpt/qwen3_8b
+model_path=$DATA_HOME/models/Qwen3-8B
+train_data=$DATA_HOME/datasets/processed_gsm8k/train.parquet
+valid_data=$DATA_HOME/datasets/processed_gsm8k/test.parquet
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=$train_data \
+    data.val_files=$valid_data \
+    data.train_batch_size=$train_prompt_bsz \
+    data.max_prompt_length=$max_prompt_length \
+    data.max_response_length=$max_response_length \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$model_path \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$train_prompt_mini_bsz \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.kl_loss_coef=0.001 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$tp_size \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
+    actor_rollout_ref.rollout.n=5 \
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.rollout.enable_chunked_prefill=False \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.nccl_timeout=1800 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=console \
+    trainer.val_before_train=False \
+    trainer.project_name='verl_grpo_example_512_1024_gsm8k' \
+    trainer.experiment_name='qwen3_8b_function_rm' \
+    trainer.n_gpus_per_node=$num_npu \
+    trainer.nnodes=1 \
+    trainer.save_freq=1000 \
+    trainer.test_freq=10000 \
+    trainer.total_epochs=5 \
+    trainer.default_local_dir="${CKPTS_DIR}" \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3_vl-8b-megatron.sh ADDED Viewed

	@@ -0,0 +1,86 @@

+set -x
+ENGINE=${1:-vllm}
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+# dependency: vllm>=0.11.0, megatron-lm>=0.13, mbridge with qwen3vl_cp branch
+# environment option1: use a stable container later than docker://verlai/verl:vllm011.dev6
+    # and install mbridge in it by following the instruction in the container
+            # pip remove mbridge if you have installed it
+            # pip install git+https://github.com/ISEEKYAN/mbridge.git@qwen3vl_cp # for correct mbridge
+# environment option2: use container docker://verlai/verl:vllm011.dev_qwenvl_cp
+export VLLM_ALLREDUCE_USE_SYMM_MEM=0 # for vllm0.11.0 with TP
+HF_MODEL_PATH=${HF_MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-8B-Instruct"}
+GEN_TP=${GEN_TP:-4}
+CP=${CP:-2}
+TP=${TP:-2}
+PP=${PP:-2}
+train_path=$HOME/data/geo3k/train.parquet
+test_path=$HOME/data/geo3k/test.parquet
+python3 -m verl.trainer.main_ppo --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml'\
+    algorithm.adv_estimator=grpo \
+    data.train_files="$train_path" \
+    data.val_files="$test_path" \
+    data.train_batch_size=512 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$HF_MODEL_PATH \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
+    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.kl_loss_coef=0.01 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$GEN_TP \
+    actor_rollout_ref.actor.use_dynamic_bsz=True \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=4096 \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=True \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=4096 \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=4096 \
+    actor_rollout_ref.rollout.name=$ENGINE \
+    +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
+    actor_rollout_ref.actor.megatron.use_mbridge=True \
+    actor_rollout_ref.actor.megatron.param_offload=True \
+    actor_rollout_ref.actor.megatron.optimizer_offload=True \
+    actor_rollout_ref.actor.megatron.grad_offload=True \
+    actor_rollout_ref.ref.megatron.param_offload=True \
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=1 \
+    +actor_rollout_ref.actor.optim.override_optimizer_config.overlap_cpu_optimizer_d2h_h2d=True \
+    +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True \
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_router_dtype=fp32 \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_enable_deepep=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_token_dispatcher_type=flex \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.gradient_accumulation_fusion=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.moe_permute_fusion=True \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_grpo_example_geo3k' \
+    trainer.experiment_name='qwen3_vl_8b_megatron' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/grpo_trainer/run_qwen3moe-30b_grpo_megatron_vllm_npu.sh ADDED Viewed

	@@ -0,0 +1,188 @@

+#!/bin/bash
+set -xeuo pipefail
+mkdir -p logs
+# Project Configuration
+project_name='GRPO-Qwen3-30b-A3B-BASE-MATH'
+exp_name='GRPO-Qwen3-30B-A3B-BASE-Megatron-vLLM'
+# Node Info
+NNODES=${NNODES:-1}
+NPUS_PER_NODE=${NPUS_PER_NODE:-16}
+# Model Weights Paths
+MODEL_PATH=Qwen/Qwen3-30B-A3B-Base
+MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-Base-dist
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+# File System Paths
+TRAIN_FILE=$RAY_DATA_HOME/dataset/gsm8k/train.parquet
+TEST_FILE=$RAY_DATA_HOME/dataset/gsm8k/test.parquet
+# Data Configuration
+max_prompt_length=$((1024 * 1))
+max_response_length=$((1024 * 1))
+# Training Batch Configuration
+train_prompt_bsz=128
+train_prompt_mini_bsz=32
+n_resp_per_prompt=16
+# Algorithm Configuration
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=True
+kl_loss_coef=0.001
+# Performance and Memory Management Configuration
+all_offload=True
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 4))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 8))
+optimizer_offload_fraction=1
+# Megatron Configuration
+train_tp=2
+train_ep=8
+train_etp=1
+train_pp=2
+train_cp=1
+# vLLM Configuration
+gen_tp=2
+gen_dp=1
+gen_ep=1
+gpu_memory_utilization=0.8
+max_model_len=$((max_prompt_length + max_response_length))
+max_num_batched_tokens=$(((max_prompt_length + max_response_length) * 1))
+# Data Configuration
+DATA_CONFIG=(
+    data.train_files="${TRAIN_FILE}"
+    data.val_files="${TEST_FILE}"
+    data.prompt_key=prompt
+    data.train_batch_size=${train_prompt_bsz}
+    data.max_prompt_length=${max_prompt_length}
+    data.max_response_length=${max_response_length}
+    data.filter_overlong_prompts=False
+    data.truncation='left'
+)
+# Model Configuration
+MODEL_CONFIG=(
+    actor_rollout_ref.model.path="${MODEL_PATH}"
+    actor_rollout_ref.model.use_remove_padding=True
+)
+# Algorithm Configuration
+ALGORITHM_CONFIG=(
+    algorithm.adv_estimator=${adv_estimator}
+    algorithm.use_kl_in_reward=${use_kl_in_reward}
+    algorithm.kl_ctrl.kl_coef=${kl_coef}
+)
+# Actor Model Configuration
+ACTOR_CONFIG=(
+    actor_rollout_ref.actor.use_torch_compile=False
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss}
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef}
+    actor_rollout_ref.actor.entropy_coeff=0
+    actor_rollout_ref.actor.ppo_epochs=1
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz}
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    actor_rollout_ref.actor.optim.lr=1e-6
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_offload_fraction=${optimizer_offload_fraction}
+    +actor_rollout_ref.actor.optim.override_optimizer_config.use_precision_aware_optimizer=True
+    +actor_rollout_ref.actor.optim.override_optimizer_config.optimizer_cpu_offload=True
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp}
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=${train_etp}
+    actor_rollout_ref.actor.megatron.param_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.optimizer_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.grad_offload=${all_offload}
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
+    +actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full
+    +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1
+)
+# Reference Model Configuration
+REF_CONFIG=(
+    actor_rollout_ref.ref.use_torch_compile=False
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp}
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp}
+    actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp}
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=${train_ep}
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=${train_etp}
+    actor_rollout_ref.ref.megatron.param_offload=${all_offload}
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=${MCORE_MODEL_PATH}
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=False
+)
+# Rollout Configuration
+ROLLOUT_CONFIG=(
+    actor_rollout_ref.rollout.name=vllm
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt}
+    actor_rollout_ref.rollout.top_p=1.0
+    actor_rollout_ref.rollout.top_k=-1
+    actor_rollout_ref.rollout.temperature=1.0
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+    actor_rollout_ref.rollout.gpu_memory_utilization=${gpu_memory_utilization}
+    actor_rollout_ref.rollout.max_num_batched_tokens=${max_num_batched_tokens}
+    actor_rollout_ref.rollout.max_model_len=${max_model_len}
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp}
+    actor_rollout_ref.rollout.data_parallel_size=${gen_dp}
+    actor_rollout_ref.rollout.expert_parallel_size=${gen_ep}
+    actor_rollout_ref.rollout.enable_chunked_prefill=True
+    actor_rollout_ref.rollout.enable_prefix_caching=True
+    actor_rollout_ref.rollout.enforce_eager=True
+    actor_rollout_ref.rollout.free_cache_engine=True
+    actor_rollout_ref.rollout.val_kwargs.n=1
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0
+    actor_rollout_ref.rollout.val_kwargs.top_k=-1
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0
+)
+# Trainer Configuration
+TRAINER_CONFIG=(
+    trainer.logger='["console","tensorboard"]'
+    trainer.project_name="${project_name}"
+    trainer.experiment_name="${exp_name}"
+    trainer.nnodes="${NNODES}"
+    trainer.n_gpus_per_node="${NPUS_PER_NODE}"
+    trainer.device='npu'
+    trainer.total_epochs=15
+    trainer.val_before_train=False
+    trainer.test_freq=-1
+    trainer.save_freq=-1
+    trainer.default_local_dir="${CKPTS_DIR}"
+)
+# Main GRPO Training Command
+python3 -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    "${DATA_CONFIG[@]}" \
+    "${MODEL_CONFIG[@]}" \
+    "${ACTOR_CONFIG[@]}" \
+    "${REF_CONFIG[@]}" \
+    "${ROLLOUT_CONFIG[@]}" \
+    "${ALGORITHM_CONFIG[@]}" \
+    "${TRAINER_CONFIG[@]}" \
+    "$@" | tee logs/run_qwen3moe-30b_grpo_megatron_vllm_npu.log

code/RL_model/verl/verl_train/examples/mtp_trainer/runtime_env.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+working_dir: ./
+excludes:
+  - ".git/"
+env_vars:
+  VLLM_USE_V1: "1"
+  HYDRA_FULL_ERROR: "1"
+  NCCL_NVLS_ENABLE: "0"
+  NCCL_SOCKET_IFNAME: "eth0"
+  TMPDIR: "/tmp"
+  CUDA_HOME: "/usr/local/cuda"
+  CUDA_TMPDIR: "/tmp"
+  CUDA_CACHE_PATH: "/tmp/cuda_cache"
+  # For distributed training, the path must be set on a distributed file system (DFS) to ensure visibility across all nodes.
+  HF_HOME: "/tmp/hf_home_mimo"
+  PYTHONPATH: "/tmp/hf_home_mimo/modules/"

code/RL_model/verl/verl_train/examples/mtp_trainer/test_dapo_mimo_7b_with_mtp_math_megatron.sh ADDED Viewed

	@@ -0,0 +1,144 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+project_name='DAPO'
+exp_name='DAPO-mimo-7b-rl-megatron'
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 8))
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 4))
+overlong_penalty_factor=1.0
+loss_agg_mode="token-mean"
+train_prompt_bsz=128
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+# Ray
+# RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+# WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+# RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/examples/mtp_trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+# very important! please modify the max_position_embeddings in config.json to 32768 after downloading from huggingface
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/MiMo-7B-RL"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+# Performance Related Parameter
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 2))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 3))
+offload=True
+gen_tp=4
+train_tp=2
+train_pp=2
+train_cp=2
+common_params=(
+actor_rollout_ref.model.mtp.enable=True
+actor_rollout_ref.model.mtp.enable_train=True
+actor_rollout_ref.model.mtp.mtp_loss_scaling_factor=0.1
+actor_rollout_ref.model.mtp.detach_encoder=True
+)
+python -m verl.trainer.main_ppo \
+    --config-path=config \
+    --config-name='ppo_megatron_trainer.yaml' \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.megatron.param_offload=${offload} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${offload} \
+    actor_rollout_ref.actor.megatron.grad_offload=${offload} \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.actor.megatron.context_parallel_size=${train_cp} \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.optim.clip_grad=1.0 \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=${train_pp} \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=${train_tp} \
+    actor_rollout_ref.ref.megatron.context_parallel_size=${train_cp} \
+    actor_rollout_ref.ref.megatron.param_offload=${offload} \
+    reward_model.reward_manager=dapo \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.enable=${enable_overlong_buffer} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.len=${overlong_buffer_len} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.penalty_factor=${overlong_penalty_factor} \
+    +reward_model.reward_kwargs.overlong_buffer_cfg.log=False \
+    +reward_model.reward_kwargs.max_resp_len=${max_response_length} \
+    trainer.logger='["console","tensorboard"]' \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    trainer.nnodes="${NNODES}" \
+    trainer.val_before_train=False \
+    trainer.test_freq=10 \
+    trainer.save_freq=-1 \
+    trainer.total_epochs=10 \
+    trainer.resume_mode=auto \
+    trainer.log_val_generations=10 \
+    actor_rollout_ref.rollout.disable_log_stats=False \
+    actor_rollout_ref.rollout.prometheus.enable=True \
+    actor_rollout_ref.rollout.prometheus.port=44398 \
+    actor_rollout_ref.model.trust_remote_code=True \
+    data.trust_remote_code=True \
+    trainer.total_training_steps=400 \
+    actor_rollout_ref.actor.megatron.use_mbridge=True \
+    "${common_params[@]}"

code/RL_model/verl/verl_train/examples/ppo_trainer/README.md ADDED Viewed

	@@ -0,0 +1,103 @@

+# Proximal Policy Optimization (PPO)
+Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning.
+Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from:
+- High variance and sample inefficiency.
+- Instability due to large policy updates.
+PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives.
+For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347).
+## Key Components
+- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model.
+- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias.
+- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates.
+## Configuration
+Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
+Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below.
+![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
+- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
+- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers
+- `critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers
+- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2
+- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor
+- `critic.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic. Defaults to `actor_rollout_ref.actor.ppo_epochs`
+- `algorithm.gamma`: discount factor
+- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator
+- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo, rloo_vectorized
+## Advanced Extensions
+### KL Divergence Control
+Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+Options to use KL loss for KL divergence control:
+- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False
+- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
+- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+Options to use KL penalty in the reward:
+- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False.
+- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
+- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
+- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details.
+- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details.
+### Dual-clip PPO
+The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound.
+![image](https://github.com/user-attachments/assets/fc232181-d8b0-4307-8dd2-4dc0a4c1c139)
+- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0
+## Reference Example
+Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)
+```bash
+bash run_gemma.sh
+  trainer.n_gpus_per_node=1 \
+  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+  trainer.logger=console \
+  critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  data.train_batch_size=256 \
+  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+  actor_rollout_ref.actor.ppo_micro_batch_size=2 \
+  critic.ppo_micro_batch_size=2
+```
+Reference performance with verl v0.2:
+| Model                          | Method          | Score | Link                                                                                           |
+|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------|
+| Qwen/Qwen2.5-0.5B-Instruct     | pretrained model | 36.4  | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/)                                        |
+| Qwen/Qwen2.5-0.5B-Instruct     | PPO              | 56.7  | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+set -x
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example_gsm8k' \
+    trainer.experiment_name='deepseek_llm_7b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=1 \
+    trainer.use_legacy_worker_impl=auto \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+set -x
+VERL_USE_MODELSCOPE=True \
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example_gsm8k' \
+    trainer.experiment_name='deepseek_llm_7b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_pfppo.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+set -x
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    algorithm.use_pf_ppo=True \
+    algorithm.pf_ppo.reweight_method=pow \  # ["pow", "max_min", "max_random"]
+    algorithm.pf_ppo.weight_pow=2.0 \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.rollout.n=5 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example_gsm8k' \
+    trainer.experiment_name='deepseek_llm_7b_function_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh ADDED Viewed

	@@ -0,0 +1,44 @@

+set -x
+python3 -m verl.trainer.main_ppo \
+    reward_model.sandbox_fusion.url='https://xxxxxxxxx.apigateway-cn-beijing.volceapi.com/run_code' \
+    reward_model.sandbox_fusion.max_concurrent=128 \
+    reward_model.reward_manager=prime \
+    algorithm.adv_estimator=gae \
+    data.train_files=$HOME/data/Eurus-2-RL-Data/train.parquet \
+    data.val_files=$HOME/data/Eurus-2-RL-Data/validation.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example_sandbox_fusion' \
+    trainer.experiment_name='deepseek_llm_7b_function_sandbox_fusion' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=1 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek7b_llm_sp2.sh ADDED Viewed

	@@ -0,0 +1,43 @@

+set -x
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.ulysses_sequence_parallel_size=2 \
+    critic.model.use_remove_padding=True \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=64 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example_gsm8k' \
+    trainer.experiment_name='deepseek_llm_7b_function_rm_sp2' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_full_hh_rlhf.sh ADDED Viewed

	@@ -0,0 +1,45 @@

+set -x
+train_files=$HOME/data/full_hh_rlhf/rl/train.parquet
+test_files=$HOME/data/full_hh_rlhf/rl/train.parquet # no use
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=512 \
+    data.max_prompt_length=128 \
+    data.max_response_length=128 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    reward_model.enable=True \
+    reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=4 \
+    reward_model.rollout.prompt_length=256 \
+    reward_model.rollout.response_length=128 \
+    reward_model.num_workers=8 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_megatron_full_hh_rlhf_examples' \
+    trainer.experiment_name='deepseek_llm_7b_model_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=100 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron.sh ADDED Viewed

	@@ -0,0 +1,49 @@

+set -x
+# Example runnable on H20 * 8
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_ppo_gsm8k_math_examples' \
+    trainer.experiment_name='deepseek_llm_7b_megatron' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=100 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_deepseek_math_gsm8k_megatron_nsys.sh ADDED Viewed

	@@ -0,0 +1,65 @@

+set -x
+# Example runnable on H20 * 8
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files=${train_files:-"$gsm8k_train_path"}
+test_files=${test_files:-"$gsm8k_test_path"}
+# Nsight profiling configuration
+PROFILE_STEPS="[1]" # or [] or null
+PROFILE_RANKS_ALL=False # or True
+PROFILE_RANKS=[0,4]
+DISCRETE=True  # or True
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=256 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.profiler.enable=True \
+    actor_rollout_ref.actor.profiler.ranks=$PROFILE_RANKS \
+    actor_rollout_ref.actor.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    critic.profiler.enable=True \
+    critic.profiler.ranks=$PROFILE_RANKS \
+    critic.profiler.all_ranks=$PROFILE_RANKS_ALL \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_ppo_gsm8k_math_examples' \
+    trainer.experiment_name='deepseek_llm_7b_megatron' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=-1 \
+    trainer.total_epochs=100 \
+    trainer.total_training_steps=1 \
+    global_profiler.tool=nsys \
+    global_profiler.steps=$PROFILE_STEPS \
+    global_profiler.global_tool_config.nsys.discrete=$DISCRETE $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_gemma.sh ADDED Viewed

	@@ -0,0 +1,40 @@

+set -x
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files=$HOME/data/gsm8k/train.parquet \
+    data.val_files=$HOME/data/gsm8k/test.parquet \
+    data.train_batch_size=512 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=google/gemma-2-2b-it \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=False \
+    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=False \
+    critic.model.path=google/gemma-2-2b-it \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example' \
+    trainer.experiment_name='gemma2b_function_rm' \
+    trainer.n_gpus_per_node=2 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_moonlight16b_a3b_gsm8k_megatron.sh ADDED Viewed

	@@ -0,0 +1,106 @@

+set -x
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+# 0. download the model
+hf download moonshotai/Moonlight-16B-A3B-Instruct
+# 1. convert the model to mcore format
+# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
+HF_MODEL_PATH=/data/models/moonshotai/Moonlight-16B-A3B-Instruct
+DIST_CKPT_PATH=/data/mcore_ckpt/Moonlight-16B-A3B-Instruct
+python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
+# 2. run the script
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+train_files=$gsm8k_train_path
+test_files=$gsm8k_test_path
+ALL_OFFLOAD=${ALL_OFFLOAD:-False}
+COMMON_PARAM_OFFLOAD=${COMMON_PARAM_OFFLOAD:-$ALL_OFFLOAD}
+COMMON_GRAD_OFFLOAD=${COMMON_GRAD_OFFLOAD:-$ALL_OFFLOAD}
+COMMON_OPTIMIZER_OFFLOAD=${COMMON_OPTIMIZER_OFFLOAD:-$ALL_OFFLOAD}
+ACTOR_PARAM_OFFLOAD=${ACTOR_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
+ACTOR_GRAD_OFFLOAD=${ACTOR_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
+ACTOR_OPTIMIZER_OFFLOAD=${ACTOR_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
+REF_PARAM_OFFLOAD=${REF_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
+CRITIC_PARAM_OFFLOAD=${CRITIC_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
+CRITIC_GRAD_OFFLOAD=${CRITIC_GRAD_OFFLOAD:-$COMMON_GRAD_OFFLOAD}
+CRITIC_OPTIMIZER_OFFLOAD=${CRITIC_OPTIMIZER_OFFLOAD:-$COMMON_OPTIMIZER_OFFLOAD}
+RM_PARAM_OFFLOAD=${RM_PARAM_OFFLOAD:-$COMMON_PARAM_OFFLOAD}
+NODES=4
+PP=2
+TP=8
+EP=8
+ETP=1
+VLLM_TP=4
+# RAY_ADDRESS='auto' ray job submit --working-dir . --
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.trust_remote_code=True \
+    actor_rollout_ref.model.path=$LLM \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=$LLM \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_megatron_gsm8k_examples' \
+    trainer.experiment_name='moonlight_16b_a3b_instruct_1node' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=$NODES \
+    trainer.save_freq=-1 \
+    trainer.test_freq=5 \
+    actor_rollout_ref.model.trust_remote_code=True \
+    critic.model.trust_remote_code=True \
+    +actor_rollout_ref.actor.megatron.override_transformer_config.num_layers_in_last_pipeline_stage=13 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
+    critic.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
+    critic.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.actor.megatron.expert_model_parallel_size=$EP \
+    actor_rollout_ref.ref.megatron.expert_model_parallel_size=$EP \
+    critic.megatron.expert_model_parallel_size=$EP \
+    actor_rollout_ref.actor.megatron.expert_tensor_parallel_size=$ETP \
+    actor_rollout_ref.ref.megatron.expert_tensor_parallel_size=$ETP \
+    critic.megatron.expert_tensor_parallel_size=$ETP \
+    actor_rollout_ref.actor.megatron.param_offload=${ACTOR_PARAM_OFFLOAD} \
+    actor_rollout_ref.actor.megatron.optimizer_offload=${ACTOR_OPTIMIZER_OFFLOAD} \
+    actor_rollout_ref.actor.megatron.grad_offload=${ACTOR_GRAD_OFFLOAD} \
+    actor_rollout_ref.ref.megatron.param_offload=${REF_PARAM_OFFLOAD} \
+    critic.megatron.param_offload=${CRITIC_PARAM_OFFLOAD} \
+    critic.megatron.optimizer_offload=${CRITIC_OPTIMIZER_OFFLOAD} \
+    critic.megatron.grad_offload=${CRITIC_GRAD_OFFLOAD} \
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
+    critic.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    trainer.val_before_train=False \
+    trainer.total_epochs=100 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen1.5_moe_a2.7b-gsm8k_megatron.sh ADDED Viewed

	@@ -0,0 +1,73 @@

+set -x
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+# 0. download the model
+#hf download Qwen/Qwen1.5-MoE-A2.7B-Chat
+# 1. convert the model to mcore format
+# change the HF_MODEL_PATH and DIST_CKPT_PATH to your own path
+HF_MODEL_PATH=/data/models/Qwen/Qwen1.5-MoE-A2.7B-Chat
+DIST_CKPT_PATH=/data/mcore_ckpt/Qwen1.5-MoE-A2.7B-Chat
+python scripts/converter_hf_to_mcore.py --hf_model_path $HF_MODEL_PATH --output_path $DIST_CKPT_PATH
+# 2. run the script
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+train_files=$gsm8k_train_path
+test_files=$gsm8k_test_path
+NODES=4
+PP=2
+TP=4
+CP=1
+VLLM_TP=4
+# RAY_ADDRESS='auto' ray job submit --working-dir . --
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=$HF_MODEL_PATH \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.actor.megatron.context_parallel_size=$CP \
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.actor.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=$TP \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=$PP \
+    actor_rollout_ref.ref.megatron.context_parallel_size=$CP \
+    actor_rollout_ref.ref.megatron.use_dist_checkpointing=True \
+    actor_rollout_ref.ref.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$VLLM_TP \
+    critic.optim.lr=1e-5 \
+    critic.model.path=$HF_MODEL_PATH \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    critic.megatron.tensor_model_parallel_size=$TP \
+    critic.megatron.pipeline_model_parallel_size=$PP \
+    critic.megatron.context_parallel_size=$CP \
+    critic.megatron.use_dist_checkpointing=True \
+    critic.megatron.dist_checkpointing_path=$DIST_CKPT_PATH \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_megatron_gsm8k_examples' \
+    trainer.experiment_name='qwen1.5_moe_nochat' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=$NODES \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=100 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_math_gsm8k_megatron.sh ADDED Viewed

	@@ -0,0 +1,47 @@

+set -x
+export CUDA_DEVICE_MAX_CONNECTIONS=1 # For megatron communication/computation overlapping
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
+    actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
+    critic.optim.lr=1e-5 \
+    critic.model.path=Qwen/Qwen2-7B-Instruct \
+    critic.ppo_micro_batch_size_per_gpu=4 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_ppo_gsm8k_math_examples' \
+    trainer.experiment_name='qwen2_7b_megatron' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=100 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm.sh ADDED Viewed

	@@ -0,0 +1,75 @@

+# Discliamer: the model used in the script is only for academic purpose.
+set -x
+# Data preparation scripts are available in ``examples/data_preprocess``.
+# Example usage:
+#
+#   python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
+#   python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+# prepare model ckpt
+hf download Qwen/Qwen2-7B-Instruct --local-dir $HOME/models/Qwen2-7B-Instruct &
+hf download sfairXC/FsfairX-LLaMA3-RM-v0.1 --local-dir $HOME/models/FsfairX-LLaMA3-RM-v0.1 &
+wait
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=512 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2-7B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2-7B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=2048 \
+    reward_model.rollout.response_length=1024 \
+    reward_model.num_workers=8 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_example' \
+    trainer.val_before_train=False \
+    trainer.experiment_name='Qwen2-7B-Instruct_hybrid_rm' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=20 \
+    trainer.test_freq=5 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_legacy.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=False \
+    reward_model.model.use_remove_padding=True \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=True \
+    trainer.experiment_name='legacy_fsdp_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@

code/RL_model/verl/verl_train/examples/ppo_trainer/run_qwen2-7b_rm_reward_loop_colocate.sh ADDED Viewed

	@@ -0,0 +1,69 @@

+# download datasets and models
+# python3 examples/data_preprocess/gsm8k.py
+# python3 examples/data_preprocess/math_dataset.py
+# hf download Skywork/Skywork-Reward-V2-Llama-3.2-3B --local-dir $HOME/models/Skywork-Reward-V2-Llama-3.2-3B
+# hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+gsm8k_train_path=$HOME/data/gsm8k/train.parquet
+gsm8k_test_path=$HOME/data/gsm8k/test.parquet
+math_train_path=$HOME/data/math/train.parquet
+math_test_path=$HOME/data/math/test.parquet
+train_files="['$gsm8k_train_path', '$math_train_path']"
+test_files="['$gsm8k_test_path', '$math_test_path']"
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.train_batch_size=1024 \
+    data.max_prompt_length=1024 \
+    data.max_response_length=2048 \
+    data.filter_overlong_prompts=True \
+    data.truncation='error' \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding=True \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="$HOME/models/Qwen2.5-3B-Instruct" \
+    critic.model.enable_gradient_checkpointing=True \
+    critic.ppo_micro_batch_size_per_gpu=32 \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="$HOME/models/Skywork-Reward-V2-Llama-3.2-3B" \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.name=vllm \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.prompt_length=4096 \
+    reward_model.rollout.response_length=4096 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.num_workers=8 \
+    reward_model.model.use_remove_padding=True \
+    reward_model.model.fsdp_config.param_offload=True \
+    reward_model.micro_batch_size_per_gpu=32 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name='verl_test_qwen25_rm' \
+    trainer.val_before_train=False \
+    trainer.experiment_name='reward_loop_colocate_reward_model' \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=1 \
+    trainer.save_freq=-1 \
+    trainer.test_freq=10 \
+    trainer.total_epochs=15 $@