shahidul034 commited on Feb 15

Commit

d76c61c

verified ·

1 Parent(s): e0a9278

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

code/RL_model/verl/Search-R1/dataset/data_prep.py +88 -0
code/RL_model/verl/Search-R1/dataset/prompt +58 -0
code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/main_ppo.log +0 -0
code/RL_model/verl/Search-R1/search_r1/__init__.py +0 -0
code/RL_model/verl/Search-R1/verl.egg-info/PKG-INFO +507 -0
code/RL_model/verl/Search-R1/verl.egg-info/dependency_links.txt +1 -0
code/RL_model/verl/Search-R1/verl.egg-info/requires.txt +15 -0
code/RL_model/verl/Search-R1/verl.egg-info/top_level.txt +2 -0
code/RL_model/verl/Search-R1/verl/__init__.py +27 -0
code/RL_model/verl/Search-R1/verl/protocol.py +639 -0
code/RL_model/verl/Search-R1/wandb/debug-internal.log +6 -0
code/RL_model/verl/Search-R1/wandb/debug.log +21 -0
code/RL_model/verl/verl_train/tests/experimental/agent_loop/agent_utils.py +92 -0
code/RL_model/verl/verl_train/tests/experimental/agent_loop/qwen_vl_tool_chat_template.jinja2 +150 -0
code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_basic_agent_loop.py +454 -0
code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_gpt_oss_tool_parser.py +34 -0
code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_multi_modal.py +570 -0
code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_standalone_rollout.py +157 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_agent_loop_reward_manager.py +111 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py +168 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_async_token_bucket_on_cpu.py +267 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_math_verify.py +100 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_rate_limited_reward_manager_on_cpu.py +528 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_reward_model_disrm.py +153 -0
code/RL_model/verl/verl_train/tests/experimental/vla/test_sim_envs.py +101 -0
code/RL_model/verl/verl_train/tests/single_controller/base/test_decorator.py +76 -0
code/RL_model/verl/verl_train/tests/single_controller/check_worker_alive/main.py +64 -0
code/RL_model/verl/verl_train/tests/single_controller/detached_worker/README.md +14 -0
code/RL_model/verl/verl_train/tests/single_controller/detached_worker/client.py +56 -0
code/RL_model/verl/verl_train/tests/single_controller/detached_worker/run.sh +5 -0
code/RL_model/verl/verl_train/tests/single_controller/detached_worker/server.py +152 -0
code/RL_model/verl/verl_train/tests/special_e2e/envs/__init__.py +17 -0
code/RL_model/verl/verl_train/tests/special_e2e/envs/digit_completion/__init__.py +22 -0
code/RL_model/verl/verl_train/tests/special_e2e/envs/digit_completion/task.py +179 -0
code/RL_model/verl/verl_train/tests/special_e2e/envs/digit_completion/tokenizer.py +155 -0
code/RL_model/verl/verl_train/tests/special_e2e/generation/run_gen_qwen05.sh +26 -0
code/RL_model/verl/verl_train/tests/special_e2e/generation/run_gen_qwen05_server.sh +26 -0
code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json +4 -0
code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json +4 -0
code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_function_reward.sh +165 -0
code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_model_reward.sh +101 -0
code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_single_gpu.sh +24 -0
code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh +25 -0
code/RL_model/verl/verl_train/tests/special_e2e/sft/compare_sft_engine_results.py +58 -0
code/RL_model/verl/verl_train/tests/special_e2e/sft/run_sft.sh +63 -0
code/RL_model/verl/verl_train/tests/special_e2e/sft/run_sft_engine.sh +134 -0
code/RL_model/verl/verl_train/tests/special_e2e/sft/test_sft_engine_all.sh +42 -0
code/RL_model/verl/verl_train/tests/special_e2e/sft/test_sp_loss_match.py +150 -0
code/RL_model/verl/verl_train/tests/trainer/config/__init__.py +13 -0
code/RL_model/verl/verl_train/tests/utils/ckpt/test_checkpoint_cleanup_on_cpu.py +139 -0

code/RL_model/verl/Search-R1/dataset/data_prep.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import os
+import json
+import datasets
+import argparse
+from verl.utils.hdfs_io import copy, makedirs
+# 1. Define the exact Prompt Template from your requirements
+# /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt
+with open("/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/prompt", 'r') as f:
+    PROMPT_TEMPLATE = f.read()
+def make_map_fn(split, data_source):
+    def process_fn(example, idx):
+        # Extract fields from your specific JSON keys: ['id', 'fulltext', 'summary']
+        full_text = example.pop('fulltext')
+        gold_summary = example.pop('summary')
+        # Format the prompt using your template
+        # Note: Added 'English' as default source lang based on filename
+        prompt_content = PROMPT_TEMPLATE.format(
+            source_lang="English",
+            gold_summary=gold_summary,
+            full_text=full_text
+        )
+        return {
+            "data_source": data_source,
+            "prompt": [{
+                "role": "user",
+                "content": prompt_content
+            }],
+            "ability": "summarization",
+            "reward_model": {
+                "style": "rule",
+                "ground_truth": gold_summary
+            },
+            "extra_info": {
+                "split": split,
+                "index": idx,
+                "original_id": example.get('id', idx)
+            }
+        }
+    return process_fn
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    # Path to your input JSON
+    parser.add_argument('--input_path', default='/home/mshahidul/readctrl/data/processed_test_raw_data/multiclinsum_test_en.json')
+    # Updated destination as requested
+    parser.add_argument('--local_dir', default='/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset')
+    args = parser.parse_args()
+    data_source = 'multiclinsum'
+    # Load your local JSON file
+    with open(args.input_path, 'r') as f:
+        raw_data = json.load(f)
+    # Convert to HuggingFace Dataset
+    dataset = datasets.Dataset.from_list(raw_data)
+    # Split into train/test (95% train, 5% test)
+    split_dataset = dataset.train_test_split(test_size=0.05, seed=42)
+    # Apply the mapping transformation for each split
+    processed_train = split_dataset["train"].map(
+        function=make_map_fn('train', data_source),
+        with_indices=True
+    )
+    processed_test = split_dataset["test"].map(
+        function=make_map_fn('test', data_source),
+        with_indices=True
+    )
+    # Create the directory if it doesn't exist
+    os.makedirs(args.local_dir, exist_ok=True)
+    # Save to Parquet in the specified location
+    train_output_path = os.path.join(args.local_dir, 'train.parquet')
+    test_output_path = os.path.join(args.local_dir, 'test.parquet')
+    processed_train.to_parquet(train_output_path)
+    processed_test.to_parquet(test_output_path)
+    print(f"--- Dataset Preparation Complete ---")
+    print(f"Train file saved to: {train_output_path}")
+    print(f"Test file saved to: {test_output_path}")
+    print(f"Total train records: {len(processed_train)}")
+    print(f"Total test records: {len(processed_test)}")

code/RL_model/verl/Search-R1/dataset/prompt ADDED Viewed

	@@ -0,0 +1,58 @@

+**System Role:**
+You are an expert medical editor and Health Literacy specialist. Your task is to transform complex medical text into three distinct versions based on the reader's health literacy level. You must maintain the source language of the input while adjusting the linguistic complexity. Use the provided Gold Summary as the factual anchor to ensure the simplified versions remain accurate and focused on the most important information.
+**User Prompt:**
+Please process the following medical Source Text and its corresponding Gold Summary to generate three versions tailored to different health literacy levels.
+### Instructions for Each Level:
+1. Level: Low Health Literacy (High Readability)
+Target: Individuals needing the simplest terms for immediate action.
+Linguistic Goal: Use "living room" language. Replace all medical jargon with functional descriptions (e.g., "renal" becomes "kidney").
+Information Density: Focus strictly on the "need-to-know" info found in the Gold Summary.
+Strategy: High paraphrasing using analogies. One idea per sentence.
+Faithfulness: Must align perfectly with the Gold Summary.
+2. Level: Intermediate Health Literacy (Medium Readability)
+Target: The general public (news-reading level).
+Linguistic Goal: Standard vocabulary. Common medical terms are okay, but technical "doctor-speak" must be simplified.
+Information Density: Balanced. Use the Gold Summary as the lead, supplemented by necessary context from the Source Text.
+Strategy: Moderate paraphrasing. Remove minor technical details to avoid information overload.
+Faithfulness: Maintains the main narrative of the Gold Summary.
+3. Level: Proficient Health Literacy (Low Readability)
+Target: Researchers, clinicians, or highly informed patients.
+Linguistic Goal: Technical and academic language. Prioritize clinical nuance and medical accuracy.
+Information Density: High. Use the Full Source Text to include data, physiological mechanisms, and statistics.
+Strategy: Minimal paraphrasing. Retain all original technical terminology.
+Faithfulness: Adhere to the Source Text; you may add related subclaims that provide deeper scientific context.
+I will provide the following information:
+- Input Language: <<<SOURCE_LANGUAGE>>>
+- Gold Summary (the anchor reference summary): <<<GOLD_SUMMARY>>>
+- Source Text (detailed content): <<<FULL_TEXT>>>
+**Output Format (JSON only):**
+    {{
+    "low_health_literacy": "...",
+    "intermediate_health_literacy": "...",
+    "proficient_health_literacy": "..."
+    }}

code/RL_model/verl/Search-R1/outputs/2026-02-01/20-26-44/main_ppo.log ADDED Viewed

File without changes

code/RL_model/verl/Search-R1/search_r1/__init__.py ADDED Viewed

File without changes

code/RL_model/verl/Search-R1/verl.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,507 @@

+Metadata-Version: 2.4
+Name: verl
+Version: 0.1
+Summary: veRL: Volcano Engine Reinforcement Learning for LLM
+Home-page: https://github.com/volcengine/verl
+Author: Bytedance - Seed - MLSys
+Author-email: Bytedance - Seed - MLSys <zhangchi.usc1992@bytedance.com>, Bytedance - Seed - MLSys <gmsheng@connect.hku.hk>
+License:
+                                         Apache License
+                                   Version 2.0, January 2004
+                                http://www.apache.org/licenses/
+           TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+           1. Definitions.
+              "License" shall mean the terms and conditions for use, reproduction,
+              and distribution as defined by Sections 1 through 9 of this document.
+              "Licensor" shall mean the copyright owner or entity authorized by
+              the copyright owner that is granting the License.
+              "Legal Entity" shall mean the union of the acting entity and all
+              other entities that control, are controlled by, or are under common
+              control with that entity. For the purposes of this definition,
+              "control" means (i) the power, direct or indirect, to cause the
+              direction or management of such entity, whether by contract or
+              otherwise, or (ii) ownership of fifty percent (50%) or more of the
+              outstanding shares, or (iii) beneficial ownership of such entity.
+              "You" (or "Your") shall mean an individual or Legal Entity
+              exercising permissions granted by this License.
+              "Source" form shall mean the preferred form for making modifications,
+              including but not limited to software source code, documentation
+              source, and configuration files.
+              "Object" form shall mean any form resulting from mechanical
+              transformation or translation of a Source form, including but
+              not limited to compiled object code, generated documentation,
+              and conversions to other media types.
+              "Work" shall mean the work of authorship, whether in Source or
+              Object form, made available under the License, as indicated by a
+              copyright notice that is included in or attached to the work
+              (an example is provided in the Appendix below).
+              "Derivative Works" shall mean any work, whether in Source or Object
+              form, that is based on (or derived from) the Work and for which the
+              editorial revisions, annotations, elaborations, or other modifications
+              represent, as a whole, an original work of authorship. For the purposes
+              of this License, Derivative Works shall not include works that remain
+              separable from, or merely link (or bind by name) to the interfaces of,
+              the Work and Derivative Works thereof.
+              "Contribution" shall mean any work of authorship, including
+              the original version of the Work and any modifications or additions
+              to that Work or Derivative Works thereof, that is intentionally
+              submitted to Licensor for inclusion in the Work by the copyright owner
+              or by an individual or Legal Entity authorized to submit on behalf of
+              the copyright owner. For the purposes of this definition, "submitted"
+              means any form of electronic, verbal, or written communication sent
+              to the Licensor or its representatives, including but not limited to
+              communication on electronic mailing lists, source code control systems,
+              and issue tracking systems that are managed by, or on behalf of, the
+              Licensor for the purpose of discussing and improving the Work, but
+              excluding communication that is conspicuously marked or otherwise
+              designated in writing by the copyright owner as "Not a Contribution."
+              "Contributor" shall mean Licensor and any individual or Legal Entity
+              on behalf of whom a Contribution has been received by Licensor and
+              subsequently incorporated within the Work.
+           2. Grant of Copyright License. Subject to the terms and conditions of
+              this License, each Contributor hereby grants to You a perpetual,
+              worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+              copyright license to reproduce, prepare Derivative Works of,
+              publicly display, publicly perform, sublicense, and distribute the
+              Work and such Derivative Works in Source or Object form.
+           3. Grant of Patent License. Subject to the terms and conditions of
+              this License, each Contributor hereby grants to You a perpetual,
+              worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+              (except as stated in this section) patent license to make, have made,
+              use, offer to sell, sell, import, and otherwise transfer the Work,
+              where such license applies only to those patent claims licensable
+              by such Contributor that are necessarily infringed by their
+              Contribution(s) alone or by combination of their Contribution(s)
+              with the Work to which such Contribution(s) was submitted. If You
+              institute patent litigation against any entity (including a
+              cross-claim or counterclaim in a lawsuit) alleging that the Work
+              or a Contribution incorporated within the Work constitutes direct
+              or contributory patent infringement, then any patent licenses
+              granted to You under this License for that Work shall terminate
+              as of the date such litigation is filed.
+           4. Redistribution. You may reproduce and distribute copies of the
+              Work or Derivative Works thereof in any medium, with or without
+              modifications, and in Source or Object form, provided that You
+              meet the following conditions:
+              (a) You must give any other recipients of the Work or
+                  Derivative Works a copy of this License; and
+              (b) You must cause any modified files to carry prominent notices
+                  stating that You changed the files; and
+              (c) You must retain, in the Source form of any Derivative Works
+                  that You distribute, all copyright, patent, trademark, and
+                  attribution notices from the Source form of the Work,
+                  excluding those notices that do not pertain to any part of
+                  the Derivative Works; and
+              (d) If the Work includes a "NOTICE" text file as part of its
+                  distribution, then any Derivative Works that You distribute must
+                  include a readable copy of the attribution notices contained
+                  within such NOTICE file, excluding those notices that do not
+                  pertain to any part of the Derivative Works, in at least one
+                  of the following places: within a NOTICE text file distributed
+                  as part of the Derivative Works; within the Source form or
+                  documentation, if provided along with the Derivative Works; or,
+                  within a display generated by the Derivative Works, if and
+                  wherever such third-party notices normally appear. The contents
+                  of the NOTICE file are for informational purposes only and
+                  do not modify the License. You may add Your own attribution
+                  notices within Derivative Works that You distribute, alongside
+                  or as an addendum to the NOTICE text from the Work, provided
+                  that such additional attribution notices cannot be construed
+                  as modifying the License.
+              You may add Your own copyright statement to Your modifications and
+              may provide additional or different license terms and conditions
+              for use, reproduction, or distribution of Your modifications, or
+              for any such Derivative Works as a whole, provided Your use,
+              reproduction, and distribution of the Work otherwise complies with
+              the conditions stated in this License.
+           5. Submission of Contributions. Unless You explicitly state otherwise,
+              any Contribution intentionally submitted for inclusion in the Work
+              by You to the Licensor shall be under the terms and conditions of
+              this License, without any additional terms or conditions.
+              Notwithstanding the above, nothing herein shall supersede or modify
+              the terms of any separate license agreement you may have executed
+              with Licensor regarding such Contributions.
+           6. Trademarks. This License does not grant permission to use the trade
+              names, trademarks, service marks, or product names of the Licensor,
+              except as required for reasonable and customary use in describing the
+              origin of the Work and reproducing the content of the NOTICE file.
+           7. Disclaimer of Warranty. Unless required by applicable law or
+              agreed to in writing, Licensor provides the Work (and each
+              Contributor provides its Contributions) on an "AS IS" BASIS,
+              WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+              implied, including, without limitation, any warranties or conditions
+              of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+              PARTICULAR PURPOSE. You are solely responsible for determining the
+              appropriateness of using or redistributing the Work and assume any
+              risks associated with Your exercise of permissions under this License.
+           8. Limitation of Liability. In no event and under no legal theory,
+              whether in tort (including negligence), contract, or otherwise,
+              unless required by applicable law (such as deliberate and grossly
+              negligent acts) or agreed to in writing, shall any Contributor be
+              liable to You for damages, including any direct, indirect, special,
+              incidental, or consequential damages of any character arising as a
+              result of this License or out of the use or inability to use the
+              Work (including but not limited to damages for loss of goodwill,
+              work stoppage, computer failure or malfunction, or any and all
+              other commercial damages or losses), even if such Contributor
+              has been advised of the possibility of such damages.
+           9. Accepting Warranty or Additional Liability. While redistributing
+              the Work or Derivative Works thereof, You may choose to offer,
+              and charge a fee for, acceptance of support, warranty, indemnity,
+              or other liability obligations and/or rights consistent with this
+              License. However, in accepting such obligations, You may act only
+              on Your own behalf and on Your sole responsibility, not on behalf
+              of any other Contributor, and only if You agree to indemnify,
+              defend, and hold each Contributor harmless for any liability
+              incurred by, or claims asserted against, such Contributor by reason
+              of your accepting any such warranty or additional liability.
+           END OF TERMS AND CONDITIONS
+           APPENDIX: How to apply the Apache License to your work.
+              To apply the Apache License to your work, attach the following
+              boilerplate notice, with the fields enclosed by brackets "[]"
+              replaced with your own identifying information. (Don't include
+              the brackets!)  The text should be enclosed in the appropriate
+              comment syntax for the file format. We also recommend that a
+              file or class name and description of purpose be included on the
+              same "printed page" as the copyright notice for easier
+              identification within third-party archives.
+           Copyright [yyyy] [name of copyright owner]
+           Licensed under the Apache License, Version 2.0 (the "License");
+           you may not use this file except in compliance with the License.
+           You may obtain a copy of the License at
+               http://www.apache.org/licenses/LICENSE-2.0
+           Unless required by applicable law or agreed to in writing, software
+           distributed under the License is distributed on an "AS IS" BASIS,
+           WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+           See the License for the specific language governing permissions and
+           limitations under the License.
+Project-URL: Homepage, https://github.com/volcengine/verl
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: accelerate
+Requires-Dist: codetiming
+Requires-Dist: datasets
+Requires-Dist: dill
+Requires-Dist: hydra-core
+Requires-Dist: numpy
+Requires-Dist: pybind11
+Requires-Dist: ray
+Requires-Dist: tensordict
+Requires-Dist: transformers<4.48
+Requires-Dist: vllm<=0.6.3
+Provides-Extra: test
+Requires-Dist: pytest; extra == "test"
+Requires-Dist: yapf; extra == "test"
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+# Search-R1: Train your LLMs to reason and call a search engine with reinforcement learning
+<div align="center">
+  <img src="https://raw.githubusercontent.com/PeterGriffinJin/Search-R1/main/public/logo.png" alt="logo" width="300"/>
+</div>
+<p align="center">
+  <a href="https://arxiv.org/abs/2503.09516">
+    <img src="https://img.shields.io/badge/Paper1-blue?style=for-the-badge" alt="Button1"/>
+  </a>
+  <a href="https://arxiv.org/abs/2505.15117">
+    <img src="https://img.shields.io/badge/Paper2-green?style=for-the-badge" alt="Button2"/>
+  </a>
+  <a href="https://huggingface.co/collections/PeterJinGo/search-r1-67d1a021202731cb065740f5">
+    <img src="https://img.shields.io/badge/Resources-orange?style=for-the-badge" alt="Button3"/>
+  </a>
+  <a href="https://x.com/BowenJin13/status/1895544294473109889">
+    <img src="https://img.shields.io/badge/Tweet-red?style=for-the-badge" alt="Button4"/>
+  </a>
+  <a href="https://wandb.ai/peterjin/Search-R1-v0.2">
+    <img src="https://img.shields.io/badge/Logs-purple?style=for-the-badge" alt="Button5"/>
+  </a>
+</p>
+<!-- <strong>Search-R1</strong> is a reinforcement learning framework for <em>training reasoning and searching (tool-call) interleaved LLMs</em>.  -->
+<!-- We built upon [veRL](https://github.com/volcengine/verl). -->
+**Search-R1** is a reinforcement learning framework designed for training **reasoning-and-searching interleaved LLMs**—language models that learn to reason and make tool calls (e.g., to search engines) in a coordinated manner.
+<!-- It can be seen as an extension of <strong>DeepSeek-R1(-Zero)</strong> with interleaved search engine calling and an opensource RL training-based solution for <strong>OpenAI DeepResearch</strong>. -->
+Built upon [veRL](https://github.com/volcengine/verl), Search-R1 extends the ideas of **DeepSeek-R1(-Zero)** by incorporating interleaved search engine access and provides a fully open-source RL training pipeline. It serves as an alternative and open solution to **OpenAI DeepResearch**, enabling research and development in tool-augmented LLM reasoning.
+<!-- Through RL (rule-based outcome reward), the 3B **base** LLM (both Qwen2.5-3b-base and Llama3.2-3b-base) develops reasoning and search engine calling abilities all on its own. -->
+We support different RL methods (e.g., PPO, GRPO, reinforce), different LLMs (e.g., llama3, Qwen2.5, etc) and different search engines (e.g., local sparse/dense retrievers and online search engines).
+Paper: [link1](https://arxiv.org/pdf/2503.09516), [link2](https://arxiv.org/abs/2505.15117); Model and data: [link](https://huggingface.co/collections/PeterJinGo/search-r1-67d1a021202731cb065740f5); Twitter thread: [link](https://x.com/BowenJin13/status/1895544294473109889); Full experiment log: [prelim](https://wandb.ai/peterjin/Search-R1-open); [v0.1](https://wandb.ai/peterjin/Search-R1-nq_hotpotqa_train); [v0.2](https://wandb.ai/peterjin/Search-R1-v0.2); [v0.3](https://wandb.ai/peterjin/Search-R1-v0.3). Details about these logs and methods can be find [here](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/experiment_log.md).
+![single-turn](public/main.png)
+## News
+- [2025.10] Search-R1 is featured by Thinking Machines Lab's first product [Tinker](https://github.com/thinking-machines-lab/tinker-cookbook)! Details: [Document](https://github.com/thinking-machines-lab/tinker-cookbook/tree/main/tinker_cookbook/recipes/tool_use/search).
+- [2025.7] Search-R1 is supported by [SkyRL](https://github.com/NovaSky-AI/SkyRL)! Detailed instructions: [code](https://github.com/NovaSky-AI/SkyRL/tree/main/skyrl-train/examples/search), [Document](https://novasky-ai.notion.site/skyrl-searchr1).
+- [2025.6] Search-R1 is now integrated into the latest version of veRL and can take advantage of its most up-to-date features! Detailed instructions: [veRL](https://verl.readthedocs.io/en/latest/sglang_multiturn/search_tool_example.html), [English Document](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/tool_examples/verl-multiturn-searchR1-like.md), [Chinese Document](https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/blob/main/rlhf/verl/multi-turn/tool_examples/verl-multiturn-searchR1-like_ZH.md).
+- [2025.5] The second [paper](https://arxiv.org/abs/2505.15117) conducting detailed empirical studies is published with logs: [v0.3](https://wandb.ai/peterjin/Search-R1-v0.3).
+- [2025.4] We support [multinode](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/multinode.md) training for 30B+ LLMs!
+- [2025.4] We support [different search engines](https://github.com/PeterGriffinJin/Search-R1/blob/main/docs/retriever.md) including sparse local retriever, dense local retriever with ANN indexing and online search engines!
+- [2025.3] The first Search-R1 [paper](https://arxiv.org/pdf/2503.09516) is published with the logs: [v0.1](https://wandb.ai/peterjin/Search-R1-nq_hotpotqa_train); [v0.2](https://wandb.ai/peterjin/Search-R1-v0.2).
+- [2025.2] We opensource Search-R1 codebase with [preliminary results](https://wandb.ai/peterjin/Search-R1-open).
+## Links
+- [Installation](#installation)
+- [Quick start](#quick-start)
+- [Preliminary results](#preliminary-results)
+- [Inference](#inference)
+- [Use your own dataset](#use-your-own-dataset)
+- [Use your own search engine](#use-your-own-search-engine)
+- [Features](#features)
+- [Ackowledge](#acknowledge)
+- [Citations](#citations)
+## Installation
+### Search-r1 environment
+```bash
+conda create -n searchr1 python=3.9
+conda activate searchr1
+# install torch [or you can skip this step and let vllm to install the correct version for you]
+pip install torch==2.4.0 --index-url https://download.pytorch.org/whl/cu121
+# install vllm
+pip3 install vllm==0.6.3 # or you can install 0.5.4, 0.4.2 and 0.3.1
+# verl
+pip install -e .
+# flash attention 2
+pip3 install flash-attn --no-build-isolation
+pip install wandb
+```
+### Retriever environment (optional)
+If you would like to call a local retriever as the search engine, you can install the environment as follows. (We recommend using a seperate environment.)
+```bash
+conda create -n retriever python=3.10
+conda activate retriever
+# we recommend installing torch with conda for faiss-gpu
+conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia
+pip install transformers datasets pyserini
+## install the gpu version faiss to guarantee efficient RL rollout
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0
+## API function
+pip install uvicorn fastapi
+```
+## Quick start
+Train a reasoning + search LLM on NQ dataset with e5 as the retriever and wikipedia as the corpus.
+(1) Download the indexing and corpus.
+```bash
+save_path=/the/path/to/save
+python scripts/download.py --save_path $save_path
+cat $save_path/part_* > $save_path/e5_Flat.index
+gzip -d $save_path/wiki-18.jsonl.gz
+```
+(2) Process the NQ dataset.
+```bash
+python scripts/data_process/nq_search.py
+```
+(3) Launch a local retrieval server.
+```bash
+conda activate retriever
+bash retrieval_launch.sh
+```
+(4) Run RL training (PPO) with Llama-3.2-3b-base.
+```bash
+conda activate searchr1
+bash train_ppo.sh
+```
+## Preliminary results
+(1) The base model (llama3.2-3b-base) learns to call the search engine and obtain improved performance.
+![llama-3b](public/llama32-3b.png)
+(2) The base model (Qwen2.5-7b-base) can learn to conduct multi-turn search engine calling and reasoning with RL.
+![multi-turn](public/multi-turn.png)
+## Inference
+#### You can play with the trained Search-R1 model with your own question.
+(1) Launch a local retrieval server.
+```bash
+conda activate retriever
+bash retrieval_launch.sh
+```
+(2) Run inference.
+```bash
+conda activate searchr1
+python infer.py
+```
+You can modify the ```question``` on line 7 to something you're interested in.
+## Use your own dataset
+### QA data
+For each question-answer sample, it should be a dictionary containing the desired content as below:
+```
+data = {
+        "data_source": data_source,
+        "prompt": [{
+            "role": "user",
+            "content": question,
+        }],
+        "ability": "fact-reasoning",
+        "reward_model": {
+            "style": "rule",
+            "ground_truth": solution
+        },
+        "extra_info": {
+            'split': split,
+            'index': idx,
+        }
+    }
+```
+You can refer to ```scripts/data_process/nq_search.py``` for a concrete data processing example.
+### Corpora
+It is recommended to make your corpus a jsonl file, where each line (a dictionary with "id" key and "contents" key) corresponds to one passage. You can refer to ```example/corpus.jsonl``` for an example.
+The "id" key corresponds to the passage id, while the "contents" key corresponds to the passage content ('"' + title + '"\n' + text).
+For example:
+```
+{"id": "0", "contents": "Evan Morris Evan L. Morris (January 26, 1977 \u2013 July 9, 2015) was a lobbyist for Genentech and its parent corporation Roche in Washington."}
+...
+{"id": "100", "contents": "Three years later, when the United States Exploring Expedition to little-known portions of the globe was organised under Charles Wilkes, Hale was recommended, while yet an undergraduate."}
+...
+```
+**Index your corpora (optional).**
+If you would like to use a local retriever as the search engine, you can index your own corpus by:
+```
+bash search_r1/search/build_index.sh
+```
+You can change ```retriever_name``` and ```retriever_model``` to your interested off-the-shelf retriever.
+## Use your own search engine
+Our codebase supports local sparse retriever (e.g., BM25), local dense retriever (both flat indexing with GPUs and ANN indexing with CPUs) and online search engine (e.g., Google, Bing, etc). More details can be found [here](https://github.com/PeterGriffinJin/Search-R1/tree/main/docs/retriever.md).
+The main philosophy is to launch a local or remote search engine server separately from the main RL training pipeline.
+The LLM can call the search engine by calling the search API (e.g., "http://127.0.0.1:8000/retrieve").
+You can refer to ```search_r1/search/retriever_server.py``` for an example of launching a local retriever server.
+## Features
+- Support local sparse retrievers (e.g., BM25). ✔️
+- Support local dense retrievers (both flat indexing and ANN indexing) ✔️
+- Support google search / bing search / brave search API and others. ✔️
+- Support off-the-shelf neural rerankers. ✔️
+- Support different RL methods (e.g., PPO, GRPO, reinforce). ✔️
+- Support different LLMs (e.g., llama3, Qwen2.5, etc). ✔️
+## Acknowledge
+The concept of Search-R1 is inspired by [Deepseek-R1](https://github.com/deepseek-ai/DeepSeek-R1) and [TinyZero](https://github.com/Jiayi-Pan/TinyZero/tree/main).
+Its implementation is built upon [veRL](https://github.com/volcengine/verl) and [RAGEN](https://github.com/ZihanWang314/RAGEN/tree/main).
+We sincerely appreciate the efforts of these teams for their contributions to open-source research and development.
+## Awesome work powered or inspired by Search-R1
+- [DeepResearcher](https://github.com/GAIR-NLP/DeepResearcher): Scaling Deep Research via Reinforcement Learning in Real-world Environments. [![[code]](https://img.shields.io/github/stars/GAIR-NLP/DeepResearcher)](https://github.com/GAIR-NLP/DeepResearcher)
+- [Multimodal-Search-R1](https://github.com/EvolvingLMMs-Lab/multimodal-search-r1): Incentivizing LMMs to Search. [![[code]](https://img.shields.io/github/stars/EvolvingLMMs-Lab/multimodal-search-r1)](https://github.com/EvolvingLMMs-Lab/multimodal-search-r1)
+- [OTC](https://arxiv.org/pdf/2504.14870): Optimal Tool Calls via Reinforcement Learning.
+- [ZeroSearch](https://github.com/Alibaba-NLP/ZeroSearch): Incentivize the Search Capability of LLMs without Searching. [![[code]](https://img.shields.io/github/stars/Alibaba-NLP/ZeroSearch)](https://github.com/Alibaba-NLP/ZeroSearch)
+- [IKEA](https://github.com/hzy312/knowledge-r1): Reinforced Internal-External Knowledge Synergistic Reasoning for Efficient Adaptive Search Agent. [![[code]](https://img.shields.io/github/stars/hzy312/knowledge-r1)](https://github.com/hzy312/knowledge-r1)
+- [Scent of Knowledge](https://arxiv.org/abs/2505.09316): Optimizing Search-Enhanced Reasoning with Information Foraging.
+- [AutoRefine](https://www.arxiv.org/pdf/2505.11277): Search and Refine During Think. [![[code]](https://img.shields.io/github/stars/syr-cn/AutoRefine)](https://github.com/syr-cn/AutoRefine)
+- [O^2-Searcher](https://arxiv.org/pdf/2505.16582): A Searching-based Agent Model for Open-Domain Open-Ended Question Answering. [![[code]](https://img.shields.io/github/stars/Acade-Mate/O2-Searcher)](https://github.com/Acade-Mate/O2-Searcher)
+- [MaskSearch](https://arxiv.org/pdf/2505.20285): A Universal Pre-Training Framework to Enhance Agentic Search Capability. [![[code]](https://img.shields.io/github/stars/Alibaba-NLP/MaskSearch)](https://github.com/Alibaba-NLP/MaskSearch)
+- [VRAG-RL](https://arxiv.org/abs/2505.22019): Vision-Perception-Based RAG for Visually Rich Information Understanding. [![[code]](https://img.shields.io/github/stars/Alibaba-NLP/VRAG)](https://github.com/Alibaba-NLP/VRAG)
+- [R1-Code-Interpreter](https://arxiv.org/abs/2505.21668): Training LLMs to Reason with Code via SFT and RL. [![[code]](https://img.shields.io/github/stars/yongchao98/R1-Code-Interpreter)](https://github.com/yongchao98/R1-Code-Interpreter)
+- [R-Search](https://arxiv.org/abs/2506.04185): Empowering LLM Reasoning with Search via Multi-Reward Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/QingFei1/R-Search)](https://github.com/QingFei1/R-Search)
+- [StepSearch](https://arxiv.org/pdf/2505.15107): Igniting LLMs Search Ability via Step-Wise Proximal Policy Optimization. [![[code]](https://img.shields.io/github/stars/Zillwang/StepSearch)](https://github.com/Zillwang/StepSearch)
+- [SimpleTIR](https://simpletir.notion.site/report): Stable End-to-End Reinforcement Learning for Multi-Turn Tool-Integrated Reasoning. [![[code]](https://img.shields.io/github/stars/ltzheng/SimpleTIR)](https://github.com/ltzheng/SimpleTIR)
+- [Router-R1](https://arxiv.org/pdf/2506.09033): Teaching LLMs Multi-Round Routing and Aggregation via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/ulab-uiuc/Router-R1)](https://github.com/ulab-uiuc/Router-R1)
+- [SkyRL](https://skyrl.readthedocs.io/en/latest/): A Modular Full-stack RL Library for LLMs. [![[code]](https://img.shields.io/github/stars/NovaSky-AI/SkyRL)](https://github.com/NovaSky-AI/SkyRL)
+- [ASearcher](https://arxiv.org/abs/2508.07976): Large-Scale RL for Search Agents. [![[code]](https://img.shields.io/github/stars/inclusionAI/ASearcher)](https://github.com/inclusionAI/ASearcher)
+- [ParallelSearch](https://www.arxiv.org/abs/2508.09303): Decompose Query and Search Sub-queries in Parallel with RL. [![[code]](https://img.shields.io/github/stars/Tree-Shu-Zhao/ParallelSearch)](https://github.com/Tree-Shu-Zhao/ParallelSearch)
+- [AutoTIR](https://arxiv.org/pdf/2507.21836): Autonomous Tools Integrated Reasoning via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/weiyifan1023/AutoTIR)](https://github.com/weiyifan1023/AutoTIR)
+- [verl-tool](https://arxiv.org/pdf/2509.01055): A version of verl to support diverse tool use. [![[code]](https://img.shields.io/github/stars/TIGER-AI-Lab/verl-tool)](https://github.com/TIGER-AI-Lab/verl-tool)
+- [Tree-GRPO](https://arxiv.org/abs/2509.21240): Tree Search for LLM Agent Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/AMAP-ML/Tree-GRPO)](https://github.com/AMAP-ML/Tree-GRPO)
+- [EviNote-RAG](https://arxiv.org/abs/2509.00877): Enhancing RAG Models via Answer-Supportive Evidence Notes. [![[code]](https://img.shields.io/github/stars/Da1yuqin/EviNoteRAG)](https://github.com/Da1yuqin/EviNoteRAG)
+- [GlobalRAG](https://arxiv.org/pdf/2510.20548v1): GlobalRAG: Enhancing Global Reasoning in Multi-hop Question Answering via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CarnegieBin/GlobalRAG)](https://github.com/CarnegieBin/GlobalRAG)
+## Citations
+```bibtex
+@article{jin2025search,
+  title={Search-r1: Training llms to reason and leverage search engines with reinforcement learning},
+  author={Jin, Bowen and Zeng, Hansi and Yue, Zhenrui and Yoon, Jinsung and Arik, Sercan and Wang, Dong and Zamani, Hamed and Han, Jiawei},
+  journal={arXiv preprint arXiv:2503.09516},
+  year={2025}
+}
+```
+```bibtex
+@article{jin2025empirical,
+  title={An Empirical Study on Reinforcement Learning for Reasoning-Search Interleaved LLM Agents},
+  author={Jin, Bowen and Yoon, Jinsung and Kargupta, Priyanka and Arik, Sercan O and Han, Jiawei},
+  journal={arXiv preprint arXiv:2505.15117},
+  year={2025}
+}
+```

code/RL_model/verl/Search-R1/verl.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

code/RL_model/verl/Search-R1/verl.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+accelerate
+codetiming
+datasets
+dill
+hydra-core
+numpy
+pybind11
+ray
+tensordict
+transformers<4.48
+vllm<=0.6.3
+[test]
+pytest
+yapf

code/RL_model/verl/Search-R1/verl.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ search_r1
2	+ verl

code/RL_model/verl/Search-R1/verl/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+version_folder = os.path.dirname(os.path.join(os.path.abspath(__file__)))
+with open(os.path.join(version_folder, 'version/version')) as f:
+    __version__ = f.read().strip()
+from .protocol import DataProto
+from .utils.logging_utils import set_basic_config
+import logging
+set_basic_config(level=logging.WARNING)

code/RL_model/verl/Search-R1/verl/protocol.py ADDED Viewed

	@@ -0,0 +1,639 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Implement base data transfer protocol between any two functions, modules.
+We can subclass Protocol to define more detailed batch info with specific keys
+"""
+import pickle
+import numpy as np
+import copy
+from dataclasses import dataclass, field
+from typing import Callable, Dict, List, Union
+import torch
+import tensordict
+from tensordict import TensorDict
+from torch.utils.data import DataLoader, Dataset
+from verl.utils.py_functional import union_two_dict
+__all__ = ['DataProto', 'union_tensor_dict']
+try:
+    tensordict.set_lazy_legacy(False).set()
+except:
+    pass
+def pad_dataproto_to_divisor(data: 'DataProto', size_divisor: int):
+    """Pad a DataProto to size divisible by size_divisor
+    Args:
+        size_divisor (int): size divisor
+    Returns:
+        data: (DataProto): the padded DataProto
+        pad_size (int)
+    """
+    assert isinstance(data, DataProto), 'data must be a DataProto'
+    if len(data) % size_divisor != 0:
+        pad_size = size_divisor - len(data) % size_divisor
+        data_padded = DataProto.concat([data, data[:pad_size]])
+    else:
+        pad_size = 0
+        data_padded = data
+    return data_padded, pad_size
+def unpad_dataproto(data: 'DataProto', pad_size):
+    if pad_size != 0:
+        data = data[:-pad_size]
+    return data
+def union_tensor_dict(tensor_dict1: TensorDict, tensor_dict2: TensorDict) -> TensorDict:
+    """Union two tensordicts."""
+    assert tensor_dict1.batch_size == tensor_dict2.batch_size, \
+        f'Two tensor dict must have identical batch size. Got {tensor_dict1.batch_size} and {tensor_dict2.batch_size}'
+    for key in tensor_dict2.keys():
+        if key not in tensor_dict1.keys():
+            tensor_dict1[key] = tensor_dict2[key]
+        else:
+            assert tensor_dict1[key].equal(tensor_dict2[key]), \
+                f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
+    return tensor_dict1
+def union_numpy_dict(tensor_dict1: dict[np.ndarray], tensor_dict2: dict[np.ndarray]) -> dict[np.ndarray]:
+    for key, val in tensor_dict2.items():
+        if key in tensor_dict1:
+            assert isinstance(tensor_dict2[key], np.ndarray)
+            assert isinstance(tensor_dict1[key], np.ndarray)
+            assert np.all(tensor_dict2[key] == tensor_dict1[key]), \
+                f'{key} in tensor_dict1 and tensor_dict2 are not the same object'
+        tensor_dict1[key] = val
+    return tensor_dict1
+def list_of_dict_to_dict_of_list(list_of_dict: list[dict]):
+    if len(list_of_dict) == 0:
+        return {}
+    keys = list_of_dict[0].keys()
+    output = {key: [] for key in keys}
+    for data in list_of_dict:
+        for key, item in data.items():
+            assert key in output
+            output[key].append(item)
+    return output
+def fold_batch_dim(data: 'DataProto', new_batch_size):
+    """
+    Fold a batch dim from [bsz, xxx] into [new_bsz, bsz // new_bsz, xxx]
+    """
+    batch_size = data.batch.batch_size[0]
+    assert batch_size % new_batch_size == 0
+    tensor: TensorDict = data.batch
+    non_tensor = data.non_tensor_batch
+    tensor = tensor.view(new_batch_size, -1)
+    tensor.auto_batch_size_(batch_dims=1)
+    for key, val in non_tensor.items():
+        non_tensor[key] = np.reshape(val, newshape=(new_batch_size, -1, *val.shape[1:]))
+    return DataProto(batch=tensor, non_tensor_batch=non_tensor, meta_info=data.meta_info)
+def unfold_batch_dim(data: 'DataProto', batch_dims=2):
+    """
+    Unfold the first n dims as new batch dim
+    """
+    tensor: TensorDict = data.batch
+    non_tensor = data.non_tensor_batch
+    tensor.auto_batch_size_(batch_dims=batch_dims)
+    tensor = tensor.view(-1)
+    batch_size = tensor.batch_size[0]
+    non_tensor_new = {}
+    for key, val in non_tensor.items():
+        non_tensor_new[key] = np.reshape(val, newshape=(batch_size, *val.shape[batch_dims:]))
+    return DataProto(batch=tensor, non_tensor_batch=non_tensor_new, meta_info=data.meta_info)
+def collate_fn(x: list['DataProtoItem']):
+    batch = []
+    non_tensor_batch = []
+    for data in x:
+        batch.append(data.batch)
+        non_tensor_batch.append(data.non_tensor_batch)
+    batch = torch.stack(batch).contiguous()
+    non_tensor_batch = list_of_dict_to_dict_of_list(non_tensor_batch)
+    for key, val in non_tensor_batch.items():
+        non_tensor_batch[key] = np.array(val, dtype=object)
+    return DataProto(batch=batch, non_tensor_batch=non_tensor_batch)
+@dataclass
+class DataProtoItem:
+    # TODO(zhangchi.usc1992) add consistency check
+    batch: TensorDict = None
+    non_tensor_batch: Dict = field(default_factory=dict)
+    meta_info: Dict = field(default_factory=dict)
+@dataclass
+class DataProto:
+    """
+    A DataProto is a data structure that aims to provide a standard protocol for data exchange between functions.
+    It contains a batch (TensorDict) and a meta_info (Dict). The batch is a TensorDict https://pytorch.org/tensordict/.
+    TensorDict allows you to manipulate a dictionary of Tensors like a single Tensor. Ideally, the tensors with the
+    same batch size should be put inside batch.
+    """
+    batch: TensorDict = None
+    non_tensor_batch: Dict = field(default_factory=dict)
+    meta_info: Dict = field(default_factory=dict)
+    def __post_init__(self):
+        # perform necessary checking
+        self.check_consistency()
+    def __len__(self):
+        if self.batch is not None:
+            return self.batch.batch_size[0]
+        elif self.non_tensor_batch is not None and len(self.non_tensor_batch) > 0:
+            random_key = list(self.non_tensor_batch.keys())[0]
+            return self.non_tensor_batch[random_key].shape[0]
+        else:
+            return 0
+    def __getitem__(self, item):
+        tensor_data = self.batch[item]
+        non_tensor_data = {key: val[item] for key, val in self.non_tensor_batch.items()}
+        return DataProtoItem(batch=tensor_data, non_tensor_batch=non_tensor_data, meta_info=self.meta_info)
+    def __getstate__(self):
+        import io
+        buffer = io.BytesIO()
+        if tensordict.__version__ >= '0.5.0' and self.batch is not None:
+            self.batch = self.batch.contiguous()
+            self.batch = self.batch.consolidate()
+        torch.save(self.batch, buffer)
+        buffer_bytes = buffer.getvalue()
+        return buffer_bytes, self.non_tensor_batch, self.meta_info
+    def __setstate__(self, data):
+        import io
+        batch_deserialized_bytes, non_tensor_batch, meta_info = data
+        batch_deserialized = io.BytesIO(initial_bytes=batch_deserialized_bytes)
+        batch = torch.load(batch_deserialized,
+                           weights_only=False,
+                           map_location='cpu' if not torch.cuda.is_available() else None)
+        self.batch = batch
+        self.non_tensor_batch = non_tensor_batch
+        self.meta_info = meta_info
+    def save_to_disk(self, filepath):
+        with open(filepath, 'wb') as f:
+            pickle.dump(self, f)
+    @staticmethod
+    def load_from_disk(filepath) -> 'DataProto':
+        with open(filepath, 'rb') as f:
+            data = pickle.load(f)
+            return data
+    def print_size(self, prefix=""):
+        size_of_tensordict = 0
+        for key, tensor in self.batch.items():
+            size_of_tensordict += tensor.element_size() * tensor.numel()
+        size_of_numpy_array = 0
+        for key, numpy_array in self.non_tensor_batch.items():
+            size_of_numpy_array += numpy_array.nbytes
+        size_of_numpy_array /= 1024**3
+        size_of_tensordict /= 1024**3
+        message = f'Size of tensordict: {size_of_tensordict} GB, size of non_tensor_batch: {size_of_numpy_array} GB'
+        if prefix:
+            message = f'{prefix}, ' + message
+        print(message)
+    def check_consistency(self):
+        """Check the consistency of the DataProto. Mainly for batch and non_tensor_batch
+        We expose this function as a public one so that user can call themselves directly
+        """
+        if self.batch is not None:
+            assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1'
+        if self.non_tensor_batch is not None:
+            for key, val in self.non_tensor_batch.items():
+                assert isinstance(val, np.ndarray)
+        if self.batch is not None and len(self.non_tensor_batch) != 0:
+            # TODO: we can actually lift this restriction if needed
+            assert len(self.batch.batch_size) == 1, 'only support num_batch_dims=1 when non_tensor_batch is not empty.'
+            batch_size = self.batch.batch_size[0]
+            for key, val in self.non_tensor_batch.items():
+                assert isinstance(
+                    val, np.ndarray
+                ) and val.dtype == object, 'data in the non_tensor_batch must be a numpy.array with dtype=object'
+                assert val.shape[
+                    0] == batch_size, f'key {key} length {len(val)} is not equal to batch size {batch_size}'
+    @classmethod
+    def from_single_dict(cls, data: Dict[str, Union[torch.Tensor, np.ndarray]], meta_info=None):
+        tensors = {}
+        non_tensors = {}
+        for key, val in data.items():
+            if isinstance(val, torch.Tensor):
+                tensors[key] = val
+            elif isinstance(val, np.ndarray):
+                non_tensors[key] = val
+            else:
+                raise ValueError(f'Unsupported type in data {type(val)}')
+        return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
+    @classmethod
+    def from_dict(cls, tensors: Dict[str, torch.Tensor], non_tensors=None, meta_info=None, num_batch_dims=1):
+        """Create a DataProto from a dict of tensors. This assumes that
+        1. All the tensor in tensors have the same dim0
+        2. Only dim0 is the batch dim
+        """
+        assert len(tensors) > 0, 'tensors must not be empty'
+        assert num_batch_dims > 0, 'num_batch_dims must be greater than zero'
+        if non_tensors is not None:
+            assert num_batch_dims == 1, 'only support num_batch_dims=1 when non_tensors is not None.'
+        if meta_info is None:
+            meta_info = {}
+        if non_tensors is None:
+            non_tensors = {}
+        assert isinstance(non_tensors, dict)
+        # get and check batch size
+        batch_size = None
+        pivot_key = None
+        for key, tensor in tensors.items():
+            if batch_size is None:
+                batch_size = tensor.shape[:num_batch_dims]
+                pivot_key = key
+            else:
+                current_batch = tensor.shape[:num_batch_dims]
+                assert batch_size == current_batch, \
+                    f'Not all the tensor in tensors have the same batch size with batch_dims={num_batch_dims}. Got {pivot_key} has {batch_size}, {key} has {current_batch}'
+        for key, val in non_tensors.items():
+            non_tensors[key] = np.array(val, dtype=object)
+        tensor_dict = TensorDict(source=tensors, batch_size=batch_size)
+        return cls(batch=tensor_dict, non_tensor_batch=non_tensors, meta_info=meta_info)
+    def to(self, device) -> 'DataProto':
+        """move the batch to device
+        Args:
+            device (torch.device, str): torch device
+        Returns:
+            DataProto: the current DataProto
+        """
+        if self.batch is not None:
+            self.batch = self.batch.to(device)
+        return self
+    def select(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None, deepcopy=False) -> 'DataProto':
+        """Select a subset of the DataProto via batch_keys and meta_info_keys
+        Args:
+            batch_keys (list, optional): a list of strings indicating the keys in batch to select
+            meta_info_keys (list, optional): a list of keys indicating the meta info to select
+        Returns:
+            DataProto: the DataProto with the selected batch_keys and meta_info_keys
+        """
+        # TODO (zhangchi.usc1992) whether to copy
+        if batch_keys is not None:
+            batch_keys = tuple(batch_keys)
+            sub_batch = self.batch.select(*batch_keys)
+        else:
+            sub_batch = self.batch
+        if non_tensor_batch_keys is not None:
+            non_tensor_batch = {key: val for key, val in self.non_tensor_batch.items() if key in non_tensor_batch_keys}
+        else:
+            non_tensor_batch = self.non_tensor_batch
+        if deepcopy:
+            non_tensor_batch = copy.deepcopy(non_tensor_batch)
+        if meta_info_keys is not None:
+            sub_meta_info = {key: val for key, val in self.meta_info.items() if key in meta_info_keys}
+        else:
+            sub_meta_info = self.meta_info
+        if deepcopy:
+            sub_meta_info = copy.deepcopy(sub_meta_info)
+        return DataProto(batch=sub_batch, non_tensor_batch=non_tensor_batch, meta_info=sub_meta_info)
+    def pop(self, batch_keys=None, non_tensor_batch_keys=None, meta_info_keys=None) -> 'DataProto':
+        """Pop a subset of the DataProto via `batch_keys` and `meta_info_keys`
+        Args:
+            batch_keys (list, optional): a list of strings indicating the keys in batch to pop
+            meta_info_keys (list, optional): a list of keys indicating the meta info to pop
+        Returns:
+            DataProto: the DataProto with the poped batch_keys and meta_info_keys
+        """
+        assert batch_keys is not None
+        if meta_info_keys is None:
+            meta_info_keys = []
+        if non_tensor_batch_keys is None:
+            non_tensor_batch_keys = []
+        tensors = {}
+        # tensor batch
+        for key in batch_keys:
+            assert key in self.batch.keys()
+            tensors[key] = self.batch.pop(key)
+        non_tensors = {}
+        # non tensor batch
+        for key in non_tensor_batch_keys:
+            assert key in self.non_tensor_batch.keys()
+            non_tensors[key] = self.non_tensor_batch.pop(key)
+        meta_info = {}
+        for key in meta_info_keys:
+            assert key in self.meta_info.keys()
+            meta_info[key] = self.meta_info.pop(key)
+        return DataProto.from_dict(tensors=tensors, non_tensors=non_tensors, meta_info=meta_info)
+    def rename(self, old_keys=None, new_keys=None) -> 'DataProto':
+        """
+        Note that this function only rename the key in the batch
+        """
+        def validate_input(keys):
+            if keys is not None:
+                if isinstance(keys, str):
+                    keys = [keys]
+                elif isinstance(keys, list):
+                    pass
+                else:
+                    raise TypeError(f'keys must be a list or a string, but got {type(keys)}')
+            return keys
+        old_keys = validate_input(old_keys)
+        new_keys = validate_input(new_keys)
+        if len(new_keys) != len(old_keys):
+            raise ValueError(
+                f'new_keys and old_keys must have the same length, but got {len(new_keys)} and {len(old_keys)}')
+        self.batch.rename_key_(tuple(old_keys), tuple(new_keys))
+        return self
+    def union(self, other: 'DataProto') -> 'DataProto':
+        """Union with another DataProto. Union batch and meta_info separately.
+        Throw an error if
+        - there are conflict keys in batch and they are not equal
+        - the batch size of two data batch is not the same
+        - there are conflict keys in meta_info and they are not the same.
+        Args:
+            other (DataProto): another DataProto to union
+        Returns:
+            DataProto: the DataProto after union
+        """
+        self.batch = union_tensor_dict(self.batch, other.batch)
+        self.non_tensor_batch = union_numpy_dict(self.non_tensor_batch, other.non_tensor_batch)
+        self.meta_info = union_two_dict(self.meta_info, other.meta_info)
+        return self
+    def make_iterator(self, mini_batch_size, epochs, seed=None, dataloader_kwargs=None):
+        """Make an iterator from the DataProto. This is built upon that TensorDict can be used as a normal Pytorch
+        dataset. See https://pytorch.org/tensordict/tutorials/data_fashion for more details.
+        Args:
+            mini_batch_size (int): mini-batch size when iterating the dataset. We require that
+                ``batch.batch_size[0] % mini_batch_size == 0``
+            epochs (int): number of epochs when iterating the dataset.
+            dataloader_kwargs: internally, it returns a DataLoader over the batch.
+                The dataloader_kwargs is the kwargs passed to the DataLoader
+        Returns:
+            Iterator: an iterator that yields a mini-batch data at a time. The total number of iteration steps is
+            ``self.batch.batch_size * epochs // mini_batch_size``
+        """
+        assert self.batch.batch_size[0] % mini_batch_size == 0, f"{self.batch.batch_size[0]} % {mini_batch_size} != 0"
+        # we can directly create a dataloader from TensorDict
+        if dataloader_kwargs is None:
+            dataloader_kwargs = {}
+        if seed is not None:
+            generator = torch.Generator()
+            generator.manual_seed(seed)
+        else:
+            generator = None
+        assert isinstance(dataloader_kwargs, Dict)
+        train_dataloader = DataLoader(dataset=self,
+                                      batch_size=mini_batch_size,
+                                      collate_fn=collate_fn,
+                                      generator=generator,
+                                      **dataloader_kwargs)
+        def get_data():
+            for _ in range(epochs):
+                for d in train_dataloader:
+                    d.meta_info = self.meta_info
+                    yield d
+        return iter(get_data())
+    def chunk(self, chunks: int) -> List['DataProto']:
+        """Split the batch among dim=0 into chunks. The meta_info is passed to each DataProto after split.
+        Args:
+            chunks (int): the number of chunks to split on dim=0
+        Returns:
+            List[DataProto]: a list of DataProto after splitting
+        """
+        assert len(
+            self) % chunks == 0, f'only support equal chunk. Got size of DataProto {len(self)} and chunk {chunks}.'
+        if self.batch is not None:
+            batch_lst = self.batch.chunk(chunks=chunks, dim=0)
+        else:
+            batch_lst = [None for _ in range(chunks)]
+        non_tensor_batch_lst = [{} for _ in range(chunks)]
+        for key, val in self.non_tensor_batch.items():
+            assert isinstance(val, np.ndarray)
+            non_tensor_lst = np.array_split(val, chunks)
+            assert len(non_tensor_lst) == chunks
+            for i in range(chunks):
+                non_tensor_batch_lst[i][key] = non_tensor_lst[i]
+        output = []
+        for i in range(chunks):
+            output.append(
+                DataProto(batch=batch_lst[i], non_tensor_batch=non_tensor_batch_lst[i], meta_info=self.meta_info))
+        return output
+    @staticmethod
+    def concat(data: List['DataProto']) -> 'DataProto':
+        """Concat a list of DataProto. The batch is concatenated among dim=0.
+        The meta_info is assumed to be identical and will use the first one.
+        Args:
+            data (List[DataProto]): list of DataProto
+        Returns:
+            DataProto: concatenated DataProto
+        """
+        batch_lst = []
+        for batch in data:
+            batch_lst.append(batch.batch)
+        if batch_lst[0] is not None:
+            new_batch = torch.cat(batch_lst, dim=0)
+        else:
+            new_batch = None
+        non_tensor_batch = list_of_dict_to_dict_of_list(list_of_dict=[d.non_tensor_batch for d in data])
+        for key, val in non_tensor_batch.items():
+            non_tensor_batch[key] = np.concatenate(val, axis=0)
+        return DataProto(batch=new_batch, non_tensor_batch=non_tensor_batch, meta_info=data[0].meta_info)
+    def reorder(self, indices):
+        """
+        Note that this operation is in-place
+        """
+        indices_np = indices.detach().numpy()
+        self.batch = self.batch[indices]
+        self.non_tensor_batch = {key: val[indices_np] for key, val in self.non_tensor_batch.items()}
+    def repeat(self, repeat_times=2, interleave=True):
+        """
+        Repeat the batch data a specified number of times.
+        Args:
+            repeat_times (int): Number of times to repeat the data.
+            interleave (bool): Whether to interleave the repeated data.
+        Returns:
+            DataProto: A new DataProto with repeated data.
+        """
+        if self.batch is not None:
+            if interleave:
+                # Interleave the data
+                repeated_tensors = {
+                    key: tensor.repeat_interleave(repeat_times, dim=0) for key, tensor in self.batch.items()
+                }
+            else:
+                # Stack the data
+                repeated_tensors = {
+                    key: tensor.unsqueeze(0).expand(repeat_times, *tensor.shape).reshape(-1, *tensor.shape[1:])
+                    for key, tensor in self.batch.items()
+                }
+            repeated_batch = TensorDict(
+                source=repeated_tensors,
+                batch_size=(self.batch.batch_size[0] * repeat_times,),
+            )
+        else:
+            repeated_batch = None
+        repeated_non_tensor_batch = {}
+        for key, val in self.non_tensor_batch.items():
+            if interleave:
+                repeated_non_tensor_batch[key] = np.repeat(val, repeat_times, axis=0)
+            else:
+                repeated_non_tensor_batch[key] = np.tile(val, (repeat_times,) + (1,) * (val.ndim - 1))
+        return DataProto(
+            batch=repeated_batch,
+            non_tensor_batch=repeated_non_tensor_batch,
+            meta_info=self.meta_info,
+        )
+import ray
+@dataclass
+class DataProtoFuture:
+    """
+    DataProtoFuture aims to eliminate actual data fetching on driver. By doing so, the driver doesn't have to wait
+    for data so that asynchronous execution becomes possible.
+    DataProtoFuture contains a list of futures from another WorkerGroup of size world_size.
+    - collect_fn is a Callable that reduces the list of futures to a DataProto
+    - dispatch_fn is a Callable that partitions the DataProto into a list of DataProto of size world_size and then select
+    Potential issue: we can optimize dispatch_fn(collect_fn) such that only needed data is fetched on destination
+    - DataProtoFuture only supports directly passing from the output of a method to another input. You can't perform any
+    operation on the DataProtoFuture in driver.
+    """
+    collect_fn: Callable
+    futures: List[ray.ObjectRef]
+    dispatch_fn: Callable = None
+    @staticmethod
+    def concat(data: List[ray.ObjectRef]) -> 'DataProtoFuture':
+        output = DataProtoFuture(collect_fn=DataProto.concat, futures=data)
+        return output
+    def chunk(self, chunks: int) -> List['DataProtoFuture']:
+        from functools import partial
+        arg_future_lst = []
+        for i in range(chunks):
+            # note that we can't directly pass i and chunks
+            def dispatch_fn(x, i, chunks):
+                return x.chunk(chunks=chunks)[i]
+            arg_future = DataProtoFuture(collect_fn=self.collect_fn,
+                                         dispatch_fn=partial(dispatch_fn, i=i, chunks=chunks),
+                                         futures=self.futures)
+            arg_future_lst.append(arg_future)
+        return arg_future_lst
+    def get(self):
+        output = ray.get(self.futures)  # dp_size.
+        for o in output:
+            assert isinstance(o, DataProto)
+        output = self.collect_fn(output)  # select dp, concat
+        if self.dispatch_fn is not None:
+            output = self.dispatch_fn(output)  # split in batch dim, select using dp
+        return output

code/RL_model/verl/Search-R1/wandb/debug-internal.log ADDED Viewed

	@@ -0,0 +1,6 @@

+{"time":"2026-02-01T20:27:26.269116545-05:00","level":"INFO","msg":"stream: starting","core version":"0.23.1"}
+{"time":"2026-02-01T20:27:27.692526697-05:00","level":"INFO","msg":"stream: created new stream","id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.692680073-05:00","level":"INFO","msg":"handler: started","stream_id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.695494454-05:00","level":"INFO","msg":"stream: started","id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.69557747-05:00","level":"INFO","msg":"writer: started","stream_id":"lly0j9zs"}
+{"time":"2026-02-01T20:27:27.695701035-05:00","level":"INFO","msg":"sender: started","stream_id":"lly0j9zs"}

code/RL_model/verl/Search-R1/wandb/debug.log ADDED Viewed

	@@ -0,0 +1,21 @@

+2026-02-01 20:27:25,874 INFO    MainThread:1578907 [wandb_setup.py:_flush():80] Current SDK version is 0.23.1
+2026-02-01 20:27:25,874 INFO    MainThread:1578907 [wandb_setup.py:_flush():80] Configure stats pid to 1578907
+2026-02-01 20:27:25,875 INFO    MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from /home/mshahidul/.config/wandb/settings
+2026-02-01 20:27:25,875 INFO    MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/settings
+2026-02-01 20:27:25,875 INFO    MainThread:1578907 [wandb_setup.py:_flush():80] Loading settings from environment variables
+2026-02-01 20:27:25,875 INFO    MainThread:1578907 [wandb_init.py:setup_run_log_directory():714] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug.log
+2026-02-01 20:27:25,875 INFO    MainThread:1578907 [wandb_init.py:setup_run_log_directory():715] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/Search-R1/wandb/run-20260201_202725-lly0j9zs/logs/debug-internal.log
+2026-02-01 20:27:25,876 INFO    MainThread:1578907 [wandb_init.py:init():841] calling init triggers
+2026-02-01 20:27:25,876 INFO    MainThread:1578907 [wandb_init.py:init():846] wandb.init called with sweep_config: {}
+config: {'data': {'tokenizer': None, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/Search-R1/dataset/test.parquet', 'train_data_num': None, 'val_data_num': None, 'prompt_key': 'prompt', 'max_prompt_length': 4096, 'max_response_length': 1024, 'max_start_length': 256, 'max_obs_length': 512, 'train_batch_size': 128, 'val_batch_size': 64, 'return_raw_input_ids': False, 'return_raw_chat': False, 'shuffle_train_dataloader': True}, 'actor_rollout_ref': {'hybrid_engine': True, 'model': {'path': 'Qwen/Qwen3-4B-Instruct-2507', 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'use_remove_padding': False}, 'actor': {'strategy': 'fsdp', 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'grad_clip': 1.0, 'state_masking': False, 'clip_ratio': 0.2, 'entropy_coeff': 0.001, 'use_kl_loss': False, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'ulysses_sequence_parallel_size': 1, 'optim': {'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'fsdp_config': {'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'grad_offload': False, 'optimizer_offload': True, 'fsdp_size': -1}, 'ppo_micro_batch_size_per_gpu': 16}, 'ref': {'fsdp_config': {'param_offload': True, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'ulysses_sequence_parallel_size': 1}, 'rollout': {'name': 'vllm', 'temperature': 1.0, 'top_k': -1, 'top_p': 0.95, 'prompt_length': 4096, 'response_length': 1024, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'free_cache_engine': True, 'load_format': 'dummy_dtensor', 'tensor_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_num_seqs': 1024, 'log_prob_micro_batch_size': 64, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'do_sample': True, 'n': 1, 'n_agent': 1}}, 'critic': {'strategy': 'fsdp', 'optim': {'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'min_lr_ratio': None, 'warmup_style': 'constant', 'total_training_steps': 1005}, 'model': {'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'enable_gradient_checkpointing': False, 'use_remove_padding': False, 'fsdp_config': {'param_offload': False, 'grad_offload': False, 'optimizer_offload': False, 'wrap_policy': {'min_num_params': 0}, 'fsdp_size': -1}}, 'ppo_mini_batch_size': 64, 'ppo_micro_batch_size': 64, 'forward_micro_batch_size': 64, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ulysses_sequence_parallel_size': 1, 'ppo_epochs': 1, 'shuffle': False, 'grad_clip': 1.0, 'cliprange_value': 0.5}, 'reward_model': {'enable': False, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'use_remove_padding': False, 'fsdp_config': {'min_num_params': 0, 'param_offload': False}}, 'micro_batch_size': 64, 'max_length': None, 'ulysses_sequence_parallel_size': 1, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'structure_format_score': 0, 'final_format_score': 0, 'retrieval_score': 0}, 'retriever': {'url': 'http://127.0.0.1:8000/retrieve', 'topk': 3}, 'algorithm': {'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'no_think_rl': False, 'kl_penalty': 'kl', 'kl_ctrl': {'type': 'fixed', 'kl_coef': 0.001}, 'state_masking': {'start_state_marker': '<information>', 'end_state_marker': '</information>'}}, 'trainer': {'total_epochs': 15, 'total_training_steps': 1005, 'project_name': '', 'experiment_name': 'llm_guard_3B_10k_v2', 'logger': ['wandb'], 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 100, 'test_freq': 50, 'critic_warmup': 0, 'default_hdfs_dir': '~/experiments/gsm8k/ppo/llm_guard_3B_10k_v2', 'default_local_dir': 'verl_checkpoints/llm_guard_3B_10k_v2'}, 'max_turns': 1, 'do_search': False, '_wandb': {}}
+2026-02-01 20:27:25,876 INFO    MainThread:1578907 [wandb_init.py:init():889] starting backend
+2026-02-01 20:27:26,251 INFO    MainThread:1578907 [wandb_init.py:init():892] sending inform_init request
+2026-02-01 20:27:26,261 INFO    MainThread:1578907 [wandb_init.py:init():900] backend started and connected
+2026-02-01 20:27:26,270 INFO    MainThread:1578907 [wandb_init.py:init():970] updated telemetry
+2026-02-01 20:27:26,293 INFO    MainThread:1578907 [wandb_init.py:init():994] communicating run to backend with 90.0 second timeout
+2026-02-01 20:27:27,908 INFO    MainThread:1578907 [wandb_init.py:init():1041] starting run threads in backend
+2026-02-01 20:27:28,715 INFO    MainThread:1578907 [wandb_run.py:_console_start():2521] atexit reg
+2026-02-01 20:27:28,716 INFO    MainThread:1578907 [wandb_run.py:_redirect():2369] redirect: wrap_raw
+2026-02-01 20:27:28,716 INFO    MainThread:1578907 [wandb_run.py:_redirect():2438] Wrapping output streams.
+2026-02-01 20:27:28,716 INFO    MainThread:1578907 [wandb_run.py:_redirect():2461] Redirects installed.
+2026-02-01 20:27:28,726 INFO    MainThread:1578907 [wandb_init.py:init():1081] run started, returning control to user process

code/RL_model/verl/verl_train/tests/experimental/agent_loop/agent_utils.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+from omegaconf import DictConfig
+from verl.experimental.agent_loop import AgentLoopManager
+from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+from verl.single_controller.ray.base import create_colocated_worker_cls
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
+from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker, RewardModelWorker
+def init_agent_loop_manager(config: DictConfig) -> AgentLoopManager | RayWorkerGroup:
+    # =========================== 1. Create hybrid ActorRollout workers ===========================
+    actor_rollout_cls = (
+        AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+    )
+    role_worker_mapping = {
+        Role.ActorRollout: ray.remote(actor_rollout_cls),
+    }
+    if config.reward_model.enable:
+        role_worker_mapping[Role.RewardModel] = ray.remote(RewardModelWorker)
+    global_pool_id = "global_pool"
+    resource_pool_spec = {
+        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+    }
+    mapping = {
+        Role.ActorRollout: global_pool_id,
+    }
+    if config.reward_model.enable_resource_pool:
+        mapping[Role.RewardModel] = "reward_pool"
+        if config.reward_model.n_gpus_per_node <= 0:
+            raise ValueError("config.reward_model.n_gpus_per_node must be greater than 0")
+        if config.reward_model.nnodes <= 0:
+            raise ValueError("config.reward_model.nnodes must be greater than 0")
+        reward_pool = [config.reward_model.n_gpus_per_node] * config.reward_model.nnodes
+        resource_pool_spec["reward_pool"] = reward_pool
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
+    resource_pool_manager.create_resource_pool()
+    resource_pool_to_cls = {pool: {} for pool in resource_pool_manager.resource_pool_dict.values()}
+    # create actor and rollout
+    resource_pool = resource_pool_manager.get_resource_pool(Role.ActorRollout)
+    actor_rollout_cls = RayClassWithInitArgs(
+        cls=role_worker_mapping[Role.ActorRollout], config=config.actor_rollout_ref, role="actor_rollout"
+    )
+    resource_pool_to_cls[resource_pool]["actor_rollout"] = actor_rollout_cls
+    if config.reward_model.enable:
+        # we create a RM here
+        resource_pool = resource_pool_manager.get_resource_pool(Role.RewardModel)
+        rm_cls = RayClassWithInitArgs(role_worker_mapping[Role.RewardModel], config=config.reward_model)
+        resource_pool_to_cls[resource_pool]["rm"] = rm_cls
+    all_wg = {}
+    for resource_pool, class_dict in resource_pool_to_cls.items():
+        worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict)
+        wg_dict = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls)
+        spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys())
+        all_wg.update(spawn_wg)
+    actor_rollout_wg = all_wg["actor_rollout"]
+    actor_rollout_wg.init_model()
+    if config.actor_rollout_ref.rollout.mode == "sync":
+        raise ValueError("Agent loop tests require async rollout mode. Please set rollout.mode=async.")
+    if config.reward_model.enable_resource_pool and config.reward_model.enable:
+        rm_resource_pool = resource_pool_manager.get_resource_pool(Role.RewardModel)
+    else:
+        rm_resource_pool = None
+    # =========================== 2. Create AgentLoopManager ===========================
+    agent_loop_manager = AgentLoopManager(
+        config=config,
+        worker_group=actor_rollout_wg,
+        rm_resource_pool=rm_resource_pool,
+    )
+    return agent_loop_manager

code/RL_model/verl/verl_train/tests/experimental/agent_loop/qwen_vl_tool_chat_template.jinja2 ADDED Viewed

	@@ -0,0 +1,150 @@

+{% set image_count = namespace(value=0) %}
+{% set video_count = namespace(value=0) %}
+{%- if tools %}
+{{- '<|im_start|>system\n' }}
+{%- if messages[0]['role'] == 'system' %}
+{%- if messages[0]['content'] is string %}
+{{- messages[0]['content'] }}
+{%- else %}
+{{- messages[0]['content'][0]['text'] }}
+{%- endif %}
+{%- else %}
+{{- 'You are a helpful assistant.' }}
+{%- endif %}
+{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+{%- for tool in tools %}
+{{- "\n" }}
+{{- tool | tojson }}
+{%- endfor %}
+{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{% for message in messages %}
+{% if message['role'] != 'system' or loop.first == false %}
+{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}
+{{ message['content'] }}<|im_end|>
+{% else %}
+{% for content in message['content'] %}
+{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
+{% set image_count.value = image_count.value + 1 %}
+{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>
+{% elif content['type'] == 'video' or 'video' in content %}
+{% set video_count.value = video_count.value + 1 %}
+{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>
+{% elif 'text' in content %}
+{{ content['text'] }}
+{% endif %}
+{% endfor %}<|im_end|>
+{% endif %}
+{%- elif message.role == "assistant" %}
+{{- '<|im_start|>' + message.role }}
+{%- if message.content %}
+{{- '\n' + message.content }}
+{%- endif %}
+{%- for tool_call in message.tool_calls %}
+{%- if tool_call.function is defined %}
+{%- set tool_call = tool_call.function %}
+{%- endif %}
+{{- '\n<tool_call>\n{"name": "' }}
+{{- tool_call.name }}
+{{- '", "arguments": ' }}
+{{- tool_call.arguments | tojson }}
+{{- '}\n</tool_call>' }}
+{%- endfor %}
+{{- '<|im_end|>\n' }}
+{%- elif message.role == "tool" %}
+{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+{{- '<|im_start|>user' }}
+{%- endif %}
+{{- '\n<tool_response>\n' }}
+{% if message['content'] is string %}
+{{ message.content }}
+{% else %}
+{% for content in message['content'] %}
+{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
+{% set image_count.value = image_count.value + 1 %}
+{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>
+{% elif content['type'] == 'video' or 'video' in content %}
+{% set video_count.value = video_count.value + 1 %}
+{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>
+{% elif content['type'] == 'text' or 'text' in content %}
+{{ content['text'] }}
+{% endif %}
+{% endfor %}
+{% endif %}
+{{- '\n</tool_response>' }}
+{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+{{- '<|im_end|>\n' }}
+{%- endif %}
+{%- endif %}
+{% endif %}
+{% endfor %}
+{%- else %}
+{% for message in messages %}
+{% if loop.first and message['role'] != 'system' %}
+<|im_start|>system
+You are a helpful assistant.<|im_end|>
+{% endif %}
+{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+<|im_start|>{{ message['role'] }}
+{% if message['content'] is string %}
+{{ message['content'] }}<|im_end|>
+{% else %}
+{% for content in message['content'] %}
+{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
+{% set image_count.value = image_count.value + 1 %}
+{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>
+{% elif content['type'] == 'video' or 'video' in content %}
+{% set video_count.value = video_count.value + 1 %}
+{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>
+{% elif 'text' in content %}
+{{ content['text'] }}
+{% endif %}
+{% endfor %}<|im_end|>
+{% endif %}
+{%- elif message.role == "assistant" %}
+{{- '<|im_start|>' + message.role }}
+{%- if message.content %}
+{{- '\n' + message.content }}
+{%- endif %}
+{%- for tool_call in message.tool_calls %}
+{%- if tool_call.function is defined %}
+{%- set tool_call = tool_call.function %}
+{%- endif %}
+{{- '\n<tool_call>\n{"name": "' }}
+{{- tool_call.name }}
+{{- '", "arguments": ' }}
+{{- tool_call.arguments | tojson }}
+{{- '}\n</tool_call>' }}
+{%- endfor %}
+{{- '<|im_end|>\n' }}
+{%- elif message.role == "tool" %}
+{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+{{- '<|im_start|>user' }}
+{%- endif %}
+{{- '\n<tool_response>\n' }}
+{% if message['content'] is string %}
+{{ message.content }}
+{% else %}
+{% for content in message['content'] %}
+{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
+{% set image_count.value = image_count.value + 1 %}
+{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>
+{% elif content['type'] == 'video' or 'video' in content %}
+{% set video_count.value = video_count.value + 1 %}
+{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>
+{% elif content['type'] == 'text' or 'text' in content %}
+{{ content['text'] }}
+{% endif %}
+{% endfor %}
+{% endif %}
+{{- '\n</tool_response>' }}
+{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+{{- '<|im_end|>\n' }}
+{%- endif %}
+{%- endif %}
+{% endfor %}
+{%- endif %}
+{% if add_generation_prompt %}
+<|im_start|>assistant
+{% endif %}

code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_basic_agent_loop.py ADDED Viewed

	@@ -0,0 +1,454 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from typing import Any
+import numpy as np
+import pytest
+import ray
+from omegaconf import DictConfig
+from transformers.utils import get_json_schema
+from tests.experimental.agent_loop.agent_utils import init_agent_loop_manager
+from verl.checkpoint_engine import CheckpointEngineManager
+from verl.experimental.agent_loop import AgentLoopManager
+from verl.experimental.agent_loop.agent_loop import get_trajectory_info
+from verl.protocol import DataProto
+from verl.tools.base_tool import BaseTool, OpenAIFunctionToolSchema
+from verl.tools.schemas import ToolResponse
+from verl.trainer.ppo.reward import compute_reward, load_reward_manager
+from verl.utils import hf_tokenizer
+@pytest.fixture
+def init_config() -> DictConfig:
+    from hydra import compose, initialize_config_dir
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(
+            config_name="ppo_trainer",
+            overrides=[
+                "actor_rollout_ref.actor.use_dynamic_bsz=true",
+                # test sleep/wake_up with fsdp offload
+                "actor_rollout_ref.actor.fsdp_config.param_offload=True",
+                "actor_rollout_ref.actor.fsdp_config.optimizer_offload=True",
+                "reward_model.reward_manager=dapo",
+                "+reward_model.reward_kwargs.overlong_buffer_cfg.enable=False",
+                "+reward_model.reward_kwargs.overlong_buffer_cfg.len=3072",
+                "+reward_model.reward_kwargs.max_resp_len=4096",
+            ],
+        )
+    model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
+    config.actor_rollout_ref.model.path = model_path
+    config.actor_rollout_ref.rollout.name = os.environ["ROLLOUT_NAME"]
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.enforce_eager = True
+    config.actor_rollout_ref.rollout.prompt_length = 4096
+    config.actor_rollout_ref.rollout.response_length = 4096
+    config.actor_rollout_ref.rollout.n = 4
+    config.actor_rollout_ref.rollout.agent.num_workers = 2
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = True
+    return config
+def test_single_turn(init_config):
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    agent_loop_manager = AgentLoopManager(init_config)
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    reward_fn = load_reward_manager(
+        init_config, tokenizer, num_examine=0, **init_config.reward_model.get("reward_kwargs", {})
+    )
+    raw_prompts = [
+        [
+            {
+                "role": "user",
+                "content": "Let's play a role playing game. Your name is Alice, your favorite color is blue.",
+            }
+        ],
+        [{"role": "user", "content": "Let's play a role playing game. Your name is Bob, your favorite color is red."}],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array(raw_prompts),
+            "agent_name": np.array(["single_turn_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+        },
+    )
+    n = init_config.actor_rollout_ref.rollout.n
+    batch = batch.repeat(n)
+    result = agent_loop_manager.generate_sequences(prompts=batch)
+    assert len(result) == len(raw_prompts) * n
+    # check result
+    seq_len = result.batch["prompts"].size(1) + result.batch["responses"].size(1)
+    assert result.batch["input_ids"].size(1) == seq_len
+    assert result.batch["attention_mask"].size(1) == seq_len
+    assert result.batch["position_ids"].size(1) == seq_len
+    if init_config.actor_rollout_ref.rollout.calculate_log_probs:
+        assert result.batch["rollout_log_probs"].size(1) == result.batch["responses"].size(1)
+    # check compute score
+    assert result.batch["rm_scores"].shape == result.batch["responses"].shape
+    reward_tensor, reward_extra_info = compute_reward(result, reward_fn)
+    assert reward_tensor.shape == result.batch["responses"].shape
+    assert "acc" in reward_extra_info, f"reward_extra_info {reward_extra_info} should contain 'acc'"
+    assert reward_extra_info["acc"].shape == (len(result),), f"invalid acc: {reward_extra_info['acc']}"
+    # check turns
+    num_turns = result.non_tensor_batch["__num_turns__"]
+    assert np.all(num_turns == 2)
+    print("Test passed!")
+    ray.shutdown()
+class WeatherTool(BaseTool):
+    def get_current_temperature(self, location: str, unit: str = "celsius"):
+        """Get current temperature at a location.
+        Args:
+            location: The location to get the temperature for, in the format "City, State, Country".
+            unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+        Returns:
+            the temperature, the location, and the unit in a dict
+        """
+        print(f"[DEBUG] get_current_temperature: {location}, {unit}")
+        return {
+            "temperature": 26.1,
+            "location": location,
+            "unit": unit,
+        }
+    def get_openai_tool_schema(self) -> OpenAIFunctionToolSchema:
+        schema = get_json_schema(self.get_current_temperature)
+        return OpenAIFunctionToolSchema(**schema)
+    async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> tuple[ToolResponse, float, dict]:
+        try:
+            result = self.get_current_temperature(**parameters)
+            return ToolResponse(text=json.dumps(result)), 0, {}
+        except Exception as e:
+            return ToolResponse(text=str(e)), 0, {}
+class WeatherToolWithData(BaseTool):
+    def get_openai_tool_schema(self) -> OpenAIFunctionToolSchema:
+        schema = get_json_schema(self.get_temperature_date)
+        return OpenAIFunctionToolSchema(**schema)
+    def get_temperature_date(self, location: str, date: str, unit: str = "celsius"):
+        """Get temperature at a location and date.
+        Args:
+            location: The location to get the temperature for, in the format "City, State, Country".
+            date: The date to get the temperature for, in the format "Year-Month-Day".
+            unit: The unit to return the temperature in. Defaults to "celsius". (choices: ["celsius", "fahrenheit"])
+        Returns:
+            the temperature, the location, the date and the unit in a dict
+        """
+        print(f"[DEBUG] get_temperature_date: {location}, {date}, {unit}")
+        return {
+            "temperature": 25.9,
+            "location": location,
+            "date": date,
+            "unit": unit,
+        }
+    async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> tuple[ToolResponse, float, dict]:
+        try:
+            result = self.get_temperature_date(**parameters)
+            return ToolResponse(text=json.dumps(result)), 0, {}
+        except Exception as e:
+            return ToolResponse(text=str(e)), 0, {}
+def test_tool_agent(init_config):
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        },
+        ignore_reinit_error=True,
+    )
+    # =========================== 1. Init rollout manager ===========================
+    tool_config = {
+        "tools": [
+            {
+                "class_name": "tests.experimental.agent_loop.test_basic_agent_loop.WeatherTool",
+                "config": {"type": "native"},
+            },
+            {
+                "class_name": "tests.experimental.agent_loop.test_basic_agent_loop.WeatherToolWithData",
+                "config": {"type": "native"},
+            },
+        ]
+    }
+    tool_config_path = "/tmp/tool_config.json"
+    with open(tool_config_path, "w") as f:
+        json.dump(tool_config, f)
+    n = 2
+    init_config.actor_rollout_ref.rollout.n = n
+    init_config.actor_rollout_ref.rollout.multi_turn.tool_config_path = tool_config_path
+    init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 2
+    init_config.actor_rollout_ref.rollout.calculate_log_probs = True
+    agent_loop_manager = AgentLoopManager(init_config)
+    # =========================== 2. Generate sequences  ===========================
+    raw_prompts = [
+        [
+            {"role": "user", "content": "How are you?"},
+        ],
+        [
+            {"role": "user", "content": "What's the temperature in Los Angeles now?"},
+        ],
+        [
+            {"role": "user", "content": "What's the temperature in New York now?"},
+        ],
+        [
+            {
+                "role": "system",
+                "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n\n"
+                "Current Date: 2024-09-30",
+            },
+            {"role": "user", "content": "What's the temperature in San Francisco now? How about tomorrow?"},
+        ],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
+            "agent_name": np.array(["tool_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+        },
+    )
+    batch = batch.repeat(n)
+    result = agent_loop_manager.generate_sequences(prompts=batch)
+    assert len(result) == len(raw_prompts) * n
+    # Check turns
+    num_turns = result.non_tensor_batch["__num_turns__"]
+    print(f"num_turns: {num_turns}")
+    for i in range(len(num_turns)):
+        if i // n == 0:
+            # [user, assistant]
+            assert num_turns[i] == 2
+        else:
+            # [user, assistant, tool, assistant]
+            assert num_turns[i] == 4
+    # Check response_mask
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    responses = result.batch["responses"]
+    response_mask = result.batch["response_mask"]
+    attention_mask = result.batch["attention_mask"]
+    assert result.batch["rm_scores"].size(1) == responses.size(1)
+    assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
+    assert result.batch["rollout_log_probs"].size(1) == result.batch["responses"].size(1)
+    response_length = response_mask.size(1)
+    for i in range(len(responses)):
+        # response with tool response
+        valid_tokens = responses[i][attention_mask[i][-response_length:].bool()]
+        response_with_obs = tokenizer.decode(valid_tokens)
+        # response without tool response
+        valid_tokens = responses[i][response_mask[i].bool()]
+        response_without_obs = tokenizer.decode(valid_tokens)
+        assert "<tool_response>" not in response_without_obs, (
+            f"found <tool_response> in response: {response_without_obs}"
+        )
+        assert "</tool_response>" not in response_without_obs, (
+            f"found </tool_response> in response: {response_without_obs}"
+        )
+        print("=========================")
+        print(response_with_obs)
+        print("---")
+        print(response_without_obs)
+    print("Test passed!")
+    ray.shutdown()
+def test_tool_agent_with_interaction(init_config):
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    # =========================== 1. Init rollout manager ===========================
+    tool_config = {
+        "tools": [
+            {
+                "class_name": "tests.experimental.agent_loop.test_basic_agent_loop.WeatherTool",
+                "config": {"type": "native"},
+            },
+            {
+                "class_name": "tests.experimental.agent_loop.test_basic_agent_loop.WeatherToolWithData",
+                "config": {"type": "native"},
+            },
+        ]
+    }
+    tool_config_path = "/tmp/tool_config.json"
+    with open(tool_config_path, "w") as f:
+        json.dump(tool_config, f)
+    interaction_config = {
+        "interaction": [
+            {"name": "weather", "class_name": "verl.interactions.weather_interaction.WeatherInteraction", "config": {}}
+        ]
+    }
+    interaction_config_path = "/tmp/interaction_config.json"
+    with open(interaction_config_path, "w") as f:
+        json.dump(interaction_config, f)
+    n = 2
+    init_config.actor_rollout_ref.rollout.n = n
+    init_config.actor_rollout_ref.rollout.multi_turn.tool_config_path = tool_config_path
+    init_config.actor_rollout_ref.rollout.multi_turn.interaction_config_path = interaction_config_path
+    init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 2
+    agent_loop_manager = init_agent_loop_manager(init_config)
+    checkpoint_manager = CheckpointEngineManager(
+        backend=init_config.actor_rollout_ref.rollout.checkpoint_engine.backend,
+        trainer=agent_loop_manager.worker_group,
+        replicas=agent_loop_manager.rollout_replicas,
+    )
+    checkpoint_manager.sleep_replicas()
+    checkpoint_manager.update_weights()
+    # =========================== 2. Generate sequences  ===========================
+    raw_prompts = [
+        [
+            {"role": "user", "content": "How are you?"},
+        ],
+        [
+            {"role": "user", "content": "What's the temperature in Los Angeles now?"},
+        ],
+        [
+            {"role": "user", "content": "What's the temperature in New York now?"},
+        ],
+        [
+            {
+                "role": "system",
+                "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant.\n\n"
+                "Current Date: 2024-09-30",
+            },
+            {"role": "user", "content": "What's the temperature in San Francisco now? How about tomorrow?"},
+        ],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
+            "agent_name": np.array(["tool_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+            "extra_info": np.array(
+                [
+                    {"interaction_kwargs": {"name": "weather"}},
+                    {"interaction_kwargs": {"name": "weather"}},
+                    {"interaction_kwargs": {"name": "weather"}},
+                    {"interaction_kwargs": {"name": "weather"}},
+                ]
+            ),
+        },
+    )
+    batch = batch.repeat(n)
+    result = agent_loop_manager.generate_sequences(prompts=batch)
+    assert len(result) == len(raw_prompts) * n
+    # Check turns
+    num_turns = result.non_tensor_batch["__num_turns__"]
+    print(f"num_turns: {num_turns}")
+    for i in range(len(num_turns)):
+        if i // n == 0:
+            # [user, assistant, user]
+            assert num_turns[i] == 3
+        else:
+            # [user, assistant, tool, assistant, user]
+            assert num_turns[i] == 5
+    # Check response_mask
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    responses = result.batch["responses"]
+    response_mask = result.batch["response_mask"]
+    attention_mask = result.batch["attention_mask"]
+    assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
+    response_length = response_mask.size(1)
+    for i in range(len(responses)):
+        # response with tool response
+        valid_tokens = responses[i][attention_mask[i][-response_length:].bool()]
+        response_with_obs = tokenizer.decode(valid_tokens)
+        # response without tool response
+        valid_tokens = responses[i][response_mask[i].bool()]
+        response_without_obs = tokenizer.decode(valid_tokens)
+        assert "\udb82\udc89" not in response_without_obs, f"found \udb82\udc89 in response: {response_without_obs}"
+        assert "\udb82\udc8a" not in response_without_obs, f"found \udb82\udc8a in response: {response_without_obs}"
+        print("=========================")
+        print(response_with_obs)
+        print("---")
+        print(response_without_obs)
+    print("Test passed!")
+    ray.shutdown()
+@pytest.mark.asyncio
+async def test_get_trajectory_info():
+    """Tests the get_trajectory_info method."""
+    # Initialize the class to set up class-level attributes
+    step = 10
+    index = [1, 1, 3, 3]
+    expected_info = [
+        {"step": step, "sample_index": 1, "rollout_n": 0, "validate": False},
+        {"step": step, "sample_index": 1, "rollout_n": 1, "validate": False},
+        {"step": step, "sample_index": 3, "rollout_n": 0, "validate": False},
+        {"step": step, "sample_index": 3, "rollout_n": 1, "validate": False},
+    ]
+    trajectory_info = await get_trajectory_info(step, index, validate=False)
+    assert trajectory_info == expected_info

code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_gpt_oss_tool_parser.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+from transformers import AutoTokenizer
+from verl.experimental.agent_loop.tool_parser import GptOssToolParser
+@pytest.mark.asyncio
+@pytest.mark.skip(reason="local test only")
+async def test_gpt_oss_tool_parser():
+    example_text = """
+<|start|>assistant<|channel|>commentary to=functions.get_current_weather \
+<|constrain|>json<|message|>{"location": "Tokyo"}<|call|>
+<|start|>functions.get_current_weather to=assistant<|channel|>commentary<|message|>\
+{ "temperature": 20, "sunny": true }<|end|>"""
+    tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
+    response_ids = tokenizer.encode(example_text)
+    tool_parser = GptOssToolParser(tokenizer)
+    _, function_calls = await tool_parser.extract_tool_calls(response_ids)
+    assert len(function_calls) == 1
+    assert function_calls[0].name == "get_current_weather"
+    assert function_calls[0].arguments == '{"location": "Tokyo"}'

code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_multi_modal.py ADDED Viewed

	@@ -0,0 +1,570 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+from typing import Any
+import numpy as np
+import pytest
+import ray
+from omegaconf import DictConfig
+from PIL import Image
+from transformers.utils import get_json_schema
+from verl.experimental.agent_loop import AgentLoopManager
+from verl.protocol import DataProto
+from verl.tools.base_tool import BaseTool, OpenAIFunctionToolSchema
+from verl.tools.schemas import ToolResponse
+from verl.utils import hf_tokenizer
+def parse_multi_modal_type(messages: list[dict]) -> str:
+    message = messages[-1]
+    if isinstance(message["content"], str):
+        return "text"
+    for content in message["content"]:
+        if content["type"] == "image":
+            return "image"
+        elif content["type"] == "video":
+            return "video"
+    return "text"
+@pytest.fixture
+def init_config() -> DictConfig:
+    from hydra import compose, initialize_config_dir
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(
+            config_name="ppo_trainer",
+            overrides=[
+                "actor_rollout_ref.actor.use_dynamic_bsz=true",
+                # test sleep/wake_up with fsdp offload
+                "actor_rollout_ref.actor.fsdp_config.param_offload=True",
+                "actor_rollout_ref.actor.fsdp_config.optimizer_offload=True",
+            ],
+        )
+    model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-VL-3B-Instruct")
+    config.actor_rollout_ref.model.path = model_path
+    config.actor_rollout_ref.rollout.name = os.environ["ROLLOUT_NAME"]
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.enforce_eager = True
+    config.actor_rollout_ref.rollout.prompt_length = 10240
+    config.actor_rollout_ref.rollout.response_length = 4096
+    config.actor_rollout_ref.rollout.n = 4
+    config.actor_rollout_ref.rollout.agent.num_workers = 2
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = True
+    return config
+class ImageGeneratorTool(BaseTool):
+    def generate_image(self, description: str, size: str = "256x256"):
+        """Generate a simple image based on description.
+        Args:
+            description: The description of the image to generate.
+            size: The size of the image. Defaults to "256x256". (choices: ["256x256", "512x512"])
+        Returns:
+            A generated image
+        """
+        print(f"[DEBUG] generate_image: {description}, {size}")
+        # Create a simple colored image for testing
+        width, height = map(int, size.split("x"))
+        # Create different colors based on description
+        if "red" in description.lower():
+            color = (255, 0, 0)
+        elif "blue" in description.lower():
+            color = (0, 0, 255)
+        elif "green" in description.lower():
+            color = (0, 255, 0)
+        else:
+            color = (128, 128, 128)  # gray
+        # Create image
+        image = Image.new("RGB", (width, height), color)
+        # Add some pattern to make it more interesting
+        for i in range(0, width, 50):
+            for j in range(0, height, 50):
+                # Add white squares in a grid pattern
+                for x in range(i, min(i + 20, width)):
+                    for y in range(j, min(j + 20, height)):
+                        image.putpixel((x, y), (255, 255, 255))
+        return image
+    def get_openai_tool_schema(self) -> OpenAIFunctionToolSchema:
+        schema = get_json_schema(self.generate_image)
+        return OpenAIFunctionToolSchema(**schema)
+    async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> tuple[ToolResponse, float, dict]:
+        try:
+            image = self.generate_image(**parameters)
+            # Return the PIL Image directly - the framework should handle the conversion
+            return ToolResponse(image=[image]), 0, {}
+        except Exception as e:
+            return ToolResponse(text=str(e)), 0, {}
+@pytest.mark.flaky(reruns=3)
+def test_multimodal_tool_agent(init_config):
+    """Test agent loop with multimodal tool that returns images using Qwen VL model."""
+    ray.shutdown()
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        },
+        ignore_reinit_error=True,
+    )
+    # Add custom chat template to enable tool calling support (same as recipe/deepeyes)
+    template_path = os.path.join(os.path.dirname(__file__), "qwen_vl_tool_chat_template.jinja2")
+    with open(template_path, encoding="utf-8") as f:
+        custom_chat_template = f.read()
+    init_config.actor_rollout_ref.model.custom_chat_template = custom_chat_template
+    # =========================== 1. Init rollout manager with image tool ===========================
+    tool_config = {
+        "tools": [
+            {
+                "class_name": "tests.experimental.agent_loop.test_multi_modal.ImageGeneratorTool",
+                "config": {"type": "native"},
+            },
+        ]
+    }
+    tool_config_path = "/tmp/multimodal_tool_config.json"
+    with open(tool_config_path, "w") as f:
+        json.dump(tool_config, f)
+    n = 2
+    init_config.actor_rollout_ref.rollout.n = n
+    init_config.actor_rollout_ref.rollout.multi_turn.tool_config_path = tool_config_path
+    init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 1
+    init_config.actor_rollout_ref.rollout.multi_turn.max_user_turns = 1
+    agent_loop_manager = AgentLoopManager(init_config)
+    # =========================== 2. Generate sequences with multimodal prompts ===========================
+    raw_prompts = [
+        [
+            {"role": "user", "content": "How are you?"},
+        ],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": os.path.expanduser("~/models/hf_data/test-videos/space_woaudio.mp4"),
+                        "min_pixels": 4 * 32 * 32,
+                        "max_pixels": 256 * 32 * 32,
+                        "total_pixels": 4096 * 32 * 32,
+                    },
+                    {
+                        "type": "text",
+                        "text": "Describe this video. Then you must call the "
+                        "image generator tool to generate a green image for me.",
+                    },
+                ],
+            },
+        ],
+        [
+            {"role": "user", "content": "Please generate a red image for me."},
+        ],
+        [
+            {"role": "user", "content": "Can you create a blue picture with size 512x512?"},
+        ],
+        [
+            {
+                "role": "system",
+                "content": (
+                    "You are Qwen VL, created by Alibaba Cloud. You are a helpful "
+                    "assistant that can generate and analyze images."
+                ),
+            },
+            {"role": "user", "content": "Generate a green landscape image and describe what you see in it."},
+        ],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
+            "agent_name": np.array(["tool_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+        },
+    )
+    batch = batch.repeat(n)
+    result = agent_loop_manager.generate_sequences(prompts=batch)
+    assert len(result) == len(raw_prompts) * n
+    # Check turns
+    num_turns = result.non_tensor_batch["__num_turns__"]
+    multi_modal_inputs = result.non_tensor_batch["multi_modal_inputs"]
+    print(f"num_turns: {num_turns}")
+    for i in range(len(num_turns)):
+        multi_modal_type = parse_multi_modal_type(raw_prompts[i // n])
+        if multi_modal_type == "video":
+            assert "pixel_values_videos" in multi_modal_inputs[i], f"Sample {i} should have pixel_values_videos"
+            assert "video_grid_thw" in multi_modal_inputs[i], f"Sample {i} should have video_grid_thw"
+        if i // n <= 1:
+            # TODO: prompt with video not generate tool call as expected
+            # First prompt: "How are you?" - should have 2 turns [user, assistant]
+            assert num_turns[i] == 2, f"Expected 2 turns but got {num_turns[i]} for sample {i}"
+        else:
+            # Tool-calling prompts should have 4 turns [user, assistant, tool, assistant]
+            assert num_turns[i] == 4, f"Expected 4 turns but got {num_turns[i]} for sample {i}"
+            assert "pixel_values" in multi_modal_inputs[i], f"Sample {i} should have pixel_values"
+            assert "image_grid_thw" in multi_modal_inputs[i], f"Sample {i} should have image_grid_thw"
+    # Check that images were properly returned in the tool responses
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    responses = result.batch["responses"]
+    response_mask = result.batch["response_mask"]
+    attention_mask = result.batch["attention_mask"]
+    assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
+    response_length = response_mask.size(1)
+    image_found_count = 0
+    for i in range(len(responses)):
+        # response with tool response (including images)
+        valid_tokens = responses[i][attention_mask[i][-response_length:].bool()]
+        response_with_obs = tokenizer.decode(valid_tokens)
+        # response without tool response
+        valid_tokens = responses[i][response_mask[i].bool()]
+        response_without_obs = tokenizer.decode(valid_tokens)
+        # Check that tool responses were properly masked out from training
+        assert "<tool_response>" not in response_without_obs, (
+            f"found <tool_response> in response: {response_without_obs}"
+        )
+        assert "</tool_response>" not in response_without_obs, (
+            f"found </tool_response> in response: {response_without_obs}"
+        )
+        # Check that images were included in the full response
+        if "<image>" in response_with_obs or "image" in response_with_obs.lower():
+            image_found_count += 1
+        print("=========================")
+        print("Response with tool observations:")
+        print(response_with_obs)
+        print("---")
+        print("Response without tool observations:")
+        print(response_without_obs)
+    # Verify that tool-calling responses contained image-related content
+    print(f"Found {image_found_count} responses with image content out of {len(responses)}")
+    # We should have at least some image content from the tool-calling prompts
+    # Note: First prompt might not use tools, so we don't expect 100% image content
+    expected_tool_calls = sum(1 for i in range(len(num_turns)) if num_turns[i] == 4)
+    assert image_found_count >= 0, (
+        f"No image-related content found, but expected at least some from {expected_tool_calls} tool calls"
+    )
+    print("Multimodal tool test passed!")
+    ray.shutdown()
+def test_multimodal_single_turn_agent(init_config):
+    """Test single turn agent loop with multimodal inputs using Qwen VL model."""
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        },
+        ignore_reinit_error=True,
+    )
+    # =========================== 1. Init rollout manager ===========================
+    n = 2
+    init_config.actor_rollout_ref.rollout.n = n
+    init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 1
+    init_config.actor_rollout_ref.rollout.multi_turn.max_user_turns = 1
+    agent_loop_manager = AgentLoopManager(init_config)
+    # =========================== 2. Generate sequences with multimodal prompts ===========================
+    # Create a simple test image
+    test_image = Image.new("RGB", (256, 256), (100, 150, 200))
+    test_image2 = Image.new("RGB", (512, 512), (100, 150, 200))
+    raw_prompts = [
+        # text
+        [
+            {"role": "user", "content": "Hello, how are you?"},
+        ],
+        # image
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": test_image},
+                    {"type": "text", "text": "What color is this image?"},
+                ],
+            },
+        ],
+        # system + image
+        [
+            {
+                "role": "system",
+                "content": "You are Qwen VL, created by Alibaba Cloud. You are a helpful assistant.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": test_image2},
+                    {"type": "text", "text": "Describe this image in detail."},
+                ],
+            },
+        ],
+        # video
+        [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "video",
+                        "video": os.path.expanduser("~/models/hf_data/test-videos/space_woaudio.mp4"),
+                        "min_pixels": 4 * 32 * 32,
+                        "max_pixels": 256 * 32 * 32,
+                        "total_pixels": 4096 * 32 * 32,
+                    },
+                    {"type": "text", "text": "Describe this video."},
+                ],
+            },
+        ],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
+            "agent_name": np.array(["single_turn_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+        },
+    )
+    batch = batch.repeat(n)
+    result = agent_loop_manager.generate_sequences(prompts=batch)
+    assert len(result) == len(raw_prompts) * n
+    # Check turns - all should be single turn (2: user + assistant)
+    num_turns = result.non_tensor_batch["__num_turns__"]
+    print(f"num_turns: {num_turns}")
+    for i in range(len(num_turns)):
+        assert num_turns[i] == 2, f"Expected 2 turns but got {num_turns[i]} for sample {i}"
+    # Verify responses
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    prompts = result.batch["prompts"]
+    responses = result.batch["responses"]
+    response_mask = result.batch["response_mask"]
+    input_ids = result.batch["input_ids"]
+    position_ids = result.batch["position_ids"]
+    multi_modal_inputs = result.non_tensor_batch["multi_modal_inputs"]
+    assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
+    assert position_ids.size() == (input_ids.size(0), 4, input_ids.size(1))  # (batch_size, 4, seq_len)
+    # Check for image pads in prompts
+    image_pad_count = 0
+    for i in range(len(prompts)):
+        prompt_ids = prompts[i][prompts[i] != tokenizer.pad_token_id].tolist()
+        prompt_text = tokenizer.decode(prompt_ids)
+        # Check if this sample should have image pads (samples with index 1 and 2 in each repeat have images)
+        sample_idx = i // n
+        has_image_pad = "<|image_pad|>" in prompt_text or "<|vision_start|>" in prompt_text
+        print("=========================")
+        print(f"Sample {i} (original prompt index: {sample_idx}):")
+        print(f"Prompt length: {len(prompt_ids)} tokens")
+        print(f"Has image_pad: {has_image_pad}")
+        # Check multi-modal type
+        multi_modal_type = parse_multi_modal_type(raw_prompts[sample_idx])
+        if multi_modal_type == "text":
+            assert len(multi_modal_inputs[i]) == 0, f"Sample {i} should not have multi-modal inputs"
+        elif multi_modal_type == "image":
+            assert "pixel_values" in multi_modal_inputs[i], f"Sample {i} should have pixel_values"
+            assert "image_grid_thw" in multi_modal_inputs[i], f"Sample {i} should have image_grid_thw"
+        else:
+            assert "pixel_values_videos" in multi_modal_inputs[i], f"Sample {i} should have pixel_values_videos"
+            assert "video_grid_thw" in multi_modal_inputs[i], f"Sample {i} should have video_grid_thw"
+        # Show first 200 chars of prompt
+        print(f"Prompt text (first 200 chars): {prompt_text[:200]}...")
+    for i in range(len(responses)):
+        valid_tokens = responses[i][response_mask[i].bool()]
+        response_text = tokenizer.decode(valid_tokens)
+        print(f"Sample {i} response: {response_text[:100]}...")
+    # Verify that we found image pads in multimodal samples
+    expected_multimodal_samples = 2 * n  # 2 prompts with images, repeated n times
+    print(f"\nFound {image_pad_count} samples with image_pad out of {expected_multimodal_samples} expected")
+    print("Single turn multimodal test passed!")
+    ray.shutdown()
+def test_multimodal_partial_single_turn_agent(init_config):
+    """Test partial single turn agent loop with multimodal inputs using Qwen VL model."""
+    # TODO(baiyan):
+    #    see verl/recipe/fully_async_policy/agent_loop/partial_single_turn_agent_loop.py for more details.
+    #    if use_correct_processor=True, the test will pass but the async training will hang, so I disable this test
+    #    for now
+    return
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        },
+        ignore_reinit_error=True,
+    )
+    from verl.experimental.fully_async_policy.agent_loop import FullyAsyncAgentLoopManager
+    # =========================== 1. Init rollout manager ===========================
+    n = 2
+    init_config.actor_rollout_ref.rollout.n = n
+    init_config.actor_rollout_ref.rollout.multi_turn.max_parallel_calls = 1
+    init_config.actor_rollout_ref.rollout.multi_turn.max_user_turns = 1
+    import asyncio
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    agent_loop_manager = loop.run_until_complete(FullyAsyncAgentLoopManager.create(init_config))
+    # =========================== 2. Generate sequences with multimodal prompts ===========================
+    # Create a simple test image
+    test_image = Image.new("RGB", (256, 256), (200, 100, 50))
+    test_image2 = Image.new("RGB", (512, 512), (100, 150, 200))
+    raw_prompts = [
+        [
+            {"role": "user", "content": "What is the capital of France?"},
+        ],
+        [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": test_image},
+                    {"type": "text", "text": "What do you see in this image?"},
+                ],
+            },
+        ],
+        [
+            {
+                "role": "system",
+                "content": "You are Qwen VL, a helpful multimodal assistant.",
+            },
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image", "image": test_image2},
+                    {"type": "text", "text": "Analyze the colors in this image."},
+                ],
+            },
+        ],
+    ]
+    batch = DataProto(
+        non_tensor_batch={
+            "raw_prompt": np.array([np.array(prompt) for prompt in raw_prompts], dtype=object),
+            "agent_name": np.array(["partial_single_turn_agent"] * len(raw_prompts)),
+            "data_source": np.array(["openai/gsm8k"] * len(raw_prompts)),
+            "reward_model": np.array([{"style": "rule", "ground_truth": "1.0"}] * len(raw_prompts)),
+        },
+    )
+    batch = batch.repeat(n)
+    result = agent_loop_manager.generate_sequences(prompts=batch)
+    assert len(result) == len(raw_prompts) * n
+    # Check turns - all should be single turn (2: user + assistant)
+    num_turns = result.non_tensor_batch["__num_turns__"]
+    print(f"num_turns: {num_turns}")
+    for i in range(len(num_turns)):
+        assert num_turns[i] == 2, f"Expected 2 turns but got {num_turns[i]} for sample {i}"
+    # Verify responses
+    tokenizer = hf_tokenizer(init_config.actor_rollout_ref.model.path)
+    prompts = result.batch["prompts"]
+    responses = result.batch["responses"]
+    response_mask = result.batch["response_mask"]
+    assert responses.size() == response_mask.size(), f"{responses.size()} != {response_mask.size()}"
+    # Check for image pads in prompts
+    image_pad_count = 0
+    for i in range(len(prompts)):
+        prompt_ids = prompts[i][prompts[i] != tokenizer.pad_token_id].tolist()
+        prompt_text = tokenizer.decode(prompt_ids)
+        # Check if this sample should have image pads (samples with index 1 and 2 in each repeat have images)
+        sample_idx = i // n
+        has_image_pad = "<|image_pad|>" in prompt_text or "<|vision_start|>" in prompt_text
+        print("=========================")
+        print(f"Sample {i} (original prompt index: {sample_idx}):")
+        print(f"Prompt length: {len(prompt_ids)} tokens")
+        print(f"Has image_pad: {has_image_pad}")
+        if sample_idx != 0:  # Samples 1 and 2 should have images
+            if has_image_pad:
+                image_pad_count += 1
+                # Count the number of image_pad tokens
+                num_image_pads = prompt_text.count("<|image_pad|>")
+                print(f"Number of <|image_pad|> tokens: {num_image_pads}")
+            else:
+                print("WARNING: Expected image_pad but not found!")
+        # Show first 200 chars of prompt
+        print(f"Prompt text (first 200 chars): {prompt_text[:200]}...")
+    for i in range(len(responses)):
+        valid_tokens = responses[i][response_mask[i].bool()]
+        response_text = tokenizer.decode(valid_tokens)
+        print(f"Sample {i} response: {response_text[:100]}...")
+    # Verify that we found image pads in multimodal samples
+    expected_multimodal_samples = 2 * n  # 2 prompts with images, repeated n times
+    print(f"\nFound {image_pad_count} samples with image_pad out of {expected_multimodal_samples} expected")
+    assert image_pad_count > 0, "No image_pad tokens found in multimodal samples!"
+    print("Partial single turn multimodal test passed!")
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/experimental/agent_loop/test_standalone_rollout.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import os
+import pytest
+import ray
+from omegaconf import DictConfig
+from openai import AsyncOpenAI, OpenAI
+from tests.experimental.agent_loop.agent_utils import init_agent_loop_manager
+from verl.checkpoint_engine import CheckpointEngineManager
+from verl.workers.rollout.replica import get_rollout_replica_class
+@pytest.fixture
+def init_config() -> DictConfig:
+    from hydra import compose, initialize_config_dir
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+    config.trainer.n_gpus_per_node = 4
+    config.trainer.nnodes = 2
+    config.actor_rollout_ref.actor.use_dynamic_bsz = True
+    config.actor_rollout_ref.model.path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
+    config.actor_rollout_ref.rollout.name = os.environ["ROLLOUT_NAME"]
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = False
+    return config
+@pytest.mark.asyncio
+@pytest.mark.parametrize("tp_size", [2, 4])
+async def test_standalone_rollout(init_config, tp_size):
+    """Test standalone rollout single node and multi nodes."""
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+                "NCCL_P2P_DISABLE": "1",  # disable p2p in L20
+            }
+        }
+    )
+    init_config.actor_rollout_ref.rollout.tensor_model_parallel_size = tp_size
+    num_replicas = (init_config.trainer.n_gpus_per_node * init_config.trainer.nnodes) // tp_size
+    rollout_config = init_config.actor_rollout_ref.rollout
+    model_config = init_config.actor_rollout_ref.model
+    # create standalone rollout server
+    rollout_server_class = get_rollout_replica_class(init_config.actor_rollout_ref.rollout.name)
+    rollout_servers = [
+        rollout_server_class(
+            replica_rank=replica_rank, config=rollout_config, model_config=model_config, gpus_per_node=2
+        )
+        for replica_rank in range(num_replicas)
+    ]
+    await asyncio.gather(*[server.init_standalone() for server in rollout_servers])
+    server_handles = [server._server_handle for server in rollout_servers]
+    server_addresses = [server._server_address for server in rollout_servers]
+    assert len(server_handles) == num_replicas
+    assert len(server_addresses) == num_replicas
+    os.environ.pop("HTTPS_PROXY", None)
+    os.environ.pop("HTTP_PROXY", None)
+    os.environ.pop("NO_PROXY", None)
+    client = AsyncOpenAI(
+        api_key="123-abc",
+        base_url=f"http://{server_addresses[0]}/v1",
+    )
+    completion = await client.chat.completions.create(
+        model=init_config.actor_rollout_ref.model.path,
+        messages=[{"role": "user", "content": "What can you do?"}],
+    )
+    print(completion.choices[0].message.content)
+    ray.shutdown()
+@pytest.mark.skip(reason="local test only")
+def test_hybrid_rollout_with_ep(init_config):
+    """Test hybrid rollout with expert parallelism, DP=2, TP=4, EP=8."""
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    model_path = os.path.expanduser("~/models/Qwen/Qwen3-30B-A3B-Instruct-2507")
+    init_config.actor_rollout_ref.model.path = model_path
+    # parallelism config
+    init_config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2
+    init_config.actor_rollout_ref.rollout.data_parallel_size = 4
+    init_config.actor_rollout_ref.rollout.expert_parallel_size = 8
+    # 1. init hybrid worker: FSDP+rollout
+    # - build FSDP model and optimizer
+    # - offload FSDP model and optimizer, build rollout
+    # - sleep rollout and load FSDP model and optimizer
+    agent_loop_manager = init_agent_loop_manager(init_config)
+    checkpoint_manager = CheckpointEngineManager(
+        backend=init_config.actor_rollout_ref.rollout.checkpoint_engine.backend,
+        trainer=agent_loop_manager.worker_group,
+        replicas=agent_loop_manager.rollout_replicas,
+    )
+    checkpoint_manager.sleep_replicas()
+    checkpoint_manager.update_weights()
+    # 3. test async openai call
+    server_address = agent_loop_manager.server_addresses[0]
+    client = OpenAI(
+        api_key="123-abc",
+        base_url=f"http://{server_address}/v1",
+    )
+    smapling_params = {
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "max_tokens": 512,
+    }
+    response = client.chat.completions.create(
+        model=model_path,
+        messages=[{"role": "user", "content": "What can you do?"}],
+        **smapling_params,
+    )
+    completion = response.choices[0].message.content
+    print(f"response: {completion}")
+    print("Test passed!")
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_agent_loop_reward_manager.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ray
+from hydra import compose, initialize_config_dir
+from torchdata.stateful_dataloader import StatefulDataLoader
+from transformers import AutoTokenizer
+from verl.experimental.agent_loop import AgentLoopManager
+from verl.protocol import DataProto
+from verl.trainer.main_ppo import create_rl_sampler
+from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+def test_agent_loop_reward_manager():
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+    rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
+    reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
+    # actor_rollout_ref config
+    config.data.return_raw_chat = True
+    config.data.max_prompt_length = 1024
+    config.data.max_response_length = 4096
+    config.actor_rollout_ref.model.path = rollout_model_path
+    config.actor_rollout_ref.actor.use_dynamic_bsz = True
+    config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2
+    config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.9
+    config.actor_rollout_ref.rollout.enforce_eager = True
+    config.actor_rollout_ref.rollout.prompt_length = 1024
+    config.actor_rollout_ref.rollout.response_length = 4096
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = True
+    config.trainer.n_gpus_per_node = 4
+    config.trainer.nnodes = 1
+    config.reward_model.reward_manager = "dapo"
+    config.reward_model.enable = True
+    config.reward_model.enable_resource_pool = True
+    config.reward_model.n_gpus_per_node = 4
+    config.reward_model.nnodes = 1
+    config.reward_model.model.path = reward_model_path
+    config.reward_model.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.reward_model.rollout.gpu_memory_utilization = 0.9
+    config.reward_model.rollout.tensor_model_parallel_size = 2
+    config.reward_model.rollout.skip_tokenizer_init = False
+    config.reward_model.rollout.prompt_length = 5120
+    config.reward_model.rollout.response_length = 4096
+    config.custom_reward_function.path = "tests/experimental/reward_loop/reward_fn.py"
+    config.custom_reward_function.name = "compute_score_gsm8k"
+    # 1. init reward model manager
+    agent_loop_manager = AgentLoopManager(config)
+    # 2. init test data
+    local_folder = os.path.expanduser("~/data/gsm8k/")
+    data_files = [os.path.join(local_folder, "train.parquet")]
+    tokenizer = AutoTokenizer.from_pretrained(rollout_model_path)
+    dataset = RLHFDataset(
+        data_files=data_files,
+        tokenizer=tokenizer,
+        config=config.data,
+        processor=None,
+    )
+    batch_size = 64
+    sampler = create_rl_sampler(config.data, dataset)
+    dataloader = StatefulDataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=config.data.dataloader_num_workers,
+        drop_last=True,
+        collate_fn=collate_fn,
+        sampler=sampler,
+    )
+    # 3. generate responses
+    batch_dict = next(iter(dataloader))
+    batch = DataProto.from_single_dict(batch_dict)
+    gen_batch = agent_loop_manager.generate_sequences(prompts=batch)
+    rm_scores = gen_batch.batch["rm_scores"]
+    sample_scores = rm_scores.sum(dim=1)
+    print(sample_scores)
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_agent_reward_loop_colocate.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ray
+from hydra import compose, initialize_config_dir
+from torchdata.stateful_dataloader import StatefulDataLoader
+from transformers import AutoTokenizer
+from verl.checkpoint_engine import CheckpointEngineManager
+from verl.experimental.agent_loop import AgentLoopManager
+from verl.experimental.reward_loop import RewardLoopManager
+from verl.protocol import DataProto
+from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+from verl.trainer.main_ppo import create_rl_sampler
+from verl.trainer.ppo.ray_trainer import ResourcePoolManager
+from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+from verl.utils.device import get_device_name
+from verl.workers.fsdp_workers import ActorRolloutRefWorker, AsyncActorRolloutRefWorker
+def test_agent_loop_reward_manager():
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+    rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
+    reward_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
+    # actor_rollout_ref config
+    config.data.return_raw_chat = True
+    config.data.max_prompt_length = 1024
+    config.data.max_response_length = 4096
+    config.actor_rollout_ref.model.path = rollout_model_path
+    config.actor_rollout_ref.actor.use_dynamic_bsz = True
+    config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2
+    config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.8
+    config.actor_rollout_ref.rollout.enforce_eager = True
+    config.actor_rollout_ref.rollout.prompt_length = 1024
+    config.actor_rollout_ref.rollout.response_length = 4096
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = True
+    config.trainer.n_gpus_per_node = 8
+    config.trainer.nnodes = 1
+    config.reward_model.reward_manager = "dapo"
+    config.reward_model.enable = True
+    config.reward_model.enable_resource_pool = False
+    config.reward_model.n_gpus_per_node = 8
+    config.reward_model.model.path = reward_model_path
+    config.reward_model.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.reward_model.rollout.gpu_memory_utilization = 0.8
+    config.reward_model.rollout.tensor_model_parallel_size = 2
+    config.reward_model.rollout.skip_tokenizer_init = False
+    config.reward_model.rollout.prompt_length = 5120
+    config.reward_model.rollout.response_length = 4096
+    config.custom_reward_function.path = "tests/experimental/reward_loop/reward_fn.py"
+    config.custom_reward_function.name = "compute_score_gsm8k"
+    # 1. init reward model manager
+    actor_rollout_cls = (
+        AsyncActorRolloutRefWorker if config.actor_rollout_ref.rollout.mode == "async" else ActorRolloutRefWorker
+    )
+    global_pool_id = "global_pool"
+    resource_pool_spec = {
+        global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
+    }
+    resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=None)
+    resource_pool_manager.create_resource_pool()
+    resource_pool = resource_pool_manager.resource_pool_dict[global_pool_id]
+    actor_rollout_cls = RayClassWithInitArgs(
+        cls=ray.remote(actor_rollout_cls), config=config.actor_rollout_ref, role="actor_rollout"
+    )
+    actor_rollout_wg = RayWorkerGroup(
+        resource_pool=resource_pool, ray_cls_with_init=actor_rollout_cls, device_name=get_device_name()
+    )
+    actor_rollout_wg.init_model()
+    agent_loop_manager = AgentLoopManager(config, worker_group=actor_rollout_wg)
+    # sleep rollout replicas
+    checkpoint_manager = CheckpointEngineManager(
+        backend=config.actor_rollout_ref.rollout.checkpoint_engine.backend,
+        trainer=actor_rollout_wg,
+        replicas=agent_loop_manager.rollout_replicas,
+    )
+    checkpoint_manager.sleep_replicas()
+    reward_loop_manager = RewardLoopManager(config, rm_resource_pool=resource_pool)
+    # 2. init test data
+    local_folder = os.path.expanduser("~/data/gsm8k/")
+    data_files = [os.path.join(local_folder, "train.parquet")]
+    tokenizer = AutoTokenizer.from_pretrained(rollout_model_path)
+    dataset = RLHFDataset(
+        data_files=data_files,
+        tokenizer=tokenizer,
+        config=config.data,
+        processor=None,
+    )
+    batch_size = 64
+    sampler = create_rl_sampler(config.data, dataset)
+    dataloader = StatefulDataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=config.data.dataloader_num_workers,
+        drop_last=True,
+        collate_fn=collate_fn,
+        sampler=sampler,
+    )
+    # 3. generate responses
+    batch_dict = next(iter(dataloader))
+    batch = DataProto.from_single_dict(batch_dict)
+    def _get_gen_batch(batch: DataProto) -> DataProto:
+        reward_model_keys = set({"data_source", "reward_model", "extra_info", "uid"}) & batch.non_tensor_batch.keys()
+        # pop those keys for generation
+        batch_keys_to_pop = []
+        non_tensor_batch_keys_to_pop = set(batch.non_tensor_batch.keys()) - reward_model_keys
+        gen_batch = batch.pop(
+            batch_keys=batch_keys_to_pop,
+            non_tensor_batch_keys=list(non_tensor_batch_keys_to_pop),
+        )
+        # For agent loop, we need reward model keys to compute score.
+        gen_batch.non_tensor_batch.update(batch.non_tensor_batch)
+        return gen_batch
+    # wake up rollout replicas via update_weight
+    checkpoint_manager.update_weights()
+    gen_batch = _get_gen_batch(batch)
+    gen_batch = agent_loop_manager.generate_sequences(gen_batch)
+    checkpoint_manager.sleep_replicas()
+    batch = batch.union(gen_batch)
+    rm_outputs = reward_loop_manager.compute_rm_score(batch)
+    for output in rm_outputs[:5]:
+        print(output.non_tensor_batch)
+    print("done")
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_async_token_bucket_on_cpu.py ADDED Viewed

	@@ -0,0 +1,267 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import time
+import pytest
+from verl.experimental.reward_loop.reward_manager.limited import AsyncTokenBucket
+class TestAsyncTokenBucket:
+    """Unit tests for AsyncTokenBucket rate limiter."""
+    @pytest.mark.asyncio
+    async def test_basic_acquire(self):
+        """Test basic token acquisition."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        # Should be able to acquire tokens immediately when bucket is full
+        start = time.time()
+        await bucket.acquire(5.0)
+        elapsed = time.time() - start
+        assert elapsed < 0.1, "Initial acquire should be immediate"
+        assert bucket.tokens == pytest.approx(5.0, abs=0.1)
+    @pytest.mark.asyncio
+    async def test_refill_mechanism(self):
+        """Test that tokens refill over time."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        # Consume all tokens
+        await bucket.acquire(10.0)
+        assert bucket.tokens == pytest.approx(0.0, abs=0.1)
+        # Wait for refill (should get ~5 tokens in 0.5 seconds at 10 tokens/sec)
+        await asyncio.sleep(0.5)
+        # Try to acquire 4 tokens (should succeed without waiting)
+        start = time.time()
+        await bucket.acquire(4.0)
+        elapsed = time.time() - start
+        assert elapsed < 0.1, "Acquire should be quick after refill"
+    @pytest.mark.asyncio
+    async def test_waiting_for_tokens(self):
+        """Test that acquire waits when insufficient tokens available."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        # Consume all tokens
+        await bucket.acquire(10.0)
+        # Try to acquire more tokens (should wait ~0.5 seconds for 5 tokens)
+        start = time.time()
+        await bucket.acquire(5.0)
+        elapsed = time.time() - start
+        # Should wait approximately 0.5 seconds (5 tokens / 10 tokens per second)
+        assert 0.4 < elapsed < 0.7, f"Expected ~0.5s wait, got {elapsed:.3f}s"
+    @pytest.mark.asyncio
+    async def test_max_tokens_cap(self):
+        """Test that tokens don't exceed max_tokens capacity."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=5.0)
+        # Wait for potential overflow
+        await asyncio.sleep(1.0)
+        # Tokens should be capped at max_tokens
+        await bucket.acquire(1.0)
+        # After 1 second at 10 tokens/sec, should have max_tokens (5.0)
+        # After acquiring 1, should have 4.0 remaining
+        assert bucket.tokens <= 5.0, "Tokens should not exceed max_tokens"
+    @pytest.mark.asyncio
+    async def test_fractional_tokens(self):
+        """Test acquiring fractional tokens."""
+        bucket = AsyncTokenBucket(rate_limit=100.0, max_tokens=100.0)
+        # Acquire fractional amounts
+        await bucket.acquire(0.5)
+        await bucket.acquire(1.5)
+        await bucket.acquire(2.3)
+        assert bucket.tokens == pytest.approx(100.0 - 0.5 - 1.5 - 2.3, abs=0.1)
+    @pytest.mark.asyncio
+    async def test_concurrent_acquires(self):
+        """Test multiple concurrent acquire operations."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        async def acquire_task(num_tokens: float, task_id: int):
+            await bucket.acquire(num_tokens)
+            return task_id
+        # Launch 5 concurrent tasks, each acquiring 3 tokens (15 total)
+        # Bucket only has 10, so some will need to wait
+        start = time.time()
+        tasks = [acquire_task(3.0, i) for i in range(5)]
+        results = await asyncio.gather(*tasks)
+        elapsed = time.time() - start
+        # Should take at least 0.5 seconds to refill 5 tokens
+        # (15 needed - 10 available) / 10 tokens per second = 0.5 seconds
+        assert elapsed >= 0.4, f"Expected >=0.4s for concurrent acquires, got {elapsed:.3f}s"
+        assert len(results) == 5, "All tasks should complete"
+    @pytest.mark.asyncio
+    async def test_high_rate_limit(self):
+        """Test with high rate limit (simulating high-throughput scenarios)."""
+        bucket = AsyncTokenBucket(rate_limit=1000.0, max_tokens=1000.0)
+        # Rapidly acquire tokens
+        start = time.time()
+        for _ in range(100):
+            await bucket.acquire(10.0)  # 1000 tokens total
+        elapsed = time.time() - start
+        # Should complete in approximately 1 second
+        assert elapsed < 1.5, f"High rate limit test took too long: {elapsed:.3f}s"
+    @pytest.mark.asyncio
+    async def test_zero_initial_state(self):
+        """Test that bucket starts with full tokens."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        assert bucket.tokens == 10.0, "Bucket should start full"
+        assert bucket.last_update is None, "last_update should be None initially"
+        # After first acquire, last_update should be set
+        await bucket.acquire(1.0)
+        assert bucket.last_update is not None, "last_update should be set after acquire"
+    @pytest.mark.asyncio
+    async def test_rate_limit_accuracy(self):
+        """Test rate limit accuracy over time."""
+        rate = 50.0  # 50 tokens per second
+        bucket = AsyncTokenBucket(rate_limit=rate, max_tokens=rate)
+        # Consume all tokens and measure refill time for 25 tokens
+        await bucket.acquire(50.0)
+        start = time.time()
+        await bucket.acquire(25.0)
+        elapsed = time.time() - start
+        expected_time = 25.0 / rate  # 0.5 seconds
+        # Allow 20% margin for timing inaccuracy
+        assert abs(elapsed - expected_time) < expected_time * 0.2, f"Expected ~{expected_time:.3f}s, got {elapsed:.3f}s"
+    @pytest.mark.asyncio
+    async def test_sequential_acquires(self):
+        """Test sequential acquire operations."""
+        bucket = AsyncTokenBucket(rate_limit=20.0, max_tokens=20.0)
+        # Sequential acquires without waiting
+        await bucket.acquire(5.0)
+        await bucket.acquire(5.0)
+        await bucket.acquire(5.0)
+        await bucket.acquire(5.0)
+        # Bucket should be empty
+        assert bucket.tokens == pytest.approx(0.0, abs=0.1)
+        # Next acquire should wait
+        start = time.time()
+        await bucket.acquire(10.0)
+        elapsed = time.time() - start
+        assert elapsed >= 0.4, "Should wait for token refill"
+    @pytest.mark.asyncio
+    async def test_default_max_tokens(self):
+        """Test that max_tokens defaults to rate_limit."""
+        bucket = AsyncTokenBucket(rate_limit=15.0)
+        assert bucket.max_tokens == 15.0, "max_tokens should default to rate_limit"
+        assert bucket.tokens == 15.0, "Initial tokens should equal max_tokens"
+    @pytest.mark.asyncio
+    async def test_single_token_acquire(self):
+        """Test default acquire of 1 token."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        await bucket.acquire()  # Default num_tokens=1.0
+        assert bucket.tokens == pytest.approx(9.0, abs=0.1)
+    @pytest.mark.asyncio
+    async def test_large_token_acquire(self):
+        """Test acquiring more tokens than bucket capacity."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        # Try to acquire 50 tokens (5x capacity)
+        start = time.time()
+        await bucket.acquire(50.0)
+        elapsed = time.time() - start
+        # Should wait for: (50 - 10) / 10 = 4 seconds
+        assert 3.5 < elapsed < 5.0, f"Expected ~4s wait for large acquire, got {elapsed:.3f}s"
+    @pytest.mark.asyncio
+    async def test_thread_safety_with_lock(self):
+        """Test that lock prevents race conditions."""
+        bucket = AsyncTokenBucket(rate_limit=100.0, max_tokens=100.0)
+        results = []
+        async def acquire_and_record():
+            await bucket.acquire(10.0)
+            results.append(1)
+        # Launch many concurrent tasks
+        tasks = [acquire_and_record() for _ in range(10)]
+        await asyncio.gather(*tasks)
+        # All tasks should complete
+        assert len(results) == 10, "All tasks should complete successfully"
+        # Bucket should have consumed exactly 100 tokens
+        assert bucket.tokens == pytest.approx(0.0, abs=0.5)
+    @pytest.mark.asyncio
+    async def test_multiple_wait_cycles(self):
+        """Test multiple wait cycles in the acquire loop."""
+        bucket = AsyncTokenBucket(rate_limit=10.0, max_tokens=10.0)
+        # Consume all tokens
+        await bucket.acquire(10.0)
+        # Acquire tokens that require multiple refill cycles
+        start = time.time()
+        await bucket.acquire(15.0)
+        elapsed = time.time() - start
+        # Should wait for 15 tokens / 10 tokens per second = 1.5 seconds
+        assert 1.3 < elapsed < 1.8, f"Expected ~1.5s for multiple refill cycles, got {elapsed:.3f}s"
+    @pytest.mark.asyncio
+    async def test_rapid_small_acquires(self):
+        """Test many rapid small acquisitions."""
+        bucket = AsyncTokenBucket(rate_limit=100.0, max_tokens=100.0)
+        start = time.time()
+        for _ in range(50):
+            await bucket.acquire(2.0)  # 100 tokens total
+        elapsed = time.time() - start
+        # Should complete quickly since we're within capacity
+        assert elapsed < 0.5, f"Rapid small acquires took too long: {elapsed:.3f}s"
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])

code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_math_verify.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ray
+from hydra import compose, initialize_config_dir
+from torchdata.stateful_dataloader import StatefulDataLoader
+from transformers import AutoTokenizer
+from verl.experimental.agent_loop import AgentLoopManager
+from verl.protocol import DataProto
+from verl.trainer.main_ppo import create_rl_sampler
+from verl.utils.dataset.rl_dataset import RLHFDataset, collate_fn
+def test_agent_loop_reward_manager():
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+    rollout_model_path = os.path.expanduser("~/models/Qwen/Qwen2.5-3B-Instruct")
+    # actor_rollout_ref config
+    config.data.return_raw_chat = True
+    config.data.max_prompt_length = 1024
+    config.data.max_response_length = 4096
+    config.actor_rollout_ref.model.path = rollout_model_path
+    config.actor_rollout_ref.actor.use_dynamic_bsz = True
+    config.actor_rollout_ref.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.actor_rollout_ref.rollout.mode = "async"
+    config.actor_rollout_ref.rollout.tensor_model_parallel_size = 2
+    config.actor_rollout_ref.rollout.gpu_memory_utilization = 0.9
+    config.actor_rollout_ref.rollout.enforce_eager = True
+    config.actor_rollout_ref.rollout.prompt_length = 2048
+    config.actor_rollout_ref.rollout.response_length = 4096
+    config.actor_rollout_ref.rollout.skip_tokenizer_init = True
+    config.trainer.n_gpus_per_node = 8
+    config.trainer.nnodes = 1
+    config.reward_model.reward_manager = "remote"
+    config.reward_model.num_workers = 2
+    config.custom_reward_function.path = "tests/experimental/reward_loop/reward_fn.py"
+    config.custom_reward_function.name = "compute_score_math_verify"
+    # 1. init reward model manager
+    agent_loop_manager = AgentLoopManager(config)
+    # 2. init test data
+    local_folder = os.path.expanduser("~/data/math/")
+    data_files = [os.path.join(local_folder, "train.parquet")]
+    tokenizer = AutoTokenizer.from_pretrained(rollout_model_path)
+    dataset = RLHFDataset(
+        data_files=data_files,
+        tokenizer=tokenizer,
+        config=config.data,
+        processor=None,
+    )
+    batch_size = 64
+    sampler = create_rl_sampler(config.data, dataset)
+    dataloader = StatefulDataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=config.data.dataloader_num_workers,
+        drop_last=True,
+        collate_fn=collate_fn,
+        sampler=sampler,
+    )
+    # 3. generate responses
+    batch_dict = next(iter(dataloader))
+    batch = DataProto.from_single_dict(batch_dict)
+    gen_batch = agent_loop_manager.generate_sequences(prompts=batch)
+    rm_scores = gen_batch.batch["rm_scores"]
+    accuracy = rm_scores.sum(dim=-1).mean()
+    print(accuracy)
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_rate_limited_reward_manager_on_cpu.py ADDED Viewed

	@@ -0,0 +1,528 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import logging
+import os.path
+import time
+import pytest
+import torch
+from omegaconf import DictConfig
+from transformers import AutoTokenizer
+from verl import DataProto
+from verl.experimental.reward_loop.reward_manager.limited import RateLimitedRewardManager
+# Mock API reward functions for testing
+class MockAPICounter:
+    """Shared counter to track API calls across tests."""
+    def __init__(self):
+        self.call_count = 0
+        self.call_times = []
+        self.lock = asyncio.Lock()
+    async def record_call(self):
+        async with self.lock:
+            self.call_count += 1
+            self.call_times.append(time.time())
+    def reset(self):
+        self.call_count = 0
+        self.call_times.clear()
+    def get_rate_per_second(self, window_start: float = None):
+        """Calculate API call rate over a time window."""
+        if window_start is None:
+            if not self.call_times:
+                return 0.0
+            window_start = self.call_times[0]
+        if not self.call_times:
+            return 0.0
+        window_end = self.call_times[-1]
+        duration = window_end - window_start
+        if duration <= 0:
+            return 0.0
+        calls_in_window = sum(1 for t in self.call_times if t >= window_start)
+        return calls_in_window / duration
+# Global counter instance
+api_counter = MockAPICounter()
+def mock_sync_reward_function(
+    data_source: str, solution_str: str, ground_truth: str, extra_info: dict, **kwargs
+) -> float:
+    """Synchronous mock reward function that simulates API call."""
+    # Simulate API processing time
+    time.sleep(0.01)
+    # Simple scoring logic
+    score = 1.0 if solution_str.strip() == ground_truth.strip() else 0.0
+    return score
+async def mock_async_reward_function(
+    data_source: str, solution_str: str, ground_truth: str, extra_info: dict, **kwargs
+) -> float:
+    """Asynchronous mock reward function that simulates API call."""
+    # Record API call for rate tracking
+    await api_counter.record_call()
+    # Simulate async API call (e.g., HTTP request)
+    await asyncio.sleep(0.01)
+    # Simple scoring logic
+    score = 1.0 if solution_str.strip() == ground_truth.strip() else 0.0
+    return score
+async def mock_slow_api_function(
+    data_source: str, solution_str: str, ground_truth: str, extra_info: dict, **kwargs
+) -> float:
+    """Slow mock API function for timeout testing."""
+    await asyncio.sleep(2.0)  # Simulate slow API
+    return 0.5
+async def mock_failing_api_function(
+    data_source: str, solution_str: str, ground_truth: str, extra_info: dict, **kwargs
+) -> float:
+    """Mock API function that raises an exception."""
+    await api_counter.record_call()
+    raise ValueError("Simulated API error")
+async def mock_dict_result_function(
+    data_source: str, solution_str: str, ground_truth: str, extra_info: dict, **kwargs
+) -> dict:
+    """Mock API function that returns dict result."""
+    await api_counter.record_call()
+    await asyncio.sleep(0.01)
+    correct = solution_str.strip() == ground_truth.strip()
+    return {"score": 1.0 if correct else 0.0, "correct": correct, "reasoning": "Mock reasoning"}
+def create_test_data_proto(tokenizer, response_text: str, ground_truth: str, data_source: str = "test"):
+    """Helper to create DataProto for testing."""
+    response_ids = tokenizer.encode(response_text, add_special_tokens=False)
+    response_tensor = torch.tensor([response_ids], dtype=torch.long)
+    attention_mask = torch.ones_like(response_tensor)
+    data = DataProto.from_dict(
+        {
+            "responses": response_tensor,
+            "attention_mask": attention_mask,
+        }
+    )
+    # Wrap non-tensor values in lists to match batch dimension
+    data.non_tensor_batch = {"data_source": [data_source], "reward_model": [{"ground_truth": ground_truth}]}
+    return data
+class TestRateLimitedRewardManager:
+    """Integration tests for RateLimitedRewardManager with mock API functions."""
+    @pytest.fixture(autouse=True)
+    def setup_and_teardown(self):
+        """Reset global state before each test."""
+        api_counter.reset()
+        # Reset class state
+        RateLimitedRewardManager._class_initialized = False
+        RateLimitedRewardManager._semaphore = None
+        RateLimitedRewardManager._rpm_limiter = None
+        RateLimitedRewardManager._tpm_limiter = None
+        yield
+        # Cleanup
+        api_counter.reset()
+    @pytest.fixture
+    def tokenizer(self):
+        """Load a simple tokenizer for testing."""
+        return AutoTokenizer.from_pretrained(os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct"))
+    @pytest.mark.asyncio
+    async def test_basic_reward_computation(self, tokenizer):
+        """Test basic reward computation without rate limiting."""
+        config = DictConfig({"reward_model": {"max_concurrent": 10, "timeout": 10.0}})
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_async_reward_function)
+        # Create test data
+        data = create_test_data_proto(tokenizer, "correct answer", "correct answer")
+        # Compute reward
+        result = await manager.run_single(data)
+        assert "reward_score" in result
+        assert result["reward_score"] == 1.0
+        assert api_counter.call_count == 1
+    @pytest.mark.asyncio
+    async def test_rpm_rate_limiting(self, tokenizer):
+        """Test request per minute (RPM) rate limiting."""
+        # Set RPM limit to 60 (1 request per second)
+        config = DictConfig(
+            {
+                "reward_model": {
+                    "max_concurrent": 10,
+                    "max_rpm": 60,  # 1 request per second
+                    "timeout": 10.0,
+                }
+            }
+        )
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_async_reward_function)
+        # Create test data
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        # Make 3 requests - should be rate limited
+        start_time = time.time()
+        results = []
+        for _ in range(3):
+            result = await manager.run_single(data)
+            results.append(result)
+        elapsed = time.time() - start_time
+        # Should take at least ~2 seconds for 3 requests at 1 req/sec
+        assert elapsed >= 1.8, f"RPM limiting failed: {elapsed:.3f}s for 3 requests"
+        assert all(r["reward_score"] == 1.0 for r in results)
+        assert api_counter.call_count == 3
+    @pytest.mark.asyncio
+    async def test_tpm_rate_limiting(self, tokenizer):
+        """Test tokens per minute (TPM) rate limiting."""
+        # Set TPM limit to 6000 (100 tokens per second)
+        # With 2000 tokens per request, that's 0.05 req/sec or 20 seconds per request
+        config = DictConfig(
+            {
+                "reward_model": {
+                    "max_concurrent": 10,
+                    "max_tpm": 6000,  # 100 tokens per second
+                    "estimated_tokens_per_request": 2000,  # Each request = 2000 tokens
+                    "timeout": 30.0,
+                }
+            }
+        )
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_async_reward_function)
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        # Make 2 requests
+        start_time = time.time()
+        result1 = await manager.run_single(data)
+        result2 = await manager.run_single(data)
+        elapsed = time.time() - start_time
+        # First request: consumes 2000 tokens (immediate)
+        # Second request: needs 2000 tokens, waits for refill
+        # Wait time: 2000 tokens / 100 tokens per second = 20 seconds
+        assert elapsed >= 18.0, f"TPM limiting failed: {elapsed:.3f}s for 2 requests"
+        assert result1["reward_score"] == 1.0
+        assert result2["reward_score"] == 1.0
+    @pytest.mark.asyncio
+    async def test_concurrency_limiting(self, tokenizer):
+        """Test concurrent request limiting."""
+        config = DictConfig(
+            {
+                "reward_model": {
+                    "max_concurrent": 2,  # Only 2 concurrent requests
+                    "timeout": 10.0,
+                }
+            }
+        )
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_async_reward_function)
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        # Launch 5 concurrent requests
+        start_time = time.time()
+        tasks = [manager.run_single(data) for _ in range(5)]
+        results = await asyncio.gather(*tasks)
+        elapsed = time.time() - start_time
+        # All should succeed
+        assert len(results) == 5
+        assert all(r["reward_score"] == 1.0 for r in results)
+        # With concurrency=2 and 0.01s per request, should take at least 0.03s
+        # (3 batches: 2+2+1)
+        assert elapsed >= 0.02, f"Concurrency limiting may not be working: {elapsed:.3f}s"
+    @pytest.mark.asyncio
+    async def test_timeout_handling(self, tokenizer):
+        """Test timeout handling for slow API."""
+        config = DictConfig(
+            {
+                "reward_model": {
+                    "max_concurrent": 10,
+                    "timeout": 0.5,  # 500ms timeout
+                }
+            }
+        )
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_slow_api_function)
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        # Should timeout and return 0.0
+        result = await manager.run_single(data)
+        assert result["reward_score"] == 0.0
+        assert result["reward_extra_info"].get("timeout") is True
+        assert result["reward_extra_info"].get("acc") == 0.0
+    @pytest.mark.asyncio
+    async def test_error_handling(self, tokenizer):
+        """Test error handling for failing API."""
+        config = DictConfig({"reward_model": {"max_concurrent": 10, "timeout": 10.0}})
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_failing_api_function)
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        # Should catch exception and return 0.0
+        result = await manager.run_single(data)
+        assert result["reward_score"] == 0.0
+        assert "error" in result["reward_extra_info"]
+        assert "Simulated API error" in result["reward_extra_info"]["error"]
+        assert result["reward_extra_info"].get("acc") == 0.0
+        assert api_counter.call_count == 1
+    @pytest.mark.asyncio
+    async def test_dict_result_format(self, tokenizer):
+        """Test handling of dict return format from reward function."""
+        config = DictConfig({"reward_model": {"max_concurrent": 10, "timeout": 10.0}})
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_dict_result_function)
+        data = create_test_data_proto(tokenizer, "correct", "correct")
+        result = await manager.run_single(data)
+        assert result["reward_score"] == 1.0
+        assert result["reward_extra_info"]["score"] == 1.0
+        assert result["reward_extra_info"]["correct"] is True
+        assert result["reward_extra_info"]["reasoning"] == "Mock reasoning"
+    @pytest.mark.asyncio
+    async def test_sync_reward_function(self, tokenizer):
+        """Test that synchronous reward functions work correctly."""
+        config = DictConfig({"reward_model": {"max_concurrent": 10, "timeout": 10.0}})
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_sync_reward_function)
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        result = await manager.run_single(data)
+        assert result["reward_score"] == 1.0
+        assert manager.is_async_reward_score is False
+    @pytest.mark.asyncio
+    async def test_combined_rate_limits(self, tokenizer):
+        """Test all three rate limiting layers together."""
+        config = DictConfig(
+            {
+                "reward_model": {
+                    "max_concurrent": 2,
+                    "max_rpm": 120,  # 2 requests per second
+                    "max_tpm": 12000,  # 200 tokens per second
+                    "estimated_tokens_per_request": 100,  # 0.5 seconds per request
+                    "timeout": 10.0,
+                }
+            }
+        )
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_async_reward_function)
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        # Make 6 requests to exceed burst capacity (RPM bucket starts with 2 tokens)
+        start_time = time.time()
+        tasks = [manager.run_single(data) for _ in range(6)]
+        results = await asyncio.gather(*tasks)
+        elapsed = time.time() - start_time
+        # Bucket starts with 2 RPM tokens and 200 TPM tokens
+        # First 2 requests: use burst capacity (2 RPM tokens, 200 TPM tokens)
+        # Next 4 requests: need 4 RPM tokens (wait 2 seconds) and 400 TPM tokens (wait 2 seconds)
+        # Limiting factor: RPM at 2 seconds
+        assert elapsed >= 1.8, f"Combined rate limiting: {elapsed:.3f}s"
+        assert all(r["reward_score"] == 1.0 for r in results)
+        assert api_counter.call_count == 6
+    @pytest.mark.asyncio
+    async def test_correct_vs_incorrect_answers(self, tokenizer):
+        """Test scoring of correct vs incorrect answers."""
+        config = DictConfig({"reward_model": {"max_concurrent": 10, "timeout": 10.0}})
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_async_reward_function)
+        # Test correct answer
+        data_correct = create_test_data_proto(tokenizer, "right answer", "right answer")
+        result_correct = await manager.run_single(data_correct)
+        # Test incorrect answer
+        data_incorrect = create_test_data_proto(tokenizer, "wrong answer", "right answer")
+        result_incorrect = await manager.run_single(data_incorrect)
+        assert result_correct["reward_score"] == 1.0
+        assert result_incorrect["reward_score"] == 0.0
+    @pytest.mark.asyncio
+    async def test_high_throughput(self, tokenizer):
+        """Test high throughput with many concurrent requests."""
+        config = DictConfig(
+            {
+                "reward_model": {
+                    "max_concurrent": 20,
+                    "max_rpm": 6000,  # 100 requests per second
+                    "timeout": 10.0,
+                }
+            }
+        )
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(config=config, tokenizer=tokenizer, compute_score=mock_async_reward_function)
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        # Launch 200 concurrent requests (more than burst capacity of 100)
+        start_time = time.time()
+        tasks = [manager.run_single(data) for _ in range(200)]
+        results = await asyncio.gather(*tasks)
+        elapsed = time.time() - start_time
+        assert len(results) == 200
+        assert all(r["reward_score"] == 1.0 for r in results)
+        # Bucket starts with 100 tokens (burst capacity)
+        # First 100 requests: use burst capacity instantly
+        # Next 100 requests: need to wait for refill at 100 tokens/sec = 1 second minimum
+        # Total time should be at least 1 second
+        assert elapsed >= 0.9, f"Should take at least 0.9s for rate limiting, took {elapsed:.3f}s"
+        # Calculate actual rate over the time window
+        actual_rate = api_counter.call_count / elapsed
+        # Average rate should not significantly exceed 100 req/sec
+        # Allow some burst overhead due to initial capacity
+        assert actual_rate <= 200, f"Rate limiting failed: {actual_rate:.1f} req/sec (max 200)"
+    @pytest.mark.asyncio
+    async def test_class_initialization_once(self, tokenizer):
+        """Test that class initialization only happens once."""
+        config = DictConfig({"reward_model": {"max_concurrent": 5, "timeout": 10.0}})
+        # Initialize multiple times
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        first_semaphore = RateLimitedRewardManager._semaphore
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        second_semaphore = RateLimitedRewardManager._semaphore
+        # Should be the same object
+        assert first_semaphore is second_semaphore
+    def test_warn_when_rate_limits_are_ignored_due_to_prior_init(self, tokenizer, caplog):
+        """Warn when a new config attempts to change global RPM/TPM after the class has been initialized."""
+        caplog.set_level(logging.WARNING)
+        # First instantiation without a config (legacy signature) initializes global limiters with defaults.
+        _ = RateLimitedRewardManager(
+            tokenizer=tokenizer,
+            compute_score=mock_async_reward_function,
+            num_examine=0,
+            reward_fn_key="data_source",
+        )
+        # Second instantiation attempts to set RPM limits, but will be ignored due to global initialization.
+        config = DictConfig({"reward_model": {"max_concurrent": 10, "max_rpm": 60, "timeout": 10.0}})
+        _ = RateLimitedRewardManager(
+            config=config,
+            tokenizer=tokenizer,
+            compute_score=mock_async_reward_function,
+        )
+        assert any(
+            "RateLimitedRewardManager has already been initialized" in record.getMessage()
+            and "ignored" in record.getMessage()
+            for record in caplog.records
+        ), "Expected a warning when attempting to change global rate limits after initialization."
+    @pytest.mark.asyncio
+    async def test_extra_info_handling(self, tokenizer):
+        """Test that extra_info is properly passed to reward function."""
+        received_extra_info = {}
+        async def mock_reward_with_extra_info(
+            data_source: str, solution_str: str, ground_truth: str, extra_info: dict, **kwargs
+        ):
+            received_extra_info.update(extra_info)
+            return 1.0
+        config = DictConfig({"reward_model": {"max_concurrent": 10, "timeout": 10.0}})
+        RateLimitedRewardManager.init_class(config, tokenizer)
+        manager = RateLimitedRewardManager(
+            config=config, tokenizer=tokenizer, compute_score=mock_reward_with_extra_info
+        )
+        data = create_test_data_proto(tokenizer, "answer", "answer")
+        data.non_tensor_batch["extra_info"] = [{"custom_field": "test_value"}]
+        await manager.run_single(data)
+        assert "custom_field" in received_extra_info
+        assert received_extra_info["custom_field"] == "test_value"
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])

code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_reward_model_disrm.py ADDED Viewed

	@@ -0,0 +1,153 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ray
+import torch
+from hydra import compose, initialize_config_dir
+from verl.experimental.reward_loop import RewardLoopManager
+from verl.protocol import DataProto
+from verl.utils import hf_tokenizer
+from verl.utils.model import compute_position_id_with_mask
+def create_data_samples(tokenizer) -> DataProto:
+    convs = [
+        [
+            {
+                "role": "user",
+                "content": "What is the range of the numeric output of a sigmoid node in a neural network?",
+            },
+            {"role": "assistant", "content": "Between -1 and 1."},
+        ],
+        [
+            {
+                "role": "user",
+                "content": "What is the range of the numeric output of a sigmoid node in a neural network?",
+            },
+            {"role": "assistant", "content": "Between 0 and 1."},
+        ],
+        [
+            {"role": "user", "content": "What is the capital of Australia?"},
+            {
+                "role": "assistant",
+                "content": "Canberra is the capital city of Australia.",
+            },
+        ],
+        [
+            {"role": "user", "content": "What is the capital of Australia?"},
+            {
+                "role": "assistant",
+                "content": "Sydney is the capital of Australia.",
+            },
+        ],
+    ]
+    raw_prompt = [conv[:1] for conv in convs]
+    data_source = ["gsm8k"] * len(convs)
+    reward_info = [{"ground_truth": "Not Used"}] * len(convs)
+    extra_info = [{"question": conv[0]["content"]} for conv in convs]
+    prompt_length, response_length = 1024, 4096
+    pad_token_id = tokenizer.pad_token_id
+    prompts, responses, input_ids, attention_masks = [], [], [], []
+    for conv in convs:
+        prompt_tokens = tokenizer.apply_chat_template(conv[:1], tokenize=True)
+        response_tokens = tokenizer.apply_chat_template(conv, tokenize=True)[len(prompt_tokens) :]
+        padded_prompt = [pad_token_id] * (prompt_length - len(prompt_tokens)) + prompt_tokens
+        padded_response = response_tokens + [pad_token_id] * (response_length - len(response_tokens))
+        attention_mask = (
+            [0] * (prompt_length - len(prompt_tokens))
+            + [1] * len(prompt_tokens)
+            + [1] * len(response_tokens)
+            + [0] * (response_length - len(response_tokens))
+        )
+        prompts.append(torch.tensor(padded_prompt))
+        responses.append(torch.tensor(padded_response))
+        input_ids.append(torch.tensor(padded_prompt + padded_response))
+        attention_masks.append(torch.tensor(attention_mask))
+    prompts = torch.stack(prompts)
+    responses = torch.stack(responses)
+    input_ids = torch.stack(input_ids)
+    attention_masks = torch.stack(attention_masks)
+    position_ids = compute_position_id_with_mask(attention_masks)
+    data = DataProto.from_dict(
+        tensors={
+            "prompts": prompts,
+            "responses": responses,
+            "input_ids": input_ids,
+            "attention_mask": attention_masks,
+            "position_ids": position_ids,
+        },
+        non_tensors={
+            "data_source": data_source,
+            "reward_model": reward_info,
+            "raw_prompt": raw_prompt,
+            "extra_info": extra_info,
+        },
+    )
+    return data, convs
+def test_reward_model_manager():
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+    rollout_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
+    reward_model_name = os.path.expanduser("~/models/Skywork/Skywork-Reward-V2-Llama-3.2-1B")
+    config.actor_rollout_ref.model.path = rollout_model_name
+    config.reward_model.reward_manager = "dapo"
+    config.reward_model.enable = True
+    config.reward_model.enable_resource_pool = True
+    config.reward_model.n_gpus_per_node = 8
+    config.reward_model.nnodes = 1
+    config.reward_model.model.path = reward_model_name
+    config.reward_model.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.reward_model.rollout.gpu_memory_utilization = 0.9
+    config.reward_model.rollout.tensor_model_parallel_size = 2
+    config.reward_model.rollout.skip_tokenizer_init = False
+    config.reward_model.rollout.prompt_length = 2048
+    config.reward_model.rollout.response_length = 4096
+    # 1. init reward model manager
+    reward_loop_manager = RewardLoopManager(config)
+    # 2. init test data
+    rollout_tokenizer = hf_tokenizer(rollout_model_name)
+    data, convs = create_data_samples(rollout_tokenizer)
+    # 3. generate responses
+    outputs = reward_loop_manager.compute_rm_score(data)
+    for idx, (conv, output) in enumerate(zip(convs, outputs, strict=True)):
+        print(f"Problem {idx}:\n{conv[0]['content']}\n")
+        print(f"AI Solution {idx}:\n{conv[1]['content']}\n")
+        print(f"DisRM Score {idx}:\n{output.batch['rm_scores'].sum(dim=-1).item()}\n")
+        print("=" * 50 + "\n")
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/experimental/vla/test_sim_envs.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import numpy as np
+import pytest
+from omegaconf import OmegaConf
+# @pytest.mark.parametrize("simulator_type", ["libero", "isaac"])
+@pytest.mark.parametrize("simulator_type", ["isaac"])
+def test_sim_env_creation_and_step(simulator_type):
+    num_envs = 8
+    actions = np.array(
+        [
+            [5.59112417e-01, 8.06460073e-02, 1.36817226e-02, -4.64279854e-04, -1.72158767e-02, -6.57548380e-04, -1],
+            [2.12711899e-03, -3.13366604e-01, 3.41386353e-04, -4.64279854e-04, -8.76528812e-03, -6.57548380e-04, -1],
+            [7.38182960e-02, -4.64548351e-02, -6.63602950e-02, -4.64279854e-04, -2.32520114e-02, -6.57548380e-04, -1],
+            [7.38182960e-02, -1.60845593e-01, 3.41386353e-04, -4.64279854e-04, 1.05503430e-02, -6.57548380e-04, -1],
+            [7.38182960e-02, -3.95982152e-01, -7.97006313e-02, -5.10713711e-03, 3.22804279e-02, -6.57548380e-04, -1],
+            [2.41859427e-02, -3.64206941e-01, -6.63602950e-02, -4.64279854e-04, 1.05503430e-02, -6.57548380e-04, -1],
+            [4.62447664e-02, -5.16727952e-01, -7.97006313e-02, -4.64279854e-04, 1.05503430e-02, 8.73740975e-03, -1],
+            [4.62447664e-02, -5.73923331e-01, 3.41386353e-04, -4.64279854e-04, 6.92866212e-03, -6.57548380e-04, -1],
+        ]
+    )
+    cfg = OmegaConf.create(
+        {
+            "max_episode_steps": 512,
+            "only_eval": False,
+            "reward_coef": 1.0,
+            "init_params": {
+                "camera_names": ["agentview"],
+            },
+            "video_cfg": {
+                "save_video": True,
+                "video_base_dir": "/tmp/test_sim_env_creation_and_step",
+            },
+            "task_suite_name": "libero_10",
+            "num_envs": num_envs,
+            "num_group": 1,
+            "group_size": num_envs,
+            "seed": 0,
+        },
+    )
+    sim_env = None
+    if simulator_type == "isaac":
+        from verl.experimental.vla.envs.isaac_env.isaac_env import IsaacEnv
+        sim_env = IsaacEnv(cfg, rank=0, world_size=1)
+    elif simulator_type == "libero":
+        from verl.experimental.vla.envs.libero_env.libero_env import LiberoEnv
+        sim_env = LiberoEnv(cfg, rank=0, world_size=1)
+    else:
+        raise ValueError(f"simulator_type {simulator_type} is not supported")
+    video_count = 0
+    for i in [0]:
+        # The first call to step with actions=None will reset the environment
+        step = 0
+        sim_env.reset_envs_to_state_ids([0] * num_envs, [i] * num_envs)
+        for action in actions:
+            obs_venv, reward_venv, terminated_venv, truncated_venv, info_venv = sim_env.step(
+                np.array([action] * num_envs)
+            )
+            assert isinstance(obs_venv, dict)
+            assert reward_venv.shape == (num_envs,)
+            assert terminated_venv.shape == (num_envs,)
+            assert truncated_venv.shape == (num_envs,)
+            assert isinstance(info_venv, dict)
+            if terminated_venv.any() or truncated_venv.any():
+                break
+            step += 1
+        sim_env.flush_video(video_sub_dir=f"task_{i}")
+        assert os.path.exists(os.path.join(cfg.video_cfg.video_base_dir, f"rank_0/task_{i}/{video_count}.mp4"))
+        os.remove(os.path.join(cfg.video_cfg.video_base_dir, f"rank_0/task_{i}/{video_count}.mp4"))
+        video_count += 1
+    print("test passed")
+    sim_env.close()
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/single_controller/base/test_decorator.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import pytest
+import verl.single_controller.base.decorator as decorator_module
+from verl.single_controller.base.decorator import (
+    DISPATCH_MODE_FN_REGISTRY,
+    Dispatch,
+    _check_dispatch_mode,
+    get_predefined_dispatch_fn,
+    register_dispatch_mode,
+    update_dispatch_mode,
+)
+@pytest.fixture
+def reset_dispatch_registry():
+    # Store original state
+    original_registry = DISPATCH_MODE_FN_REGISTRY.copy()
+    yield
+    # Reset registry after test
+    decorator_module.DISPATCH_MODE_FN_REGISTRY.clear()
+    decorator_module.DISPATCH_MODE_FN_REGISTRY.update(original_registry)
+def test_register_new_dispatch_mode(reset_dispatch_registry):
+    # Test registration
+    def dummy_dispatch(worker_group, *args, **kwargs):
+        return args, kwargs
+    def dummy_collect(worker_group, output):
+        return output
+    register_dispatch_mode("TEST_MODE", dummy_dispatch, dummy_collect)
+    # Verify enum extension
+    _check_dispatch_mode(Dispatch.TEST_MODE)
+    # Verify registry update
+    assert get_predefined_dispatch_fn(Dispatch.TEST_MODE) == {
+        "dispatch_fn": dummy_dispatch,
+        "collect_fn": dummy_collect,
+    }
+    # Clean up
+    Dispatch.remove("TEST_MODE")
+def test_update_existing_dispatch_mode(reset_dispatch_registry):
+    # Store original implementation
+    original_mode = Dispatch.ONE_TO_ALL
+    # New implementations
+    def new_dispatch(worker_group, *args, **kwargs):
+        return args, kwargs
+    def new_collect(worker_group, output):
+        return output
+    # Test update=
+    update_dispatch_mode(original_mode, new_dispatch, new_collect)
+    # Verify update
+    assert get_predefined_dispatch_fn(original_mode)["dispatch_fn"] == new_dispatch
+    assert get_predefined_dispatch_fn(original_mode)["collect_fn"] == new_collect

code/RL_model/verl/verl_train/tests/single_controller/check_worker_alive/main.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+import time
+import ray
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+@ray.remote
+class TestActor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
+    def foo(self, wait_time):
+        time.sleep(wait_time)
+        sys.exit(1)
+if __name__ == "__main__":
+    wait_time = int(os.getenv("WAIT_TIME", "10"))
+    ray.init()
+    # test single-node-no-partition
+    print("test single-node-no-partition")
+    resource_pool = RayResourcePool([2], use_gpu=False)
+    class_with_args = RayClassWithInitArgs(cls=TestActor)
+    print("create worker group")
+    wg = RayWorkerGroup(resource_pool, class_with_args, name_prefix="test")
+    wg.start_worker_aliveness_check(1)
+    time.sleep(1)
+    print(time.time(), "start foo")
+    _ = wg.foo(wait_time)
+    print("foo started")
+    print(
+        time.time(),
+        f"wait 6x wait time {wait_time * 6} to let signal returned to process but still not exceed process wait time",
+    )
+    time.sleep(wait_time * 6)
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/single_controller/detached_worker/README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+# Detached Worker
+## How to run (Only on a single node)
+- Start a local ray cluster:
+```bash
+ray start --head --port=6379
+```
+- Run the server
+```bash
+python3 server.py
+```
+- On another terminal, Run the client
+```bash
+python3 client.py
+```

code/RL_model/verl/verl_train/tests/single_controller/detached_worker/client.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+In client, we can get the server handler and send RPC request
+"""
+import ray
+import torch
+from server import Trainer
+from tensordict import TensorDict
+from verl import DataProto
+from verl.single_controller.ray import RayClassWithInitArgs, RayWorkerGroup
+def compute_position_id_with_mask(mask):
+    return torch.clip(torch.cumsum(mask, dim=-1) - 1, min=0, max=None)
+if __name__ == "__main__":
+    ray.init(address="auto", namespace="verl")
+    # get the worker group using names
+    worker_names = ["trainerTrainer_0:0", "trainerTrainer_0:1"]
+    cls_with_init_args = RayClassWithInitArgs(cls=Trainer)
+    worker_group = RayWorkerGroup.from_detached(worker_names=worker_names, ray_cls_with_init=cls_with_init_args)
+    batch_size = 16
+    sequence_length = 1024
+    # give Trainer some data to train
+    input_ids = torch.randint(low=0, high=256, size=(batch_size, sequence_length), dtype=torch.int64, device="cuda")
+    attention_mask = torch.ones_like(input_ids)
+    position_ids = compute_position_id_with_mask(attention_mask)
+    data = DataProto(
+        batch=TensorDict(
+            {"input_ids": input_ids, "attention_mask": attention_mask, "position_ids": position_ids},
+            batch_size=batch_size,
+        ),
+        meta_info={},
+    )
+    output = worker_group.train_model(data)
+    print(output)

code/RL_model/verl/verl_train/tests/single_controller/detached_worker/run.sh ADDED Viewed

	@@ -0,0 +1,5 @@

+#!/bin/bash
+ray start --head --port=6379
+python3 server.py
+python3 client.py
+ray stop --force

code/RL_model/verl/verl_train/tests/single_controller/detached_worker/server.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Server starts a Trainer. Client sends data to the server to train.
+"""
+import os
+os.environ["MEGATRON_USE_CUDA_TIMER"] = "0"
+os.environ["MEGATRON_START_PROCESS_TIMER"] = "False"
+os.environ["NCCL_DEBUG"] = "WARN"
+import ray
+import torch
+from megatron.core import parallel_state as mpu
+from megatron.core import tensor_parallel
+from megatron.core.models.gpt.gpt_model import ModelType
+from omegaconf import OmegaConf
+from tensordict import TensorDict
+from torch import nn
+from transformers import LlamaConfig
+from verl import DataProto
+from verl.models.llama.megatron import ParallelLlamaForCausalLMRmPadPP
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, make_nd_compute_dataproto_dispatch_fn, register
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils.megatron.optimizer import get_megatron_optimizer, init_megatron_optim_config
+from verl.utils.megatron_utils import get_model, mcore_model_parallel_config
+@ray.remote
+class Trainer(Worker):
+    def __init__(self):
+        super().__init__()
+        if not torch.distributed.is_initialized():
+            rank = int(os.environ["LOCAL_RANK"])
+            torch.distributed.init_process_group(backend="nccl")
+            torch.cuda.set_device(rank)
+            mpu.initialize_model_parallel(
+                tensor_model_parallel_size=2,
+                pipeline_model_parallel_size=1,
+                virtual_pipeline_model_parallel_size=None,
+                use_sharp=False,
+                context_parallel_size=1,
+                expert_model_parallel_size=1,
+                nccl_communicator_config_path=None,
+            )
+            tensor_parallel.model_parallel_cuda_manual_seed(10)
+            is_collect = (
+                mpu.get_tensor_model_parallel_rank() == 0
+                and mpu.get_pipeline_model_parallel_rank() == mpu.get_pipeline_model_parallel_world_size() - 1
+                and mpu.get_context_parallel_rank() == 0
+            )
+            self._register_dispatch_collect_info(
+                mesh_name="train", dp_rank=mpu.get_data_parallel_rank(), is_collect=is_collect
+            )
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def init_model(self):
+        actor_model_config = LlamaConfig(
+            vocab_size=256,
+            hidden_size=2048,
+            intermediate_size=5504,
+            num_hidden_layers=24,
+            num_attention_heads=16,
+            num_key_value_heads=16,
+        )
+        megatron_config = mcore_model_parallel_config(sequence_parallel=True, params_dtype=torch.bfloat16)
+        self.megatron_config = megatron_config
+        def megatron_actor_model_provider(pre_process, post_process):
+            # vpp is not supported yet because it will hang for some reason. Need debugging
+            # this_megatron_config = copy.deepcopy(megatron_config)
+            # this_megatron_config.virtual_pipeline_model_parallel_rank = vpp_rank
+            parallel_model = ParallelLlamaForCausalLMRmPadPP(
+                config=actor_model_config,
+                megatron_config=megatron_config,
+                pre_process=pre_process,
+                post_process=post_process,
+            )
+            parallel_model.cuda()
+            return parallel_model
+        actor_module = get_model(
+            model_provider_func=megatron_actor_model_provider,
+            model_type=ModelType.encoder_or_decoder,
+            wrap_with_ddp=True,
+        )
+        actor_module = nn.ModuleList(actor_module)
+        optim_config = OmegaConf.create({"lr": 1e-6, "clip_grad": 1.0})
+        optim_config = init_megatron_optim_config(optim_config)
+        self.optimizer_config = optim_config
+        actor_optimizer = get_megatron_optimizer(model=actor_module, config=optim_config)
+        self.model = actor_module[0]
+        self.optimizer = actor_optimizer
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="train"))
+    def train_model(self, data: DataProto) -> DataProto:
+        input_ids = data.batch["input_ids"]
+        attention_mask = data.batch["attention_mask"]
+        position_ids = data.batch["position_ids"]
+        self.optimizer.zero_grad()
+        self.model.zero_grad_buffer(
+            zero_buffer=(not self.optimizer_config.use_distributed_optimizer)
+        )  # use use_contiguous_buffers_in_local_ddp and no overlap_dp_param_comm
+        # update for 1 iteration
+        output = self.model(input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids).logits
+        output.mean().backward()
+        update_successful, grad_norm, num_zeros_in_grad = self.optimizer.step(
+            self.megatron_config, self.megatron_config.timers
+        )
+        return DataProto(batch=TensorDict({"loss": output.detach()}, batch_size=output.shape[0]))
+if __name__ == "__main__":
+    ray.init(address="auto", namespace="verl")
+    resource_pool = RayResourcePool(process_on_nodes=[2], detached=True)
+    cls_with_init_args = RayClassWithInitArgs(cls=Trainer)
+    worker_group = RayWorkerGroup(
+        resource_pool=resource_pool,
+        ray_cls_with_init=cls_with_init_args,
+        name_prefix="trainer",
+        detached=True,
+    )
+    worker_group.init_model()
+    worker_names = worker_group.worker_names
+    print(worker_names)

code/RL_model/verl/verl_train/tests/special_e2e/envs/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .digit_completion import DigitCompletion
+__all__ = ["DigitCompletion"]

code/RL_model/verl/verl_train/tests/special_e2e/envs/digit_completion/__init__.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers import AutoTokenizer, LlamaConfig
+from .task import DigitCompletion, generate_ground_truth_response
+from .tokenizer import CharTokenizer
+AutoTokenizer.register(LlamaConfig, CharTokenizer, exist_ok=True)
+__all__ = ["DigitCompletion", "generate_ground_truth_response", "CharTokenizer"]

code/RL_model/verl/verl_train/tests/special_e2e/envs/digit_completion/task.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Task and environment definition for digit completion."""
+import numpy as np
+class DigitCompletion:
+    """
+    The implementation of a simple digit completion task.
+    The prompt is a sequence of numbers with fixed difference. The task is to complete the next N numbers.
+    If the max number is reached, the next number should be modulo with max number.
+    For example,
+    - prompt = [1, 2, 3]
+    - N = 5
+    - max_number = 6
+    the response should be [4, 5, 6, 7%6, 8%6] = [4, 5, 6, 0, 1]
+    Note that the tokenizer is char-level to increase the difficulty.
+    """
+    def __init__(self, max_number: int, max_diff: int, max_num_in_response: int, seed=0):
+        """
+        Args:
+            max_number: the maximum number allowed in the arithmetic sequence
+            max_diff: the maximum diff. The actual common diff will be sampled from [0, max_diff]
+            max_num_in_response: the maximum number in the response
+        """
+        super().__init__()
+        self.max_number = max_number
+        self.max_diff = max_diff
+        self.max_num_in_response = max_num_in_response
+        assert self.max_num_in_response < 10
+        assert self.max_number > 0
+        assert self.max_diff > 0
+        self.max_number_length = len(str(max_number))
+        # {num1},{num2}:{max_num_in_response},{max_number}
+        self._prompt_length = self.max_number_length * 2 + 4 + self.max_number_length  # no negative is allowed
+        self.np_rng = np.random.default_rng(seed=seed)
+    def __str__(self):
+        return (
+            f"Prompt length: {self.prompt_length}. Response length: {self.response_length}, "
+            f"Max number: {self.max_number}. Max diff: {self.max_diff}, "
+            f"Max number in response: {self.max_num_in_response}"
+        )
+    def get_state(self):
+        return {"rng": self.np_rng}
+    def set_state(self, state):
+        assert "rng" in state, "rng must be inside state"
+        self.np_rng = state["rng"]
+    @property
+    def prompt_length(self):
+        return self._prompt_length
+    @property
+    def response_length(self):
+        # number length + comma length + [EOS]
+        # The actual number times 1.5 to allow 'U'
+        return (self.max_num_in_response * self.max_number_length + (self.max_num_in_response - 1) + 1) * 2
+    def add(self, a, b):
+        return (a + b) % self.max_number
+    def get_all_prompts(self):
+        all_prompts = []
+        for first_num in range(self.max_number + 1):
+            for diff in range(0, self.max_diff + 1):
+                second_num = self.add(first_num, diff)
+                for num_to_complete in range(self.max_num_in_response + 1):
+                    prompt = str(first_num) + "," + str(second_num) + f":{self.max_number},{num_to_complete}"
+                    all_prompts.append(prompt)
+        return all_prompts
+    def sample_str_prompts(self):
+        # step 1: sample initial numbers
+        first_num = self.np_rng.integers(self.max_number + 1)
+        diff = self.np_rng.integers(self.max_diff + 1)
+        second_num = self.add(first_num, diff)
+        num_to_complete = self.np_rng.integers(self.max_num_in_response + 1)
+        prompt = str(first_num) + "," + str(second_num) + f":{self.max_number},{num_to_complete}"
+        return prompt
+    def sample_batch_str_prompts(self, batch_size):
+        str_prompts = []
+        for _ in range(batch_size):
+            str_prompts.append(self.sample_str_prompts())
+        return str_prompts
+def compute_attention_mask(prompts, pad_token_id):
+    mask = np.ones_like(prompts)
+    mask[prompts == pad_token_id] = 0
+    return mask
+def compute_position_id_with_mask(mask):
+    return np.clip(np.cumsum(mask, axis=-1) - 1, a_min=0, a_max=None)
+def generate_ground_truth_response(prompt: str):
+    """Generate ground truth response given a prompt."""
+    num, info = prompt.split(":")
+    num1, num2 = num.split(",")
+    max_number, num_to_gen = info.split(",")
+    num1 = int(num1)
+    num2 = int(num2)
+    max_number = int(max_number)
+    num_to_gen = int(num_to_gen)
+    diff = (num2 - num1) % max_number
+    results = []
+    last_num = num2
+    for _ in range(num_to_gen):
+        curr = (last_num + diff) % max_number
+        results.append(str(curr))
+        last_num = curr
+    response = ",".join(results)
+    return response
+def compute_reward(prompt: str, response: str, sequence_reward=1.0):
+    """We compute dense reward here so that we can directly train RL without SFT"""
+    response_length = len(response)
+    ground_truth_response = generate_ground_truth_response(prompt)
+    per_token_reward = sequence_reward / (len(ground_truth_response) + 1)  # including [EOS]
+    # pad
+    reward = np.zeros(response_length, dtype=np.float32)  # this assumes that each char is a token
+    # assign reward until mismatches
+    ground_truth_idx = 0
+    for i in range(response_length):
+        if ground_truth_idx == len(ground_truth_response):
+            break
+        ground_truth_response_token = ground_truth_response[ground_truth_idx]
+        response_token = response[i]
+        if ground_truth_response_token == response_token:
+            reward[i] = per_token_reward
+            ground_truth_idx += 1
+        else:
+            # no matches
+            break
+    return reward, {"ground_truth_response": ground_truth_response}
+if __name__ == "__main__":
+    task = DigitCompletion(max_number=20, max_diff=3, max_num_in_response=5)
+    print(task.sample_str_prompts())
+    prompt = "7,8:20,0"
+    response = ""
+    print(compute_reward(prompt, response))
+    prompt = "7,8:20,0"
+    response = "E000"
+    print(compute_reward(prompt, response))
+    prompt = "9,10:20,2"
+    response = "11,12,13"
+    print(compute_reward(prompt, response))

code/RL_model/verl/verl_train/tests/special_e2e/envs/digit_completion/tokenizer.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Copied from https://github.com/dariush-bahrami/character-tokenizer/blob/master/charactertokenizer/core.py
+CharacterTokenzier for Hugging Face Transformers.
+This is heavily inspired from CanineTokenizer in transformers package.
+"""
+import json
+import os
+from pathlib import Path
+from typing import Optional, Sequence
+from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
+class CharTokenizer(PreTrainedTokenizer):
+    def __init__(self, characters: Sequence[str], model_max_length: int, chat_template, **kwargs):
+        """Character tokenizer for Hugging Face transformers.
+        Args:
+            characters (Sequence[str]): List of desired characters. Any character which
+                is not included in this list will be replaced by a special token called
+                [UNK] with id=6. Following are list of all of the special tokens with
+                their corresponding ids:
+                    "[CLS]": 0
+                    "[SEP]": 1
+                    "[BOS]": 2
+                    "[MASK]": 3
+                    "[PAD]": 4
+                    "[RESERVED]": 5
+                    "[UNK]": 6
+                an id (starting at 7) will be assigned to each character.
+            model_max_length (int): Model maximum sequence length.
+        """
+        eos_token_str = "E"
+        sep_token_str = "S"
+        pad_token_str = "P"
+        unk_token_str = "U"
+        self.characters = characters
+        self.model_max_length = model_max_length
+        eos_token = AddedToken(eos_token_str, lstrip=False, rstrip=False)
+        sep_token = AddedToken(sep_token_str, lstrip=False, rstrip=False)
+        pad_token = AddedToken(pad_token_str, lstrip=False, rstrip=False)
+        unk_token = AddedToken(unk_token_str, lstrip=False, rstrip=False)
+        self._vocab_str_to_int = {
+            sep_token_str: 0,
+            eos_token_str: 1,
+            pad_token_str: 2,
+            unk_token_str: 3,
+            **{ch: i + 4 for i, ch in enumerate(characters)},
+        }
+        self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
+        super().__init__(
+            eos_token=eos_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            unk_token=unk_token,
+            add_prefix_space=False,
+            model_max_length=model_max_length,
+            **kwargs,
+        )
+        self.chat_template = chat_template
+    @property
+    def vocab_size(self) -> int:
+        return len(self._vocab_str_to_int)
+    def get_vocab(self):
+        return self._vocab_str_to_int
+    def _tokenize(self, text: str) -> list[str]:
+        return list(text)
+    def _convert_token_to_id(self, token: str) -> int:
+        return self._vocab_str_to_int.get(token, self._vocab_str_to_int["U"])
+    def _convert_id_to_token(self, index: int) -> str:
+        return self._vocab_int_to_str[index]
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        result = cls + token_ids_0 + sep
+        if token_ids_1 is not None:
+            result += token_ids_1 + sep
+        return result
+    def get_special_tokens_mask(
+        self,
+        token_ids_0: list[int],
+        token_ids_1: Optional[list[int]] = None,
+        already_has_special_tokens: bool = False,
+    ) -> list[int]:
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0,
+                token_ids_1=token_ids_1,
+                already_has_special_tokens=True,
+            )
+        result = [1] + ([0] * len(token_ids_0)) + [1]
+        if token_ids_1 is not None:
+            result += ([0] * len(token_ids_1)) + [1]
+        return result
+    def get_config(self) -> dict:
+        return {
+            "char_ords": [ord(ch) for ch in self.characters],
+            "model_max_length": self.model_max_length,
+            "chat_template": self.chat_template,
+        }
+    @classmethod
+    def from_config(cls, config: dict):
+        cfg = {}
+        cfg["characters"] = [chr(i) for i in config["char_ords"]]
+        cfg["model_max_length"] = config["model_max_length"]
+        cfg["chat_template"] = config["chat_template"]
+        return cls(**cfg)
+    def save_pretrained(self, save_directory: str | os.PathLike, **kwargs):
+        cfg_file = Path(save_directory) / "tokenizer_config.json"
+        cfg = self.get_config()
+        with open(cfg_file, "w") as f:
+            json.dump(cfg, f, indent=4)
+    @classmethod
+    def from_pretrained(cls, save_directory: str | os.PathLike, **kwargs):
+        cfg_file = Path(save_directory) / "tokenizer_config.json"
+        with open(cfg_file) as f:
+            cfg = json.load(f)
+        return cls.from_config(cfg)

code/RL_model/verl/verl_train/tests/special_e2e/generation/run_gen_qwen05.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env bash
+# Tested with 1 & 4 GPUs
+set -xeuo pipefail
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-4}
+OUTPUT_PATH=${OUTPUT_PATH:-$HOME/data/gen/qwen_05_gen_test.parquet}
+GEN_TP=${GEN_TP:-2}  # Default tensor parallel size to 2
+python3 -m verl.trainer.main_generation \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    data.path="${HOME}/data/gsm8k/test.parquet" \
+    data.prompt_key=prompt \
+    data.n_samples=1 \
+    data.output_path="${OUTPUT_PATH}" \
+    model.path="${MODEL_ID}" \
+    +model.trust_remote_code=True \
+    rollout.temperature=1.0 \
+    rollout.top_k=50 \
+    rollout.top_p=0.7 \
+    rollout.prompt_length=2048 \
+    rollout.response_length=1024 \
+    rollout.tensor_model_parallel_size="${GEN_TP}" \
+    rollout.gpu_memory_utilization=0.8

code/RL_model/verl/verl_train/tests/special_e2e/generation/run_gen_qwen05_server.sh ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env bash
+# Tested with 1 & 4 GPUs
+set -xeuo pipefail
+MODEL_ID=${MODEL_ID:-$HOME/models/Qwen/Qwen2.5-0.5B-Instruct}
+NGPUS_PER_NODE=${NGPUS_PER_NODE:-8}
+OUTPUT_PATH=${OUTPUT_PATH:-$HOME/data/gen/qwen_05_gen_test.parquet}
+GEN_TP=${GEN_TP:-2}  # Default tensor parallel size to 2
+python3 -m verl.trainer.main_generation_server \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    actor_rollout_ref.model.path="${MODEL_ID}" \
+    actor_rollout_ref.model.trust_remote_code=True \
+    actor_rollout_ref.rollout.temperature=1.0 \
+    actor_rollout_ref.rollout.top_k=50 \
+    actor_rollout_ref.rollout.top_p=0.7 \
+    actor_rollout_ref.rollout.prompt_length=2048 \
+    actor_rollout_ref.rollout.response_length=1024 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size="${GEN_TP}" \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=4 \
+    data.train_files="${HOME}/data/gsm8k/test.parquet" \
+    data.prompt_key=prompt \
+    +data.output_path="${OUTPUT_PATH}" \

code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "num_hidden_layers": 2,
+    "max_window_layers": 2
+}

code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/expert_parallel/qwen3moe_minimal.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+    "num_hidden_layers": 2,
+    "max_window_layers": 2
+}

code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_function_reward.sh ADDED Viewed

	@@ -0,0 +1,165 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+NUM_GPUS=${NUM_GPUS:-8}
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+#hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
+MAX_PROMPT_LEN=${MAX_PROMPT_LEN:-512}
+MAX_RESPONSE_LEN=${MAX_RESPONSE_LEN:-512}
+ENGINE=${ENGINE:-vllm}
+if [ "$ENGINE" = "vllm" ]; then
+    export VLLM_USE_V1=1
+fi
+ROLLOUT_MODE="async"
+RETURN_RAW_CHAT="True"
+SKIP_TOKENIZER_INIT="True"
+GPU_MEMORY_UTILIZATION=${GPU_MEMORY_UTILIZATION:-0.7}
+ACTOR_FSDP_PARAM_OFFLOAD=${ACTOR_FSDP_PARAM_OFFLOAD:-False}
+ACTOR_FSDP_OPTIMIZER_OFFLOAD=${ACTOR_FSDP_OPTIMIZER_OFFLOAD:-False}
+REF_FSDP_PARAM_OFFLOAD=${REF_FSDP_PARAM_OFFLOAD:-True}
+RM_PAD=${RM_PAD:-True}
+FUSED_KERNELS=${FUSED_KERNELS:-False}
+FUSED_KERNEL_BACKEND=${FUSED_KERNEL_BACKEND:-torch} # or 'triton' for triton backend
+ADV_ESTIMATOR=${ADV_ESTIMATOR:-gae}
+LOSS_MODE=${LOSS_MODE:-vanilla}
+USE_KL=${USE_KL:-False}
+CUSTOM_REWARD_FN=${CUSTOM_REWARD_FN:-False}
+ENABLE_CHUNKED_PREFILL=${ENABLE_CHUNKED_PREFILL:-True} # For vLLM VLM placeholder issue: https://github.com/vllm-project/vllm/issues/15185
+STRATEGY=${STRATEGY:-fsdp}
+# LoRA config
+LORA_RANK=${LORA_RANK:-0}
+LORA_ALPHA=${LORA_ALPHA:-${LORA_RANK}}
+LORA_TARGET=${LORA_TARGET:-"all-linear"}
+LORA_EXCLUDE=${LORA_EXCLUDE:-"DONT_EXCLUDE"}
+USE_SHM=${USE_SHM:-False}
+LOAD_FORMAT=${LOAD_FORMAT:-dummy}
+LAYERED_SUMMON=${LAYERED_SUMMON:-False}
+# Validation
+VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
+TEST_FREQ=${TEST_FREQ:--1}
+# Save & Resume
+RESUME_MODE=${RESUME_MODE:-disable}
+SAVE_FREQ=${SAVE_FREQ:--1}
+TOTAL_TRAIN_STEPS=${TOTAL_TRAIN_STEPS:-1}
+# whether to save hf_model
+SAVE_HF_MODEL=${SAVE_HF_MODEL:-False}
+FSDP_SIZE=${FSDP_SIZE:--1}
+SP_SIZE=${SP_SIZE:-1}
+if [ "${SAVE_HF_MODEL}" = "True" ]; then
+    CHECKPOINT_CONTENTS="['model','hf_model','optimizer','extra']"
+else
+    CHECKPOINT_CONTENTS="['model','optimizer','extra']"
+fi
+train_traj_micro_bsz_per_gpu=2 # b
+n_resp_per_prompt=4 # g
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
+train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
+reward_fn_name=null
+reward_fn_file_path=null
+output_file="$(pwd)/output.txt"
+if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
+    reward_fn_name="my_reward_function"
+    reward_fn_file_path="$(pwd)/my_reward_function.py"
+    rm -rf "${reward_fn_file_path}"
+    cat <<EOF > "$reward_fn_file_path"
+def ${reward_fn_name}(data_source, solution_str, ground_truth, extra_info=None):
+    print(f"Congratulations!!! You have called ${reward_fn_name} successfully!!!")
+    return 0.1
+EOF
+    rm -rf "${output_file}"
+fi
+exp_name="${VERL_EXP_NAME:-$(basename "${MODEL_ID,,}")-function-reward-minimal}"
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator="${ADV_ESTIMATOR}" \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.train_batch_size="${train_prompt_bsz}" \
+    data.max_prompt_length="${MAX_PROMPT_LEN}" \
+    data.max_response_length="${MAX_RESPONSE_LEN}" \
+    data.return_raw_chat=${RETURN_RAW_CHAT} \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.use_shm=${USE_SHM} \
+    actor_rollout_ref.model.lora_rank=${LORA_RANK} \
+    actor_rollout_ref.model.lora_alpha=${LORA_ALPHA} \
+    actor_rollout_ref.model.target_modules=${LORA_TARGET} \
+    actor_rollout_ref.model.exclude_modules=${LORA_EXCLUDE} \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding="${RM_PAD}" \
+    actor_rollout_ref.model.use_fused_kernels=${FUSED_KERNELS} \
+    actor_rollout_ref.model.fused_kernel_options.impl_backend=${FUSED_KERNEL_BACKEND} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.strategy=${STRATEGY} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=${ACTOR_FSDP_PARAM_OFFLOAD} \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=${ACTOR_FSDP_OPTIMIZER_OFFLOAD} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${FSDP_SIZE} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size="${SP_SIZE}" \
+    actor_rollout_ref.actor.checkpoint.save_contents=${CHECKPOINT_CONTENTS} \
+    actor_rollout_ref.actor.use_kl_loss="${USE_KL}" \
+    actor_rollout_ref.actor.policy_loss.loss_mode="${LOSS_MODE}" \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name="${ENGINE}" \
+    actor_rollout_ref.rollout.mode="${ROLLOUT_MODE}" \
+    actor_rollout_ref.rollout.load_format=${LOAD_FORMAT} \
+    actor_rollout_ref.rollout.layered_summon=${LAYERED_SUMMON} \
+    actor_rollout_ref.rollout.skip_tokenizer_init="${SKIP_TOKENIZER_INIT}" \
+    actor_rollout_ref.rollout.gpu_memory_utilization="${GPU_MEMORY_UTILIZATION}" \
+    actor_rollout_ref.rollout.enable_chunked_prefill="${ENABLE_CHUNKED_PREFILL}" \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.ref.fsdp_config.param_offload="${REF_FSDP_PARAM_OFFLOAD}" \
+    critic.optim.lr=1e-5 \
+    critic.model.use_remove_padding="${RM_PAD}" \
+    critic.model.path="${MODEL_PATH}" \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    custom_reward_function.path="${reward_fn_file_path}"\
+    custom_reward_function.name="${reward_fn_name}"\
+    algorithm.use_kl_in_reward="${USE_KL}" \
+    algorithm.kl_penalty=kl \
+    algorithm.kl_ctrl.kl_coef=0.001 \
+    trainer.critic_warmup=0 \
+    trainer.logger=console \
+    trainer.project_name='verl-test' \
+    trainer.experiment_name="${exp_name}" \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node="${NUM_GPUS}" \
+    trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
+    trainer.test_freq="${TEST_FREQ}" \
+    trainer.save_freq="${SAVE_FREQ}" \
+    trainer.resume_mode="${RESUME_MODE}" \
+    trainer.total_epochs=2 \
+    trainer.device=cuda \
+    trainer.total_training_steps="${TOTAL_TRAIN_STEPS}" $@ \
+    | tee "${output_file}"
+if [ "${CUSTOM_REWARD_FN}" = "True" ]; then
+    python3 tests/special_e2e/check_custom_rwd_fn.py --output_file="${output_file}"
+    check_exit_code=$?
+    rm -rf "${reward_fn_file_path}"
+    rm -rf "${output_file}"
+    # Return the exit code of check_custom_rwd_fn.py if it fails
+    if [ $check_exit_code -ne 0 ]; then
+        exit $check_exit_code
+    fi
+fi

code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_model_reward.sh ADDED Viewed

	@@ -0,0 +1,101 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+NUM_GPUS=${NUM_GPUS:-8}
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+#hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
+RM_PAD=${RM_PAD:-True}
+FUSED_KERNELS=${FUSED_KERNELS:-False}
+FUSED_KERNEL_BACKEND=${FUSED_KERNEL_BACKEND:-torch} # or 'triton' for triton backend
+SP_SIZE=${SP_SIZE:-1}
+SEQ_BALANCE=${SEQ_BALANCE:-False}
+LIGER=${LIGER:-False}
+# Validation
+VAL_BEFORE_TRAIN=${VAL_BEFORE_TRAIN:-False}
+TEST_FREQ=${TEST_FREQ:--1}
+# Save & Resume
+RESUME_MODE=${RESUME_MODE:-disable}
+SAVE_FREQ=${SAVE_FREQ:--1}
+TOTAL_TRAIN_STEPS=${TOTAL_TRAIN_STEPS:-1}
+train_traj_micro_bsz_per_gpu=2 # b
+n_resp_per_prompt=4 # g
+train_traj_micro_bsz=$((train_traj_micro_bsz_per_gpu * NUM_GPUS)) # b * n
+train_traj_mini_bsz=$((train_traj_micro_bsz * 2)) # 2 * b * n
+train_prompt_mini_bsz=$((train_traj_mini_bsz * n_resp_per_prompt)) # 2 * b * n / g
+train_prompt_bsz=$((train_prompt_mini_bsz * 2)) # 4 * b * n / g
+train_max_token_num_per_gpu=32768
+infer_max_token_num_per_gpu=32768
+exp_name="$(basename "${MODEL_ID,,}")-model-reward-minimal"
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=gae \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.max_prompt_length=512 \
+    data.max_response_length=512 \
+    data.return_raw_chat=True \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.use_liger="${LIGER}" \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.model.use_remove_padding="${RM_PAD}" \
+    actor_rollout_ref.model.use_fused_kernels=${FUSED_KERNELS} \
+    actor_rollout_ref.model.fused_kernel_options.impl_backend=${FUSED_KERNEL_BACKEND} \
+    actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.use_dynamic_bsz="${SEQ_BALANCE}" \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size="${SP_SIZE}" \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_max_token_num_per_gpu} \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.optim.lr=1e-5 \
+    critic.ulysses_sequence_parallel_size="${SP_SIZE}" \
+    critic.model.use_remove_padding="${RM_PAD}" \
+    critic.optim.lr_warmup_steps_ratio=0.05 \
+    critic.model.path="${MODEL_PATH}" \
+    critic.model.enable_gradient_checkpointing=False \
+    critic.use_dynamic_bsz="${SEQ_BALANCE}" \
+    critic.ppo_max_token_len_per_gpu=${train_max_token_num_per_gpu} \
+    critic.ppo_micro_batch_size_per_gpu=${train_traj_micro_bsz_per_gpu} \
+    critic.model.fsdp_config.param_offload=False \
+    critic.model.fsdp_config.optimizer_offload=False \
+    reward_model.enable=True \
+    reward_model.model.path="${MODEL_PATH}" \
+    reward_model.use_reward_loop=True \
+    reward_model.rollout.gpu_memory_utilization=0.8 \
+    reward_model.rollout.tensor_model_parallel_size=1 \
+    reward_model.rollout.prompt_length=1024 \
+    reward_model.rollout.response_length=512 \
+    reward_model.num_workers=8 \
+    algorithm.use_kl_in_reward=False \
+    trainer.critic_warmup=0 \
+    trainer.logger=console \
+    trainer.project_name='verl-test' \
+    trainer.experiment_name="${exp_name}" \
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node="${NUM_GPUS}" \
+    trainer.val_before_train="${VAL_BEFORE_TRAIN}" \
+    trainer.test_freq="${VAL_BEFORE_TRAIN}" \
+    trainer.save_freq="${SAVE_FREQ}" \
+    trainer.resume_mode="${RESUME_MODE}" \
+    trainer.total_epochs=2 \
+    trainer.total_training_steps="${TOTAL_TRAIN_STEPS}" $@

code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_single_gpu.sh ADDED Viewed

	@@ -0,0 +1,24 @@

+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+  data.train_files=$HOME/data/gsm8k/train.parquet \
+  data.val_files=$HOME/data/gsm8k/test.parquet \
+  data.train_batch_size=256  \
+  data.max_prompt_length=512 \
+  data.max_response_length=256  \
+  actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  actor_rollout_ref.actor.optim.lr=1e-6 \
+  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4  \
+  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+  actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+  critic.optim.lr=1e-5 \
+  critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  critic.ppo_micro_batch_size_per_gpu=4 \
+  algorithm.kl_ctrl.kl_coef=0.001 \
+  trainer.logger=console \
+  trainer.val_before_train=False \
+  trainer.n_gpus_per_node=1 \
+  trainer.nnodes=1 \
+  actor_rollout_ref.rollout.name=hf \
+  trainer.total_training_steps=2

code/RL_model/verl/verl_train/tests/special_e2e/ppo_trainer/run_single_gpu_with_engine.sh ADDED Viewed

	@@ -0,0 +1,25 @@

+PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+  data.train_files=$HOME/data/gsm8k/train.parquet \
+  data.val_files=$HOME/data/gsm8k/test.parquet \
+  data.train_batch_size=256  \
+  data.max_prompt_length=512 \
+  data.max_response_length=256  \
+  actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  actor_rollout_ref.actor.optim.lr=1e-6 \
+  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+  actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4  \
+  actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+  actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+  actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+  critic.optim.lr=1e-5 \
+  critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  critic.ppo_micro_batch_size_per_gpu=4 \
+  algorithm.kl_ctrl.kl_coef=0.001 \
+  trainer.logger=['console'] \
+  trainer.val_before_train=False \
+  trainer.n_gpus_per_node=1 \
+  trainer.nnodes=1 \
+  actor_rollout_ref.rollout.name=hf \
+  trainer.use_legacy_worker_impl=disable \
+  trainer.total_training_steps=2

code/RL_model/verl/verl_train/tests/special_e2e/sft/compare_sft_engine_results.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import torch
+def get_result(file):
+    file = os.path.expanduser(file)
+    result = []
+    with open(file) as f:
+        lines = f.readlines()
+        for line in lines:
+            result.append(json.loads(line))
+    return result
+def compare_results(golden_results, other_result):
+    golden_loss = golden_results[0]["data"]["train/loss"]
+    golden_grad_norm = golden_results[0]["data"]["train/grad_norm"]
+    loss = other_result[0]["data"]["train/loss"]
+    grad_norm = other_result[0]["data"]["train/grad_norm"]
+    torch.testing.assert_close(golden_loss, loss, atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(golden_grad_norm, grad_norm, atol=1e-4, rtol=3e-2)
+if __name__ == "__main__":
+    golden_results = get_result("~/verl/test/log/golden.jsonl")
+    # get all other results
+    other_results = {}
+    # walk through all files in ~/verl/test/log
+    for file in os.listdir(os.path.expanduser("~/verl/test/log/verl_sft_test")):
+        if file.endswith(".jsonl"):
+            other_results[file] = get_result(os.path.join(os.path.expanduser("~/verl/test/log/verl_sft_test"), file))
+    # # compare results
+    for file, other_result in other_results.items():
+        print(f"compare results {file}")
+        compare_results(golden_results, other_result)
+        print(f"compare results {file} done")
+    print("All results are close to golden results")

code/RL_model/verl/verl_train/tests/special_e2e/sft/run_sft.sh ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.fsdp_sft_trainer"}
+NUM_GPUS=${NUM_GPUS:-8}
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B-Instruct}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+#hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+TRAIN_FILES=${TRAIN_FILES:-$HOME/data/gsm8k/train.parquet}
+VAL_FILES=${VAL_FILES:-$HOME/data/gsm8k/test.parquet}
+SP_SIZE=${SP_SIZE:-1}
+LIGER=${LIGER:-False}
+MULTITURN=${MULTITURN:-False}
+LORA_RANK=${LORA_RANK:-0}
+RM_PAD=${RM_PAD:-True}
+TOTAL_TRAIN_STEP=${TOTAL_TRAIN_STEP:-1}
+RESUME_MODE=${RESUME_MODE:-disable}
+SAVE_FREQ=${SAVE_FREQ:-1}
+micro_bsz=2
+NUM_GPUS=8
+project_name="verl-test"
+exp_name="$(basename "${MODEL_ID,,}")-sft-minimal"
+ckpts_home=${ckpts_home:-$HOME/${project_name}/${exp_name}}
+mkdir -p "${ckpts_home}"
+torchrun --standalone --nnodes=1 --nproc_per_node=${NUM_GPUS} ${ENTRYPOINT} \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.prompt_key=extra_info \
+    data.response_key=extra_info \
+    data.prompt_dict_keys=['question'] \
+    data.response_dict_keys=['answer'] \
+    data.multiturn.enable="${MULTITURN}" \
+    data.multiturn.messages_key=messages \
+    optim.lr=1e-4 \
+    data.micro_batch_size_per_gpu=${micro_bsz} \
+    model.strategy=fsdp \
+    model.partial_pretrain="${MODEL_PATH}" \
+    model.lora_rank="${LORA_RANK}" \
+    model.lora_alpha=16 \
+    model.target_modules=all-linear \
+    model.use_liger="${LIGER}" \
+    ulysses_sequence_parallel_size="${SP_SIZE}" \
+    use_remove_padding="${RM_PAD}" \
+    trainer.default_local_dir="${ckpts_home}" \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_training_steps=${TOTAL_TRAIN_STEP} \
+    trainer.save_freq=${SAVE_FREQ} \
+    trainer.checkpoint.save_contents=[model,optimizer,extra,hf_model] \
+    trainer.max_ckpt_to_keep=1 \
+    trainer.resume_mode=${RESUME_MODE} \
+    trainer.logger=['console'] $@
+rm -rf "${ckpts_home:?}/*"

code/RL_model/verl/verl_train/tests/special_e2e/sft/run_sft_engine.sh ADDED Viewed

	@@ -0,0 +1,134 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+NUM_GPUS=${NUM_GPUS:-1}
+mode=${mode:-spmd}
+if [ "$mode" = "spmd" ]; then
+  ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer"}
+  COMMAND="torchrun --standalone --nnodes=${NNODES:-1} --nproc-per-node=${NUM_GPUS:-1} ${ENTRYPOINT}"
+else
+  ENTRYPOINT=${ENTRYPOINT:-"-m verl.trainer.sft_trainer_ray"}
+  COMMAND="python ${ENTRYPOINT} trainer.nnodes=${NNODES:-1} trainer.n_gpus_per_node=${NUM_GPUS:-1}"
+fi
+DATASET_DIR=${DATASET_DIR:-~/data/gsm8k_sft}
+TRAIN_FILES=${DATASET_DIR}/train.parquet
+VAL_FILES=${DATASET_DIR}/test.parquet
+backend=${BACKEND:-fsdp}
+project_name=verl_sft_test
+RESUME_MODE=disable
+ckpts_home=${ckpts_home:-~/verl/test/gsm8k-sft-${backend}}
+MODEL_ID=${MODEL_ID:-Qwen/Qwen2.5-0.5B}
+MODEL_PATH=${MODEL_PATH:-${HOME}/models/${MODEL_ID}}
+#hf download "${MODEL_ID}" --local-dir "${MODEL_PATH}"
+SP_SIZE=${SP_SIZE:-1}
+FSDP_SIZE=${FSDP_SIZE:-${NUM_GPUS}}
+FSDP_STRATEGY=${FSDP_STRATEGY:-"fsdp"}
+TP_SIZE=${TP_SIZE:-1}
+PP_SIZE=${PP_SIZE:-1}
+VPP_SIZE=${VPP_SIZE:-null}
+CP_SIZE=${CP_SIZE:-1}
+PAD_MODE=${PAD_MODE:-no_padding}
+USE_REMOVE_PADDING=${USE_REMOVE_PADDING:-True}
+FSDP_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.min_lr_ratio=0.1 \
+    optim.lr_scheduler_type=cosine \
+    engine.ulysses_sequence_parallel_size=${SP_SIZE} \
+    engine.strategy=${FSDP_STRATEGY} \
+    engine.fsdp_size=${FSDP_SIZE}"
+VEOMNI_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_min=1e-6 \
+    optim.lr_scheduler_type=cosine \
+    engine.ulysses_parallel_size=${SP_SIZE} \
+    engine.data_parallel_mode=${FSDP_STRATEGY} \
+    engine.data_parallel_size=${FSDP_SIZE}"
+MEGATRON_ENGINE_CONFIG="\
+    engine=${backend} \
+    optim=${backend} \
+    optim.lr=1e-5 \
+    optim.lr_warmup_steps_ratio=0.2 \
+    optim.weight_decay=0.1 \
+    optim.betas="[0.9,0.95]" \
+    optim.clip_grad=1.0 \
+    optim.lr_warmup_init=0 \
+    optim.lr_decay_style=cosine \
+    optim.min_lr=1e-6 \
+    engine.tensor_model_parallel_size=${TP_SIZE} \
+    engine.pipeline_model_parallel_size=${PP_SIZE} \
+    engine.virtual_pipeline_model_parallel_size=${VPP_SIZE} \
+    engine.context_parallel_size=${CP_SIZE} \
+    +engine.override_transformer_config.context_parallel_size=${CP_SIZE} \
+    engine.use_mbridge=True"
+if [ "$backend" = "fsdp" ]; then
+    ENGINE_CONFIG="$FSDP_ENGINE_CONFIG"
+    echo "Using fsdp engine"
+    exp_name=gsm8k-${backend}-${FSDP_STRATEGY}-sp${SP_SIZE}-fsdp${FSDP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
+elif [ "$backend" = "veomni" ]; then
+    ENGINE_CONFIG="$VEOMNI_ENGINE_CONFIG"
+    echo "Using veomni engine"
+    exp_name=gsm8k-${backend}-sp${SP_SIZE}-fsdp${FSDP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
+else
+    ENGINE_CONFIG="$MEGATRON_ENGINE_CONFIG"
+    echo "Using megatron engine"
+    exp_name=gsm8k-${backend}-tp${TP_SIZE}-pp${PP_SIZE}-vpp${VPP_SIZE}-cp${CP_SIZE}-pad-${PAD_MODE}-use_remove_padding-${USE_REMOVE_PADDING}-mode-${mode}
+fi
+mkdir -p "${ckpts_home}"
+$COMMAND \
+    data.train_files="${TRAIN_FILES}" \
+    data.val_files="${VAL_FILES}" \
+    data.train_batch_size=128 \
+    data.pad_mode=${PAD_MODE} \
+    data.truncation=error \
+    data.use_dynamic_bsz=True \
+    data.max_token_len_per_gpu=2048 \
+    data.messages_key=messages \
+    model.path=$MODEL_PATH \
+    model.use_remove_padding=${USE_REMOVE_PADDING} \
+    ${ENGINE_CONFIG} \
+    trainer.test_freq=after_each_epoch \
+    trainer.save_freq=-1 \
+    trainer.logger=['console','file'] \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.total_epochs=2 \
+    trainer.total_training_steps=2 \
+    trainer.default_local_dir="${ckpts_home}" \
+    trainer.resume_mode=${RESUME_MODE} \
+    # trainer.total_training_steps=${TOTAL_TRAIN_STEP} \
+    # trainer.checkpoint.save_contents=[model,optimizer,extra,hf_model] \
+    # trainer.max_ckpt_to_keep=1 \
+rm -rf "${ckpts_home:?}/*"

code/RL_model/verl/verl_train/tests/special_e2e/sft/test_sft_engine_all.sh ADDED Viewed

	@@ -0,0 +1,42 @@

+#!/usr/bin/env bash
+set -xeuo pipefail
+rm -rf ~/verl/test/log
+mkdir -p ~/verl/test/log
+export VERL_FILE_LOGGER_ROOT=~/verl/test/log
+VPP_SIZE=${VPP_SIZE:-2}
+# test with single gpu as golden
+echo "run with single gpu as golden"
+BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=1 NUM_GPUS=1 FSDP_STRATEGY=fsdp VERL_FILE_LOGGER_PATH=~/verl/test/log/golden.jsonl bash tests/special_e2e/sft/run_sft_engine.sh
+# test with fsdp 1
+echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp pad_mode no_padding"
+BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding bash tests/special_e2e/sft/run_sft_engine.sh
+# test with fsdp 1 use_remove_padding and pad_mode no_padding
+echo "run with sp4 fsdp_size4 num_gpus8 fsdp_strategy fsdp pad_mode no_padding use_remove_padding False"
+BACKEND=fsdp SP_SIZE=1 FSDP_SIZE=-1 NUM_GPUS=8 FSDP_STRATEGY=fsdp PAD_MODE=no_padding USE_REMOVE_PADDING=False bash tests/special_e2e/sft/run_sft_engine.sh
+# test with fsdp 2
+echo "run with sp2 fsdp_size2 num_gpus8 fsdp_strategy fsdp2"
+BACKEND=fsdp SP_SIZE=2 FSDP_SIZE=2 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine.sh
+# test with veomni
+echo "run with sp2 fsdp_size4 num_gpus8 fsdp_strategy fsdp2"
+BACKEND=veomni SP_SIZE=2 FSDP_SIZE=4 NUM_GPUS=8 FSDP_STRATEGY=fsdp2 bash tests/special_e2e/sft/run_sft_engine.sh
+# test with megatron
+echo "run with tp2 pp2 vpp2 cp2 num_gpus8"
+BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 bash tests/special_e2e/sft/run_sft_engine.sh
+# test with cp in ray
+echo "run with tp2 pp2 vpp2 cp2 num_gpus8 mode=ray"
+BACKEND=megatron TP_SIZE=2 PP_SIZE=2 VPP_SIZE=${VPP_SIZE} CP_SIZE=2 NUM_GPUS=8 mode=ray bash tests/special_e2e/sft/run_sft_engine.sh
+python3 tests/special_e2e/sft/compare_sft_engine_results.py
+rm -rf ~/verl/test/log

code/RL_model/verl/verl_train/tests/special_e2e/sft/test_sp_loss_match.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.distributed
+from tensordict import TensorDict
+from torch.distributed.device_mesh import init_device_mesh
+from verl.trainer.fsdp_sft_trainer import FSDPSFTTrainer
+from verl.utils.distributed import initialize_global_process_group
+def test_trainer_forward_consistency(trainer: FSDPSFTTrainer, total_steps: int = 4):
+    """Test consistency between original forward pass and SP+rmpad forward passes.
+    Args:
+        trainer: The FSDPSFTTrainer instance to test
+        total_steps: Number of steps to test (default: 4)
+    """
+    if trainer.device_mesh.get_rank() == 0:
+        print("\nStarting debug comparison between original and SP+rmpad forward passes...")
+        print(f"Sequence parallel size: {trainer.config.ulysses_sequence_parallel_size}")
+        print(f"Remove padding: {trainer.use_remove_padding}\n")
+    steps_remaining = total_steps
+    for epoch in range(1):  # Just one epoch for testing
+        trainer.train_sampler.set_epoch(epoch=epoch)
+        for data in trainer.train_dataloader:
+            data = TensorDict(data, batch_size=trainer.config.data.train_batch_size).cuda()
+            trainer.fsdp_model.train()
+            micro_batches = data.split(trainer.config.data.micro_batch_size_per_gpu)
+            for idx, micro_batch in enumerate(micro_batches):
+                if trainer.device_mesh.get_rank() == 0:
+                    print(f"\nProcessing micro batch {idx + 1}/{len(micro_batches)}")
+                # Compute losses using both methods
+                # Disable SP and rmpad
+                trainer.use_remove_padding = False
+                old_sp = trainer.config.ulysses_sequence_parallel_size
+                trainer.config.ulysses_sequence_parallel_size = 1
+                loss_ref = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
+                # Do SP and rmpad
+                trainer.config.ulysses_sequence_parallel_size = old_sp
+                trainer.use_remove_padding = True
+                loss_sp = trainer._compute_loss_and_backward(micro_batch.copy(), do_backward=False)
+                # Collect losses across all ranks
+                loss_ref_all = loss_ref.clone()
+                loss_sp_all = loss_sp.clone()
+                torch.distributed.all_reduce(loss_ref_all, op=torch.distributed.ReduceOp.AVG)
+                torch.distributed.all_reduce(loss_sp_all, op=torch.distributed.ReduceOp.AVG)
+                # Calculate relative difference of averaged losses
+                rel_diff = torch.abs(loss_ref_all - loss_sp_all) / (torch.abs(loss_ref_all) + 1e-8)
+                if trainer.device_mesh.get_rank() == 0:
+                    print("\nComparison Results (Averaged across ranks):")
+                    print(f"Reference Loss: {loss_ref_all.item():.6f}")
+                    print(f"SP+rmpad Loss: {loss_sp_all.item():.6f}")
+                    print(f"Relative Difference: {rel_diff.item():.6f}")
+                    assert rel_diff.item() < 1e-2, "Significant difference detected between averaged losses!"
+                    print("Loss difference is within the acceptable range.")
+                steps_remaining -= 1
+                if steps_remaining == 0:
+                    break
+            if steps_remaining == 0:
+                break
+        break
+    if trainer.device_mesh.get_rank() == 0:
+        print("\nDebug comparison completed successfully.")
+def create_trainer(config):
+    """Create and initialize a trainer instance with the given config.
+    Args:
+        config: Configuration object with training parameters
+    Returns:
+        FSDPSFTTrainer: Initialized trainer instance
+    """
+    local_rank, rank, world_size = initialize_global_process_group()
+    device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(world_size,), mesh_dim_names=("fsdp",))
+    dp_size = world_size // config.ulysses_sequence_parallel_size
+    ulysses_device_mesh = init_device_mesh(
+        device_type="cuda", mesh_shape=(dp_size, config.ulysses_sequence_parallel_size), mesh_dim_names=("dp", "sp")
+    )
+    # build tokenizer and datasets first
+    from verl.trainer.fsdp_sft_trainer import create_sft_dataset
+    from verl.utils import hf_tokenizer
+    from verl.utils.fs import copy_to_local
+    local_model_path = copy_to_local(src=config.model.partial_pretrain, verbose=True)
+    tokenizer = hf_tokenizer(local_model_path, trust_remote_code=config.model.trust_remote_code)
+    train_dataset = create_sft_dataset(
+        config.data.train_files, config.data, tokenizer, max_samples=config.data.get("train_max_samples", -1)
+    )
+    val_dataset = create_sft_dataset(
+        config.data.val_files, config.data, tokenizer, max_samples=config.data.get("val_max_samples", -1)
+    )
+    return FSDPSFTTrainer(
+        config=config,
+        device_mesh=device_mesh,
+        ulysses_device_mesh=ulysses_device_mesh,
+        tokenizer=tokenizer,
+        train_dataset=train_dataset,
+        val_dataset=val_dataset,
+    )
+def main(config):
+    """Main function to run trainer tests.
+    Args:
+        config: Configuration object with training parameters
+    """
+    trainer = create_trainer(config)
+    test_trainer_forward_consistency(trainer)
+if __name__ == "__main__":
+    import hydra
+    from omegaconf import DictConfig
+    @hydra.main(config_path="../../../verl/trainer/config", config_name="sft_trainer")
+    def hydra_entry(cfg: DictConfig) -> None:
+        main(cfg)
+    hydra_entry()

code/RL_model/verl/verl_train/tests/trainer/config/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

code/RL_model/verl/verl_train/tests/utils/ckpt/test_checkpoint_cleanup_on_cpu.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import tempfile
+import pytest
+class TestCheckpointCleanupLogic:
+    """Tests for checkpoint cleanup methods in BaseCheckpointManager."""
+    @pytest.fixture(autouse=True)
+    def setup(self):
+        """Set up test fixtures."""
+        self.test_dir = tempfile.mkdtemp()
+        yield
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+    @pytest.fixture
+    def manager(self, monkeypatch):
+        """Create a minimal BaseCheckpointManager for testing."""
+        import torch.distributed
+        monkeypatch.setattr(torch.distributed, "get_rank", lambda: 0)
+        monkeypatch.setattr(torch.distributed, "get_world_size", lambda: 1)
+        from verl.utils.checkpoint.checkpoint_manager import BaseCheckpointManager
+        class MockModel:
+            pass
+        class MockOptimizer:
+            pass
+        return BaseCheckpointManager(
+            model=MockModel(),
+            optimizer=MockOptimizer(),
+            lr_scheduler=None,
+            processing_class=None,
+            checkpoint_config=None,
+        )
+    def _create_checkpoint_dir(self, step: int) -> str:
+        """Create a mock checkpoint directory."""
+        path = os.path.join(self.test_dir, f"global_step_{step}")
+        os.makedirs(path, exist_ok=True)
+        with open(os.path.join(path, "checkpoint.txt"), "w") as f:
+            f.write(f"step={step}")
+        return path
+    def test_max_ckpt_1_preserves_existing_before_save(self, manager):
+        """
+        Regression test: max_ckpt_to_keep=1 must NOT delete existing checkpoint before save.
+        """
+        ckpt_100 = self._create_checkpoint_dir(100)
+        manager.previous_saved_paths = [ckpt_100]
+        manager.ensure_checkpoint_capacity(max_ckpt_to_keep=1)
+        assert os.path.exists(ckpt_100), "Bug: checkpoint deleted before save!"
+        assert manager.previous_saved_paths == [ckpt_100]
+    def test_max_ckpt_1_deletes_old_after_save(self, manager):
+        """After save succeeds, old checkpoint should be deleted."""
+        ckpt_100 = self._create_checkpoint_dir(100)
+        manager.previous_saved_paths = [ckpt_100]
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.register_checkpoint(ckpt_200, max_ckpt_to_keep=1)
+        assert not os.path.exists(ckpt_100)
+        assert os.path.exists(ckpt_200)
+        assert manager.previous_saved_paths == [ckpt_200]
+    def test_max_ckpt_2_keeps_one_before_save(self, manager):
+        """With max_ckpt_to_keep=2, pre-save cleanup keeps 1 checkpoint."""
+        ckpt_100 = self._create_checkpoint_dir(100)
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.previous_saved_paths = [ckpt_100, ckpt_200]
+        manager.ensure_checkpoint_capacity(max_ckpt_to_keep=2)
+        assert not os.path.exists(ckpt_100)
+        assert os.path.exists(ckpt_200)
+        assert len(manager.previous_saved_paths) == 1
+    def test_max_ckpt_0_keeps_all(self, manager):
+        """max_ckpt_to_keep=0 means unlimited - no deletions."""
+        ckpt_100 = self._create_checkpoint_dir(100)
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.previous_saved_paths = [ckpt_100, ckpt_200]
+        manager.ensure_checkpoint_capacity(max_ckpt_to_keep=0)
+        ckpt_300 = self._create_checkpoint_dir(300)
+        manager.register_checkpoint(ckpt_300, max_ckpt_to_keep=0)
+        assert os.path.exists(ckpt_100)
+        assert os.path.exists(ckpt_200)
+        assert os.path.exists(ckpt_300)
+        assert len(manager.previous_saved_paths) == 3
+    def test_full_save_cycle_max_ckpt_1(self, manager):
+        """Simulate multiple save cycles with max_ckpt_to_keep=1."""
+        # First save
+        manager.ensure_checkpoint_capacity(1)
+        ckpt_100 = self._create_checkpoint_dir(100)
+        manager.register_checkpoint(ckpt_100, 1)
+        assert manager.previous_saved_paths == [ckpt_100]
+        # Second save - existing checkpoint must survive pre-save
+        manager.ensure_checkpoint_capacity(1)
+        assert os.path.exists(ckpt_100), "Bug: checkpoint deleted before save!"
+        ckpt_200 = self._create_checkpoint_dir(200)
+        manager.register_checkpoint(ckpt_200, 1)
+        assert not os.path.exists(ckpt_100)
+        assert manager.previous_saved_paths == [ckpt_200]
+        # Third save
+        manager.ensure_checkpoint_capacity(1)
+        assert os.path.exists(ckpt_200), "Bug: checkpoint deleted before save!"
+        ckpt_300 = self._create_checkpoint_dir(300)
+        manager.register_checkpoint(ckpt_300, 1)
+        assert not os.path.exists(ckpt_200)
+        assert manager.previous_saved_paths == [ckpt_300]