shahidul034 commited on Feb 15

Commit

ff8fd11

verified ·

1 Parent(s): b7edbea

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

code/RL_model/verl/verl_train/docs/_static/custom.css +217 -0
code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js +251 -0
code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js +14 -0
code/RL_model/verl/verl_train/docs/_static/logo.png +0 -0
code/RL_model/verl/verl_train/docs/advance/agent_loop.rst +238 -0
code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md +242 -0
code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst +119 -0
code/RL_model/verl/verl_train/docs/advance/checkpoint.rst +159 -0
code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst +273 -0
code/RL_model/verl/verl_train/docs/advance/fp8.md +107 -0
code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst +97 -0
code/RL_model/verl/verl_train/docs/advance/fully_async.md +595 -0
code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md +193 -0
code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst +20 -0
code/RL_model/verl/verl_train/docs/advance/mtp.md +105 -0
code/RL_model/verl/verl_train/docs/advance/one_step_off.md +319 -0
code/RL_model/verl/verl_train/docs/advance/placement.rst +13 -0
code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst +208 -0
code/RL_model/verl/verl_train/docs/advance/reward_loop.rst +301 -0
code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst +61 -0
code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst +146 -0
code/RL_model/verl/verl_train/docs/advance/rope.rst +39 -0
code/RL_model/verl/verl_train/docs/algo/baseline.md +73 -0
code/RL_model/verl/verl_train/docs/algo/collabllm.md +105 -0
code/RL_model/verl/verl_train/docs/algo/dapo.md +187 -0
code/RL_model/verl/verl_train/docs/algo/entropy.md +115 -0
code/RL_model/verl/verl_train/docs/algo/gpg.md +36 -0
code/RL_model/verl/verl_train/docs/algo/grpo.md +72 -0
code/RL_model/verl/verl_train/docs/algo/opo.md +33 -0
code/RL_model/verl/verl_train/docs/algo/otb.md +104 -0
code/RL_model/verl/verl_train/docs/algo/ppo.md +105 -0
code/RL_model/verl/verl_train/docs/algo/rollout_corr.md +1313 -0
code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md +954 -0
code/RL_model/verl/verl_train/docs/algo/spin.md +179 -0
code/RL_model/verl/verl_train/docs/algo/sppo.md +52 -0
code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst +796 -0
code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst +41 -0
code/RL_model/verl/verl_train/docs/api/data.rst +61 -0
code/RL_model/verl/verl_train/docs/api/single_controller.rst +30 -0
code/RL_model/verl/verl_train/docs/api/trainer.rst +31 -0
code/RL_model/verl/verl_train/docs/api/utils.rst +76 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst +50 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst +403 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst +398 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst +289 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst +153 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst +82 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst +296 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md +324 -0
code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md +233 -0

code/RL_model/verl/verl_train/docs/_static/custom.css ADDED Viewed

	@@ -0,0 +1,217 @@

+/* Make the documentation use full screen width */
+.wy-nav-content {
+    max-width: none !important;
+    width: 100% !important;
+    padding: 1.618em 3.236em !important;
+}
+/* Adjust the content wrapper - will be set by JavaScript */
+.wy-nav-content-wrap {
+    margin-left: 300px;
+    transition: margin-left 0.2s ease;
+    width: auto !important;
+    position: relative !important;
+    background: white !important;
+    min-height: 100vh !important;
+}
+/* Make the main content area responsive */
+.rst-content {
+    max-width: none !important;
+    width: 100% !important;
+}
+/* Optional: Adjust table widths to prevent overflow */
+.rst-content table.docutils {
+    width: 100% !important;
+    table-layout: auto !important;
+}
+/* Optional: Better code block width handling */
+.rst-content .highlight {
+    width: 100% !important;
+}
+/* Content area positioning already handled above */
+/* Optional: Improve readability with some margin on very wide screens */
+@media (min-width: 1400px) {
+    .wy-nav-content {
+        max-width: none !important;
+        margin: 0 auto !important;
+    }
+}
+/* Resizable sidebar styles */
+.wy-nav-side {
+    position: fixed !important;
+    top: 0 !important;
+    bottom: 0 !important;
+    left: 0 !important;
+    width: 300px;
+    min-width: 200px;
+    max-width: 600px;
+    display: flex;
+    flex-direction: column;
+    z-index: 200 !important;
+}
+/* Ensure sidebar header (logo, search) adapts to width */
+.wy-side-nav-search {
+    width: 100% !important;
+    box-sizing: border-box !important;
+    padding: 0.809em 0.809em !important;
+}
+.wy-side-nav-search input[type="text"] {
+    width: 100% !important;
+    box-sizing: border-box !important;
+}
+/* Make logo/title area responsive */
+.wy-side-nav-search > div.version {
+    width: 100% !important;
+}
+.wy-side-nav-search > a {
+    width: 100% !important;
+    display: block !important;
+    white-space: nowrap !important;
+    overflow: hidden !important;
+    text-overflow: ellipsis !important;
+}
+/* Responsive adjustments for narrow sidebar */
+@media (max-width: 300px) {
+    .wy-side-nav-search > a {
+        font-size: 0.9em !important;
+    }
+    .wy-side-nav-search input[type="text"] {
+        font-size: 0.8em !important;
+    }
+}
+/* Ensure search input doesn't overflow */
+.wy-side-nav-search form {
+    width: 100% !important;
+    margin: 0 !important;
+}
+/* Make search icon responsive */
+.wy-side-nav-search .wy-dropdown {
+    width: 100% !important;
+}
+/* Adjust search results dropdown width */
+.wy-side-nav-search .wy-dropdown-menu {
+    width: 100% !important;
+    max-width: none !important;
+    left: 0 !important;
+    right: 0 !important;
+}
+/* Resize handle is created by JavaScript */
+/* Make sure the sidebar content doesn't overflow */
+.wy-side-scroll {
+    width: 100% !important;
+    flex: 1 !important;
+    overflow-y: auto !important;
+    overflow-x: hidden !important;
+    padding-right: 10px !important;
+    box-sizing: border-box !important;
+    scroll-behavior: auto !important; /* Prevent smooth scrolling on sidebar itself */
+}
+/* Ensure proper scroll behavior for main content area */
+html {
+    scroll-behavior: smooth !important;
+}
+/* Ensure anchor links work properly in main content */
+.wy-nav-content-wrap {
+    scroll-behavior: smooth !important;
+}
+/* Fix scroll to target for anchor links */
+.rst-content {
+    scroll-behavior: smooth !important;
+}
+/* Fix anchor scroll offset to account for fixed header */
+.rst-content .section {
+    scroll-margin-top: 60px;
+}
+/* Fix anchor scroll offset for headers */
+.rst-content h1, .rst-content h2, .rst-content h3, .rst-content h4, .rst-content h5, .rst-content h6 {
+    scroll-margin-top: 60px;
+}
+/* Fix anchor scroll offset for specific scroll targets */
+.rst-content .headerlink {
+    scroll-margin-top: 60px;
+}
+/* Fix sidebar navigation styling */
+.wy-menu-vertical {
+    width: 100% !important;
+}
+.wy-menu-vertical li {
+    width: 100% !important;
+}
+.wy-menu-vertical a {
+    width: 100% !important;
+    word-wrap: break-word !important;
+    white-space: normal !important;
+}
+/* Content area margin is handled by JavaScript */
+/* Custom drag handle (more visible) */
+.resize-handle {
+    position: absolute;
+    top: 0;
+    right: 0;
+    width: 8px;
+    height: 100%;
+    background: #ccc;
+    cursor: col-resize;
+    z-index: 1001;
+    opacity: 0.3;
+    transition: opacity 0.2s ease;
+}
+.resize-handle:hover {
+    opacity: 0.8;
+    background: #999;
+}
+.resize-handle::before {
+    content: '';
+    position: absolute;
+    top: 50%;
+    left: 50%;
+    width: 2px;
+    height: 20px;
+    background: #666;
+    transform: translate(-50%, -50%);
+    border-radius: 1px;
+}
+.resize-handle:hover::before {
+    background: #333;
+}
+/* Ensure smooth resizing */
+.wy-nav-side.resizing {
+    user-select: none;
+    pointer-events: none;
+}
+.wy-nav-side.resizing .wy-side-scroll {
+    overflow: hidden;
+}

code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js ADDED Viewed

	@@ -0,0 +1,251 @@

+// Resizable sidebar functionality
+document.addEventListener('DOMContentLoaded', function() {
+    const sidebar = document.querySelector('.wy-nav-side');
+    const content = document.querySelector('.wy-nav-content-wrap');
+    if (!sidebar || !content) return;
+    // Create resize handle
+    const resizeHandle = document.createElement('div');
+    resizeHandle.className = 'resize-handle';
+    sidebar.appendChild(resizeHandle);
+    let isResizing = false;
+    let startX = 0;
+    let startWidth = 0;
+    // Get initial width
+    const getInitialWidth = () => {
+        return 300; // Default width
+    };
+    // Save width to localStorage
+    const saveWidth = (width) => {
+        localStorage.setItem('sidebar-width', width);
+    };
+    // Load width from localStorage
+    const loadWidth = () => {
+        const savedWidth = localStorage.getItem('sidebar-width');
+        if (savedWidth) {
+            const width = parseInt(savedWidth, 10);
+            if (width >= 200 && width <= 600) {
+                return width;
+            }
+        }
+        return getInitialWidth();
+    };
+    // Apply width to sidebar and content
+    const applyWidth = (width) => {
+        // Update sidebar width
+        sidebar.style.width = width + 'px';
+        // Update content margin with !important to override any CSS
+        content.style.setProperty('margin-left', width + 'px', 'important');
+        // Also update any other content wrapper that might exist
+        const contentInner = document.querySelector('.wy-nav-content');
+        if (contentInner) {
+            contentInner.style.setProperty('margin-left', '0px', 'important');
+        }
+        // Force reflow and repaint
+        sidebar.offsetHeight;
+        content.offsetHeight;
+        // Trigger window resize event to notify other components
+        window.dispatchEvent(new Event('resize'));
+    };
+    // Initialize with saved width
+    const initialWidth = loadWidth();
+    applyWidth(initialWidth);
+    // Mouse down on resize handle
+    resizeHandle.addEventListener('mousedown', (e) => {
+        isResizing = true;
+        startX = e.clientX;
+        startWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
+        sidebar.classList.add('resizing');
+        document.body.style.cursor = 'col-resize';
+        document.body.style.userSelect = 'none';
+        // Add overlay to prevent iframe issues
+        const overlay = document.createElement('div');
+        overlay.style.cssText = `
+            position: fixed;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%;
+            z-index: 9999;
+            cursor: col-resize;
+        `;
+        overlay.id = 'resize-overlay';
+        document.body.appendChild(overlay);
+        e.preventDefault();
+    });
+    // Mouse move
+    document.addEventListener('mousemove', (e) => {
+        if (!isResizing) return;
+        const width = startWidth + e.clientX - startX;
+        const clampedWidth = Math.max(200, Math.min(600, width));
+        applyWidth(clampedWidth);
+    });
+    // Mouse up
+    document.addEventListener('mouseup', () => {
+        if (!isResizing) return;
+        isResizing = false;
+        sidebar.classList.remove('resizing');
+        document.body.style.cursor = '';
+        document.body.style.userSelect = '';
+        // Remove overlay
+        const overlay = document.getElementById('resize-overlay');
+        if (overlay) {
+            overlay.remove();
+        }
+        // Save the current width
+        const currentWidth = parseInt(window.getComputedStyle(sidebar).width, 10);
+        saveWidth(currentWidth);
+    });
+    // Handle window resize - removed to prevent infinite loop
+    // The sidebar width is fixed and managed by drag functionality, no need to recalculate on window resize
+    // Double-click to reset to default width
+    resizeHandle.addEventListener('dblclick', () => {
+        const defaultWidth = 300;
+        applyWidth(defaultWidth);
+        saveWidth(defaultWidth);
+    });
+});
+// Fix navigation issues - Using MutationObserver for reliable initialization
+document.addEventListener('DOMContentLoaded', function() {
+    let navigationFixed = false;
+    function setupNavigationFix() {
+        if (navigationFixed) return;
+        // Find all links in the sidebar
+        const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
+        // Only proceed if we have sidebar links
+        if (sidebarLinks.length === 0) return;
+        console.log('Setting up navigation fix...');
+        sidebarLinks.forEach(function(link) {
+            const href = link.getAttribute('href');
+            // Clone the link to remove all existing event listeners
+            const newLink = link.cloneNode(true);
+            // Add our own click handler
+            newLink.addEventListener('click', function(e) {
+                console.log('Link clicked:', href);
+                // If it's an anchor link within the same page
+                if (href && href.startsWith('#') && href !== '#') {
+                    e.preventDefault();
+                    e.stopPropagation();
+                    const targetId = href.substring(1);
+                    const targetElement = document.getElementById(targetId);
+                    if (targetElement) {
+                        // Calculate offset for fixed header
+                        const headerHeight = 60;
+                        const elementPosition = targetElement.getBoundingClientRect().top;
+                        const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
+                        window.scrollTo({
+                            top: offsetPosition,
+                            behavior: 'smooth'
+                        });
+                        // Update URL hash
+                        if (history.pushState) {
+                            history.pushState(null, null, '#' + targetId);
+                        } else {
+                            location.hash = '#' + targetId;
+                        }
+                    }
+                }
+                // For external links, navigate normally
+                else if (href && !href.startsWith('#') && !href.startsWith('javascript:')) {
+                    console.log('Navigating to external link:', href);
+                    window.location.href = href;
+                }
+            });
+            // Replace the old link with the new one
+            link.parentNode.replaceChild(newLink, link);
+        });
+        navigationFixed = true;
+        // Handle initial page load with hash
+        if (window.location.hash) {
+            // Use requestAnimationFrame for better timing
+            requestAnimationFrame(() => {
+                const targetId = window.location.hash.substring(1);
+                const targetElement = document.getElementById(targetId);
+                if (targetElement) {
+                    const headerHeight = 60;
+                    const elementPosition = targetElement.getBoundingClientRect().top;
+                    const offsetPosition = elementPosition + window.pageYOffset - headerHeight;
+                    window.scrollTo({
+                        top: offsetPosition,
+                        behavior: 'smooth'
+                    });
+                }
+            });
+        }
+    }
+    // Try to set up navigation fix immediately
+    setupNavigationFix();
+    // If it didn't work, use MutationObserver to watch for when sidebar links are added
+    if (!navigationFixed) {
+        const observer = new MutationObserver(function(mutations) {
+            mutations.forEach(function(mutation) {
+                if (mutation.type === 'childList' && mutation.addedNodes.length > 0) {
+                    // Check if sidebar links were added
+                    const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a');
+                    if (sidebarLinks.length > 0) {
+                        setupNavigationFix();
+                        if (navigationFixed) {
+                            observer.disconnect();
+                        }
+                    }
+                }
+            });
+        });
+        // Start observing the document for changes
+        observer.observe(document.body, {
+            childList: true,
+            subtree: true
+        });
+        // Fallback timeout in case MutationObserver doesn't work
+        setTimeout(function() {
+            if (!navigationFixed) {
+                setupNavigationFix();
+            }
+            observer.disconnect();
+        }, 5000);
+    }
+});

code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js ADDED Viewed

	@@ -0,0 +1,14 @@

+document.addEventListener("DOMContentLoaded", function () {
+    var script = document.createElement("script");
+    script.type = "module";
+    script.id = "runllm-widget-script";
+    script.src = "https://widget.runllm.com";
+    script.setAttribute("version", "stable");
+    script.setAttribute("crossorigin", "true");
+    script.setAttribute("runllm-keyboard-shortcut", "Mod+j");
+    script.setAttribute("runllm-name", "verl Chatbot");
+    script.setAttribute("runllm-position", "TOP_RIGHT");
+    script.setAttribute("runllm-assistant-id", "679");
+    script.async = true;
+    document.head.appendChild(script);
+  });

code/RL_model/verl/verl_train/docs/_static/logo.png ADDED Viewed

code/RL_model/verl/verl_train/docs/advance/agent_loop.rst ADDED Viewed

	@@ -0,0 +1,238 @@

+Agent Loop
+==========
+Last updated: 07/17/2025.
+.. versionadded:: 0.4.2
+   [status: alpha]
+.. warning::
+   Agent Loop is ready for use, but the API may change in future releaes.
+Agent Loop is designed as general interface for multi-turn rollout and agentic reinforcement learning.
+**Design goal**:
+- Plugable user defined agent loop
+- Provide standard request generate api with different inference frameworks
+- Provide request level load balance between multiple inference servers
+**Non-goal**:
+- How tool is defined and how to call tool
+In high level overview, agent loop is given a prompt, run user defined loop: call LLM generate api, call tools, ...
+and return the final output. The final output is then calculated reward and used as trajectory for RL training.
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_overview.svg?raw=true
+API Design
+----------
+``AgentLoopBase`` class is the abstraction of agent loop, and ``run`` method is the only interface that user need to implement.
+The run method, given prompt messages in format: [{"role": "user"}, {"content": "..."}], and additional sampling params,
+could do whatever user wants, such as
+- call LLM generate api
+- call tools: web search, database query, code sandbox, ...
+- environment interaction
+- reflection
+- ...
+.. code:: python
+   class AgentLoopBase(ABC):
+       @abstractmethod
+       async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput:
+           """Run agent loop to interact with LLM server and environment.
+           Args:
+               sampling_params (Dict[str, Any]): LLM sampling params.
+               **kwargs: dataset fields from `verl.utils.dataset.RLHFDataset`.
+           Returns:
+               AgentLoopOutput: Agent loop output.
+           """
+           raise NotImplementedError
+After running user defined loop, run method should return ``AgentLoopOutput``, including prompt token ids,
+response token ids, and response mask.
+.. code:: python
+   class AgentLoopOutput(BaseModel):
+       """Agent loop output."""
+       prompt_ids: list[int]
+       """Prompt token ids."""
+       response_ids: list[int]
+       """Response token ids including LLM generated token, tool response token."""
+       response_mask: list[int]
+       """Response mask, 1 for LLM generated token, 0 for tool response token."""
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_output.svg?raw=true
+.. note:: AgentLoopOutput only output one trajectory for a given prompt, multiple trajectories output is still under discussion.
+Architecture Design
+-------------------
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_architecture.png?raw=true
+A single PPO step contain two phase: rollout and train. In rollout phase:
+1. PPOTrainer sample a batch from dataset and call ``AgentLoopManager.generate_sequences``.
+2. AgentLoopManager ``wake_up`` all async LLM server instances, which will sync weights between inference engine(vLLM/SGLang) and training engine(FSDP/Megatron-LM).
+3. AgentLoopManager split batch into chunks and send each chunk to ``AgentLoopWorker``.
+4. AgentLoopWorker receive chunk and for each prompt, spawn a user defined ``AgentLoopBase`` instance, run ``run`` coroutine until end and get ``AgentLoopOutput``.
+.. tip::
+   AgentLoopWorker schedules multiple coroutines concurrently. If number of AgentLoopWorker equals batch_size, then each worker is response for one prompt.
+In agent loop, when user need LLM generate response:
+5. Call ``AsyncLLMServerManager.generate`` with prompt_ids.
+6. AsyncLLMServerManager select a server instance with least request in first turn and send request to it. (In following turns, the request will be sent to the same server instance).
+7. AsyncLLMServer receive a request, issue ipc/rpc with model_runner, and generate response. (There's slight differences between vLLM and SGLang, see below).
+When all prompts in all AgentLoopWorker finish, AgentLoopManager gather results and return to PPOTrainer.
+8. AgentLoopManager ``sleep`` all server instances, which will free kv cache and offload weights to CPU memory.
+AsyncLLMServer
+~~~~~~~~~~~~~~
+AsyncLLMServer is the abstraction of LLM server with two types of generation api:
+- `OpenAI chat completion <https://platform.openai.com/docs/api-reference/chat>`_: generate response for the given chat conversation.
+- Token in token out: generate response ids for the given token ids.
+We have officially supported vLLM and SGLang AsyncLLMServer, both of them implement the two api and are well tested.
+Other inference engine should be easy to plug-in by implement the ``AsyncServerBase`` class.
+.. code:: python
+   class AsyncServerBase(ABC):
+       @abstractmethod
+       async def chat_completion(self, raw_request: Request) -> JSONResponse:
+           """OpenAI chat completion API.
+           Args:
+               raw_request (Request): raw json request
+           Returns:
+               JSONResponse: json response
+           API reference: https://platform.openai.com/docs/api-reference/chat/create
+           """
+           raise NotImplementedError
+       @abstractmethod
+       async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]:
+           """Generate response ids given prompt ids.
+           Args:
+               prompt_ids (List[int]): prompt ids
+               sampling_params (Dict[str, Any]): sampling params
+               request_id (str): request id
+           Returns:
+               List[int]: response ids
+           """
+           raise NotImplementedError
+Chat completion vs Token in token out
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. warning::
+   The following conclusion is based on our recent experience and is still open to investigation and discussion.
+Almost all agent frameworks (LangGraph, CrewAI, LlamaIndex, etc) call LLM with OpenAI chat completion api, and
+keep chat history as messages. So user may expect that we should use the chat completion api in multi-turn rollout.
+But based on our recent experience on single-turn training on DAPO and multi-turn training on `retool <https://github.com/volcengine/verl-recipe/tree/main/retool>`_,
+we found the token_ids from apply the final messages may not equal to the token_ids by concat prompt_ids and response_ids in each turn.
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/multi_turn.png?raw=true
+**Where does this inconsistency happened?**
+First, the tool parser may alter the content. For example
+.. code:: json
+   {"role": "assistant", "content": "Let me call a <tool_call>...</tool_call> and get the result"}
+After tool_calls extraction, the messages is like this:
+.. code:: json
+   {"role": "assistant", "content": "Let me call a and get the result", "tool_calls": [{"name": "foo", "arguments": "{}"}]}
+Encode the extracted message back is not equal to the original LLM generated response_ids.
+Second,  the `decode-encode` may also lead to inconsistency: `Agent-R1 issue#30 <https://github.com/0russwest0/Agent-R1/issues/30#issuecomment-2826155367>`_.
+**What is the impact of this inconsistency?**
+This inconsistency is not a big problem for serving/agent system, but is critical to RL training.
+It causes the trajectory deviate from the policy model distribution. We have observed that apply_chat_template
+to the final chat history messages make PPO training not even converged in single-turn.
+vLLM
+^^^^
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_vllm.png?raw=true
+For vLLM, the Async LLM Engine is running in same process as the server, and ModelRunner is running in same process as FSDP/Megatron-LM workers.
+Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it directly call engine to generate response_ids.
+SGLang
+^^^^^^
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_sglang.png?raw=true
+For SGLang, the Async LLM Engine is running in same process as FSDP/Megatron-LM worker-0, and it spawn multiple subprocesses as ModelRunner.
+Also, Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it remote call the worker-0 and get response_ids.
+AsyncLLMServerManager
+~~~~~~~~~~~~~~~~~~~~~
+AsyncLLMServerManager serve as proxy to multiple AsyncLLMServer instances, provides:
+- load balance: select a server instance with least request in first turn and send request to it.
+- sticky session: bind request_id to server instance, so that the same request_id will be sent to the same server instance in following turns.
+AsyncLLMServerManager is passed to ``AgentLoopBase.__init__``, whenever user want to interact with LLM in agent loop,
+they can call ``AsyncLLMServerManager.generate`` to generate response_ids.
+.. code:: python
+   class AsyncLLMServerManager:
+       async def generate(
+           self,
+           request_id,
+           *,
+           prompt_ids: list[int],
+           sampling_params: dict[str, Any],
+       ) -> list[int]:
+           """Generate tokens from prompt ids.
+           Args:
+               request_id (str): request id for sticky session.
+               prompt_ids (List[int]): List of prompt token ids.
+               sampling_params (Dict[str, Any]): Sampling parameters for the chat completion.
+           Returns:
+               List[int]: List of generated token ids.
+           """
+           ...
+Next
+----
+- :doc:`Agentic RL Training<../start/agentic_rl>`: Quick start agentic RL training with gsm8k dataset.
+- `LangGraph MathExpression <https://github.com/volcengine/verl-recipe/tree/main/langgraph_agent/example>`_: Demonstrate how to use LangGraph to build agent loop.
+- `Retool <https://github.com/volcengine/verl-recipe/tree/main/retool>`_: End-to-end retool paper reproduction using tool agent.

code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md ADDED Viewed

	@@ -0,0 +1,242 @@

+# Recipe: Async On-Policy Knowledge Distillation Trainer
+**Authors:** Brilliant Hanabi, furunding
+**Last updated:** 2025-11-08
+## 1. Background
+On-policy knowledge distillation (KD) trains a student policy to imitate a stronger teacher using samples drawn from the student's current policy. For each on-policy rollout the teacher returns soft, top-k token distributions and the student is optimized with a token-wise sparse KL objective that focuses learning on the teacher's high-probability modes. Because training examples come from the student's own state distribution, KD reduces distributional mismatch relative to off-policy distillation or supervised fine-tuning (SFT), improving stability and sample efficiency. Compared with reinforcement learning, KD avoids high-variance reward-based optimization and complex reward design by providing dense, informative per-token targets, which typically yields faster convergence and simpler scaling. Recent empirical and implementation-focused writeups (e.g., [ThinkingMachines' blog on on-policy distillation](https://thinkingmachines.ai/blog/on-policy-distillation/)) also demonstrate that on-policy distillation can deliver high-quality behavior with substantially lower compute and data requirements than many alternative approaches.
+Built on verl’s Ray-based single-controller components, we initially assembled a strictly on-policy KD pipeline where rollout generation, teacher knowledge acquisition, and policy optimization ran in lockstep. In practice, this synchronous design proved highly inefficient: the three stages had to wait for one another, creating pipeline bubbles and underutilized GPUs. To address this, we extend the asynchronous schedulers introduced by the One-Step-Off Policy pipeline to overlap these phases. This overlap preserves the same distillation objective while trading some strict on-policy guarantees for substantial gains in end-to-end throughput and hardware utilization.
+## 2. Distillation Overview and Objective
+This recipe centers on on-policy knowledge distillation: the student policy learns from a stronger teacher on samples generated by the current policy (on-policy). For each input prompt, the student (actor) generates responses; the teacher provides top-k token distributions, and the student is trained to match them token-wise.
+Core components:
+1. Teacher signal: top-k log-probabilities and token indices per valid token position.
+2. Student objective: sparse, token-level KL divergence between student logits and teacher top-k distribution.
+Objective: encourage student probabilities $Q$ to cover teacher modes $P$ using token-wise $\mathrm{KL}(P\,\|\,Q)$ computed on the teacher's top-k support.
+## 3. Efficient System Design
+### 3.1 Schedulers (One-Step / Two-Step Off-Policy)
+The native (serial) on-policy distillation process is shown in the figure below.
+![Zero-Step-Off Scheduler](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/zero-step-off-distill.png)
+This recipe supports optional schedulers that overlap generation, teacher querying, and updates to improve throughput without changing the distillation objective.
+#### 3.1.1 One-Step-Off-Policy
+![One-Step-Off Scheduler](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/one-step-off-distill.png)
+- Warm-up: 2 steps.
+- Overlap pattern: rollout while actor update; weight sync while teacher retrieving.
+- Timing keys: `sync_rollout_weights`, `wait_prev_gen`, `wait_prev_teacher`.
+#### 3.1.2 Two-Step-Off-Policy
+![Two-Step-Off Scheduler](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/two-step-off-distill.png)
+- Warm-up: 3 steps.
+- Overlap pattern: rollout, actor update while teacher retrieving; interleave weight sync.
+- Timing keys: `sync_rollout_weights`, `max(wait_prev_gen, wait_prev_prev_teacher)`.
+Tip: Use `two_step_off` when teacher takes much more time than sync; `one_step_off` for simpler overlapping.
+Practical details:
+- Inputs per batch: `teacher_topk_logps`, `teacher_topk_indices`, `attention_mask` (to select valid token positions).
+- Loss injection: last pipeline stage computes KL via a logits processor; earlier stages remain unchanged.
+- Optional dynamic micro-batching groups sequences by density to reduce padding overhead.
+The pipeline:
+1. Actor parameters are synchronized to a rollout worker group (nccl broadcast) with a little bit latency.
+2. Rollout workers (vLLM-backed) generate sequences asynchronously (`async_generate_sequences`).
+3. Teacher client service (ZeroMQ based) returns top-k log-probabilities + token indices for each sequence (batched micro-requests), enabling KL-based guidance.
+4. Megatron actor performs a KL divergence computation between student logits and teacher top-k distributions (custom TP-aware kernel in `megatron_kl_loss.py`).
+5. Scheduling strategies (`one_step_off_scheduler`, `two_step_off_scheduler`) can overlap phases (optional for throughput):
+### 3.2 Weights sync between actor and rollout
+We initially followed the weight synchronization path from the One-Step-Off-Policy recipe (Ray collective broadcast across all actor and rollout ranks, plus Megatron-side allgather of parameter shards). In practice this became the dominant bottleneck, so we made three changes:
+1. Batch-and-bulk load on the rollout side: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched load into the rollout engine. In our setup this reduced the weight-loading time by roughly 3×.
+2. Batch-and-bulk broadcast between the actor and rollout: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched broadcast between the actor and rollout workers.
+3. Replace allgather with gather-to-root in Megatron: parameter shards are gathered to actor rank 0 (rather than allgathered to everyone), and that root then serves as the single source for broadcasting to rollout ranks. On top of the previous change, 2 and 3 changes delivered an additional ~4× speedup in the synchronization phase.
+## 4. High-Level Data & Control Flow
+```
+Driver (TaskRunner)
+  ├─ Initialize Ray, tokenizer, datasets, worker groups
+  ├─ Build ResourcePoolManager (actor vs rollout GPU layouts)
+  ├─ Trainer.fit()
+      ├─ init_workers(): build actor + rollout groups, broadcast weight metadata, create nccl collective group
+      ├─ continuous_iterator(): epochs → batches
+      ├─ scheduler (see Section 6)
+        • _async_gen_next_batch(): optional weight sync + non-blocking rollout
+        • _async_get_teacher_knowledge(): submit teacher requests, store future
+        ├─ For each step:
+            • Sync rollout weights
+            • Retrieve (batch, gen_output, teacher_output) from futures
+            • Merge gen + teacher outputs → DataProto
+            • Compute metrics (response length stats, timing, throughput)
+            • Update actor (forward_backward_batch + KL loss + optimizer step)
+            • (Optional) save checkpoint
+```
+> Note: Schedulers are optional and explained later; the distillation objective is independent of how phases are overlapped.
+## 5. Key Components
+### 5.1 `OnPolicyDistillTrainer` (`ray_trainer.py`)
+- Creates `GenerationBatchFuture` objects holding rollout and (later) teacher futures.
+- Adds scheduling + teacher integration + modified metric emission (KL, timing, MFU).
+### 5.2 Actor Worker (Megatron)
+- `OnPolicyDistillActor.update_policy()` orchestrates micro-batch forward/backward.
+- KL Loss injection via `logits_processor` during forward on pipeline last stage.
+### 5.3 Rollout Worker (vLLM / SGLang)
+- Pure inference mode (`init_model` builds model; no optimizer).
+- `async_generate_sequences` returns a Ray future for overlapping.
+### 5.4 Teacher Service (`teacher/`)
+- Proxy + worker architecture (ZMQ REQ/REP) for batched top-k retrieval.
+- `TeacherClient.submit()` returns a `Future`; aggregator composes micro-batches.
+- Configurable temperature, max tokens, only-response mode.
+### 5.5 KL Loss (`megatron_kl_loss.py`)
+- Performs normalization & stable per-token probability construction across TP shards.
+- Gradient is (student_probs - teacher_sparse_probs) scaled by upstream grad.
+## 6. Configuration Highlights (`on_policy_distill_trainer.yaml`)
+| Section | Purpose | Notable Keys |
+|---------|---------|-------------|
+| actor_rollout_ref.teacher | Teacher server | server_ip, server_port, n_server_workers |
+| trainer | Global training control | total_epochs, save_freq, scheduler (one_step_off | two_step_off), n_gpus_per_node, nnodes |
+| rollout | Resource split for rollout | n_gpus_per_node, nnodes |
+**Remember to set `trainer.n_gpus_per_node`, `trainer.nnodes`, `rollout.n_gpus_per_node` and `rollout.nnodes` to allocate GPU resources.**
+### Dynamic Batch Size
+Enable by:
+```
+actor_rollout_ref.actor.use_dynamic_bsz=True
+actor_rollout_ref.actor.max_token_len=6000  # cap post-group token length
+```
+Improves utilization under variable sequence lengths.
+### Resource Guidelines
+- Actor pool: `trainer.nnodes * trainer.n_gpus_per_node` GPUs.
+- Rollout pool: `rollout.nnodes * rollout.n_gpus_per_node` GPUs.
+- Ensure teacher server capacity ≈ `n_server_workers` to avoid stalls (monitor `wait_prev_teacher`).
+## 7. Usage Examples
+### 7.1 Launch Teacher Server
+Before training process, you should have a teacher server to provide logp information.
+We provide a toy teacher server example with vLLM. It needs `telnet` to check proxy status, and `python` command to run. So if you have not installed `telnet`, you can just delete these code in `start_server.sh`. And some OS use `python3` rather than `python`, so you also need to modify it. Also you can change the port of teacher if you meet port conflict.
+There are 3 arguments can be set for vllm backend `--tp-size`, `--n-logprobs` and `--ckpt-path` in `start_server.sh` / `worker.py`. You should set before you start server.
+We also provide a toy multi-node teacher server. You can start the main node using `start_server.sh` and start the slave nodes using `join_server.sh`. Still remember to set args in `join_server.sh`, especially the `$PROXY_IP` and `$PROXY_BACKEND_PORT` of main node.
+When training, student will automatically use the teacher's topk (n-logprobs) to set its own topk argument at line 83 of `recipe/gkd/megatron_kl_loss.py`, so you don't need to set student's topk argument.
+```bash
+cd recipe/gkd/teacher
+bash start_server.sh
+# Exports ports and launches proxy + worker (default vLLM backend)
+```
+Verify with:
+```bash
+telnet localhost 15555
+```
+### 7.2 Minimal Local (Megatron + vLLM) Run
+```bash
+python3 -m recipe.gkd.main_gkd \
+  --config-path=recipe/gkd/config \
+  --config-name=on_policy_distill_trainer \
+  actor_rollout_ref.model.path=/path/to/MODEL \
+  data.train_files=/path/to/train.parquet \
+  trainer.total_epochs=2 \
+  trainer.n_gpus_per_node=4 rollout.n_gpus_per_node=2 \
+  actor_rollout_ref.teacher.server_ip=127.0.0.1 \
+  actor_rollout_ref.teacher.server_port=15555 \
+  trainer.scheduler=one_step_off
+```
+(Requires a running teacher server).
+### 7.3 Ray Job Submission (Distilled 16B Example)
+See `run_moonlight_dsv3_training.sh` for a full script including:
+- Dist ckpt path setup (`dist_checkpointing_path`)
+- Expert parallel sizing (EP / ETP)
+- Dynamic batch sizing
+- Two-step-off scheduling for deeper overlap.
+Submit (after adjusting paths):
+```bash
+bash recipe/gkd/run_moonlight_dsv3_training.sh
+```
+## 8. Metrics & Monitoring
+Emitted metrics include (prefixes may vary):
+- Timing: `timing/wait_prev_gen`, `timing/sync_rollout_weights`, `timing/get_teacher_knowledge`, `timing/update_actor`.
+- Sequence stats: `response_seq_len/*` (avg, max, min, counts).
+- Performance: `perf/mfu/actor`, `perf/max_memory_allocated_gb`, `perf/cpu_memory_used_gb`.
+- Distillation: `actor/kl_loss`, `actor/grad_norm`, `actor/lr`.
+Interpretation Tips:
+- High `wait_prev_teacher` → scale `n_server_workers` and allocate more teacher GPUs or reduce per-request batch size, or just use `two_step_off`.
+- High `wait_prev_gen` with uniform lengths → allocate more rollout GPUs.
+- High `sync_rollout_weights` → check NCCL env / network congestion and try to modify `actor_rollout_ref.rollout.update_weights_bucket_megabytes`.
+## 9. Extensibility Notes
+- Add new schedulers by following interface returning `(epoch, batch, gen_output, teacher_output, timing_dict)`.
+- Integrate different distillation signals (e.g., hidden states, intermediate reasoning tokens) by extending `teacher_utils.get_teacher_knowledge` and modifying `logits_processor`.
+## 10. Functional Support Summary
+| Category | Supported |
+|----------|-----------|
+| Train engine | Megatron |
+| Rollout engine | vLLM |
+| Distillation signal | Teacher top-k logprobs & indices |
+| Scheduling | one_step_off, two_step_off |
+## 11. Quick Checklist Before Running
+- Teacher server reachable (`telnet <ip> <port>`).
+- `actor_rollout_ref.model.path` contains the correct Megatron/HF config artifacts.
+- `train_files` points to a parquet dataset compatible with this recipe's dataset loader.
+- NCCL environment vars set (see `config/runtime_env.yaml`).
+---
+Feel free to open issues or PRs to extend scheduler variants, add new distillation objectives, or broaden engine support, and more improvement.

code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst ADDED Viewed

	@@ -0,0 +1,119 @@

+.. _attention-implementation-override:
+Attention Implementation Override
+==================================
+Last updated: 10/31/2025.
+By default, VERL's FSDP workers use ``flash_attention_2`` as the attention implementation for improved performance.
+However, you can now override this setting to use different attention implementations based on your needs.
+Supported Attention Implementations
+-----------------------------------
+The following attention implementations are supported (subject to model and hardware compatibility):
+- ``flash_attention_2``: High-performance attention implementation (default)
+- ``eager``: Standard PyTorch attention implementation
+- ``sdpa``: Scaled Dot-Product Attention (PyTorch native)
+When to Override
+----------------
+You might want to override the attention implementation in the following scenarios:
+- **Debugging**: Use ``eager`` for easier debugging and better error messages
+- **Compatibility**: Some models or hardware configurations may not support ``flash_attention_2``
+- **Memory constraints**: Different implementations have different memory characteristics
+- **Performance tuning**: Testing different implementations for optimal performance
+Configuration Examples
+-----------------------
+PPO Training with Eager Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To override the attention implementation for the actor, rollout, and reference models:
+.. code:: bash
+    python3 ppo_trainer.py \
+        +actor_rollout_ref.model.override_config.attn_implementation=eager \
+        [other parameters...]
+PPO Training with SDPA Attention
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. code:: bash
+    python3 ppo_trainer.py \
+        +actor_rollout_ref.model.override_config.attn_implementation=sdpa \
+        [other parameters...]
+Critic Model Override
+~~~~~~~~~~~~~~~~~~~~~
+For training configurations that include a critic model, you can also override its attention implementation:
+.. code:: bash
+    python3 ppo_trainer.py \
+        +actor_rollout_ref.model.override_config.attn_implementation=eager \
+        +critic.model.override_config.attn_implementation=eager \
+        [other parameters...]
+YAML Configuration
+~~~~~~~~~~~~~~~~~~
+You can also specify the attention implementation in your YAML configuration file:
+.. code:: yaml
+    actor_rollout_ref:
+      model:
+        override_config:
+          attn_implementation: eager
+          # other overrides...
+    critic:  # if using a critic model
+      model:
+        override_config:
+          attn_implementation: eager
+          # other overrides...
+Important Notes
+---------------
+**Backward Compatibility**: If you don't specify ``attn_implementation`` in the override config,
+VERL will continue to use ``flash_attention_2`` by default, ensuring backward compatibility with existing configurations.
+**Model Support**: Not all models support all attention implementations. Ensure your model is compatible
+with the chosen attention implementation before training.
+**Performance Impact**: Different attention implementations have varying performance characteristics.
+``flash_attention_2`` typically offers the best performance, while ``eager`` provides better debugging capabilities.
+**Hardware Dependencies**: Some attention implementations (like ``flash_attention_2``) may require
+specific hardware or CUDA versions. If you encounter compatibility issues, try using ``eager`` or ``sdpa``.
+Troubleshooting
+---------------
+If you encounter errors when using a specific attention implementation:
+1. **Check model compatibility**: Verify that your model supports the chosen attention implementation
+2. **Try eager attention**: Use ``attn_implementation=eager`` as a fallback for debugging
+3. **Check hardware requirements**: Ensure your hardware supports the attention implementation
+4. **Review error messages**: Attention implementation errors often provide clear guidance on supported options
+Example Error Resolution
+~~~~~~~~~~~~~~~~~~~~~~~~
+If you see an error like "flash_attention_2 is not supported", you can resolve it by switching to eager attention:
+.. code:: bash
+    # Instead of the default flash_attention_2
+    python3 ppo_trainer.py +actor_rollout_ref.model.override_config.attn_implementation=eager
+This override ensures your training can proceed while you investigate the flash attention compatibility issue.

code/RL_model/verl/verl_train/docs/advance/checkpoint.rst ADDED Viewed

	@@ -0,0 +1,159 @@

+.. _checkpoint-page:
+Using Checkpoints to Support Fault Tolerance Training
+=====================================================
+Last updated: 06/25/2025.
+There could be training errors or machine failure during the whole RLHF training process,
+so it is recommended to enable checkpoints to minimize your loss.
+The API Interface has already been listed in :ref:`config-explain-page`,
+and we will not repeat them. But there are still some technique details
+we hope to clarify.
+.. note::
+    Notice that the ``checkpoint.contents`` field has no effect to FSDP checkpoint except ``hf_model``,
+    the other 3 fields are binded together to save and load. We recommend to include ``model``, ``optimizer`` and ``extra`` all.
+Checkpoint Saving Directory Structure
+-------------------------------------
+Commonly, we use the ``default_local_dir`` declared in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yml``
+to work as preffix when saving checkpoints, which is ``checkpoints/${trainer.project_name}/${trainer.experiment_name}``.
+So the inner checkpoint structure of **FSDP** is like:
+.. code::
+    checkpoints/${trainer.project_name}/${trainer.experiment_name}
+    ├── global_steps_${i}
+    │   ├── actor
+    │   │   ├── huggingface      # default save config and tokenizer, save huggingface model if include ``hf_model`` in checkpoint.contents
+    │   │   └── fsdp_config.json # FSDP config file, including world_size and fsdp version
+    │   │   ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
+    │   │   ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
+    │   │   └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
+    │   ├── critic
+    │   │   ├── huggingface
+    │   │   └── fsdp_config.json
+    │   │   ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
+    │   │   ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
+    │   │   └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
+    └── latest_checkpointed_iteration.txt
+All model shards, optimizers and extra states are stored together, in a sharded and distributed way.
+While **Megatron** current checkpoint structure is:
+.. code::
+    checkpoints/${trainer.project_name}/${trainer.experiment_name}
+    ├── global_steps_${i}
+    │   ├── actor
+    │   │   ├── huggingface     # default save config and tokenizer, save huggingface model if include ``hf_mode`` in checkpoint.contents
+    │   │   └── dist_ckpt       # save sharded model/optimizer/rng_states, naming the same as Megatron
+    │   └── critic
+    │   │   ├── huggingface
+    │   │   └── dist_ckpt
+    └── latest_checkpointed_iteration.txt
+Convert FSDP and Megatron Checkpoints to HuggingFace Format Model
+-----------------------------------------------------------------
+We provide a tool to convert the FSDP and Megatron checkpoints to HuggingFace format model.
+The tool is located in ``verl/model_merger``. For older versions of verl that don't include fsdp_config.json in checkpoints, you can use the legacy model merger located at ``verl/scripts/legacy_model_merger.py``.
+The script supports two main sub-commands: `merge` (to convert and save checkpoints) and `test` (to validate merged checkpoints against a reference model).
+The arguments for the `merge` sub-command are as follows:
+.. code:: bash
+    usage: python -m verl.model_merger merge [-h] --backend {fsdp,megatron} [--local_dir LOCAL_DIR] [--tie-word-embedding] [--is-value-model] [--use_cpu_initialization] [--target_dir TARGET_DIR]
+                         [--hf_upload_path HF_UPLOAD_PATH] [--private]
+    options:
+    -h, --help            show this help message and exit
+    --backend {fsdp,megatron}
+                            The backend of the model
+    --local_dir LOCAL_DIR
+                            Path to the saved model checkpoints
+    --tie-word-embedding  Whether to tie word embedding weights (currently only Megatron supported)
+    --is-value-model      Whether the model is a value model (currently only Megatron supported)
+    --use_cpu_initialization
+                            Whether to use CPU initialization for the model. This is useful for large models that cannot fit into GPU memory during initialization.
+    --target_dir TARGET_DIR
+                            Directory to save the merged huggingface model
+    --hf_upload_path HF_UPLOAD_PATH
+                            Hugging Face repository ID to upload the model
+    --private             Whether to upload the model to a private Hugging Face repository
+Example usage for merging Megatron checkpoints:
+.. code:: bash
+    python -m verl.model_merger merge \
+        --backend megatron \
+        --tie-word-embedding \
+        --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
+        --target_dir /path/to/merged_hf_model
+Example usage for distributed merging Megatron checkpoints:
+.. code:: bash
+    torchrun --nproc_per_node 1 --nnodes 8 --node_rank ${RANK} -m verl.model_merger merge \
+        --backend megatron \
+        --tie-word-embedding \
+        --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
+        --target_dir /path/to/merged_hf_model
+Example usage for merging FSDP checkpoints:
+.. code:: bash
+    python -m verl.model_merger merge \
+        --backend fsdp \
+        --local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor \
+        --target_dir /path/to/merged_hf_model
+Megatron Merger details
+-----------------------
+Current implement of decoder layers uses ``nn.ModuleList`` to store the layers,
+and thus the model layers on every PP rank and VPP rank starts their index from 0.
+There are 3 ways to correct this behavior:
+1. Modify the decoder layer's state_dict, add ``offset`` to each layer's index, thus rewrite ``nn.ModuleList`` implementation.
+2. Modify the layer index when saving checkpoint and recover them when loading checkpoint.
+3. The Checkpoint merger do this work, calculate the actual ``offset`` from ``state_dict`` only, a little complex.
+Current implementation use solution 2.
+HuggingFace to Megatron DistCheckpoint details
+----------------------------------------------
+Through ``mbridge``, we can directly save the mcore model to huggingface format during training.
+No need to convert the model to Megatron dist-checkpoint format.
+Original Checkpoint Utils
+-------------------------
+Original Checkpoint Utils refer to original checkpoint implementation in ``verl/models/[model]/megatron/checkpoint_utils``.
+We only need ``[model]_loader.py`` in original checkpoint utils now, since we get rid of storing ``hf_model`` every time (which is not recommended for large model training, try only saving sharded models if you can).
+.. note::
+    Note that ``[model]_loader`` only support environments where **storage clusters are able to connect with every calculation nodes**.
+    Because it utilizes **sharded load way to minimize the loading checkpoint overhead**.
+    Every rank loads its own data from ``state_dict`` which can be accessed by all of them.
+    While there is also no need to broadcast among DP ranks, since the saved state_dict is only produced by DP rank 0.
+    For users who can **only place the huggingface model on one device**, we keep the original costly implementation in ``[model]_loader_deprecated``. In this implementation, rank 0 broadcast all weights to each tp and pp rank, and then dp rank 0 broadcast to all dp ranks. There may be at risks of OOM.
+    To use deprecated loader, change the import package of ``load_state_dict_to_megatron_llama``.

code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst ADDED Viewed

	@@ -0,0 +1,273 @@

+Extend to other RL(HF) algorithms
+=================================
+Last updated: 02/25/2025.
+We already implemented the complete training pipeline of the PPO
+algorithms. To extend to other algorithms, we analyze the high-level
+principle to use verl and provide a tutorial to implement the DPO
+algorithm. Users can follow the similar paradigm to extend to other RL algorithms.
+.. note:: **Key ideas**: Single process drives multi-process computation and data communication.
+Overall Approach
+----------------
+Step 1: Consider what multi-machine multi-GPU computations are needed
+for each model, such as ``generate_sequence`` , ``compute_log_prob`` and
+``update_policy`` in the actor_rollout model. Implement distributed
+single-process-multiple-data (SPMD) computation and encapsulate them
+into APIs
+Step 2: Based on different distributed scenarios, including FSDP and 3D
+parallelism in Megatron-LM, implement single-process control of data
+interaction among multi-process computations.
+Step 3: Utilize the encapsulated APIs to implement the control flow
+Example: Online DPO
+-------------------
+We use verl to implement a simple online DPO algorithm. The algorithm
+flow of Online DPO is as follows:
+1. There is a prompt (rollout) generator which has the same weight as
+   the actor model. After a batch of prompts are fed into the generator,
+   it generates N responses for each prompt.
+2. Send all the prompts + responses to a verifier for scoring, which can
+   be reward model or a rule-based function. Then sort them in pairs to
+   form a training batch.
+3. Use this training batch to train the actor model using DPO. During
+   the process, a reference policy is needed.
+Step 1: What are the multi-machine multi-GPU computations
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**Sample Generator**
+Implementation details:
+.. code:: python
+   from verl.single_controller.base import Worker
+   from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
+   import ray
+   @ray.remote
+   class SampleGenerator(Worker):
+       def __init__(self, config):
+           super().__init__()
+           self.config = config
+       def generate_sequences(self, data):
+           pass
+Here, ``SampleGenerator`` can be viewed as a multi-process pulled up by
+``torchrun``, with each process running the same code (SPMD).
+``SampleGenerator`` needs to implement a ``generate_sequences`` API for
+the control flow to call. The implementation details inside can use any
+inference engine including vllm, sglang and huggingface. Users can
+largely reuse the code in
+verl/verl/workers/rollout/vllm_rollout/vllm_rollout.py and we won't
+go into details here.
+**ReferencePolicy inference**
+API: compute reference log probability
+.. code:: python
+   from verl.single_controller.base import Worker
+   import ray
+   @ray.remote
+   class ReferencePolicy(Worker):
+       def __init__(self):
+           super().__init__()
+           self.model = Model()
+       def infer(self, data):
+           return self.model(data)
+**Actor update**
+API: Update actor model parameters
+.. code:: python
+   from verl.single_controller.base import Worker
+   import ray
+   @ray.remote
+   class DPOActor(Worker):
+       def __init__(self):
+           super().__init__()
+           self.model = Model()
+           self.model = FSDP(self.model)  # or other distributed strategy
+           self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
+           self.loss_fn = xxx
+       def update(self, data):
+           self.optimizer.zero_grad()
+           logits = self.model(data)
+           loss = self.loss_fn(logits)
+           loss.backward()
+           self.optimizer.step()
+**Notes: How to distinguish between control processes and distributed computation processes**
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- Control processes are generally functions directly decorated with
+  ``@ray.remote``
+- Computation processes are all wrapped into a ``RayWorkerGroup``.
+Users can reuse most of the distribtued computation logics implemented
+in PPO algorithm, including FSDP and Megatron-LM backend in
+verl/verl/trainer/ppo.
+Step 2: Based on different distributed scenarios, implement single-process control of multi-process data interaction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+**The core problem to solve here is how a single process sends data to
+multiple processes, drives multi-process computation, and how the
+control process obtains the results of multi-process computation.**
+First, we initialize the multi-process ``WorkerGroup`` in the control
+process.
+.. code:: python
+   @ray.remote(num_cpus=1)
+   def main_task(config):
+       # construct SampleGenerator
+       resource_pool = RayResourcePool(process_on_nodes=[8] * 2)  # 16 GPUs
+       ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
+       # put SampleGenerator onto resource pool
+       worker_group = RayWorkerGroup(resource_pool, ray_cls)
+       # construct reference policy
+As we can see, in the control process, multiple processes are wrapped
+into a ``RayWorkerGroup``. Inside this ``WorkerGroup``, there is a
+``self._workers`` member, where each worker is a RayActor
+(https://docs.ray.io/en/latest/ray-core/actors.html) of SampleGenerator.
+ray_trainer.md also provide an implementation of
+``MegatronRayWorkerGroup``.
+Assuming the model is distributed using FSDP, and there is a batch of
+data on the control process, for data parallelism, the underlying
+calling process is:
+.. code:: python
+   data = xxx
+   data_list = data.chunk(dp_size)
+   output = []
+   for d in data_list:
+       # worker_group._workers[i] is a SampleGenerator
+       output.append(worker_group._workers[i].generate_sequences.remote(d))
+   output = ray.get(output)
+   output = torch.cat(output)
+Single process calling multiple processes involves the following 3
+steps:
+1. Split the data into DP parts on the control process.
+2. Send the data to remote, call the remote computation through RPC, and
+   utilize multi-process computation.
+3. Obtain the computation results of each worker on the control process
+   and merge them.
+Frequently calling these 3 steps on the controller process greatly hurts
+code readability. **In verl, we have abstracted and encapsulated these 3
+steps, so that the worker's method + dispatch + collect can be
+registered into the worker_group**
+.. code:: python
+   from verl.single_controller.base.decorator import register
+   def dispatch_data(worker_group, data):
+       return data.chunk(worker_group.world_size)
+   def collect_data(worker_group, data):
+       return torch.cat(data)
+   dispatch_mode = {
+       'dispatch_fn': dispatch_data,
+       'collect_fn': collect_data
+   }
+   @register(dispatch_mode=dispatch_mode)
+   def generate_sequences(self, data):
+       pass
+In this way, we can directly call the method inside the worker through
+the ``worker_group`` on the control (driver) process (which is a single
+process):
+.. code:: python
+   output = worker_group.generate_sequences(data)
+This single line includes data splitting, data distribution and
+computation, and data collection.
+Furthermore, the model parallelism size of each model is usually fixed,
+including dp, tp, pp. So for these common distributed scenarios, we have
+pre-implemented specific dispatch and collect methods,in `decorator.py <https://github.com/volcengine/verl/blob/main/verl/single_controller/base/decorator.py>`_, which can be directly used to wrap the computations.
+.. code:: python
+   from verl.single_controller.base.decorator import register, Dispatch
+   @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+   def generate_sequences(self, data: DataProto) -> DataProto:
+       pass
+Here it requires the data interface to be ``DataProto``. Definition of
+``DataProto`` is in `protocol.py <https://github.com/volcengine/verl/blob/main/verl/protocol.py>`_.
+Step 3: Main training loop
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+With the above training flows, we can implement the algorithm's control
+flow. It is recommended that ``main_task`` is also a ray remote process.
+.. code:: python
+   @ray.remote(num_cpus=1)
+   def main_task(config):
+       # construct SampleGenerator
+       resource_pool = RayResourcePool(process_on_nodes=[8] * 2)  # 16 GPUs
+       ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
+       # put SampleGenerator onto resource pool
+       sample_gen = RayWorkerGroup(resource_pool, ray_cls)
+       # construct reference policy
+       ray_cls = RayClassWithInitArgs(ReferencePolicy)
+       ref_policy = RayWorkerGroup(resource_pool, ray_cls)
+       # construct actor
+       ray_cls = RayClassWithInitArgs(DPOActor)
+       dpo_policy = RayWorkerGroup(resource_pool, ray_cls)
+       dataloader = DataLoader()
+       for data in dataloader:
+           # generate data
+           data = sample_gen.generate_sequences(data)
+           # generate scores for each data
+           data = generate_scores(data)
+           # generate pairwise data using scores
+           data = generate_pairwise_data(data)
+           # generate ref_log_prob
+           data.batch['ref_log_prob'] = ref_policy.infer(data)
+           # update using dpo
+           dpo_policy.update(data)
+           # logging
+Here, different ``WorkerGroups`` can be placed in the same resource pool or
+in different resource pools using ``create_colocated_worker_cls``
+similar as in `ray_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py>`_.

code/RL_model/verl/verl_train/docs/advance/fp8.md ADDED Viewed

	@@ -0,0 +1,107 @@

+# FP8 rollout for verl
+Last updated: 12/4/2025
+This document introduces FP8 rollout in verl.
+We monkey patch several vLLM functions to enable FP8 rollout for reinforcement learning:
+1. **Quantize weights**: Quantize model weights on-the-fly from higher-precision formats to FP8.
+2. **Process weights after loading**: For vLLM, we replace the `vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading` function to handle weight processing after quantization. For SGLang, this patch is not needed as it natively supports loading quantized weights.
+## Support Matrix
+- FP8 blockwise quantization for rollout
+  - Used in Deepseek,
+which is 1x128 quantization for activations and 128x128 quantization for model weights
+- Dense models and MoE models
+- Async rollout interfaces
+- vLLM 0.10.x & vLLM 0.11 & SGlang 0.5.5
+- FSDP and Megatron training backends
+## Experiments and Outcomes
+### Qwen3-8B-Base Dense Model
+**Configuration**
+- DAPO recipe. AIME24 online validation.
+- vLLM(FP8 spmd rollout) + FSDP
+  - Note that SPMD rollout has been deprecated, so we removed the FP8 SPMD rollout.
+- Prompt batch size 32, n=16.
+- Rollout batch size: 32\*3*16
+- Train_batch_size & ppo_mini_batch_size 32
+- Max response length 20K
+- Token-level TIS, C=2
+- 8*H100
+- vLLM 0.10.0+CUDA 12.6 vs vLLM 0.11.0+CUDA 12.9
+**Accuracy**
+![Qwen3-8b-base_fp8_acc](
+https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-8b-base_fp8_acc.png?raw=true)
+*dark green: BF16, orange: FP8 rollout + token-level TIS, light green: FP8 rollout without TIS*
+Results and observations:
+- With TIS, FP8 rollout aligns with BF16
+- Obvious accuracy drop when TIS is not enabled
+- Higher mismatch kl but within acceptable range throughout the training
+**Performance**
+![Qwen3-8b-base_fp8_rollout_perf](
+https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-8b-base_fp8_rollout_perf.png?raw=true)
+*green: BF16, orange: FP8 rollout + CUDA12.6 + DeepGemm, purple: FP8 rollout + CUDA 12.9 + DeepGemm*
+Results and observations:
+- FP8 rollout leads to around ~12% rollout speedup with CUDA 12.6 + DeepGemm
+- When upgrading to CUDA 12.9, speedup can be up to ~18%
+### Qwen3-30B-A3B-Base MoE Model
+**Configuration**
+- DAPO recipe. AIME24 online validation.
+- FP8 async rollout, vLLM+FSDP
+- Prompt batch size 32
+- Rollout batch size: 32\*3*16
+- Train_batch_size & ppo_mini_batch_size 32
+- Max response length 20K
+- Token-level TIS, C=2
+- 2\*8*H100
+- vLLM 0.10.0+CUDA 12.6
+Please refer to `recipe/dapo/run_dapo_qwen3_moe_30b_vllm_fp8_rollout.sh`
+**Accuracy**
+![Qwen3-30b-a3b_fp8_acc](
+https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-30b-a3b_fp8_acc.png?raw=true)
+*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS*
+Results and observations:
+- Rollout & training distribution mismatch is in general higher for MoE
+- Rollout correction required even for BF16
+- FP8 rollout with token-level TIS aligns with BF16
+**Performance**
+![Qwen3-30b-a3b_fp8_perf](
+https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-30b-a3b_fp8_perf.png?raw=true)
+*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS*
+Results and observations:
+- FP8 rollout : over 35% rollout speedup
+- Expecting more perf gain with CUDA 12.9
+## Usage
+FP8 can be enabled in the config file `verl/trainer/config/ppo_megatron_trainer.yaml`:
+```
+  rollout:
+    quantization: "fp8"
+```
+Or it can be enabled by command line:
+- `actor_rollout_ref.rollout.quantization=fp8`
+Please refer to `recipe/dapo/run_dapo_qwen3_moe_30b_vllm_fp8_rollout.sh`

code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst ADDED Viewed

	@@ -0,0 +1,97 @@

+Add models with the FSDP backend
+==================================
+Last updated: 02/09/2025.
+Model
+--------------------------
+In principle, our FSDP backend can support any HF model and we can
+sychronoize the actor model weight with vLLM using `hf_weight_loader.py` under `third_party/vllm`.
+However, ``hf_weight_loader`` is will gather the full state_dict of a
+model during synchronization, which may cause OOM. We suggest using
+``dtensor_weight_loader`` which gather the full model parameter layer by
+layer to reduce the peak memory usage. We already support dtensor weight
+loader for the models below in `dtensor_weight_loader.py` under `third_party/vllm`:
+- ``GPT2LMHeadModel``
+- ``LlamaForCausalLM``
+- ``LLaMAForCausalLM``
+- ``MistralForCausalLM``
+- ``InternLMForCausalLM``
+- ``AquilaModel``
+- ``AquilaForCausalLM``
+- ``Phi3ForCausalLM``
+- ``GemmaForCausalLM``
+- ``Gemma2ForCausalLM``
+- ``GPTBigCodeForCausalLM``
+- ``Starcoder2ForCausalLM``
+- ``Qwen2ForCausalLM``
+- ``DeepseekV2ForCausalLM``
+To implement ``dtensor_weight_loader`` of a model that's supported in
+vLLM, follow the guide of gemma model below:
+1. Copy the
+   ``load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]])`` from the vllm model class
+   to ``dtensor_weight_loaders.py``
+2. Modify the arguments to
+   ``(actor_weights: Dict, vllm_model: nn.Module)``
+3. Replace the ``self`` to ``vllm_model``
+4. Add the
+   ``local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)``
+   before each ``param = params_dict[name]`` and modify the following
+   weight loading using ``local_loaded_weight``.
+5. Register the implemented dtensor weight loader to ``__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__``.
+.. code-block:: diff
+    - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+    + def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+    -   params_dict = dict(self.named_parameters())
+    +   params_dict = dict(vllm_model.named_parameters())
+        loaded_params = set()
+    -   for name, loaded_weight in weights:
+    +   for name, loaded_weight in actor_weights.items():
+            for (param_name, shard_name, shard_id) in stacked_params_mapping:
+                if shard_name not in name:
+                    continue
+                name = name.replace(shard_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+    +           local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+    -           weight_loader(param, loaded_weight, shard_id)
+    +           weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
+                break
+            else:
+                # lm_head is not used in vllm as it is tied with embed_token.
+                # To prevent errors, skip loading lm_head.weight.
+                if "lm_head.weight" in name:
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+    +           local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+    -           weight_loader(param, loaded_weight)
+    +           weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
+            loaded_params.add(name)
+        unloaded_params = params_dict.keys() - loaded_params
+        if unloaded_params:
+            raise RuntimeError(
+                "Some weights are not initialized from checkpoints: "
+                f"{unloaded_params}")

code/RL_model/verl/verl_train/docs/advance/fully_async.md ADDED Viewed

	@@ -0,0 +1,595 @@

+# Recipe: Fully Async Policy Trainer
+**Author:** `https://github.com/meituan-search`
+Last updated: 12/25/2025.
+This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter,
+supporting asynchronous sample generation and training.
+Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs,
+without significantly affecting the results.
+## Introduction
+### Background
+The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more
+flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training
+efficiency caused by long-tail problems.
+The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by
+designing a separated architecture and performing asynchronous training between rollout and train for one round.
+However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot
+completely eliminate the impact of long-tail on training efficiency.
+In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have
+been implemented based on the separated architecture and have achieved gains.
+We borrow from their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and
+partial
+rollout training.
+By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy
+can significantly improve training efficiency.
+> Magistral https://arxiv.org/abs/2506.10910
+>
+> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language
+> Reasoning https://arxiv.org/abs/2505.24298
+>
+> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream
+> Generation https://arxiv.org/abs/2504.15930
+>
+> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663
+### Core Contributions
+- **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to
+  specify the resources they occupy separately.
+- **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples.
+- **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to
+  multiple steps, making the asynchronous solution more flexible.
+- **NCCL Parameter Synchronization**: Based on the nccl communication primitive, refer to [checkpoint-engine](https://github.com/MoonshotAI/checkpoint-engine) to
+  achieve efficient parameter synchronization between Rollouter and Trainer.
+- **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single
+  sample as the minimum transmission unit.
+- **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it
+  supports training with samples generated by old parameters.
+- **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter
+  synchronization, by adding `sleep() and resume()` logic, it
+  saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for
+  ongoing tasks to finish during parameter synchronization.
+Currently, the supported usage mode is Megatron/FSDP+vLLM/SGLang. vLLM/SGLang must use the server mode based on AgentLoop.
+## Design
+The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four
+parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer.
+![fully_async_policy_structure](https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_structure.svg?raw=true)
+1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the
+   production speed controlled by freshness.
+2. MessageQueue is used to temporarily store samples generated by Rollouter.
+3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size`
+   samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers
+   a parameter synchronization with Rollouter.
+4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability.
+The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for
+rollout cannot solve the idleness caused by long-tail samples.
+After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources
+are used),
+but the overlap in their time consumption reduces the end-to-end time consumption.
+![fully_async_policy_revenue](https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_revenue.svg?raw=true)
+## Usage
+### Parameter Description
+| super params                                                     | implication                                                                                    |
+| ---------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- |
+| `trainer.nnodes`                                                 | Number of nodes for Trainer                                                                    |
+| `trainer.n_gpus_per_node`                                        | Number of GPUs per node for Trainer                                                            |
+| `rollout.nnodes`                                                 | Number of nodes for Rollouter                                                                  |
+| `rollout.n_gpus_per_node`                                        | Number of GPUs per node for Rollouter                                                          |
+| `data.train_batch_size`                                          | In the fully async strategy, this value is not effective (default is 0)                        |
+| `data.gen_batch_size`                                            | In the fully async strategy, uses streaming sample production logic (default is 1)             |
+| `rollout.total_rollout_steps`                                    | Total number of rollout samples                                                                |
+| `rollout.test_freq`                                              | How many times Rollouter updates parameters before performing a validation                     |
+| `actor_rollout_ref.actor.ppo_mini_batch_size`                    | The ppo_mini_batch_size is a global num across all workers/gpus                                |
+| `async_training.require_batches`                                 | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once                           |
+| `async_training.trigger_parameter_sync_step`                     | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization |
+| `async_training.staleness_threshold`                             | Freshness control                                                                              |
+| `async_training.partial_rollout`                                 | Whether to perform partial_rollout                                                             |
+| `async_training.use_rollout_log_probs`                           | Use log_probs generated by rollout                                                             |
+| `async_training.compute_prox_log_prob`                           | Whether to compute log_prob using the training model's parameters during the training phase    |
+| `async_training.checkpoint_engine.enable`                        | Whether to use checkpoint_engine for accelerating, default `True`                              |
+| `async_training.checkpoint_engine.overlap_broadcast_and_consume` | When use checkpoint_engine, whether to overlap broadcast and load_weights, default `False`     |
+| `async_training.checkpoint_engine.device_buffer_size_M`          | When use checkpoint_engine, the user-specific bucket size (MB), default `4096`                 |
+| `async_training.use_trainer_do_validate`                         | Whether use trainer node to do validate process, default `False`|
+**Further Explanation:**
+- `rollout.total_rollout_steps`
+  Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step:
+  `rollout.total_rollout_steps = data.train_batch_size * step`.
+- `async_training.trigger_parameter_sync_step`
+  In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches
+  `require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter.
+  Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process
+  `trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples.
+  To fairly compare speed with colocate, trigger_parameter_sync_step should be set to
+  `data.train_batch_size / (require_batches * ppo_mini_batch_size)`.
+- `async_training.staleness_threshold`
+  In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used.
+  - staleness_threshold=0, indicates synchronous training.
+    Rollouter will generate a fixed number of samples between two parameter updates, the sample count is:
+    $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$
+  - staleness_threshold>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous
+    calls.
+    Rollouter will generate at most the following number of samples between two parameter updates:
+    $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$
+  num_staleness_sample represents the number of stale samples generated in excess during the last rollout.
+  Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower,
+  trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples.
+  When rollout is fast enough, setting staleness_threshold to 1 is basically equivalent to one_step_off policy.
+  To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1.
+- `async_training.partial_rollout`
+  partial_rollout only actually takes effect when staleness_threshold>0.
+- `async_training.use_rollout_log_probs`
+  In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to
+  the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling,
+  old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm
+  correctness. In the fully
+  async strategy, we default to old_log_prob being calculated by rollout rather than by trainer.
+- `async_training.require_batches`
+  In streaming training, require_batches should be set to 1, indicating that training is performed after producing
+  enough ppo_mini_batch_size samples.
+  In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can
+  cause training instability and longer response lengths.
+  Here, we additionally provide require_batches for streaming distribution and control the number of samples
+  participating in training at once.
+- `async_training.compute_prox_log_prob` (experimental)
+  During the training process, we observed that metrics and response lengths may become unstable in the later
+  stages of training. To mitigate this issue, we can use
+  the [Rollout Importance Sampling](https://verl.readthedocs.io/en/latest/advance/rollout_is.html)
+  technique for importance sampling. To utilize Rollout Importance Sampling, we need to compute log_prob using
+  the training engine, which requires enabling this switch.
+  Additionally, when compute_prox_log_prob and Rollout Importance Sampling are enabled under mode d
+  (async stream pipeline with partial rollout), our implementation approximates `Areal's Decoupled PPO`.
+- `async_training.checkpoint_engine.enable`
+  Enabling the checkpoint engine generally reduces synchronization time overhead by more than 60% compared to
+  the original per-tensor parameter synchronization method. However, assembling buckets incurs additional
+  temporary GPU memory overhead.
+- `async_training.checkpoint_engine.overlap_broadcast_and_consume`
+  Enabling pipeline between the broadcast and load_weights parameters will allocate additional GPU memory.
+  Since the main time consumption for parameter synchronization is not in the broadcast and load_weights phases,
+  but in the parameter generation phase (by megatron or FSDP), this option is off by default.
+- `async_training.checkpoint_engine.device_buffer_size_M`
+  It controls the size of the memory buffer used for synchronization when the checkpoint-engine is enabled.
+  The actual `bucket_size` = `max(device_buffer_size_M, maximum parameter tensor size)`.
+  - When enable `overlap_broadcast_and_consume`, the additional device memory overhead of
+    trainer rank is `3 * bucket_size`and rollout rank is `2 * bucket_size`。
+  - When disable `overlap_broadcast_and_consume`, the additional device memory overhead of
+    trainer rank is `2 * bucket_size`and rollout rank is `1 * bucket_size`。
+* `async_training.use_trainer_do_validate`
+  It controls whether to use the trainer's `do_validate` method for validation.
+  If set to True, the trainer will perform validation after each parameter update. It can reduce the validation time
+  overhead and trainer node idle time.
+  If set to False, the trainer will not perform validation.
+### Supported Modes
+1. on policy pipeline:
+   1. **trigger_parameter_sync_step=1, staleness_threshold=0**
+   2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for
+      training, and after training completes, Trainer and Rollouter perform a parameter synchronization;
+   3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill
+      idle resources, causing some resource waste.
+   4. As shown in figure a;
+2. stream off policy pipeline:
+   1. **trigger_parameter_sync_step>1, staleness_threshold=0**
+   2. Synchronous streaming training will be performed. Rollouter produces
+      `require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local
+      training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training
+      trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization;
+   3. Compared to a, since more samples are generated at once, resource idleness will be lower.
+   4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples,
+      train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter
+      update, rollout waits for training to complete.
+   5. As shown in figure b;
+3. async stream pipeline with stale samples:
+   1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False**
+   2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number
+      of samples generated may be less than this value depending on rollout speed).
+   3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples
+      before parameter synchronization for immediate use by Trainer after synchronization.
+      When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete
+      and not add new tasks;
+   4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the
+      first batch rollout to finish, but will have the time to wait for active tasks to finish.
+   5. As shown in figure c;
+4. async stream pipeline with partial rollout:
+   1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True**
+   2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will
+      interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be
+      generated after synchronization. This reduces the time to wait for active tasks to finish.
+   3. As shown in figure d;
+![fully_async_policy_mode](https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_mode.svg?raw=true)
+### Key Metrics
+| metrics                                        | implication                                                                                            |
+| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------ |
+| `trainer/idle_ratio`                           | Trainer idle rate                                                                                      |
+| `rollouter/idle_ratio`                         | Rollouter idle rate                                                                                    |
+| `fully_async/count/stale_samples_processed`    | Total number of old samples used in training                                                           |
+| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories)         |
+| `fully_async/partial/total_partial_num`        | Number of partial samples processed by Trainer between two trigger_parameter_sync_step                 |
+| `fully_async/partial/partial_ratio`            | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step                  |
+| `fully_async/partial/max_partial_span`         | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step |
+### Parameter Tuning Recommendations
+- Resource Allocation and Adjustment:
+  - Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource
+    allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire
+    training process,
+    avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource
+    allocation can be adjusted based on the idle time of rollout and train during actual training,
+    which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and
+    trainer/idle_ratio is low,
+    Trainer resources should be increased and Rollouter resources should be reduced, and vice versa.
+- Key Parameters:
+  - staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It
+    is recommended to set it to less than 1.
+  - require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and
+    the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample
+    processing;
+  - trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent
+    parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in
+    low resource utilization.
+    The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy.
+  - rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small.
+- Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at
+  different levels, suitable for tasks in different scenarios.
+  - For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed
+    requirements, the on policy pipeline mode (Mode 1) can be tried.
+  - For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy
+    pipeline mode can be tried. That is, by
+    setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization
+    mechanism (staleness_threshold=0) (Mode 2).
+  - For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and
+    staleness, setting staleness_threshold>
+    0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4).
+### Quick Start
+```shell
+rollout_mode="async"
+rollout_name="vllm" # sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+train_prompt_bsz=0
+gen_prompt_bsz=1
+n_resp_per_prompt=16
+train_prompt_mini_bsz=32
+total_rollout_steps=$(((512*400)))
+test_freq=10
+staleness_threshold=0
+trigger_parameter_sync_step=16
+partial_rollout=False
+python -m verl.experimental.fully_async_policy.fully_async_main \
+	train_batch_size=${train_prompt_bsz} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.hybrid_engine=False \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.calculate_log_probs=True \
+    trainer.nnodes="${NNODES_TRAIN}" \
+    trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.nnodes="${NNODES_ROLLOUT}" \
+    rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \
+    rollout.total_rollout_steps="${total_rollout_steps}" \
+    rollout.test_freq="${test_freq}" \
+    async_training.staleness_threshold="${staleness_threshold}" \
+    async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \
+    async_training.partial_rollout="${partial_rollout}"
+```
+## Experiments
+### Asynchronous Training on 7B Model
+We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources.
+Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards,
+64 cards, and 128 cards without significantly affecting experimental results.
+- Machine: H20
+- Model: Qwen2.5-Math-7B
+- Rollout length: max_response_length FSDP2: 28K tokens;
+- Algorithm: DAPO
+- Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+- Engine: vLLM + FSDP2
+- rollout.n: 16
+- ppo_mini_batch_size: 32
+- test_freq: 20
+- colocate sync:
+  - step: 400
+  - train_batch_size: 512
+- fully_async_policy
+  - total_rollout_steps: 512\*400
+  - require_batches: 4
+  - trigger_parameter_sync_step: 4
+  - staleness_threshold: 0.5
+  - partial_rollout: True
+|   training mode    | resource allocation |  step  |  gen   | old_log_prob | update_actor | total time<br>100 step | total time<br>200 step | total time<br>300 step | total time<br>400 step |         acc/mean@1          |
+| :----------------: | :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+|   colocate sync    |         32          | 790.10 | 357.41 |    107.71    |    269.80    |        13h 44m         |       1d 3h 43m        |       2d 9h 22m        |       3d 17h 5m        | max: 0.3313<br>last: 0.2448 |
+| fully_async_policy |        16:16        | 294.77 | 21.26  |      \       |    313.81    |   7h 58m<br>(1.72x)    |   16h 21m<br>(1.70x)   |  1d 0h 53m<br>(2.31x)  |  1d 9h 26m<br>(2.66x)  | max: 0.3302<br>last: 0.2333 |
+|   colocate sync    |         64          | 365.28 | 150.72 |    70.26     |    133.41    |        10h 22m         |        20h 45m         |        1d 7h 6m        |       1d 17h 32m       | max: 0.3365<br>last: 0.2333 |
+| fully_async_policy |        32:32        | 189.26 | 28.46  |      \       |    156.98    |   4h 57m<br>(2.09x)    |   10h 14m<br>(2.03x)   |   16h 58m<br>(1.83x)   |   21h 40m<br>(1.92x)   | max: 0.3677<br>last: 0.3406 |
+|   colocate sync    |         128         | 356.30 | 177.85 |    53.92     |    113.81    |         8h 36m         |        17h 56m         |        1d 5h 6m        |       1d 16h 48m       | max: 0.3573<br>last: 0.2958 |
+| fully_async_policy |        64:64        | 150.63 | 33.14  |      \       |    113.16    |   3h 13m<br>(2.67x)    |   6h 46m<br>(2.65x)    |   10h 53m<br>(2.67x)   |   17h 22m<br>(2.35x)   | max: 0.3521<br>last: 0.3094 |
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg
+### 128-card 7B Asynchronous Mode Experiment
+We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async.
+We can see that the benefit brought by streaming is approximately 1.6x, and after combining staleness and
+partial_rollout, the benefit reaches 2.35x.
+|                                                 mode                                                  |  step  |  gen   | old_log_prob | update_actor | total time<br>100 step | total time<br>200 step | total time<br>300 step | total time<br>400 step |         acc/mean@1          |
+| :---------------------------------------------------------------------------------------------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+|                                             colocate sync                                             | 356.30 | 177.85 |    53.92     |    113.81    |         8h 36m         |        17h 56m         |        1d 5h 6m        |       1d 16h 48m       | max: 0.3573<br>last: 0.2958 |
+| `stream off policy pipeline`<br>(+fully async: trigger_parameter_sync_step= 4,<br>require_batches= 4) | 231.34 | 128.47 |      \       |    98.77     |         4h 25m         |         9h 41m         |         15h 2m         |       1d 1h 53m        | max: 0.2844<br>last: 0.2604 |
+|               `async stream pipeline with stale samples`<br>(+staleness_threshold=0.5)                |        |        |              |              |                        |                        |                        |                        |                             |
+|                `async stream pipeline with partial rollout`<br>(+partial_rollout=True)                | 150.63 | 33.14  |      \       |    113.16    |         3h 13m         |         6h 46m         |        10h 53m         |        17h 22m         | max: 0.3521<br>last: 0.3094 |
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
+### 128-card Stale Ablation Experiment
+Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training
+efficiency.
+We found that the larger the staleness, the more obvious the final gains.
+We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps
+increase, the response length changes significantly, causing training instability.
+Further analysis and optimization are needed for this issue.
+| staleness_threshold |  step  |  gen   | old_log_prob | update_actor | total time<br>100 step | total time<br>200 step | total time<br>300 step | total time<br>400 step |         acc/mean@1          |
+| :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+|          0          | 231.34 | 128.47 |      \       |    98.77     |         4h 25m         |         9h 41m         |         15h 2m         |       1d 1h 53m        | max: 0.2844<br>last: 0.2604 |
+|         0.1         | 171.30 | 58.17  |      \       |    109.12    |         3h 53m         |         8h 37m         |        14h 25m         |        19h 59m         | max: 0.3542<br>last: 0.2979 |
+|         0.3         | 146.11 | 38.88  |      \       |    103.22    |         3h 18m         |         6h 49m         |        11h 40m         |        17h 20m         | max: 0.3469<br>last: 0.2865 |
+|         0.5         | 150.63 | 33.14  |      \       |    113.16    |         3h 13m         |         6h 46m         |        10h 53m         |        17h 22m         | max: 0.3521<br>last: 0.3094 |
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg
+### 128-card 7B require_batches Ablation Experiment
+In multiple tests, we found that the number of samples issued each time in streaming affects the response length during
+training, which in turn affects training time. We verified the impact on results by modifying
+`async_training.require_batches`.
+| require_batches |  step  |  gen  | old_log_prob | update_actor | total time<br>100 step | total time<br>200 step | total time<br>300 step |         acc/mean@1          |
+| :-------------: | :----: | :---: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: |
+|        1        | 203.47 | 30.88 |      \       |    181.08    |         3h 31m         |         8h 29m         |        17h 36m         |  max: 0.349<br>last: 0.326  |
+|        2        | 158.72 | 26.32 |      \       |    128.08    |         3h 35m         |         7h 38m         |        13h 57m         | max: 0.351<br>last: 0.3406  |
+|        4        | 124.64 | 25.62 |      \       |    95.06     |         3h 13m         |         6h 46m         |        10h 53m         | max: 0.3521<br>last: 0.3521 |
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg
+### 30B Model Mode Experiment
+We achieved a 1.7x performance improvement with `async stream pipeline with staleness samples` strategy on the
+Qwen3-30B-A3B-Base model compared to the colocate setup. It is worth noting that this is far from the upper limit of
+performance gains achievable through asynchrony. Firstly, the comparative experiments used a maximum response length of
+only 8k, which is much shorter than the 20k sequence length in previous experiments, resulting in a less pronounced
+rollout tail effect. Secondly, we adopted a highly skewed resource allocation, with rollout using 96 GPUs and trainer
+using 32 GPUs, which is not an optimal configuration. During the experiments, we observed that the current verl
+implementation imposes certain constraints, such as requiring data to be evenly divisible by the number of GPUs, making
+resource adjustment less flexible. Additionally, as asynchronous training and deployment accelerate, the performance gap
+is gradually narrowing. Therefore, enabling more flexible resource allocation and dynamic resource adjustment in the
+future will be our next focus.
+- Machine: H20
+- Model: Qwen3-30B-A3B-Base
+- Rollout length: max_response_length : 8K tokens;
+- Algorithm: GRPO
+- Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+- Engine: vLLM + Megatron
+- rollout.n: 16
+- ppo_mini_batch_size: 128
+- test_freq: 20
+- colocate sync:
+  - step:400
+  - train_batch_size: 512
+- fully_async_policy
+  - total_rollout_steps: 512\*400
+  - trigger_parameter_sync_step: 512/128 = 4
+  - staleness_threshold: 0.5
+  - partial_rollout: True
+| Training Mode      | Resource Allocation | Step   | Gen    | Old Log Prob | Ref   | Update Actor | Total Time 100 Step | Total Time 200 Step | Total Time 300 Step | Total Time 400 Step | Acc/Mean@1                  |
+| ------------------ | ------------------- | ------ | ------ | ------------ | ----- | ------------ | ------------------- | ------------------- | ------------------- | ------------------- | --------------------------- |
+| Colocate Sync      | 128                 | 497.89 | 348.05 | 28.73        | 20.86 | 86.27        | 13h 36m             | 1d 3h 48m           | 1d 19h 4m           | 2d 11h 39m          | max: 0.3500<br>last: 0.3208 |
+| Fully Async Policy | 96:32               | 282.75 | 22.06  | \            | 50.05 | 206.63       | 6h 45m (2.01x)      | 14h 48m (1.88x)     | 1d 0h 9m (1.78x)    | 1d 10h 41m (1.72x)  | max: 0.3813<br>last: 0.3448 |
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-30B?nw=nwuserhouzg | | |
+### checkpoint-engine Ablation Experiment
+We tested the single-step parameter synchronization time of the checkpoint-engine on three models: Qwen2.5-Math-7B, Qwen3-30B-A3B, and Qwen3-235B-A22B, using default checkpoint-engine configurations. All experiments were performed on H20 machines, and the Megatron engine was used for training.
+| model | trainer rank | rollout rank | checkpoint-engine | total sync time |
+|:-----------------:|:--------:|:-------:|:--------------:|:--------------:|
+| Qwen2.5-Math-7B | 4 | 4 | False | 0.12s |
+| Qwen2.5-Math-7B | 4 | 4 | True | 0.02s |
+| Qwen3-30B-A3B | 16 | 16 | False | 15.76s |
+| Qwen3-30B-A3B | 16 | 16 | True | 4.38s |
+| Qwen3-235B-A22B | 64 | 64 | False | 58.57s |
+| Qwen3-235B-A22B | 64 | 64 | True | 23.70s |
+### use_trainer_do_validate Experiment
+We tested the effect of setting `use_trainer_do_validate=True` on the training process. The results show that setting
+this parameter to True can reduce the validation time overhead and trainer node idle time.
+We used Qwen2.5-Math-7B to verify the benefits of `use_trainer_do_validate=True` on the training process, we achieved about 2x performance improvement on validation time, and the trainer node idle time is reduced by about 40%.
+* Machine: H20
+* Model: Qwen2.5-Math-7B
+* Rollout length: max_response_length FSDP2: 10K tokens;
+* Algorithm: DAPO
+* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet
+* Engine: vllm+FSDP2
+* rollout.n: 16
+* ppo_mini_batch_size: 32
+* test_freq: 10
+* fully_async_policy
+    * total_rollout_steps: 512*400
+    * require_batches: 4
+    * trigger_parameter_sync_step: 4
+    * staleness_threshold: 0.5
+    * partial_rollout: True
+|  training mode  | resource allocation | step  |  gen  | old_log_prob | update_actor | validate time | total time<br>50 step | acc/mean@2 |
+|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|
+| colocate sync      | 16  |  484.623  |  52.939	 |   0	 |   430.263   |  205.080  	 |     7h9m  	 |     22.6     |
+| fully_async_policy | 8:8 |  489.953  |  52.622	 |   0	 |   435.874   |  95.699  	 |     7h2m  	 |     21.0    |
+## Multi-Turn Tool Calling
+Referencing **recipe/retool** and **ToolAgentLoop**, we implemented **AsyncPartialToolAgentLoop**, a multi-turn
+tool-calling loop that supports partial_rollout for **fully_async_policy**.
+### Core Design
+`AsyncPartialToolAgentLoop` inherits from `ToolAgentLoop` and is adapted for the asynchronous training mode of
+`fully_async_policy`. When `partial_rollout=True`, the Rollouter interrupts ongoing generation tasks before
+synchronizing parameters with the Trainer. `AsyncPartialToolAgentLoop` is capable of:
+1. **Interrupting Tasks**: Responding to an interrupt signal to save the current state. Currently, interruptions occur
+   during the `GENERATING` process or after other states have completed.
+2. **Resuming Tasks**: Resuming execution from the saved state after parameter synchronization is complete, rather than
+   starting over.
+### How to Use
+RL training with multi-turn tool calling in `fully_async_policy` is similar to `recipe/retool`. It is enabled by
+specifying `multi_turn` configurations in the config file.
+1. **SFT Stage**: First, the model should undergo SFT to learn how to follow tool-calling format instructions.
+2. **Multi-turn Configuration**: In the `fully_async_policy` training configuration, set the following parameters:
+   ```yaml
+   actor_rollout_ref:
+     rollout:
+       multi_turn:
+         enable: True # AsyncPartialToolAgentLoop will be used by default in fully_async_policy mode
+         # Other multi_turn related configurations
+   ```
+3. **Async Parameters**: To improve efficiency, enable `partial_rollout` and `staleness_threshold` when using multi-turn
+   tool calling:
+   ```yaml
+   async_training:
+     partial_rollout: True
+     staleness_threshold: 0.5
+     # Other async parameters
+   ```
+4. **Example**: See `recipe/fully_async_policy/shell/dapo_7b_async_retool.sh`.
+### Experimental Results
+To validate the performance of `fully_async_policy` on multi-turn tool-calling tasks, we compared it with the standard
+`colocate` synchronous mode. Key parameter settings are as follows.
+- **SFT Model**: Based on `Qwen2.5-7B-Instruct`, trained for 6 epochs on the `ReTool-SFT` dataset
+- **RL Algorithm**: DAPO
+- **Dataset**:
+  - Train: `DAPO-Math-17k`
+  - Test: `aime_2025`
+- **Resource and Mode Comparison**:
+  - `colocate sync`: 32 H20 gpus
+  - `fully_async_policy`: 16 gpus for Trainer + 16 gpus for Rollouter
+- **Key Configurations**:
+  1. **Tool Calling Configuration**:
+     - `multi_turn.enable: True`
+     - `multi_turn.max_user_turns: 16`
+     - `multi_turn.max_assistant_turns: 16`
+     - `multi_turn.tool_config_path: recipe/retool/sandbox_fusion_tool_config.yaml`
+  2. **`colocate sync` Configuration**:
+     - `ppo_mini_batch_size: 16`
+     - `train_batch_size: 64`
+  3. **`fully_async_policy` Configuration**:
+     - `ppo_mini_batch_size: 16`
+     - `trigger_parameter_sync_step: 4`
+     - `require_batches: 1`
+     - `staleness_threshold: 1`
+     - `partial_rollout: True`
+|   training mode    | Resource allocation |  step  |  gen   | old_log_prob | update_actor | total time<br>100 step | total time<br>200 step |  aime_2025<br>acc/mean@30   |
+| :----------------: | :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :-------------------------: |
+|      colocate      |         32          | 375.47 | 228.03 |    35.19     |    111.84    |         9h 46m         |        22h 28m         | start:0.1078<br>last:0.2056 |
+| fully_async_policy |       16: 16        | 221.36 | 40.59  |      \       |    179.58    |   6h 19m<br>(1.55x)    |   14h 4m<br>(1.60x)    |  start:0.11<br>last:0.2044  |
+> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-multiturn-tool?nw=nwuserhouzg
+## Future Plans
+- Transfer queue integration
+- Asynchronous parameter synchronization

code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md ADDED Viewed

	@@ -0,0 +1,193 @@

+# Use Prometheus and Grafana to Monitor Rollout
+**Author:** `https://github.com/meituan-search`
+Last updated: 12/05/2025.
+Monitor the rollout computation process using Prometheus and Grafana when using verl to enhance system observability and facilitate further performance optimization.
+We provide an additional training monitoring capability, leveraging Prometheus and Grafana to display rollout information during training and enhance system observability to facilitate further performance optimization.
+The system automatically configures Prometheus to scrape metrics from rollout servers, eliminating manual configuration steps.
+## Overview
+The figures below show the performance of Qwen235B on the AIME2024 dataset with a response length of 20k, where the emergence of a long-tail problem is clearly observable.
+![fully_async_policy_structure](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana_validate.png?raw=true)
+The following figure presents the fully asynchronous training of the Qwen235B model. Here, resource idleness is distinctly noticeable, indicating that rollout resources can be reduced.
+![fully_async_policy_structure](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana_fully_async_train.png?raw=true)
+Through the above two examples, we also illustrate the necessity of system observability.
+## Architecture Overview
+The overall workflow consists of the following steps:
+1. **Multi-node Ray Cluster Setup**: Start Ray cluster across multiple nodes with Grafana and Prometheus information configured in environment variables on the master node
+2. **Start Grafana Service**: Launch Grafana on the master node for visualization of monitoring dashboards
+3. **Start Prometheus Service**: Launch Prometheus on the master node for metrics collection and storage
+4. **verl Async Rollout Mode**: verl uses async rollout mode to obtain rollout server ports and IP addresses
+5. **Automatic Prometheus Configuration**: verl automatically rewrites the Prometheus configuration to add monitoring for rollout servers and notifies Prometheus to reload the configuration
+6. **Metrics Collection**: After program execution, metrics can be viewed in Prometheus
+7. **Dashboard Visualization**: Upload and view monitoring metrics in Grafana dashboards
+## Detailed Setup Steps
+### Step 1: Environment Variables and Start Ray Cluster
+First, set the necessary environment variables and start the Ray service.
+> Reference: [configure-manage-dashboard](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html)
+```bash
+# Master node environment variables
+export GF_SERVER_HTTP_PORT=3000                     # Grafana service default port (customizable)
+export PROMETHEUS_PORT=9090                         # Prometheus service default port (customizable)
+export RAY_HEAD_PORT=6379                           # Ray master node port (customizable)
+export RAY_DASHBOARD_PORT=8265                      # Ray dashboard default port (customizable)
+export GRAFANA_PATHS_DATA=/tmp/grafana              # Grafana data storage directory (customizable)
+export RAY_GRAFANA_HOST="http://${master_ip}:${GF_SERVER_HTTP_PORT}"        # Ray-associated Grafana address
+export RAY_PROMETHEUS_HOST="http://${master_ip}:${PROMETHEUS_PORT}"         # Ray-associated Prometheus address
+# Start Ray on master node
+ray start --head --port=${RAY_HEAD_PORT} --dashboard-port=${RAY_DASHBOARD_PORT}
+# Start Ray on worker nodes
+ray start --address={master_addr}:${RAY_HEAD_PORT}
+```
+**Verification:** Visit `http://master_ip:8265` to confirm Ray has started successfully.
+### Step 2: Start Grafana (Visualization Dashboard)
+Grafana is used to display metrics collected by Prometheus (such as cache hit rate, throughput, etc.):
+```bash
+# Master node
+nohup grafana-server \
+  --config /tmp/ray/session_latest/metrics/grafana/grafana.ini \
+  --homepath /usr/share/grafana \
+  web > grafana.log 2>&1 &
+```
+**Verification:** Visit `http://master_ip:3000` to confirm Grafana has started successfully (default credentials: `admin/admin`).
+If you need to change the port, modify the `GF_SERVER_HTTP_PORT` environment variable, and grafana-server will automatically recognize it.
+### Step 3: Start Prometheus (Metrics Collection)
+Prometheus is responsible for scraping metrics from vLLM services and storing them as time-series data:
+```bash
+# Master node
+nohup prometheus \
+  --config.file /tmp/ray/session_latest/metrics/prometheus/prometheus.yml \
+  --web.enable-lifecycle \
+  --web.listen-address=:${PROMETHEUS_PORT} \
+  > prometheus.log 2>&1 &
+```
+**Verification:** Visit `http://master_ip:9090` to confirm Prometheus service has started successfully.
+### Step 4 & 5: Start verl Training
+Start verl training with the following parameters configured:
+**Required Configuration:**
+- `actor_rollout_ref.rollout.mode="async"`
+- `actor_rollout_ref.rollout.disable_log_stats=False`
+- `actor_rollout_ref.rollout.prometheus.enable=True`
+If use default port, this parameter can be omitted.
+- `actor_rollout_ref.rollout.prometheus.port=9090`
+If use default path, this parameter can be omitted.
+- `actor_rollout_ref.rollout.prometheus.file="/tmp/ray/session_latest/metrics/prometheus/prometheus.yml"`
+served_model_name uses `model_path.split("/")[-1]` for data statistics by default.
+Users can also customize other aliases:
+- `actor_rollout_ref.rollout.prometheus.served_model_name="Qwen3-235B"`
+**Shell Script Example:**
+```bash
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+rollout_mode="async"
+rollout_name="vllm"  # Options: sglang or vllm
+if [ "$rollout_mode" = "async" ]; then
+    export VLLM_USE_V1=1
+    return_raw_chat="True"
+fi
+# Synchronous training
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+    --working-dir "${WORKING_DIR}" \
+    -- python3 -m verl.trainer.main_ppo \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.disable_log_stats=False \
+    actor_rollout_ref.rollout.prometheus.enable=True
+    ...
+# Asynchronous training
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+    --working-dir "${WORKING_DIR}" \
+    -- python3 verl.experimental.fully_async_policy.fully_async_main \
+    data.return_raw_chat=${return_raw_chat} \
+    actor_rollout_ref.rollout.name=${rollout_name} \
+    actor_rollout_ref.rollout.mode=${rollout_mode} \
+    actor_rollout_ref.rollout.disable_log_stats=False \
+    actor_rollout_ref.rollout.prometheus.enable=True
+    ...
+```
+### Step 6: View Metrics in Prometheus
+After task execution, verify that Prometheus is correctly collecting metrics.
+**Verification:** Visit the Prometheus interface at `http://master_ip:9090` and search for `vllm:` or `sglang:` to
+confirm metrics are being reported correctly.
+**Troubleshooting:**
+If no metrics appear:
+1. Check logs for `AgentLoopManager` to find the server port
+2. Visit `http://master_ip:server_port/metrics` to verify server metrics are available
+3. Confirm that `actor_rollout_ref.rollout.disable_log_stats=False` is set
+### Step 7: View Metrics in Grafana
+After task execution, log in to Grafana to view and customize monitoring dashboards.
+**Login:** Visit `http://master_ip:3000` (default credentials: `admin/admin`)
+**Import Dashboard:**
+1. Select `Dashboards` → `New` → `Import` → `Upload dashboard JSON file`
+2. Upload a pre-built dashboard JSON file
+**Available Dashboards:**
+- [vLLM Grafana Dashboard style 1](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana/vllm_grafana.json)
+- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/performance_statistics.json)
+- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/query_statistics.json)
+- [SGLang Grafana Dashboard](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json)
+## Additional Resources
+- [Ray Monitoring Documentation](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html)
+- [Prometheus Documentation](https://prometheus.io/docs/)
+- [Grafana Documentation](https://grafana.com/docs/)
+- [vLLM GitHub Repository](https://github.com/vllm-project/vllm)
+- [SGLang GitHub Repository](https://github.com/sgl-project/sglang)

code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst ADDED Viewed

	@@ -0,0 +1,20 @@

+Add models with the Megatron-LM backend
+=========================================
+Last updated: 04/25/2025.
+Model
+-----------
+If use latest verl, we have direct support of ``GPTModel`` for Megatron backend.
+You can use the similar way of using Megatron to pretrain custom models.
+We list the steps here:
+1. Find `model_initializer.py <https://github.com/volcengine/verl/blob/main/verl/models/mcore/model_initializer.py>`_
+2. If your model is configurable by ``TransformerLayerSpec`` , you can
+   directly use ``GPTModel``. Otherwise, Please implement a new
+   ``ModelLayerSpec`` and ``ModelLayer`` here.
+3. Use the right ``LayerSpec`` , ``TransformerConfig`` and ``HuggingfaceConfig``
+   as arguments to initialize the GPTModel.
+4. Return the model at last.

code/RL_model/verl/verl_train/docs/advance/mtp.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# Guide to Using MTP in SFT/RL Training and Inference
+**Author**: `https://github.com/meituan-search`
+Last updated: 01/30/2026
+# 1. Scope of Support
+Currently, RL training can be performed on mimo-7B-RL, Qwen-next, and Deepseek series models based on the MTP architecture. The support rules for training and inference engines are as follows:
+- **Training Engine**: Only supports the `mbridge + megatron` combination; other training engines are not compatible at this time;
+- **Inference Engine**: Compatible with all engines, but the model must be in the corresponding engine's compatibility list;
+- **Dependency Versions**:
+    - mbridge: Use the specified branch: [https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp](https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp) (will be merged into the main branch in the future);
+    - megatron: Use the latest dev version (commit: [23e092f41ec8bc659020e401ddac9576c1cfed7e](https://github.com/NVIDIA/Megatron-LM/tree/23e092f41ec8bc659020e401ddac9576c1cfed7e)), which supports MTP + CP training methods.
+    - sglang: Use the specified branch: [https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor](https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor), [PR](https://github.com/sgl-project/sglang/pull/17870) , which fix the MTP update weights from tensor OOM issue.
+# 2. MTP Training Configuration (Core Parameters)
+The MTP training process can be flexibly controlled through the following configurations. All configurations are based on the `actor_rollout_ref.model.mtp` prefix:
+| Configuration Scenario | Core Parameters                                                                                                                                                                                                                                                                                               | Description                                             |
+|------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------|
+| Load MTP Parameters Only | `enable=True`                                                                                                                                                                                                                                                                                              | VRAM usage will increase, but the exported parameters include the MTP module and can be directly used for online deployment              |
+| Full-Parameter MTP Training | `enable=True`<br>`enable_train=True`<br>`mtp_loss_scaling_factor=0.1`                                                                                                                                                                                                                              | MTP Loss will apply to all model parameters                            |
+| MTP Parameter-Only Training | `enable=True`<br>`enable_train=True`<br>`detach_encoder=True`                                                                                                                                                                                                                                      | Freeze the Encoder layer, update only MTP module parameters, MTP Loss applies only to MTP parameters |
+| MTP Accelerated Rollout | 1. vLLM configuration:<br>`enable=True`<br>`enable_rollout=True`<br>`method="mtp"`<br>`num_speculative_tokens=1`<br>2. SGLang configuration:<br>`enable=True`<br>`enable_rollout=True`<br>`speculative_algorithm="EAGLE"`<br>`speculative_num_steps=2`<br>`speculative_eagle_topk=2`<br>`speculative_num_draft_tokens=4` | Achieve inference acceleration during the Rollout phase based on MTP                      |
+# 3. Experimental Results
+The experiment was conducted as follows:
+* model = mimo-7B-math
+* max_response_length = 8k
+Experiment chart:
+![fully_async_policy_revenue](
+https://github.com/ArronHZG/verl-community/blob/main/docs/mimo-7b-mtp.png?raw=true)
+The wandb link for the graph: [wandb](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
+**Scenarios with No Significant Effect**
+The following configurations will not have a noticeable impact on training results:
+1. The base model does not carry MTP parameters;
+2. The base model carries MTP parameters, but the MTP module is not trained;
+3. The base model carries MTP parameters and trains MTP, with `mtp_loss_scaling_factor=0`;
+4. The base model carries MTP parameters, trains MTP and detaches the encoder, with `mtp_loss_scaling_factor=0.1`.
+**Scenarios with Significant Effect**
+Only the following configuration will have a noticeable impact on training results:
+- The base model carries MTP parameters, MTP Loss applies to all model parameters, and `mtp_loss_scaling_factor=0.1`.
+**Recommended Training Method**
+It is recommended to adopt the `detach_encoder=True` approach for MTP training.
+# 4. Performance Notes for MTP in Rollout Inference
+The effectiveness of MTP-accelerated Rollout is significantly affected by **model size** and **inference hardware**. Key reference information is as follows:
+**Hardware Tensor Core Performance**
+| Hardware Model | FP16 Performance (TFLOPS) |
+|----------------|---------------------------|
+| H20  | 148            |
+| H800 | 1,671          |
+| H200 | 1,979          |
+**Measured Performance and Recommendations**
+Taking the mimo-7B model deployed separately on H20 hardware using SGLang as an example: After enabling MTP speculative decoding, the Rollout throughput decreases by approximately 50%.
+- Current priority recommendation: Do not enable MTP acceleration during the inference phase for now;
+- Future planning: Further optimization of the speculative logic in the Rollout phase will be conducted to improve throughput performance.
+# 5. SFT training
+The SFT training with MTP is supported, using the same MTP training configuration as RL training.
+An example configuration for running SFT can be found in `examples/sft/gsm8k/run_mimo_megatron_mtp.sh`
+**SFT result**
+The experiment was conducted using following data:
+- model = mimo-7B-math
+- dataset = gsm8k
+The result: [wandb link](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg)
+The presence of mtp layer has limited effect on main loss. However, when MTP layer is detached, the mtp_loss converges to a higher value.

code/RL_model/verl/verl_train/docs/advance/one_step_off.md ADDED Viewed

	@@ -0,0 +1,319 @@

+# Recipe: One Step Off Policy Async Trainer
+**Author:** `https://github.com/meituan-search`
+Last updated: 07/17/2025.
+## Introduction
+### Background
+The current reinforcement learning training process implemented by verl is synchronous, adhering to the algorithmic
+workflows of established methods like PPO, GRPO, and DAPO. In each step, training samples are generated by the latest
+model, and the model is updated after training completes. While this approach aligns with off-policy reinforcement
+learning and stabilizes RL training, but it suffers from severe efficiency issues.
+Model updates must wait for the longest output in the generation phase to complete.
+During the generation of long-tail samples, GPUs remain idle, resulting in significant underutilization.
+The more severe the long-tail problem in sample generation, the lower the overall training efficiency.
+For example, in DAPO 32B training, the Rollout phase accounts for approximately 70% of the total time,
+and increasing resources does not reduce the Rollout duration.
+![DAPO 32B Math Performance](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/dapo_32b_math.png)
+> source data: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=nwusertongyuxuan361
+### Solution
+We have implemented the **One Step Off Async Trainer** to help alleviate this issue. This approach parallelizes the
+generation and training processes, utilizing samples generated in the previous step for current training.
+It also involves appropriately partitioning resources, allocating dedicated resources for generation while automatically
+assigning the remainder to training. By reducing resources allocated to the generation phase, we mitigate GPU idle time
+during long-tail sample generation. Throughout this process, generation and training parameters maintain a one-step off
+policy.
+![One Step Off Policy Diagram](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/one_step_off_policy.png)
+> reference: [AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language Reasoning](https://arxiv.org/abs/2505.24298)
+Our core contributions include:
+1. **Parallel Generation and Training**:
+   Samples for the next batch are asynchronously generated while the current batch is being trained.
+2. **Resource Isolation**:
+   Unlike `hybrid_engine`, this method requires explicit resource allocation for rollout, with remaining resources
+   automatically assigned to training.
+3. **NCCL Parameter Synchronization**:
+   Employs NCCL communication primitives for seamless parameter transfer between generation and training modules.
+### Experimental Results
+- **Machine Configuration**: 2 nodes with 16 H20 GPUs each
+  - Generation: 4 GPUs
+  - Training: 12 GPUs
+- **Model**: Qwen2.5-Math-7B
+- **Rollout Configuration**:
+- **Max Response Length**: FSDP2: 20,480 tokens; Megatron: 8,192 tokens
+- **Algorithm**: DAPO
+- **Rollout Engine**: vLLM
+| training mode          | engine        | step | gen | wait_prev_gen | generate_sequences | old_log_prob | update_actor | total time     | acc/best@32/mean | acc/maj@32/mean |
+| ---------------------- | ------------- | ---- | --- | ------------- | ------------------ | ------------ | ------------ | -------------- | ---------------- | --------------- |
+| colocate sync          | VLLM+FSDP2    | 749  | 321 | -             | 247                | 88           | 286          | 19h18m         | 0.5948           | 0.417           |
+| one-step-overlap async | VLLM+FSDP2    | 520  | -   | 45            | 458                | 108          | 337          | 15h34m（+23%） | 0.6165           | 0.494           |
+| colocate sync          | VLLM+Megatron | 699  | 207 | -             | 162                | 119          | 344          | 18h21m         | 0.605            | 0.4217          |
+| one-step-overlap async | VLLM+Megatron | 566  | -   | 59            | 501                | 120          | 347          | 13h06m (+40%)  | 0.6569           | 0.4038          |
+- colocate sync: step ≈ gen + old_log_prob + update_actor
+- one-step-overlap async: step ≈ wait_prev_gen + old_log_prob + update_actor
+![One Step Off Megatron Performance](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/one_step_off_megatron.png)
+> source data: https://wandb.ai/hou-zg-meituan/one-step-off-policy?nw=nwuserhouzg
+## Implementation
+### One Step Off Policy Async Pipeline
+Our implemented **One Step Off Policy Async Pipeline** integrates seamlessly into existing training logic at minimal
+cost,
+eliminating the need for additional sample storage management. The core mechanism uses `async_gen_next_batch`
+for asynchronous rollout generation while maintaining continuous operation during epoch transitions
+via `create_continuous_iterator`.
+```python
+# iterator generator, simplify one-step integration of the training process
+def _create_continuous_iterator(self):
+   for epoch in range(self.config.trainer.total_epochs):
+      iterator = iter(self.train_dataloader)
+      for batch_dict in iterator:
+         yield epoch, batch_dict
+# read next batch samples, parameters sync and launch asyn gen_seq
+def _async_gen_next_batch(self, continuous_iterator):
+   # read train_data
+   try:
+      epoch, batch_dict = next(continuous_iterator)
+   except StopIteration:
+      return None
+   batch = DataProto.from_single_dict(batch_dict)
+   gen_batch = batch_pocess(batch)
+   # sync weights from actor to rollout
+   self.sync_rollout_weights()
+   # async generation
+   gen_batch_output = self.rollout_wg.async_generate_sequences(gen_batch)
+   # future encapsulated
+   return GenerationBatchFuture(epoch, batch, gen_batch_output)
+continuous_iterator = self._create_continuous_iterator()
+# run rollout first to achieve one-step-off
+batch_data_future = self._async_gen_next_batch(continuous_iterator)
+while batch_data_future is not None:
+   # wait for the gen_seq result from the previous step
+   batch = batch_data_future.get()
+   # launch the next async call to generate sequences
+   batch_data_future = self._async_gen_next_batch(continuous_iterator)
+   # compute advantages
+   batch = critic.compute_values(batch)
+   batch = reference.compute_log_prob(batch)
+   batch = reward.compute_reward(batch)
+   batch = compute_advantages(batch)
+   # model update
+   critic_metrics = critic.update_critic(batch)
+   actor_metrics = actor.update_actor(batch)
+```
+### Parameter Synchronization
+The exciting point is that our nccl based weights updating for rollout model has great performance.
+At most of time, the latency is under 300ms, which is negligible for RLHF.
+> **sync_rollout_weights**：The time for synchronizing parameters from actor to rollout is extremely fast and can almost
+> be ignored because it is implemented with nccl.
+```python
+class ActorRolloutRefWorker:
+   # actor acquires the meta-info of model parameters for parameter sync
+   @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+   def get_actor_weights_info(self):
+      params = self._get_actor_params()
+      ret = []
+      for key, tensor in params.items():
+         ret.append((key, tensor.size(), tensor.dtype))
+      self._weights_info = ret
+      return ret
+   # rollout sets the meta-info of model parameters for parameter sync
+   @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+   def set_actor_weights_info(self, weights_info):
+      self._weights_info = weights_info
+class AsyncRayPPOTrainer(RayPPOTrainer):
+   def init_workers(self):
+      ...
+      # rollout obtains the meta-info of model parameters from the actor for parameter sync
+      weights_info = self.actor_wg.get_actor_weights_info()[0]
+      self.rollout_wg.set_actor_weights_info(weights_info)
+      # Create an actor-rollout communication group for parameter sync
+      self.create_weight_sync_group
+```
+```python
+# The driving process invokes the actor and rollout respectively to create a weight synchronization group based on nccl/hccl.
+def create_weight_sync_group(self):
+   master_address = ray.get(self.actor_wg.workers[0]._get_node_ip.remote())
+   master_port = ray.get(self.actor_wg.workers[0]._get_free_port.remote())
+   world_size = len(self.actor_wg.workers + self.rollout_wg.workers)
+   self.actor_wg.create_weight_sync_group(
+      master_address,
+      master_port,
+      0,
+      world_size,
+   )
+   ray.get(
+      self.rollout_wg.create_weight_sync_group(
+            master_address,
+            master_port,
+            len(self.actor_wg.workers),
+            world_size,
+      )
+   )
+# drive process call the actor and rollout respectively to sync parameters by nccl
+def sync_rollout_weights(self):
+   self.actor_wg.sync_rollout_weights()
+   ray.get(self.rollout_wg.sync_rollout_weights())
+# fsdp model parameter sync
+@register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False)
+def sync_rollout_weights(self):
+   params = self._get_actor_params() if self._is_actor else None
+   if self._is_rollout:
+      inference_model = (
+         self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model
+      )
+      from verl.utils.vllm.patch import patch_vllm_moe_model_weight_loader
+      patch_vllm_moe_model_weight_loader(inference_model)
+   # Model parameters are broadcast tensor-by-tensor from actor to rollout
+   for key, shape, dtype in self._weights_info:
+      tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device())
+      if self._is_actor:
+         assert key in params
+         origin_data = params[key]
+         if hasattr(origin_data, "full_tensor"):
+            origin_data = origin_data.full_tensor()
+         if torch.distributed.get_rank() == 0:
+            tensor.copy_(origin_data)
+      from ray.util.collective import collective
+      collective.broadcast(tensor, src_rank=0, group_name="actor_rollout")
+      if self._is_rollout:
+         inference_model.load_weights([(key, tensor)])
+```
+### PPO Correctness
+To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling.
+For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html
+The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored.
+### AgentLoop
+In the current implementation, we no longer provide SPMD model rollout mode.
+Instead, we have switched to AgentLoop mode, which also supports multi-turn tool calling.
+## Usage
+### FSDP2 Configuration Example
+```shell
+python3 -m verl.experimental.one_step_off_policy.async_main_ppo \
+    --config-path=config \
+    --config-name='one_step_off_ppo_trainer.yaml' \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    # actor and rollout are placed separately
+    actor_rollout_ref.hybrid_engine=False \
+    # actor and rollout resource
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node=6 \
+    rollout.nnodes=1 \
+    rollout.n_gpus_per_node=2
+```
+### Megatron Configuration Example
+```shell
+python3 -m verl.experimental.one_step_off_policy.async_main_ppo \
+    --config-path=config \
+    --config-name='one_step_off_ppo_megatron_trainer.yaml' \
+    actor_rollout_ref.actor.strategy=megatron \
+    # actor and rollout are placed separately
+    actor_rollout_ref.hybrid_engine=False \
+    # actor and rollout resource
+    trainer.nnodes=1 \
+    trainer.n_gpus_per_node=6 \
+    rollout.nnodes=1 \
+    rollout.n_gpus_per_node=2
+```
+### Configuration Guidelines
+1. **Card Number Relationships**
+   Maintain either of these relationships for optimal batch distribution:
+   - `actor_rollout_ref.rollout.n` should be an integer divisor of:
+     `trainer.n_gpus_per_node * trainer.nnodes`
+   - `actor_rollout_ref.rollout.n * data.train_batch_size` should be evenly divisible by:
+     `trainer.n_gpus_per_node * trainer.nnodes`
+   > Rationale: Ensures training samples can be evenly distributed across training GPUs when using partial resources for
+   > generation.
+2. **Dynamic Resource Tuning**
+   Adjust `trainer.nnodes` `trainer.n_gpus_per_node` `rollout.nnodes` `rollout.n_gpus_per_node` based on phase
+   durations:
+   - **Ideal state**: Rollout and training phases have comparable durations
+   - **Diagnostic metrics**:
+     - Monitor `wait_prev_gen` duration
+     - Analyze `sequence_length` distribution
+   - **Adjustment strategy**:
+     - High `wait_prev_gen` + uniform sequence lengths → Increase rollout resources
+     - High `wait_prev_gen` + long-tail sequences → Optimize stopping criteria (resource increase won't help)
+       > **wait_prev_gen**：The time consumed waiting for the previous rollout to end (the part that is not fully
+       > overlapped).
+       > **Resource Configuration Strategies:**
+   - **Resource-constrained scenario**: Optimize resource utilization by adjusting GPU allocation ratios,
+     keeping the number of nodes equal to allow training and rollout to share nodes;
+     - Configure `trainer.nnodes = rollout.nnodes` with
+       `trainer.n_gpus_per_node + rollout.n_gpus_per_node = physical_gpus_per_node`. Control rollout resource
+       allocation by adjusting `n_gpus_per_node`.
+   - **Resource-abundant scenario**: Optimize performance by adjusting the number of nodes,
+     keeping the number of GPUs per node equal to enable independent scaling of training and rollout
+     parallelism.
+     - Configure `trainer.n_gpus_per_node = rollout.n_gpus_per_node` and control rollout resource allocation by
+       adjusting `trainer.nnodes` and `rollout.nnodes`to achieve optimal performance.
+       > **Note**: The total number of nodes required by the system is not simply `trainer.nnodes + rollout.nnodes`. The
+       > actual calculation depends on GPU capacity:
+       >
+       > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node <= physical_gpus_per_node`,
+       >   the required node count is `max(trainer.nnodes, rollout.nnodes)`
+       > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node > physical_gpus_per_node`,
+       >   the required node count is `trainer.nnodes + rollout.nnodes`
+## Functional Support
+| Category           | Support Situation                                                                                               |
+| ------------------ | --------------------------------------------------------------------------------------------------------------- |
+| train engine       | FSDP2 <br/> Megatron                                                                                            |
+| rollout engine     | vLLM                                                                                                            |
+| AdvantageEstimator | GRPO <br/> GRPO_PASSK <br/> REINFORCE_PLUS_PLUS <br/> RLOO <br/> OPO <br/> REINFORCE_PLUS_PLUS_BASELINE<br/>GPG |
+| Reward             | all                                                                                                             |

code/RL_model/verl/verl_train/docs/advance/placement.rst ADDED Viewed

	@@ -0,0 +1,13 @@

+Ray API Design Tutorial
+=======================================
+Last updated: 10/30/2024.
+We provide a tutorial for our Ray API design, including:
+- Ray basic concepts
+- Resource Pool and RayWorkerGroup
+- Data Dispatch, Execution and Collection
+- Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool
+See details in `tutorial.ipynb <https://github.com/volcengine/verl/blob/main/examples/ray/tutorial.ipynb>`_.

code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst ADDED Viewed

	@@ -0,0 +1,208 @@

+RL(HF) algorithms with LoRA Support
+===========================================
+Last updated: 12/17/2025.
+We support LoRA (Low-Rank Adaptation) for reinforcement learning algorithms such as PPO, GRPO, and others.
+LoRA is a parameter-efficient fine-tuning technique that injects trainable low-rank matrices into pre-trained weights (typically linear layers). This reduces memory footprint and compute cost, making it possible to fine-tune large models with limited hardware.
+The benefits this brings include:
+- reinforcement learning with very large models (e.g. 70B+) with modest hardware (e.g. 8x80G GPUs),
+- enable larger batch sizes due to reduced memory usage,
+- simplify model transfer and deployment, as only LoRA adapters need to be saved,
+- Combine with techniques like `SLoRA <https://arxiv.org/abs/2311.03285>`_ or `CCoE <https://arxiv.org/abs/2407.11686>`_ to serve multiple LoRA adapters efficiently
+This guide explains how to enable LoRA in RL training and configure related parameters.
+FSDP Backend Usage Guide
+------------------------
+.. note::
+   This section applies to **FSDP/FSDP2 backend only**. For Megatron backend, see the :ref:`megatron-lora` section below.
+1. Lora is available in the `verl.trainer.ppo.ray_trainer.RayPPOTrainer`. Examples are provided via the `verl.trainer.main_ppo` entry point.
+2. Currently, LoRA is supported via huggingface peft, only with fsdp/fsdp2 and vllm backend (sglang support coming soon).
+- `strategy=fsdp` or `strategy=fsdp2`
+- `rollout.name=vllm`
+3. Required configurations for LoRA:
+- `actor_rollout_ref.model.lora_rank`: int, set to a reasonable value greater than 0 (e.g., 8, 16, 32, 64)
+- `actor_rollout_ref.model.lora_alpha`: float, the alpha term in LoRA
+- `actor_rollout_ref.rollout.load_format="safetensors"`: required. This enables vLLM to load the base model.
+- `actor_rollout_ref.model.target_modules`: the target modules for LoRA. Typically set to "all-linear".
+4. Optional configurations for LoRA:
+- `actor_rollout_ref.model.lora_adapter_path`: string, path to a pretrained LoRA adapter directory.
+   If provided, loads existing adapter instead of creating new one. Enables multi-stage training from previously saved adapters.
+   Directory need contain `adapter_model.safetensors` and `adapter_config.json`.
+5. Recommend options:
+- `actor_rollout_ref.model.use_shm=True`: preload the model into `/dev/shm` to improve model loading speed.
+- `actor_rollout_ref.rollout.layered_summon=True`: this enables the actor-model to gather the FSDP shards per layers when synchronizing the LoRA Adapter to vLLM, thereby reducing GPU peak memory. Recommended if the model is very large (70B+) or the GPU memory is limited (< 48GB)
+.. _megatron-lora:
+Megatron Backend Usage Guide
+----------------------------
+.. warning::
+   The FSDP-specific config options are **NOT applicable** to Megatron backend, and they will be ignored if set. Only options listed under ``lora`` key are applicable:
+   - ``actor_rollout_ref.model.lora.*``
+   - ``critic.model.lora.*``
+You need to install and enable Megatron-Bridge for Megatron LoRA support.
+Make sure you use Megatron-Bridge later than 0.2.0, and we recommended using `this commit <https://github.com/NVIDIA-NeMo/Megatron-Bridge/commit/83a7c1134c562d8c6decd10a1f0a6e6a7a8a3a44>`_ or later for proper support, and use the following settings to enable Megatron-Bridge:
+- ``actor_rollout_ref.actor.megatron.use_mbridge=True``
+- ``actor_rollout_ref.actor.megatron.vanilla_mbridge=False``
+**Key Differences from FSDP LoRA:**
+1. **LoRA Implementation**: Verl Megatron backend uses Megatron-Bridge's native LoRA implementation, which differs from HuggingFace PEFT.
+2. **Weight Sync / Refit Mechanism**: Currently, Megatron-Bridge can support syncing weights by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss), as well as loading separate adapters.
+**Configuration for Megatron LoRA:**
+.. code-block:: yaml
+   actor_rollout_ref:
+     model:
+      lora:
+        # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora"
+        type: lora
+        # whether to sync weights / refit by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss). If this is False, it will load separate adapters.
+        merge: False
+        # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA
+        rank: 0
+        #  Weighting factor for the low-rank projection. Defaults to 32
+        alpha: 32
+        # Dropout rate for the low-rank projection. Defaults to 0.0
+        dropout: 0.0
+        # A list of module names to apply LoRA to.
+        # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2'].
+        # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"]
+        # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention
+        # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention
+        # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP
+        # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP
+        # Target modules can also contain wildcards. For example, you can specify
+        # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers
+        #
+        # Note:
+        # For MLA (e.g., DeepSeek), you should use ["linear_kv_down_proj","linear_kv_up_proj","linear_q_down_proj","linear_q_up_proj","linear_q_proj"]
+        # Instead of "linear_qkv" or ["linear_q","linear_k","linear_v"]
+        # By default, MoE routers are excluded from LoRA adaptation, and you will need to specify "router" in target_modules to include them.
+        target_modules:
+          - linear_qkv
+          - linear_proj
+          - linear_fc1
+          - linear_fc2
+        # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name
+        # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None
+        exclude_modules: []
+        # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre'
+        dropout_position: pre
+        # Initialization method for the low-rank matrix A. Defaults to "xavier".
+        lora_A_init_method: xavier
+        # Initialization method for the low-rank matrix B. Defaults to "zero".
+        lora_B_init_method: zero
+        # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False
+        a2a_experimental: False
+        # Parameter data type for LoRA weights. Default to null, which will use model's dtype.
+        dtype: null
+        # Path to pre-trained LoRA adapter weights (null to train from scratch)
+        adapter_path: null
+        # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen.
+        # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully
+        # finetune the vision model.
+        freeze_vision_model: True
+        freeze_vision_projection: True
+        freeze_language_model: True
+LoRA training experiment with Qwen3-8B on 8 * H200 single node comparing FSDP and Megatron backend (script adapted from examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh):
+.. image:: https://github.com/user-attachments/assets/0482f423-01a3-4e52-a7ee-8b9cd79b7b1a
+.. image:: https://github.com/user-attachments/assets/6ce10400-8164-47d8-90a6-c1bf002fb9e8
+.. image:: https://github.com/user-attachments/assets/092d3a43-4eba-425e-a584-8d83c1f02de4
+Best Practices and Notes
+-------------------------
+1. **Learning rate**: it is recommended to increase the value of learning rate by an order of magnitude.
+2. **LoRA Rank**:
+- Too small a rank can hurt convergence.
+- LoRA rank recommendation from @thelongestusernameofall:
+  - A very small lora_rank can lead to slower convergence or worse training performance. It is recommended to set lora_rank to be>=32. Tests have shown that for a 0.5B model, with lora_rank=32,the training convergence speed and final performance are almost identical to non-LoRA training
+  - For a 32B model,with lora_rank=128,the training convergence speed and final performance are also almost identical to non-LoRA training.
+  - More comprehensive reference results are coming soon.
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/f2b80b8b26829124dd393b7a795a0640eff11644/docs/lora.jpg?raw=true
+3. **FSDP-Specific:** Reference configuration for RL training with the Qwen2.5-72B model using 8 x 80GB GPUs (increase lora_rank if needed):
+.. code-block::
+    data.train_batch_size=64 \
+    actor_rollout_ref.model.use_shm=True \
+    actor_rollout_ref.model.lora_rank=32 \
+    actor_rollout_ref.model.lora_alpha=32 \
+    actor_rollout_ref.model.target_modules=all-linear \
+    actor_rollout_ref.actor.optim.lr=3e-5 \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
+    actor_rollout_ref.actor.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
+    actor_rollout_ref.rollout.n=5 \
+    actor_rollout_ref.rollout.max_num_seqs=64 \
+    actor_rollout_ref.rollout.max_model_len=1536 \
+    actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
+    actor_rollout_ref.rollout.load_format=safetensors \
+    actor_rollout_ref.rollout.layered_summon=True \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
+Example Scripts
+-------------------
+For end-to-end examples, refer to the scripts below:
+**FSDP Examples:**
+- LoRA training from scratch: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
+- LoRA training from adapter path: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh
+**Megatron Examples:**
+- LoRA training with Dense: examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh
+- LoRA training with MoE: examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh

code/RL_model/verl/verl_train/docs/advance/reward_loop.rst ADDED Viewed

	@@ -0,0 +1,301 @@

+Reward Loop
+===========
+.. _yyding: https://yyding1.github.io
+Author: `Yuyang Ding <https://yyding1.github.io>`_
+Last updated: 12/20/2025.
+.. warning::
+   Reward Loop is ready for use, but the API may change in future releases.
+   User can set ``reward_model.use_reward_loop=True`` or ``False`` to control whether to enable reward loop.
+Reward Loop is designed to support flexible and user-friendly reward computation, with most implementation in ``verl/experimental/reward_loop``.
+Compared with the previous reward mechanism, the Reward Loop offers the following key features:
+1. provides a more flexible and user-friendly design for reward-model settings, enabling hybrid reward scenarios where multiple reward sources can be seamlessly integrated.
+2. implements asynchronous reward computation instead of the previous batch-based computation, improving efficiency for both rule-based rewards and reward-model-based scenarios.
+Hybrid Reward Scenarios
+-----------------------
+Reward Loop covers all typical reward-computation scenarios.
+- **Rule-based Reward**: The reward is determined by predefined rules, e.g., checking whether the predicted answer matches the ground truth via simple string matching.
+- **Discriminative Reward Model (DisRM)**: The reward is produced by a specified discriminative reward model, such as ``Skywork/Skywork-Reward-Llama-3.1-8B-v0.2``.
+- **Generative Reward Model (GenRM)**: The reward is obtained using a generative reward model, for example ``dyyyyyyyy/FAPO-GenRM-4B``.
+- **Hybrid Reward Scenarios**: Reward Loop provides interfaces for plugging in reward models, allowing users to define custom reward logic based on their needs (e.g., combining rule-based methods with GenRM).
+Rule-based Reward
+~~~~~~~~~~~~~~~~~
+If ``custom_reward_function`` is not provided, the reward loop will fall back to the default rule-based reward function.
+Otherwise, only the user-defined reward function will be used. The files under ``verl/utils/reward_score/`` provide some examples.
+Reward Loop supports both synchronous and asynchronous user-defined reward functions. It automatically detects the function type and executes it accordingly, ensuring that reward computation remains non-blocking and efficient.
+Discriminative Reward Model (DisRM)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For scenarios involving a discriminative reward model, users should provide ``reward_model.model.path`` to specify the reward model.
+The Reward Loop will pass the question and the model rollout as inputs to the reward model and obtain a reward score from its output.
+Generative Reward Model (GenRM)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For generative reward model scenarios, users need to specify both ``reward_model.model.path`` and ``custom_reward_function``.
+The custom reward function should implement the following components:
+- Convert the question and the model rollout into a GenRM input prompt using a custom prompt template.
+- Invoke the GenRM to perform generation with custom sampling parameters. For this purpose, the Reward Loop provides an HTTP interface (i.e., ``reward_router_address``) for interacting with GenRM.
+- Parse the GenRM output using a custom parser and extract the reward score.
+As these steps are highly customizable and task-dependent, we offer this flexibility entirely to the user-defined reward function.
+Below we provide an example of a custom reward function using GenRM.
+.. code:: python
+   async def compute_score_gsm8k(
+      data_source: str,
+      solution_str: str,
+      ground_truth: str,
+      extra_info: dict,
+      reward_router_address: str,  # an HTTP router endpoint provided by Reward Loop
+      reward_model_tokenizer: PreTrainedTokenizer,
+   ):
+      """Compute the reward score."""
+      # Step 1: Prepare prompt and request payload
+      grm_prompt = GRM_PROMPT_TEMPLATE.format(problem=extra_info["question"], solution=solution_str)
+      messages = [{"role": "user", "content": grm_prompt}]
+      sampling_params = {"temperature": 0.7, "top_p": 0.8, "max_tokens": 4096}
+      chat_complete_request = {"messages": messages, **sampling_params}
+      # Step 2: Send async request to the reward model
+      # here, chat_complete sends async http request to the router address
+      result = await chat_complete(
+         router_address=reward_router_address,
+         chat_complete_request=chat_complete_request,
+      )
+      # Step 3: Parse model response and extract score
+      grm_response = result.choices[0].message.content.strip()
+      try:
+         score_str = grm_response.split("\n\n")[-1].strip()
+         score = int(score_str)
+      except Exception:
+         score = 0
+      return {"score": score}
+Hybrid Reward Scenarios
+~~~~~~~~~~~~~~~~~~~~~~~
+For more complex application settings, such as combining rule-based rewards with GenRM, or mixing rule-based rewards with DisRM, users can also achieve this by specifying the ``reward_model.model.path`` together with the ``custom_reward_function``.
+The implementation of the customized reward function follows the same pattern as illustrated above.
+A runnable and reproducible example that demonstrates how to use a rule-based reward function together with a GenRM is provided in the ``recipe/fapo`` directory for reference. Welcome to use and cite.
+Architecture Design
+-------------------
+Reward Loop supports multiple execution modes for reward training:
+- **Colocate Mode**: The reward model shares the same resource pool as the actor/rollout/reference models. In this setup, all rollouts must complete first, after which the reward model is awakened to perform inference.
+- **Standalone Mode**: The reward model runs on a separate resource pool, independent from the actor/rollout/reference models. In this setup, each sample is evaluated by the reward model immediately after its rollout finishes.
+.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop.svg?raw=true
+RewardLoopWorker
+~~~~~~~~~~~~~~~~~
+The ``RewardLoopWorker`` is responsible for handling batch-level reward computation, operating in an asynchronous manner.
+.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop_worker.svg?raw=true
+For each sample, the reward is computed according to the following logic:
+- if ``custom_reward_function`` is provided, we directly use user-customized reward function
+- if ``custom_reward_function`` is not provided:
+   - **reward model is not enabled**: use default rule-based reward function
+   - **reward model is discriminative**: compute reward score using disrm
+   - **reward model is generative**: this is not permitted (user-customized reward func **must be** provided)
+In most cases, we encourage users to define and use their own customized reward functions.
+``RewardLoopWorker`` will initialize a ``RewardManager`` via ``_init_reward_fn()``.
+Then the batch reward computation request of ``compute_score_batch`` will be processed asynchronously.
+.. code:: python
+   @ray.remote
+   class RewardLoopWorker:
+      def __init__(self, config: DictConfig, reward_router_address: str = None):
+         self.config = config
+         self.reward_router_address = reward_router_address
+         self._init_reward_fn()
+      def _init_reward_fn(self):
+         input_tokenizer_local_path = copy_to_local(self.config.actor_rollout_ref.model.path)
+         self.input_tokenizer = hf_tokenizer(input_tokenizer_local_path, trust_remote_code=True)
+         self.reward_model_tokenizer = None
+         if self.config.reward_model.enable:
+            reward_model_tokenizer_local_path = copy_to_local(self.config.reward_model.model.path)
+            self.reward_model_tokenizer = hf_tokenizer(reward_model_tokenizer_local_path, trust_remote_code=True)
+         self.reward_fn = get_custom_reward_fn(self.config)
+         reward_manager_cls = get_reward_manager_cls(self.config.reward_model.reward_manager)
+         self.reward_loop = reward_manager_cls(
+            self.config, self.input_tokenizer, self.reward_fn, self.reward_router_address, self.reward_model_tokenizer
+         )
+      async def compute_score_batch(self, data: DataProto) -> list[dict]:
+         tasks = []
+         for i in range(len(data)):
+            tasks.append(asyncio.create_task(self.compute_score(data[i : i + 1])))
+         outputs = await asyncio.gather(*tasks)
+         return outputs
+      async def compute_score(self, data: DataProto) -> dict:
+         assert len(data) == 1, "RewardLoopWorker only support single data item"
+         if self.config.custom_reward_function.path is not None:
+            # directly use user-customized reward function
+            return await self.reward_loop.run_single(data)
+         else:
+            if self.config.reward_model.enable:
+               # we assume the rm is disrm
+               # genrm must set custom_reward_function
+               return await self.compute_score_disrm(data)
+            else:
+               return await self.reward_loop.run_single(data)
+RewardManager
+~~~~~~~~~~~~~
+Reward Loop refactors the previous reward manager, which processed rewards sequentially on batched inputs.
+Instead, the Reward Loop performs reward computation asynchronously and in parallel at the per-sample level.
+In the ``RewardManager`` of Reward Loop, we implement a ``run_single`` function to compute the score for single sample. All the reward functions are executed by ``compute_score_fn``. The input should be a ``DataProto`` containing only one item.
+.. code:: python
+   @register("naive")
+   class NaiveRewardManager(RewardManagerBase):
+      async def run_single(self, data: DataProto) -> dict:
+         assert len(data) == 1, "Only support single data item"
+         ...
+Commonly used reward managers, such as ``DAPORewardManager`` has been implemented in reward loop.
+In addition, ``RateLimitRewardManager`` is also ready for use for external API-based reward computation scenarios like ChatGPT.
+Users can also customize their own ``RewardManager``, by adding the ``@register`` decorator, inheriting from ``RewardManagerBase``, and implementing the ``run_single`` function.
+See ``verl/experimental/reward_manager/*`` for reference.
+.. code:: python
+   @register("user_costomized")
+   class UserCostomizedRewardManager(RewardManagerBase):
+      async def run_single(self, data: DataProto) -> dict:
+         assert len(data) == 1, "Only support single data item"
+         # your own reward manager
+         ...
+After defining it, users can specify their custom reward manager by setting ``reward_model.reward_manager=user_costomized``.
+RewardLoopManager
+~~~~~~~~~~~~~~~~~
+To enable parallel reward computation, the Reward Loop launches multiple reward workers that handle reward computation requests concurrently.
+In **standalone mode**, we directly launch one ``RewardLoopWorker`` for each ``AgentLoopWorker`` to handle reward computation independently.
+In **colocate mode**, we launch a ``RewardLoopManager`` to
+1. launch reward model if enabled
+2. manage multiple ``RewardLoopWorker`` instances to parallelize reward computation.
+Users can specify the number of workers by setting ``reward_model.num_workers`` in colocate mode.
+.. code:: python
+   class RewardLoopManager:
+      """
+      RewardLoopManager run in single controller.
+      This class will create reward loop workers and manage them.
+      RewardLoopManager will deprecate fsdp/megatron RewardModelWorker in the future.
+      """
+   def __init__(self, config: DictConfig, rm_resource_pool: RayResourcePool = None):
+      self.config = config
+      if self.config.reward_model.enable:
+         self.reward_model_manager = RewardModelManager(config.reward_model, rm_resource_pool)
+         self.reward_router_address = self.reward_model_manager.get_router_address()
+      else:
+         self.reward_model_manager = None
+         self.reward_router_address = None
+      self._init_reward_loop_workers()
+   def _init_reward_loop_workers(self):
+      self.reward_loop_workers = []
+      num_workers = self.config.reward_model.get("num_workers", 1)
+      node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0]
+      for i in range(num_workers):
+         # Round-robin scheduling over the all nodes
+         node_id = node_ids[i % len(node_ids)]
+         self.reward_loop_workers.append(
+            RewardLoopWorker.options(
+               name=f"reward_loop_worker_{i}",
+               scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy(
+                  node_id=node_id,
+                  soft=True,
+               ),
+            ).remote(self.config, self.reward_router_address)
+         )
+   def compute_rm_score(self, data: DataProto) -> DataProto:
+      """
+      Compute reward score for the given data.
+      """
+      ...
+RewardModelManager
+~~~~~~~~~~~~~~~~~~
+To support flexible and scalable reward model computation, Reward Loop implement a reward router that coordinates requests among multiple reward model servers.
+Each reward model runs as an independent server and is registered with the router.
+This router will forward the requests to the registered reward servers with load balancing and return the results.
+This design allows us to expose a single unified router address to user-defined reward functions, enabling them to access various reward models seamlessly through the same interface.
+.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop_full.svg?raw=true
+.. code:: python
+   class RewardModelManager:
+      """Reward model manager."""
+      def __init__(
+         self,
+         config: RewardModelConfig,
+         resource_pool: RayResourcePool = None,
+      ):
+         """
+         Initialize the reward model manager.
+         Args:
+            config (RewardModelConfig): Reward model configuration.
+            resource_pool (RayResourcePool, optional): Resource pool. Defaults to None.
+         """
+         self.config = config
+         self.resource_pool = resource_pool
+         self._initialize_llm_servers()
+         self._initialize_router()
+         assert self.config.rollout.skip_tokenizer_init is False, "Reward model should not skip tokenizer init."
+         if self.config.rollout.free_cache_engine:
+               self.sleep()

code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst ADDED Viewed

	@@ -0,0 +1,61 @@

+RolloutSkip Function Usage Documentation
+========================================
+Last updated: 08/01/2025.
+Applicable Scenarios
+--------------------
+The RolloutSkip functionality is designed to accelerate the rollout process in reinforcement learning training by caching and reusing previously generated sequences. This feature is particularly useful when:
+1. You need to repeatedly run experiments with the same configuration
+2. You want to save time by avoiding redundant sequence generation to come close to the optimal policy
+API and Usage Example
+----------------------
+2.1 Trainer Adaptation
+~~~~~~~~~~~~~~~~~~~~~~
+Both`RayDAPOTrainer()` (in `verl/recipe/dapo/dapo_ray_trainer.py`) and `RayPPOTrainer()`(in `verl/trainer/ppo/ray_trainer.py``) have already been adapted.
+This is an example of how to patch rollout_skip in RayPPOTrainer.
+.. code-block:: python
+    #* Import the RolloutSkip class
+    from verl.utils.rollout_skip import RolloutSkip
+    ...
+    class RayPPOTrainer:
+        ...
+        def fit(self):
+            ...
+            #* Add code as follow:
+            rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg)
+            rollout_skip.wrap_generate_sequences()
+            ...
+            for epoch in range(self.config.trainer.total_epochs):
+                for batch_dict in self.train_dataloader:
+                    ...
+2.2 Basic Configuration
+~~~~~~~~~~~~~~~~~~~~~~~
+Then, you should add the following parameters to your config to enable the RolloutSkip feature:
+.. code-block:: bash
+    actor_rollout_ref.rollout.skip_rollout=True \
+    actor_rollout_ref.rollout.skip_dump_dir="/tmp/rollout_dump" \
+Note:
+1. The `skip_dump_dir` is the directory where the cached sequences will be stored. Ensure that this directory is writable and accessible by your training process. And make sure that `skip_dump_dir` is not relative path because ray will store the data in `/tmp/ray/session_<session_id>/` and the relative path will not be found in the worker.
+2. The dumped data path follows this naming pattern `{experiment_name}_{project_name}_TrainGBS{train_gbs}__InferGBS{gen_gbs}__N{n}`, once you change the `experiment_name`, `project_name`, `train_gbs`, `gen_gbs`, or `n`, the cached data will be stored in a new directory.

code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst ADDED Viewed

	@@ -0,0 +1,146 @@

+Trace Function Usage Instructions
+========================================
+Last updated: 07/10/2025.
+Applicable Scenarios
+--------------------
+Agentic RL involves multiple turns of conversations, tool invocations, and user interactions during the rollout process. During the Model Training process, it is necessary to track function calls, inputs, and outputs to understand the flow path of data within the application. The Trace feature helps, in complex multi-round conversations, to view the transformation of data during each interaction and the entire process leading to the final output by recording the inputs, outputs, and corresponding timestamps of functions, which is conducive to understanding the details of how the model processes data and optimizing the training results.
+The Trace feature integrates commonly used Agent trace tools, including wandb weave and mlflow, which are already supported. Users can choose the appropriate trace tool according to their own needs and preferences. Here, we introduce the usage of each tool.
+Trace Parameter Configuration
+-----------------------------
+- ``actor_rollout_ref.rollout.trace.backend=mlflow|weave`` # the trace backend type
+- ``actor_rollout_ref.rollout.trace.token2text=True`` # To show decoded text in trace view
+- ``actor_rollout_ref.rollout.trace.max_samples_per_step_per_worker=N`` # Limit traces per worker (optional)
+Limiting Trace Volume
+~~~~~~~~~~~~~~~~~~~~~~
+By default, all samples are traced, which can generate large amounts of data and incur significant costs with trace backends like Weave or MLflow. To limit trace volume while maintaining representative coverage, use ``max_samples_per_step_per_worker``.
+Example configuration:
+.. code-block:: yaml
+   actor_rollout_ref:
+     rollout:
+       trace:
+         backend: weave
+         token2text: False
+         max_samples_per_step_per_worker: 5  # Each worker traces 5 random samples
+Each agent loop worker independently selects up to N unique samples to trace per training step. For GRPO (``n > 1``), all rollouts for selected samples are traced. Total traces per step = max_samples_per_step_per_worker * num_workers * n.
+Example: With 4 workers, max_samples_per_step_per_worker=5, and GRPO n=4, you get 4 * 5 * 4 = 80 traces per step instead of tracing all samples. Set to null (default) to trace all samples.
+Glossary
+--------
++----------------+------------------------------------------------------------------------------------------------------+
+| Object         | Explaination                                                                                         |
++================+======================================================================================================+
+| trajectory     | A complete multi-turn conversation includes:                                                         |
+|                | 1. LLM output at least once                                                                          |
+|                | 2. Tool Call                                                                                         |
++----------------+------------------------------------------------------------------------------------------------------+
+| step           | The training step corresponds to the global_steps variable in the trainer                            |
++----------------+------------------------------------------------------------------------------------------------------+
+| sample_index   | The identifier of the sample, defined in the extra_info.index of the dataset. It is usually a number,|
+|                | but may also be a uuid in some cases.                                                                |
++----------------+------------------------------------------------------------------------------------------------------+
+| rollout_n      | In the GROP algorithm, each sample is rolled out n times. rollout_n represents the serial number of  |
+|                | the rollout.                                                                                         |
++----------------+------------------------------------------------------------------------------------------------------+
+| validate       | Whether the test dataset is used for evaluation?                                                     |
++----------------+------------------------------------------------------------------------------------------------------+
+Rollout trace functions
+-----------------------
+There are 2 functions used for tracing:
+1. ``rollout_trace_op``: This is a decorator function used to mark the functions to trace. In default, only few method has it, you can add it to more functions to trace more infor.
+2. ``rollout_trace_attr``: This function is used to mark the entry of a trajectory and input some info to trace. If you add new type of agent, you may need to add it to enable trace.
+Usage of wandb weave
+--------------------
+1.1 Basic Configuration
+~~~~~~~~~~~~~~~~~~~~~~~
+1. Set the ``WANDB_API_KEY`` environment variable
+2. Configuration Parameters
+   1. ``actor_rollout_ref.rollout.trace.backend=weave``
+   2. ``trainer.logger=['console', 'wandb']``: This item is optional. Trace and logger are independent functions. When using Weave, it is recommended to also enable the wandb logger to implement both functions in one system.
+   3. ``trainer.project_name=$project_name``
+   4. ``trainer.experiment_name=$experiment_name``
+   5. ``actor_rollout_ref.rollout.mode=async``: Since trace is mainly used for agentic RL, need to enable agent toop using async mode for either vllm or sglang.
+Note:
+The Weave Free Plan comes with a default monthly network traffic allowance of 1GB. During the training process, the amount of trace data generated is substantial, reaching dozens of gigabytes per day, so it is necessary to select an appropriate wandb plan.
+1.2 View Trace Logs
+~~~~~~~~~~~~~~~~~~~
+After executing the training, on the project page, you can see the WEAVE sidebar. Click Traces to view it.
+Each Trace project corresponds to a trajectory. You can filter and select the trajectories you need to view by step, sample_index, rollout_n, and experiment_name.
+After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the input and output content.
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_list.png?raw=true
+1.3 Compare Trace Logs
+~~~~~~~~~~~~~~~~~~~~~~
+Weave can select multiple trace items and then compare the differences among them.
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_compare.png?raw=true
+Usage of mlflow
+---------------
+1. Basic Configuration
+~~~~~~~~~~~~~~~~~~~~~~
+1. Set the ``MLFLOW_TRACKING_URI`` environment variable, which can be:
+   1. Http and https URLs corresponding to online services
+   2. Local files or directories, such as ``sqlite:////tmp/mlruns.db``, indicate that data is stored in ``/tmp/mlruns.db``. When using local files, it is necessary to initialize the file first (e.g., start the UI: ``mlflow ui --backend-store-uri sqlite:////tmp/mlruns.db``) to avoid conflicts when multiple workers create files simultaneously.
+2. Configuration Parameters
+   1. ``actor_rollout_ref.rollout.trace.backend=mlflow``
+   2. ``trainer.logger=['console', 'mlflow']``. This item is optional. Trace and logger are independent functions. When using mlflow, it is recommended to also enable the mlflow logger to implement both functions in one system.
+   3. ``trainer.project_name=$project_name``
+   4. ``trainer.experiment_name=$experiment_name``
+2. View Log
+~~~~~~~~~~~
+Since ``trainer.project_name`` corresponds to Experiments in mlflow, in the mlflow view, you need to select the corresponding project name, then click the "Traces" tab to view traces. Among them, ``trainer.experiment_name`` corresponds to the experiment_name of tags, and tags corresponding to step, sample_index, rollout_n, etc., are used for filtering and viewing.
+For example, searching for ``"tags.step = '1'"`` can display all trajectories of step 1.
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_list.png?raw=true
+Opening one of the trajectories allows you to view each function call process within it.
+After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the content.
+.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_view.png?raw=true
+Note:
+1. mlflow does not support comparing multiple traces
+2. rollout_trace can not associate the mlflow trace with the run, so the trace content cannot be seen in the mlflow run logs.

code/RL_model/verl/verl_train/docs/advance/rope.rst ADDED Viewed

	@@ -0,0 +1,39 @@

+RoPE Scaling override
+=======================================
+Last updated: 05/14/2025.
+Some models such as `Qwen/Qwen2.5-7B-Instruct <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct#processing-long-texts>`_ support RoPE Scaling but don't have it defined in their config.json file.
+For example, this model supports this configuration:
+.. code:: python
+    {
+        ...,
+        "rope_scaling": {
+            "factor": 4.0,
+            "original_max_position_embeddings": 32768,
+            "type": "yarn"
+        }
+    }
+In order to support a longer context for such models, you must override the model configs when starting the trainer.
+PPO example:
+.. code:: bash
+    +actor_rollout_ref.model.override_config.rope_scaling.type=yarn \
+    +actor_rollout_ref.model.override_config.rope_scaling.factor=4.0 \
+    +actor_rollout_ref.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
+And for the critic model
+.. code:: bash
+    +critic.model.override_config.rope_scaling.type=yarn \
+    +critic.model.override_config.rope_scaling.factor=4.0 \
+    +critic.model.override_config.rope_scaling.original_max_position_embeddings=32768 \

code/RL_model/verl/verl_train/docs/algo/baseline.md ADDED Viewed

	@@ -0,0 +1,73 @@

+# Algorithm Baselines
+Last updated: 06/18/2025.
+## Math related datasets
+### GSM8k
+Assuming GSM8k/math dataset is preprocessed via:
+```bash
+python3 examples/data_preprocess/*.py
+```
+Refer to the table below to reproduce RL training from different pre-trained checkpoints. Below is the performance on the GSM8k dataset if not specified otherwise. More comprehensive benchmark results areavailable in the recipe folder.
+| Hardware   | Model                            | Method          | Test score   | Details                                                                                                                                                                                                                       |
+| ---------- | -------------------------------- | --------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NVIDIA GPU | google/gemma-2-2b-it             | hf checkpoint   | 23.9         | [Huggingface](https://huggingface.co/google/gemma-2-2b-it#benchmark-results)                                                                                                                                                  |
+| NVIDIA GPU | google/gemma-2-2b-it             | SFT             | 52.06        | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log)                                                                                                           |
+| NVIDIA GPU | google/gemma-2-2b-it             | SFT + PPO       | 64.02        | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log), [wandb](https://api.wandb.ai/links/verl-team/h7ux8602)                      |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct       | hf checkpoint   | 49.6         | [Qwen blog](https://qwen.ai/blog?id=qwen2.5-llm)                                                                                                                                                                              |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct       | PPO             | 56.7         | [command and log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)                                                                                     |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct       | PRIME           | 58.7         | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen.sh), [wandb](https://api.wandb.ai/links/zefan-wang-thu-tsinghua-university/rxd1btvb)                                                     |
+| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct       | GRPO-LoRA       | 54.3         | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.543.log)                                                                     |
+| NVIDIA GPU | Qwen/Qwen2.5-1.5B-Instruct       | GRPO-LoRA       | 77.9         | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-1.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.779.log)                                                                     |
+| NVIDIA GPU | Qwen/Qwen2.5-3B-Instruct         | GRPO-LoRA       | 86.1         | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-3B-bsz64_2-prompt512-resp1024-lorarank32-score0.861.log)                                                                       |
+| NVIDIA GPU | deepseek-ai/deepseek-llm-7b-chat | PPO (Megatron)  | 69.5 [1]     | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log), [wandb](https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3) |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct           | GRPO            | 89           | [script](https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh)                                                                                  |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct           | GRPO (FSDP2)    | 89.8         | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)                                                                                                                                 |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct           | GRPO (Megatron) | 89.6         | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b_math_megatron.log)                                                                                                                         |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct         | ReMax           | 97           | [script](https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh), [wandb](https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln)                                |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct         | SPPO            | 65.6 (MATH)  | [SPPO script](https://github.com/volcengine/verl-recipe/tree/main/sppo/README.md)                                                                                                                                             |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct         | GRPO-LoRA       | 93.4         | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-7B-bsz64_8-prompt512-resp1024-lorarank32-score0.934.log)                                                                       |
+| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1      | Instruct model  | 83.7         | [Qwen Blog](https://qwen.ai/blog?id=qwen2.5-llm)                                                                                                                                                                              |
+| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1      | RLOO (Megatron) | 92.3         | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d)                                                                                                                                                                          |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct         | SPIN            | 92           | [script](https://github.com/volcengine/verl-recipe/tree/main/spin/README.md)                                                                                                                                                  |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct           | GPG             | 88           | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/ab86c4va)                                                  |
+| NVIDIA GPU | Qwen/Qwen2-7B-Instruct           | GPG (Megatron)  | 88           | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math_megatron.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/yy8bheu8)                                         |
+| NVIDIA GPU | Qwen/Qwen2.5-VL-7B-Instruct      | GRPO (Megatron) | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek)                                                |
+| AMD MI300  | deepseek-ai/deepseek-llm-7b-chat | PPO             | 70.5 [1]     | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log)                                                                                                                          |
+| AMD MI300  | deepseek-ai/deepseek-llm-7b-chat | GRPO            | 71.4 [1]     | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log)                                                                                                                         |
+| NVIDIA GPU | Qwen/Qwen2.5-14B-Instruct        | GRPO-LoRA       | 94.6         | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-14B-bsz64_8-prompt512-resp1024-lorarank32-score0.946.log)                                                                      |
+| NVIDIA GPU | Qwen/Qwen2.5-32B-Instruct        | GRPO-LoRA       | 95.8         | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-32B-bsz64_8-prompt512-resp1024-lorarank32-score0.958.log)                                                                      |
+| NVIDIA GPU | Qwen/Qwen2.5-72B-Instruct        | GRPO-LoRA       | 96.0         | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-72B-bs64_8-prompt512-resp1024-lorarank32-score0.960.log)                                                                       |
+### DAPO math-17k
+- Training DAPO math-17k dataset: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k
+- Testing: AIME'24: https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024
+Note:
+- For Qwen/Qwen2.5-Math-7B, we directly modify the max_position_embeddings to 32768 without observing performance degradation in order to train longer response length.
+| Hardware   | Model                      | Method                  | Test score | Details                                                                                                                                                                                             |
+| ---------- | -------------------------- | ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NVIDIA GPU | Qwen/Qwen2.5-Math-7B (32k) | DAPO                    | 36.3       | [command](https://github.com/verl-project/verl-recipe/blob/main//dapo/test_dapo_7b_math.sh), [logs](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361) |
+| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct   | DAPO + Code Interpreter | 40.0       | [command](https://github.com/verl-project/verl-recipe/blob/main//retool/run_qwen2_7b_dapo.sh)                                                                                                       |
+## Coding related datasets
+Below is the result on leetcode if not specified otherwise.
+| Hardware   | Model                   | Method | Test score | Details                                                                                                                                                                                |
+| ---------- | ----------------------- | ------ | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NVIDIA GPU | PRIME-RL/Eurus-2-7B-SFT | RPIME  | 36.1       | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen_code.sh), [swanlab](https://swanlab.cn/@wangzefan/prime_example/runs/7f541qhspgmy8nmhdlx35/chart) |
+### Notes
+[1] During evaluation, we have only extracted answers following the format `"####"`. A more flexible answer extraction, longer response length, and better prompt engineering may lead to a higher score.
+[2] The default value of `actor_rollout_ref.actor.entropy_coeff` is set to `0.0` since verl 0.3.x on 2025-05-30, which is different from previous versions.

code/RL_model/verl/verl_train/docs/algo/collabllm.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# Recipe: CollabLLM
+Last updated: 09/22/2025.
+> Open-Source Algorithm Implementation & Expriement Running: [Haiquan Chen](https://github.com/chenhaiq), [Shirley Wu](https://github.com/Wuyxin)
+🏠 [Homepage](https://aka.ms/CollabLLM) | 📝 [Paper](https://arxiv.org/pdf/2502.00640) | 🤗 [Datasets & Models](https://huggingface.co/collabllm) | ⭐️ [Original Implementation](https://github.com/Wuyxin/collabllm)
+`verl` provides a recipe for the Outstanding Paper at ICML 2025, **"CollabLLM: From Passive Responders to Active Collaborators"**. [CollabLLM](https://aka.ms/CollabLLM) is a unified fine-tuning framework that optimizes LLMs for effective and efficient multiturn collaboration with users.
+**Core Idea:** Models are rewarded based on how well their responses enable effective *future* collaboration with users.
+Paper Authors: [Shirley Wu](https://cs.stanford.edu/~shirwu/), [Michel Galley](https://www.microsoft.com/en-us/research/people/mgalley/), Baolin Peng, Hao Cheng, Gavin Li, Yao Dou, Weixin Cai, [James Zou](https://www.james-zou.com/), [Jure Leskovec](https://cs.stanford.edu/people/jure/), [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/)
+---
+## Quick Start
+### 0. Environment
+Make sure the required packages for `verl` are installed. Additionally, install `litellm` and export the required API keys. The API model will be used for user simulators and, optionally, LLM Judges (see the Configuration section below).
+### 1. Prepare Your Dataset
+First, process your dataset using the provided script (see example commands and usage in `process_dataset.py`):
+```bash
+python process_dataset.py --dataset <> ... --dataset_type <sft or rl>
+```
+**Requirements:**
+- Input: A Hugging Face multiturn dataset. Existing datasets: `collabllm/collabllm-multiturn-$DATASET`, with `DATASET` in one of [`math-hard(-large)`, `medium(-large)`, `bigcodebench(-large)`] (*-large are the datasets used in the CollabLLM paper)
+- Example format: See [collabllm-multiturn-math-hard](https://huggingface.co/datasets/collabllm/collabllm-multiturn-math-hard)
+- To generate your own dataset: Use [build_dataset.py](https://github.com/Wuyxin/collabllm/blob/main/scripts/engine/build_dataset.py) from the original CollabLLM repository
+### 2. Train Your Model
+**(Optional) For Supervised Fine-Tuning (SFT):**
+```bash
+bash train_sft_collabllm.sh
+```
+**For Reinforcement Learning (RL):**
+```bash
+bash train_rl_collabllm.sh
+```
+The RL script shows an example to train CollabLLM on `math-hard-large`.
+- The config to sample future conversations are in `recipe/collabllm/config/collabllm_interaction_config.yaml`.
+- The Multiturn-aware Reward is aggregated from these three conversational-level rewards:
+    ```
+    +reward_model.reward_kwargs.metric_weights.accuracy=1 \
+    +reward_model.reward_kwargs.metric_weights.interactivity=1 \
+    +reward_model.reward_kwargs.metric_weights.token_amount=-0.0001 \
+    ```
+    You can remove, add, or modify the weights depending on your task. A list of implemented metrics you can already add are under `recipe/collabllm/metrics`. For example, on `medium-large`, you can replace `accuracy` with `bleu_score` via
+    ```
+    +reward_model.reward_kwargs.metric_weights.bleu_score=1
+    ```
+    which will instead apply bleu score on the sampled future conversations.
+## Algorithm
+| Step | Name                          | Description                                                                 |
+|------|-------------------------------|-----------------------------------------------------------------------------|
+| 1    | Model response generation     | The model generates multiple responses for each prompt in a batch.          |
+| 2    | Collaborative simulation      | A user simulator (e.g., GPT or Claude) samples `num_repeat_rollouts` conversations for up to `max_user_turns` additional turns. |
+| 3    | Compute Multiturn-aware Reward | Customized conversational reward functions are applied to the sampled conversations. Rewards are aggregated, then averaged across rollouts. |
+| 4    | Update model                  | The model weights are updated using the computed multiturn-aware rewards.  |
+---
+## Configuration
+The primary configuration is managed through the launch script `train_rl_collabllm.sh` and the YAML file `recipe/collabllm/config/collabllm_interaction_config.yaml`. Key configuration sections:
+| Section              | Key Parameters / Notes                                                                 |
+|----------------------|-----------------------------------------------------------------------------------------|
+| `data`               | Paths to training/validation files, batch sizes, sequence lengths.                      |
+| `actor_rollout_ref` (common) | Base model path (used for actor + initial reference), FSDP settings, optimization (LR, scheduler). |
+| `actor_rollout_ref` (CollabLLM-specific) | Hyperparameters under `actor_rollout_ref.rollout.multi_turn`: `max_user_turns`, `max_assistant_turns`, `num_repeat_rollouts`. |
+| `interaction`        | Defined in `collabllm_interaction_config.yaml`. Specifies user simulator and hyperparameters. Requires exported API keys. |
+| `reward_model`       | Manager set to `collabllm` by default. Modify `reward_model.reward_kwargs.metric_weights` for conversational rewards and weights. LLM Judge hyperparameters (e.g., `model`, `temperature`) go under `reward_model.reward_kwargs.llm_judge_kwargs`. |
+| `algorithm`          | GRPO-specific hyperparameters such as `actor_rollout_ref.rollout.n`.                    |
+| `trainer`            | Distributed training (nodes, GPUs per node), logging (WandB), checkpointing frequency.  |
+---
+## Key Files
+| File Path | Purpose |
+|-----------|---------|
+| `recipe/collabllm/collabllm_agent_loop.py` | Main logic to sample future conversations, using `CollabLLMInteraction` from `verl/interactions/collabllm_interaction.py`. |
+| `verl/workers/reward_manager/collabllm.py` | Computes rewards for future conversations, leveraging `recipe/collabllm/reward_function.py` to apply each metric. |
+---
+## Acknowledgement
+We sincerely thank the `verl` community and advisors for their contributions and guidance!

code/RL_model/verl/verl_train/docs/algo/dapo.md ADDED Viewed

	@@ -0,0 +1,187 @@

+# Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
+Last updated: 06/19/2025.
+> Open-Source Algorithm Implementation & Expriement Running: [Yuxuan Tong](https://tongyx361.github.io/), [Guangming Sheng](https://hk.linkedin.com/in/guangming-sheng-b50640211)
+🏠 [Homepage](https://dapo-sia.github.io/) | 📝 [Paper@arXiv](https://arxiv.org/abs/2503.14476) | 🤗 [Datasets&Models@HF](https://huggingface.co/collections/BytedTsinghua-SIA/dapo-67d7f1517ee33c8aed059da0) | 🐱 [Code@GitHub](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) | 🐱 [Repo@GitHub](https://github.com/BytedTsinghua-SIA/DAPO)
+> We propose the **D**ecoupled Clip and Dynamic s**A**mpling **P**olicy **O**ptimization (DAPO) algorithm. By making our work publicly available, we provide the broader research community and society with practical access to scalable reinforcement learning, enabling all to benefit from these advancements. Our system is based on the awesome [verl](https://github.com/volcengine/verl) framework. Thanks for their great work! Applying DAPO training to Qwen2.5-32B base model proves to outperform the previous state-of-the-art DeepSeek-R1-Zero-Qwen-32B on AIME 2024, achieving **50%** accuracy with **50%** less training steps.
+>
+> ![dapo-main-result](https://dapo-sia.github.io/static/images/score.png)
+## Quickstart
+1. Prepare the datasets **on the Ray cluster**:
+```bash
+bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default
+```
+2. Submit the job to the Ray cluster **from any machine**:
+```bash
+cd verl # Repo root
+export RAY_ADDRESS="http://${RAY_IP:-localhost}:8265" # The Ray cluster address to connect to
+export WORKING_DIR="${PWD}" # The local directory to package to the Ray cluster
+# Set the runtime environment like env vars and pip packages for the Ray cluster in yaml
+export RUNTIME_ENV="./recipe/dapo/runtime_env.yaml" # This sets environment variables for the Ray cluster
+bash recipe/dapo/run_dapo_qwen2.5_32b.sh # or other scripts
+```
+## Reproduction Runs
+| Setup                                        | AIME 2024 Acc. | Hardware  | Image                                                                | Commit                                                                                       | Environment Variables                                                                                                             | Training Script                                                                                                                                             | Training Record                                                                           |
+| -------------------------------------------- | -------------- | --------- | -------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
+| DAPO                                         | 52%            | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_qwen2.5_32b.sh)             | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
+| DAPO w/o Dynamic Sampling                    | 50%            | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_wo_ds_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
+| DAPO w/o Token-level Loss & Dynamic Sampling | 44%            | 16x8xH20  | `hiyouga/verl:ngc-th2.5.1-cu120-vllm0.7.4-hotfix`                    | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_early_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_early_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
+> [!IMPORTANT]
+>
+> **📢 Call for Contribution!**
+>
+> Welcome to submit your reproduction runs and setups!
+## Configuration
+### Separated Clip Epsilons (-> Clip-Higher)
+An example configuration:
+```yaml
+actor_rollout_ref:
+  actor:
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.28
+```
+`clip_ratio_low` and `clip_ratio_high` specify the $\varepsilon_{\text {low }}$ and $\varepsilon_{\text {high }}$ in the DAPO objective.
+Core relevant code:
+```python
+pg_losses1 = -advantages * ratio
+pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
+pg_losses = torch.maximum(pg_losses1, pg_losses2)
+```
+### Dynamic Sampling (with Group Filtering)
+An example configuration:
+```yaml
+data:
+  gen_batch_size: 1536
+  train_batch_size: 512
+algorithm:
+  filter_groups:
+    enable: True
+    metric: acc # score / seq_reward / seq_final_reward / ...
+    max_num_gen_batches: 10 # Non-positive values mean no upper limit
+```
+Setting `filter_groups.enable` to `True` will filter out groups whose outputs' `metric` are all the same, e.g., for `acc`, groups whose outputs' accuracies are all 1 or 0.
+The trainer will repeat sampling with `gen_batch_size` until there are enough qualified groups for `train_batch_size` or reaching the upper limit specified by `max_num_gen_batches`.
+Core relevant code:
+```python
+prompt_bsz = self.config.data.train_batch_size
+if num_prompt_in_batch < prompt_bsz:
+    print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
+    num_gen_batches += 1
+    max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
+    if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
+        print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
+        continue
+    else:
+        raise ValueError(
+            f'{num_gen_batches=} >= {max_num_gen_batches=}. Generated too many. Please check your data.'
+        )
+else:
+    # Align the batch
+    traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
+    batch = batch[:traj_bsz]
+```
+### Flexible Loss Aggregation Mode (-> Token-level Loss)
+An example configuration:
+```yaml
+actor_rollout_ref:
+  actor:
+    loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
+    # NOTE: "token-mean" is the default behavior
+```
+Setting `loss_agg_mode` to `token-mean` will mean the (policy gradient) loss across all the tokens in all the sequences in a mini-batch.
+Core relevant code:
+```python
+if loss_agg_mode == "token-mean":
+    loss = verl_F.masked_mean(loss_mat, loss_mask)
+elif loss_agg_mode == "seq-mean-token-sum":
+    seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)  # token-sum
+    loss = torch.mean(seq_losses)  # seq-mean
+elif loss_agg_mode == "seq-mean-token-mean":
+    seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1)  # token-mean
+    loss = torch.mean(seq_losses)  # seq-mean
+else:
+    raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
+```
+### Overlong Reward Shaping
+An example configuration:
+```yaml
+data:
+  max_response_length: 20480 # 16384 + 4096
+reward_model:
+  overlong_buffer:
+    enable: True
+    len: 4096
+    penalty_factor: 1.0
+```
+Setting `overlong_buffer.enable` to `True` will penalize the outputs whose lengths are overlong but still within the hard context limit.
+Specifically, the penalty increases linearly from `0` to `overlong_buffer.penalty_factor` when the length of the output exceeds the `max_response_length - overlong_buffer.len` by `0` to `overlong_buffer.len` tokens.
+Core relevant code:
+```python
+if self.overlong_buffer_cfg.enable:
+    overlong_buffer_len = self.overlong_buffer_cfg.len
+    expected_len = self.max_resp_len - overlong_buffer_len
+    exceed_len = valid_response_length - expected_len
+    overlong_penalty_factor = self.overlong_buffer_cfg.penalty_factor
+    overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0)
+    reward += overlong_reward
+```
+## FAQ
+### Where is the "Overlong Filtering" in the paper?
+Most experiments in the paper, including the best-performant one, are run without Overlong Filtering because it's somehow overlapping with Overlong Reward Shaping in terms of properly learning from the longest outputs. So we don't implement it here.
+### What's the difference between [the `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl-recipe/tree/main/dapo) and the [`recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo)?
+[The `recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) is for **as-is reproduction** and thus won't be updated with new features.
+[The `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl-recipe/tree/main/dapo) works as an example of how to extend the latest `verl` to implement an algorithm recipe, which will be maintained with new features.
+### Why can't I produce similar results after modifications?
+RL infrastructures nowadays still have inherent unrobustness, on which we are still working hard to improve.
+We strongly recommend to only modify one thing at a time.
+We also list some known problems here:
+1. Enabling CUDA graph (`enforce_eager=False`) might cause model performance degradation, whose cause is still under investigation.

code/RL_model/verl/verl_train/docs/algo/entropy.md ADDED Viewed

	@@ -0,0 +1,115 @@

+# Recipe: Entropy Mechanism
+Last updated: 06/27/2025.
+<div align="center">
+  The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning.
+[![Paper](https://img.shields.io/badge/paper-A42C25?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2505.22617)  [![Github](https://img.shields.io/badge/PRIME-000000?style=for-the-badge&logo=github&logoColor=000&logoColor=white)](https://github.com/PRIME-RL/Entropy-Mechanism-of-RL) [![alphaXiv](https://img.shields.io/badge/discussion-A42C25?style=for-the-badge&logo=arxiv&logoColor=white&color=blue
+)](https://www.alphaxiv.org/abs/2505.22617) [![Twitter](https://img.shields.io/badge/Twitter-%23000000.svg?style=for-the-badge&logo=twitter&logoColor=white)](https://x.com/stingning/status/1928088554166505667) [![Twitter](https://img.shields.io/badge/Twitter-%23000000.svg?style=for-the-badge&logo=twitter&logoColor=white)](https://x.com/charlesfornlp/status/1928089451080585283) [![Twitter-ak](https://img.shields.io/badge/Twitter-%23000000.svg?style=for-the-badge&logo=twitter&logoColor=white)](https://x.com/_akhaliq/status/1928077929105268861)
+<div align="center" style="font-family: Arial, sans-serif;">
+  <p>
+    <a href="#🎉news" style="text-decoration: none; font-weight: bold;">🎉 News</a> •
+    <a href="#✨getting-started" style="text-decoration: none; font-weight: bold;">✨ Getting Started</a> •
+    <a href="#📖introduction" style="text-decoration: none; font-weight: bold;">📖 Introduction</a>
+  </p>
+  <p>
+    <a href="#🎈citation" style="text-decoration: none; font-weight: bold;">🎈 Citation</a> •
+    <a href="#🌻acknowledgement" style="text-decoration: none; font-weight: bold;">🌻 Acknowledgement</a> •
+    <a href="#📬Contact" style="text-decoration: none; font-weight: bold;">📬 Contact</a> •
+    <a href="#📈star-history" style="text-decoration: none; font-weight: bold;">📈 Star History</a>
+  </p>
+</div>
+</div>
+## 🎉News
+- **[2025/05/29]** 🎉 Ranked **#1** of the day on [Huggingface Daily Papers](https://huggingface.co/papers?date=2025-05-29).
+- **[2025/05/29]** Released our Paper on arXiv. See [here](https://arxiv.org/pdf/2505.22617). We provide insights into the entropy mechanism of RL for LLMs and propose two simple yet effective strategies to alleviate the entropy collapse.
+## ✨Getting started
+After preparing the training data, for training Qwen2.5-7B on a single node, taking the KL-Cov approach as an example, you can simply run:
+```
+cd verl
+conda activate your_env
+bash recipe/dapo/7b_kl_cov.sh
+```
+While for training Qwen2.5-32B on multi nodes, you can run the following commands:
+```
+cd verl
+conda activate your_env
+bash recipe/dapo/32b_kl_cov.sh
+```
+## 📖Introduction
+<div align="left">
+  <img src="https://github.com/PRIME-RL/Entropy-Mechanism-of-RL/blob/main/figures/e2a.jpg?raw=true" alt="issue" style="width: 96%; height: auto;">
+</div>
+This paper addresses the entropy collapse issue in scaling reinforcement learning (RL) for large language models (LLMs), where policy entropy drops sharply during training, leading to overconfidence and performance saturation. We empirically establish a relationship between entropy ($H$) and performance ($R$): $R=−aexp(H)+b$, showing performance is bottlenecked by entropy exhaustion.
+<div align="left">
+  <img src="https://github.com/PRIME-RL/Entropy-Mechanism-of-RL/blob/main/figures/cov.jpg?raw=true" alt="issue" style="width: 96%; height: auto;">
+</div>
+Theoretically, we find entropy changes are driven by the covariance between action probability and logit updates, which correlates with advantage in Policy Gradient methods. High-probability, high-advantage actions reduce entropy, while rare, high-advantage actions increase it. Empirically, the covariance term remains positive, explaining entropy’s monotonic decline. To mitigate this, we propose Clip-Cov and KL-Cov, which restrict updates for high-covariance tokens. These methods effectively prevent entropy collapse, and improve performance.
+## 📃Evaluation
+<div align="left">
+  <img src="https://github.com/PRIME-RL/Entropy-Mechanism-of-RL/blob/main/figures/performance_fig.jpg?raw=true" alt="issue" style="width: 96%; height: auto;">
+</div>
+Our method is able to maintain a considerably higher level of entropy throughout training. For example, when the baseline's entropy reaches a plateau and can no longer be consumed, the KL-Cov method still sustains an entropy level over 10 times higher. Meanwhile, the response length of the policy model steadily increases, and its performance on the test set consistently surpasses that of the baseline. This indicates that our model is able to explore more freely during training, learning better policy through RL.
+| **Method**        | **AIME24** | **AIME25** |  **AMC** | **MATH-500** | **OMNI-MATH** | **OlympiadBench** | **Minerva** | **Avg.** |
+| ----------------- | ---------: | ---------: | -------: | -----------: | ------------: | ----------------: | ----------: | -------: |
+| *Qwen2.5-7B*      |            |            |          |              |               |                   |             |          |
+| GRPO              |       21.2 |        9.6 |     58.7 |         78.8 |          27.9 |              40.7 |        36.7 |     38.6 |
+| w. Clip-higher    |       18.1 |       11.5 |     56.6 |         79.2 |          29.8 |              43.3 |        40.4 |     38.8 |
+| w. **`CLIP-Cov`** |       22.1 |   **15.8** |     58.2 |         80.4 |      **30.5** |          **44.1** |    **41.1** |     40.4 |
+| w. **`KL-Cov`**   |   **22.6** |       12.9 | **61.4** |     **80.8** |          29.1 |              42.6 |        38.2 | **40.6** |
+| *Qwen2.5-32B*     |            |            |          |              |               |                   |             |          |
+| GRPO              |       21.8 |       16.2 |     69.7 |         84.2 |          35.2 |              43.6 |        45.5 |     45.8 |
+| w. Clip-higher    |       35.6 |       22.3 |     69.5 |         77.2 |          35.1 |              42.5 |        43.0 |     47.2 |
+| w. **`CLIP-Cov`** |       32.3 |       22.7 |     67.2 |     **87.0** |      **42.0** |          **57.2** |        46.0 |     50.3 |
+| w. **`KL-Cov`**   |   **36.8** |   **30.8** | **74.5** |         84.6 |          39.1 |              49.0 |    **46.3** | **52.2** |
+Our two approaches both achieve non-trivial improvements across all benchmarks. Compared to GRPO, our method outperforms it by 2.0% on average for the 7B model and by 6.4% for the 32B model. Moreover, we observe that our method yields more substantial gains on the larger Qwen2.5-32B. Specifically, our method achieves improvements of 15.0% and 14.6% compared to GRPO on the most challenging benchmarks, AIME24 and AIME25, respectively.
+## 🎈Citation
+If you find this paper or repo helpful, please cite us.
+```bibtex
+@article{cui2025entropy,
+  title={The Entropy Mechanism of Reinforcement Learning for Reasoning Language Models},
+  author={Cui, Ganqu and Zhang, Yuchen and Chen, Jiacheng and Yuan, Lifan and Wang, Zhi and Zuo, Yuxin and Li, Haozhan and Fan, Yuchen and Chen, Huayu and Chen, Weize and others},
+  journal={arXiv preprint arXiv:2505.22617},
+  year={2025}
+}
+```
+## 🌻Acknowledgement
+We implement our reinforcement learning algorithm extending from [verl](https://github.com/volcengine/verl). We utilize [vLLM](https://github.com/vllm-project/vllm) for inference. Our models are trained primarily on [Qwen2.5 family](https://github.com/QwenLM/Qwen2.5). Our training data is built from [DAPO-MATH](https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k). Thanks for their great contributions!
+## 📬 Contact
+For questions, discussion, or collaboration opportunities, feel free to contact:
+- Ganqu Cui: cuiganqu@pjlab.org.cn
+- Yuchen Zhang: yuchen.zhang2003@gmail.com
+- Jiacheng Chen: jackchan9345@gmail.com
+- Ning Ding: ningding.cs@gmail.com

code/RL_model/verl/verl_train/docs/algo/gpg.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# GPG: Group Policy Gradient
+Last updated: 07/03/2025.
+Group Policy Gradient (GPG) is a minimalist reinforcement learning (RL) method that enhances the reasoning ability of large language models without relying on supervised fine-tuning or complex tricks. GPG revisits traditional policy gradients and directly optimizes the RL objective—no surrogate losses, no KL penalties, no critic, and no reference model. Compared to GRPO, GPG is simpler, more efficient, and achieves better results on many tasks. For more details, please refer to the original paper [GPG: A Simple and Strong Reinforcement Learning Baseline for Model Reasoning
+](https://arxiv.org/abs/2504.02546).
+## Key Components
+- Use a corrected advantage function to improve policy gradient accuracy and training efficiency.
+- By eliminating the critic and reference models, avoiding KL divergence constraints, significantly simplifies the training process compared to Group Relative Policy Optimization (GRPO)
+## Configuration
+To configure GPG within the framework, use the following YAML settings.
+```yaml
+algorithm:
+  adv_estimator: gpg
+actor_rollout_ref:
+  actor:
+    policy_loss:
+      loss_mode: "gpg"
+```
+## Advanced Extensions
+GPG is a simple and strong baseline for model reasoning. Although it avoids using KL loss in its original form, you can still use KL loss to further improve the performance.
+```yaml
+algorithm:
+  adv_estimator: gpg
+actor_rollout_ref:
+  actor:
+    use_kl_loss: True # enable kl regularization
+    kl_loss_coef: 0.01
+    policy_loss:
+      loss_mode: "gpg"
+```

code/RL_model/verl/verl_train/docs/algo/grpo.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# Group Relative Policy Optimization (GRPO)
+Last updated: 05/31/2025.
+In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive.
+GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows:
+- Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs.
+- Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality.
+- Baseline Calculation: The average reward of the group serves as a baseline.
+- Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones.
+This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300)
+## Key Components
+- No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic)
+- Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group.
+- Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group.
+## Configuration
+Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
+Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic).
+![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
+- `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling.
+- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
+- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers.
+- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor
+- `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2
+- `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead
+- `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead.
+Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss:
+- `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO.
+- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
+- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+## Advanced Extensions
+### DrGRPO
+[Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, which leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias.
+Configure the following to enable DrGRPO, with all other parameters the same as GRPO's:
+- `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging
+- `actor_rollout_ref.actor.loss_scale_factor`: (Optional) Set to a constant integer (e.g., max response length) to ensure consistent normalization throughout training. If not set, uses the current batch's response length.
+- `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO
+- `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm
+## Reference Example
+Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)
+```bash
+bash examples/grpo_trainer/run_qwen3-8b.sh
+```
+For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html

code/RL_model/verl/verl_train/docs/algo/opo.md ADDED Viewed

	@@ -0,0 +1,33 @@

+# On-Policy RL with Optimal Reward Baseline (OPO)
+Last updated: 06/02/2025.
+Loose on-policy constraints and suboptimal baselines in reinforcement learning often lead to training instability such as large policy shifts and entropy collapse. OPO addresses these challenges by using exact on-policy training with the theretically optimal reward baseline for advantage estimation. It achieves lower policy shifts and higher output entropy, encouraging more diverse and less repetitive responses.
+OPO uses group sampling to generate multiple outputs for each input like GRPO. Unlike group-based algorithms which typically use the mean reward of a group as its baseline, OPO employs a theoretically optimal baseline: the length-weighted reward of the group. It also  omits the standard deviation normalization. By adopting these two key components, OPO enables the training of a single policy model with the objective of maximizing only the expected reward. For more detailes, refer to the original paper [On-Policy RL with Optimal Reward Baseline](https://arxiv.org/pdf/2505.23585).
+## Key Components
+- Exact On-Policy Training: always generates responses from the current policy, without using any pre-generated data or off-policy data.
+- Optimal Reward Baseline: uses a length-weighted reward of the group as the baseline for normalizing the rewards.
+## Configuration
+To configure OPO within the framework, use the following YAML settings. These parameters are crucial for enabling exact on-policy training and activating the optimal reward baseline.
+```yaml
+algorithm:
+  adv_estimator: opo  # Use OPO for optimal reward baseline
+data:
+  train_batch_size: 1024
+actor_rollout_ref:
+  actor:
+    ppo_mini_batch_size: 1024 # ppo_mini_batch_size should equal to train_batch_size to enable exact on-policy training
+    entropy_coeff: 0 # disable entropy regularization
+    use_kl_loss: False # disable kl regularization
+    kl_loss_coef: 0
+```
+## Advanced Extensions
+OPO can also be extended to other algorithms like RLOO and Reinforce++. It just needs to adjust their configurations to enable exact on-policy training and incorporate the optimal length-weighted reward baseline with minimal modifications to their advantage estimation functions.

code/RL_model/verl/verl_train/docs/algo/otb.md ADDED Viewed

	@@ -0,0 +1,104 @@

+# Optimal Token Baseline (OTB)
+Last updated: 12/25/2025.
+Optimal Token Baseline (OTB) is dynamic token-level baseline for variance reduction. It weights updates based on "Realized Energy"—essentially, how much uncertainty has accumulated up to that specific token. It downweights the noisy parts and trusts the clear signals. Read [Optimal Token Baseline blog](https://richardli.xyz/optimal-token-baseline) for more details.
+## The method: OTB
+- OTB builds a _dynamic_ baseline that adapts to each token by tracking the “Realized Energy”—the uncertainty that has accumulated up to that token. It downweights the noisy parts and trusts the clear signals.
+- Unlike standard group means (which average over the padding `EOS` token ineffectively), OTB handles this naturally by computing baselines only over valid tokens.
+## Logit-Gradient Proxy
+- Computing true uncertainty requires expensive backward passes (calculating gradient norms per token). Instead, OTB introduces the **Logit-Gradient Proxy**: the realized energy can be estimated entirely from forward probabilities.
+- This means zero extra backward calls and effectively no additional runtime overhead.
+## Mechanics at a glance
+For each prompt group of size `N`, OTB computes rewards-to-go `G_t` and cumulative variance weights `W_t`. The optimal baseline per token is
+```
+B*_t = (Σ_i G_t^{(i)} · W_t^{(i)}) / (Σ_i W_t^{(i)} + ε),
+W_t = Σ_{j=1}^t (1 - 2π_j + Σπ_j²),
+Σπ_j² = exp(logsumexp(2·logits_j) - 2·logsumexp(logits_j)).
+```
+The final advantage is `(G_t - B*_t) · mask_t`, so padding tokens stay at zero.
+## Integration in VERL
+- `AdvantageEstimator.OPTIMAL_TOKEN_BASELINE` registers `compute_optimal_token_baseline_advantage`, invoked whenever `algorithm.adv_estimator` is set to `optimal_token_baseline`.
+- `ActorRolloutRefWorker.compute_log_prob` emits an additional tensor `sum_pi_squared` (Σπ² per token) when `actor.calculate_sum_pi_squared=True`. This requires disabling fused log-prob kernels, because they do not surface logits.
+- Trainers assert `sum_pi_squared` exists, regroup trajectories by `non_tensor_batch["uid"]`, and run the OTB calculation. If rollout IS is active, they rescale the weights by `rollout_is_weights**2` before aggregating.
+- In Ulysses sequence-parallel setups, the actor gathers, unpads, and returns Σπ² in the same way it handles log-probabilities, so OTB supports sharded sequence-parallel models out of the box.
+- `sum_pi_squared_checkpointing` is available to trade compute for memory when Σπ² tensors become large (e.g., lengthy chain-of-thought reasoning).
+## Configuration checklist
+- `actor_rollout_ref.actor.calculate_sum_pi_squared: true` (mandatory).
+- `actor_rollout_ref.model.use_fused_kernels: false` (required until fused kernels emit logits).
+- `algorithm.adv_estimator: optimal_token_baseline`.
+- Group sampling (`actor_rollout_ref.rollout.n > 1`) to unlock OTB’s variance reduction; with `n=1` the baseline collapses to returns.
+Example OmegaConf overlay:
+```yaml
+algorithm:
+  adv_estimator: optimal_token_baseline
+actor_rollout_ref:
+  actor:
+    calculate_sum_pi_squared: true
+    sum_pi_squared_checkpointing: false # optional memory saver
+  rollout:
+    n: 8
+```
+## Example script
+- `examples/otb_trainer/run_qwen2_5-7b.sh`.
+## Gradient Variance Proxy Metrics
+All gradient-variance analysis in the Optimal Token Baseline work starts from the variance identity
+```
+Var(ĝ) = E[||ĝ||²] - ||E[ĝ]||²,
+```
+which states that the variance of any stochastic gradient equals the mean squared magnitude minus the squared norm of its expectation.
+For a trajectory `τ`, the policy-gradient estimator is
+```
+ĝ(τ) = ∇ log π_θ(τ) · A(τ),        A(τ) = R(τ) - B.
+```
+The logit-gradient proxy approximates the squared gradient norm without an extra backward pass:
+```
+||ĝ(τ)||² ≈ Ŵ(τ) · A(τ)²,
+```
+where `Ŵ(τ)` is the realized energy built. Given a mini-batch `{τ_i}` of size `N`, we decompose its statistics into three diagnostics:
+- **Signal strength (squared norm of the mean gradient)**
+  ```
+  S = || (1/N) · Σ ĝ(τ_i) ||²
+  ```
+- **Total power (signal + noise)**
+  ```
+  P_total = (1/N) · Σ Ŵ(τ_i) · A(τ_i)²
+  ```
+- **Pure noise (estimated variance of the batch mean)**
+  ```
+  Var_proxy = (1/(N-1)) · (P_total - S)
+  ```
+`verl/trainer/ppo/metric_utils.py#L306` implements these diagnostics via `compute_variance_proxy_metrics`, emitting
+`variance_proxy/proxy1_signal_strength`,
+`variance_proxy/proxy2_total_power`, and
+`variance_proxy/proxy3_pure_noise`.
+Tracking these metrics provides a forward-only, low-overhead view of gradient health for any advantage estimator that supplies `sum_pi_squared`.

code/RL_model/verl/verl_train/docs/algo/ppo.md ADDED Viewed

	@@ -0,0 +1,105 @@

+# Proximal Policy Optimization (PPO)
+Last updated: 06/19/2025.
+Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning.
+Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from:
+- High variance and sample inefficiency.
+- Instability due to large policy updates.
+PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives.
+For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347).
+## Key Components
+- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model.
+- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias.
+- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates.
+## Configuration
+Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
+Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below.
+![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
+- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
+- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers
+- `critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers
+- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2
+- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor
+- `critic.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic. Defaults to `actor_rollout_ref.actor.ppo_epochs`
+- `algorithm.gemma`: discount factor
+- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator
+- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo
+## Advanced Extensions
+### KL Divergence Control
+Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
+Options to use KL loss for KL divergence control:
+- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False
+- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
+- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+Options to use KL penalty in the reward:
+- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False.
+- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
+- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
+- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
+- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details.
+- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details.
+### Dual-clip PPO
+The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound.
+![image](https://github.com/user-attachments/assets/fc232181-d8b0-4307-8dd2-4dc0a4c1c139)
+- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0
+## Reference Example
+Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)
+```bash
+bash run_gemma.sh
+  trainer.n_gpus_per_node=1 \
+  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+  trainer.logger=console \
+  critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+  data.train_batch_size=256 \
+  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+  actor_rollout_ref.actor.ppo_micro_batch_size=2 \
+  critic.ppo_micro_batch_size=2
+```
+Reference performance with verl v0.2:
+| Model                          | Method          | Score | Link                                                                                           |
+|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------|
+| Qwen/Qwen2.5-0.5B-Instruct     | pretrained model | 36.4  | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/)                                        |
+| Qwen/Qwen2.5-0.5B-Instruct     | PPO              | 56.7  | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |

code/RL_model/verl/verl_train/docs/algo/rollout_corr.md ADDED Viewed

	@@ -0,0 +1,1313 @@

+# Rollout Correction
+**Author:** [Yingru Li](https://richardli.xyz/)
+Last updated: 10/30/2025.
+---
+> **📖 Documentation Structure**
+>
+> - **This document** - Practical usage guide: configurations, presets, troubleshooting
+> - **[Mathematical Formulations](rollout_corr_math.md)** - Theoretical foundations, derivations, and algorithmic details
+>
+> Start here for implementation, refer to the math doc for theory and design rationale.
+---
+This document provides a comprehensive overview of the Rollout Correction implementation in verl.
+**Note on Naming**: This feature is called "Rollout Correction" to reflect the complete functionality: importance sampling (IS) weights and rejection sampling (RS). The internal variable `rollout_is_weights` retains its name as it specifically refers to the IS weights component.
+### BibTeX Citation
+```bibtex
+@online{liu-li-2025-rl-collapse,
+  title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch},
+  author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu},
+  year = {2025},
+  month = sep,
+  url = {https://richardli.xyz/rl-collapse}
+}
+```
+### Blog Series
+- Main blog post: https://richardli.xyz/rl-collapse
+- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance)
+- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff)
+- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS)
+## Overview
+Rollout Correction provides a unified framework to handle **general off-policy problems** in RL training. Any scenario where the data collection distribution differs from the training distribution can benefit from these methods.
+**Common off-policy scenarios:**
+1. **Policy Mismatch** (Implementation Differences)
+   - Different precision: FP8 vs FP16 vs BF16 vs FP32
+   - Different backends: vLLM vs SGLang vs FSDP vs Megatron
+   - Different implementations even with identical weights
+2. **Temporal Lag** (Model Staleness)
+   - Rollout uses older checkpoint while training has progressed
+   - Asynchronous rollout workers with stale parameters
+   - Common in distributed/async RL systems
+3. **Replay Buffers**
+   - Training on historical trajectories from earlier iterations
+   - Experience replay from different policy versions
+   - Data augmentation or resampling strategies
+4. **Off-Policy Algorithms**
+   - Behavioral cloning from expert demonstrations
+   - DAPO (data from auxiliary policies)
+   - Any algorithm using trajectories from a different policy
+5. **Data Quality Filtering**
+   - Reweighting or filtering collected data
+   - Preference learning with modified distributions
+   - Curriculum learning with distribution shifts
+These off-policy gaps can cause training instability and policy collapse. Rollout Correction uses importance sampling (IS) weights and rejection sampling (RS) to correct for any distribution shift between data collection and training.
+**Important Note on Common Implementation Mistakes:**
+Many LLM-RL implementations incorrectly apply PPO by **ignoring the actual rollout policy** π_rollout and assuming the training reference policy π_old is the behavior policy. This is mathematically incorrect when π_rollout ≠ π_old (which is typical in LLM-RL due to precision/backend differences between rollout and training).
+**This is not PPO's fault** - PPO itself is mathematically correct. The issue is the incorrect assumption that π_old = π_rollout in naive implementations.
+This critical implementation mistake that leads to RL training collapse was identified in the blog post ["When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch"](https://richardli.xyz/rl-collapse) and motivated the development of this rollout correction framework.
+**Mathematically correct approaches:**
+- **Decoupled mode**: Three policies (π*rollout, π_old, π*θ) with IS correction from π_rollout to π_old
+- **Bypass mode**: Two policies (π*rollout = π_old, π*θ) using actual rollout policy as PPO anchor
+- **Bypass + Policy Gradient mode**: Two policies (π*rollout, π*θ) with IS/RS correction and no PPO clipping
+See [Mathematical Formulations](rollout_corr_math.md#38-common-implementation-mistake) for detailed explanation.
+### Key Design Principle: Separation of IS Weights and Rejection Sampling
+The implementation cleanly separates two orthogonal mechanisms:
+1. **IS Weights** (`rollout_is_weights`): Continuous reweighting for gradient correction
+   - Policy ratio: π*old/π_rollout (decoupled) or π*θ/π_rollout (bypass)
+   - **Safety-bounded**: Clamped to [exp(-20), exp(20)] ≈ [2e-9, 5e8] to prevent overflow
+     - Token level: Bounds per-token ratios
+     - Sequence level: Bounds product of ratios (broadcast to all tokens)
+   - **Truncated**: Upper clamped via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling)
+   - **Zeroed at padding**: Multiplied by response_mask to zero out padding positions
+   - Used to weight policy gradients (variance reduction)
+2. **Rejection Sampling** (`modified_response_mask`): Binary filtering for outlier exclusion
+   - Creates binary mask: 1 = keep, 0 = reject
+   - Rejects tokens/sequences with IS ratios outside [lower_threshold, upper_threshold]
+   - Modifies response_mask to exclude rejected samples from training
+   - Used for loss aggregation (rejected samples don't contribute to gradients)
+This separation ensures:
+- ✅ IS weights provide continuous reweighting (reduce variance)
+- ✅ Rejection sampling provides hard filtering (remove extreme outliers)
+- ✅ Both mechanisms can be enabled independently or together
+- ✅ Safety bounds prevent numerical overflow in all cases
+## Quick Start: Using Verified Presets
+**NEW**: We now provide typed configuration with verified presets for common scenarios. These presets have been validated with tens of thousands of GPU hours across various models and training scenarios.
+### Python API
+```python
+from verl.trainer.config.algorithm import RolloutCorrectionConfig
+# === Decoupled PPO mode (3 policies: π_rollout, π_old, π_θ) ===
+# IS weights correct for gap between π_old and π_rollout
+config = RolloutCorrectionConfig.decoupled_token_is()           # Token-TIS
+config = RolloutCorrectionConfig.decoupled_seq_is()             # Seq-TIS
+config = RolloutCorrectionConfig.decoupled_seq_is_rs()          # Seq-MIS
+config = RolloutCorrectionConfig.decoupled_geo_rs()             # Geo-RS (ratio mode)
+config = RolloutCorrectionConfig.decoupled_geo_rs_token_tis()   # Geo-RS + Token-TIS
+# === K3 KL Estimator presets (more stable for small KL) ===
+config = RolloutCorrectionConfig.decoupled_k3_rs()              # K3-RS only
+config = RolloutCorrectionConfig.decoupled_k3_rs_token_tis()    # K3-RS + Token-TIS
+# === Bypass PPO mode (2 policies: π_rollout = π_old, π_θ) - fast ===
+# PPO ratio handles IS, so no explicit IS weights needed
+config = RolloutCorrectionConfig.bypass_ppo_clip()              # PPO-clip only
+config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs()       # PPO-clip + Geo-RS (ratio)
+config = RolloutCorrectionConfig.bypass_ppo_clip_k3_rs()        # PPO-clip + K3-RS
+# === Bypass PG mode (2 policies, no PPO clipping) - fast ===
+# IS weights computed on-the-fly as π_θ / π_rollout
+config = RolloutCorrectionConfig.bypass_pg_is()                 # Seq-TIS + PG
+config = RolloutCorrectionConfig.bypass_pg_geo_rs()             # Geo-RS + PG (ratio)
+config = RolloutCorrectionConfig.bypass_pg_geo_rs_token_tis()   # Geo-RS + Token-TIS + PG
+# === Other ===
+config = RolloutCorrectionConfig.disabled()             # Metrics only (no correction)
+```
+### YAML Configuration (Advanced)
+For advanced customization or YAML-based configs:
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: token # IS weights: "token", "sequence", or null
+    rollout_is_threshold: 2.0 # Upper threshold for IS weights
+    rollout_is_batch_normalize: false # Batch normalize IS weights to mean=1.0
+    rollout_rs: null # Rejection sampling: comma-separated canonical options (e.g. "token_k1,seq_max_k2")
+    rollout_rs_threshold: null # Threshold spec: float(s) or "lower_upper" string(s)
+    bypass_mode: false # Skip old_log_prob computation (sets π_old = π_rollout)
+    loss_type: ppo_clip # Loss type in bypass mode: "ppo_clip" (default) or "reinforce"
+# REQUIRED: Enable log prob calculation
+actor_rollout_ref:
+  rollout:
+    calculate_log_probs: true
+```
+## Files
+### **Core Implementation**
+- `verl/trainer/ppo/rollout_corr_helper.py` - Contains `compute_rollout_correction_and_rejection_mask()` and `compute_offpolicy_metrics()`
+- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and REINFORCE modes (`compute_policy_loss_bypass_mode()`, `compute_policy_loss_reinforce()`)
+- `verl/trainer/ppo/ray_trainer.py` - Bypass mode implementation (skips `old_log_prob` computation)
+- `verl/workers/actor/dp_actor.py` - Mode selection logic and metrics collection
+### **Configuration Files**
+- `verl/trainer/config/algorithm.py` - Rollout Correction parameters in `AlgoConfig`
+- `verl/workers/config/actor.py` - Rollout Correction parameters in `ActorConfig`
+- `verl/trainer/config/actor/actor.yaml` - Rollout Correction configuration section
+- `verl/trainer/config/ppo_trainer.yaml` - Algorithm config with Rollout Correction
+### **Documentation**
+- `docs/examples/config.rst` - Configuration parameter descriptions
+### **Example Scripts**
+- `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh` - DAPO example with Rollout Correction
+- `examples/rollout_correction/run_with_rollout_corr.sh` - Basic example
+- `examples/rollout_correction/run_with_rollout_corr_multi_rs.sh` - Multi-RS example
+### **Tests**
+- `tests/trainer/ppo/test_rollout_corr.py` - Unit tests for IS/RS mechanisms
+- `tests/trainer/ppo/test_rollout_corr_integration.py` - Integration tests
+## Configuration Parameters
+All parameters are under `algorithm.rollout_correction`:
+### `rollout_is` (str or null)
+Importance sampling weights aggregation level:
+- `null` = No IS weights computed (metrics-only mode)
+- `"token"`: Per-token IS weights
+  - **Decoupled mode**: ρ_t = π_old(t)/π_rollout(t)
+  - **Bypass/Pure IS mode**: ρ*t = π*θ(t)/π_rollout(t)
+  - Independent truncation per token
+  - Typical threshold: 1.5 - 5.0
+- `"sequence"`: Per-sequence weight ρ_seq = ∏_t ρ_t
+  - Multiplicative aggregation across sequence
+  - Typical threshold: 2.0 - 10.0
+All IS weights are safety-bounded to [exp(-20), exp(20)] ≈ [2e-9, 5e8]
+### `rollout_is_threshold` (float)
+Upper threshold for IS weight truncation. Default: `2.0`
+- Truncates IS weights via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling)
+- Applied to IS weights for variance reduction
+- Separate from rejection sampling (controlled by `rollout_rs` parameters)
+### `rollout_is_batch_normalize` (bool)
+Apply batch normalization to IS weights. Default: `False`
+- `True`: Normalize IS weights to have mean=1.0 within each batch
+  - **Token-level IS**: Normalizes over all token weights
+  - **Sequence-level IS**: Normalizes over sequence means (one weight per sequence)
+- `False`: Use raw (truncated) IS weights
+- Reduces variance by ensuring average weight is 1.0 per batch
+- Applied AFTER truncation to preserve truncation semantics
+- Only affects IS weight values, not rejection sampling
+### `rollout_rs` (str or null)
+Rejection sampling aggregation modes. Supply a comma-separated string (spaces optional) using the canonical options implemented in `rollout_corr_helper`:
+- `token_k1`: Token-level rejection with `-log r` bounds (ratio thresholds supplied as `lower_upper`). Example: `"0.6_1.4"`
+- `token_k2`: Token-level rejection with `0.5 * (log r)^2` (upper bound only)
+- `token_k3`: Token-level rejection with `exp(log r) - 1 - log r` (upper bound only)
+- `seq_sum_k1`: Sequence-level rejection with sum of `-log r` (ratio bounds)
+- `seq_sum_k2`: Sequence-level rejection with sum of `0.5 * (log r)^2` (upper bound only)
+- `seq_sum_k3`: Sequence-level rejection with sum of `exp(log r) - 1 - log r` (upper bound only)
+- `seq_mean_k1`: Sequence-level rejection with mean of `-log r` (ratio bounds)
+- `seq_mean_k2`: Sequence-level rejection with mean of `0.5 * (log r)^2` (upper bound only)
+- `seq_mean_k3`: Sequence-level rejection with mean of `exp(log r) - 1 - log r` (upper bound only)
+- `seq_max_k2`: Sequence-level rejection with max of `0.5 * (log r)^2` (upper bound only)
+- `seq_max_k3`: Sequence-level rejection with max of `exp(log r) - 1 - log r` (upper bound only)
+### `rollout_rs_threshold` (str, float, or null)
+Threshold specification for rejection sampling.
+- Provide **one entry per option**, separated by commas. A single entry is broadcast to every option.
+- **Ratio modes (`*k1`)**: Use `"lower_upper"` strings (e.g. `"0.7_1.3"`). Supplying a float implies only the upper bound; the lower bound defaults to its reciprocal.
+- **Divergence modes (`*k2`/`*k3`)**: Supply positive upper bounds (float or numeric string).
+- Set to `null` to disable thresholds entirely (only valid when `rollout_rs` is null).
+## Understanding the Framework: Components and Combinations
+The rollout correction framework is built from **orthogonal components** that can be combined flexibly. Understanding these components helps you choose the right configuration for your scenario.
+### Key Components
+1. **Operating Mode** (Section: [Operation Modes](#operation-modes))
+   - **Decoupled**: Three policies (π*rollout, π_old, π*θ) with separate π_old computation
+   - **Bypass**: Two policies (π*rollout = π_old, π*θ), skips π_old computation
+2. **Loss Function** (in bypass mode, controlled by `loss_type`)
+   - **PPO-clip** (`loss_type="ppo_clip"`, default): PPO clipped objective (IS handled by ratio)
+   - **REINFORCE** (`loss_type="reinforce"`): Policy gradient with explicit IS weights (no clipping)
+3. **IS/RS Aggregation Level**
+   - **Token**: Per-token IS weights/rejection
+   - **Sequence**: Sequence-level IS weights/rejection
+See [Mathematical Formulations](rollout_corr_math.md#3-algorithmic-components-and-combinations) for detailed theory.
+---
+## Preset Configuration Guide
+This section provides detailed guidance on choosing and using the verified presets. Each preset is a specific combination of components optimized for common scenarios.
+### Understanding the Presets
+#### Available Preset Methods
+| Preset Method                                                                  | Estimator        | Mode               | IS Level | RS Level | Properties                              |
+| ------------------------------------------------------------------------------ | ---------------- | ------------------ | -------- | -------- | --------------------------------------- |
+| **Decoupled PPO Mode** (3 policies: π*rollout, π_old, π*θ)                     |
+| `decoupled_token_is()`                                                         | Token-TIS        | Decoupled          | token    | -        | Per-token IS weights                    |
+| `decoupled_seq_is()`                                                           | Seq-TIS          | Decoupled          | sequence | -        | Sequence-level IS weights               |
+| `decoupled_seq_is_rs()`                                                        | Seq-MIS          | Decoupled          | sequence | sequence | Sequence IS + sequence RS               |
+| `decoupled_geo_rs()`                                                           | Geo-RS           | Decoupled          | -        | sequence | Geometric RS (ratio mode)               |
+| `decoupled_geo_rs_token_tis()`                                                 | Geo-RS-Token-TIS | Decoupled          | token    | sequence | Geometric filter + token clipped weight |
+| **K3 KL Estimator** (more stable for small KL values)                          |
+| `decoupled_k3_rs()`                                                            | K3-RS            | Decoupled          | -        | k3       | K3 rejection, no IS weights             |
+| `decoupled_k3_rs_token_tis()`                                                  | K3-RS-Token-TIS  | Decoupled          | token    | k3       | K3 filter + token clipped weight        |
+| **Bypass Mode (PPO-clip)** (2 policies; ratio handles IS, RS masks outliers)   |
+| `bypass_ppo_clip()`                                                            | -                | Bypass (PPO-clip)  | -        | -        | PPO-clip only                           |
+| `bypass_ppo_clip_geo_rs()`                                                     | Geo-RS           | Bypass (PPO-clip)  | -        | sequence | PPO-clip + Geo-RS (ratio)               |
+| `bypass_ppo_clip_k3_rs()`                                                      | K3-RS            | Bypass (PPO-clip)  | -        | k3       | PPO-clip + K3-RS                        |
+| **Bypass Mode (REINFORCE)** (2 policies; explicit IS weights, no PPO clipping) |
+| `bypass_pg_is()`                                                               | Seq-TIS          | Bypass (REINFORCE) | sequence | -        | REINFORCE with explicit IS              |
+| `bypass_pg_geo_rs()`                                                           | Geo-RS           | Bypass (REINFORCE) | -        | sequence | REINFORCE with Geo-RS (ratio)           |
+| `bypass_pg_geo_rs_token_tis()`                                                 | Geo-RS-Token-TIS | Bypass (REINFORCE) | token    | sequence | REINFORCE + Geo filter + token IS       |
+| **Other**                                                                      |
+| `disabled()`                                                                   | -                | -                  | -        | -        | Metrics only, no correction             |
+**Note:**
+- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
+  - `"ppo_clip"` (default): PPO clipped objective where ratio = π_θ/π_rollout already handles IS
+  - `"reinforce"`: REINFORCE with explicit IS weights as π_θ / π_rollout
+- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples.
+- Estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS) are compatible with Decoupled and Bypass modes.
+#### Other Supported Combinations (Manual Configuration Required)
+**Other supported combinations without preset methods:**
+- Token IS + Token RS: Token-level IS weights + token-level RS mask
+- Pure token RS: Token-level RS only, no IS weights
+- Pure sequence RS: Sequence-level RS only, no IS weights
+See [detailed configuration examples below](#additional-useful-configurations-not-exposed-as-presets) for manual configurations.
+**Key properties:**
+- Any aggregation level (token/sequence/geometric) works in either decoupled or bypass mode
+- All combinations are fully supported by the implementation
+- Rejection sampling is independent of IS weighting
+- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` (no IS weights)
+---
+### 1. Decoupled Mode with Token-level Importance Sampling (`decoupled_token_is`)
+**Configuration:**
+```python
+config = RolloutCorrectionConfig.decoupled_token_is(threshold=2.0)
+```
+**Components:**
+- **Operating Mode**: Decoupled (3 policies)
+- **Loss**: PPO with clipping (only for the second drift correction)
+- **IS Aggregation**: Token-level
+- **RS**: None (can be added separately)
+**Equivalent YAML:**
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: token
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    bypass_mode: false # Decoupled mode
+```
+**Properties:**
+- Independent truncation per token
+- Lower variance than sequence-level (product of ratios bounded individually)
+- Typical threshold: 1.5 - 5.0
+**Theory:** See [rollout_corr_math.md §3.3.1](rollout_corr_math.md#331-token-level-aggregation)
+---
+### 2. Decoupled Mode with Sequence-level Importance Sampling (`decoupled_seq_is`)
+**Also known as: Seq-TIS (Sequence-Level Truncated IS)**
+**Configuration:**
+```python
+config = RolloutCorrectionConfig.decoupled_seq_is(threshold=2.0)
+```
+**Components:**
+- **Operating Mode**: Decoupled (3 policies)
+- **Loss**: PPO with clipping (only for the second drift correction)
+- **IS Aggregation**: Sequence-level (Seq-TIS)
+- **RS**: None (can be added separately)
+**Equivalent YAML:**
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: sequence
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    bypass_mode: false # Decoupled mode
+```
+**Properties:**
+- Multiplicative aggregation across sequence
+- More sensitive to outliers than token-level
+- Typical threshold: 2.0 - 10.0 (higher than token-level)
+**Theory:** See [rollout_corr_math.md §3.3.2](rollout_corr_math.md#332-sequence-level-aggregation)
+---
+### 3. Decoupled Mode with Sequence-level IS + Rejection Sampling (`decoupled_seq_is_rs`)
+**Also known as: Seq-MIS (Sequence-Level Masked IS)**
+**Configuration:**
+```python
+config = RolloutCorrectionConfig.decoupled_seq_is_rs(is_threshold=2.0, rs_threshold="0.5_2.0")
+```
+**Components:**
+- **Operating Mode**: Decoupled (3 policies)
+- **Loss**: PPO with clipping (only for the second drift correction)
+- **IS Aggregation**: Sequence-level (Seq-TIS)
+- **RS**: Sequence-level rejection (Seq-MIS)
+**Equivalent YAML:**
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: sequence
+    rollout_is_threshold: 2.0
+    rollout_rs: seq_sum_k1
+    rollout_rs_threshold: 0.5_2.0
+    bypass_mode: false # Decoupled mode
+```
+**Properties:**
+- Double mechanism: IS reweighting (Seq-TIS) + rejection filtering (Seq-MIS)
+- Lower effective sample size (rejects outliers)
+- For severe off-policy gaps or when the distribution tail is "toxic" (garbage/adversarial samples)
+**When to use Seq-MIS over Seq-TIS:**
+- **Seq-TIS (clipping only)**: Maximizes information efficiency; extracts signal from all samples. Use when data is clean and mismatch is moderate.
+- **Seq-MIS (rejection)**: Maximizes safety; acts as a hard trust region filter. Use when mismatch is severe or when high-weight samples are likely garbage rather than signal.
+**Theory:** See [rollout_corr_math.md §3.4](rollout_corr_math.md#34-rejection-sampling-rs)
+---
+### 6. Bypass Mode with PPO-clip (`bypass_ppo_clip`)
+**Configuration:**
+```python
+config = RolloutCorrectionConfig.bypass_ppo_clip()
+```
+**Components:**
+- **Operating Mode**: Bypass (2 policies: π*rollout = π_old, π*θ)
+- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights)
+- **IS Aggregation**: None (PPO ratio handles it)
+- **RS**: None
+**Equivalent YAML:**
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: null
+    rollout_rs: null
+    bypass_mode: true
+    loss_type: ppo_clip
+```
+**Properties:**
+- PPO clipped objective in bypass mode
+- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed)
+- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3)
+- No rejection sampling - use `bypass_ppo_clip_geo_rs()` for RS
+**Configuration requirement:**
+- Set `actor_rollout_ref.rollout.calculate_log_probs: true`
+**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies)
+---
+### 7. REINFORCE with IS (`bypass_pg_is`)
+**Configuration:**
+```python
+config = RolloutCorrectionConfig.bypass_pg_is(threshold=2.0)
+```
+**Components:**
+- **Operating Mode**: Bypass (2 policies: π*rollout, π*θ)
+- **Loss**: REINFORCE (policy gradient with explicit IS weights, no PPO clipping)
+- **IS Aggregation**: Sequence-level
+- **RS**: None
+**Equivalent YAML:**
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: sequence
+    rollout_is_threshold: 2.0
+    rollout_rs: null
+    bypass_mode: true
+    loss_type: reinforce # REINFORCE with explicit IS weights
+```
+**Properties:**
+- REINFORCE loss with explicit IS weights (no PPO clipping)
+- Single forward pass (skips old_log_prob computation)
+- IS weights computed on-the-fly in loss function
+**Theory:** See [rollout_corr_math.md §3.2.2](rollout_corr_math.md#322-policy-gradient-loss-with-isrs-correction)
+---
+## Additional Useful Configurations (Not Exposed as Presets)
+These configurations are **fully supported** but don't have convenience preset methods yet.
+### 1. Token IS + Token RS (`token_is_rs`)
+Token-level IS weights with token-level RS mask.
+**Python:**
+```python
+config = RolloutCorrectionConfig(
+    rollout_is="token",
+    rollout_is_threshold=2.0,
+    rollout_rs="token_k1",
+    rollout_rs_threshold=2.0,
+)
+```
+**Properties:** Per-token IS weights + per-token RS mask.
+### 2. Pure Token RS (`token_rs`)
+Token-level RS only, no IS weights.
+**Python:**
+```python
+config = RolloutCorrectionConfig(
+    rollout_is=None,
+    rollout_rs="token_k1",
+    rollout_rs_threshold=2.0,
+)
+```
+**Properties:** Token-level RS mask, no IS reweighting.
+### 3. Pure Sequence RS (`seq_rs`)
+Sequence-level RS only, no IS weights.
+**Python:**
+```python
+config = RolloutCorrectionConfig(
+    rollout_is=None,
+    rollout_rs="seq_sum_k1",
+    rollout_rs_threshold="0.5_2.0",
+)
+```
+**Properties:** Sequence-level RS mask, no IS reweighting.
+---
+### Summary: How IS Weights are Processed
+IS weights (`rollout_is_weights`) go through a fixed processing pipeline:
+**Stage 1: Safety Bound (Prevent Overflow)**
+- Token level: `exp(clamp(log_ratio, -20, 20))` per token → bounds each token to [2e-9, 5e8]
+- Sequence level: `exp(clamp(sum(log_ratio), -20, 20))` → bounds product to [2e-9, 5e8], broadcast to all tokens
+**Stage 2: Truncation (Reduce Variance)**
+- `.clamp(max=rollout_is_threshold)` → caps weights at upper threshold (TIS: Truncated Importance Sampling)
+- No lower truncation (preserves unbiasedness for small weights)
+**Stage 3: Padding Zeroing (Correct Aggregation)**
+- `weights * response_mask` → zeros out padding positions
+**Stage 4: Optional Batch Normalization**
+- If `rollout_is_batch_normalize=True`: Normalize weights to mean=1.0 within batch
+- Applied after truncation to preserve truncation semantics
+**Rejection Sampling (Separate Mechanism)**
+Rejection sampling modifies `response_mask` (NOT weights) through `compute_rollout_rejection_mask()`:
+- Computes safety-bounded ratios independently
+- Creates binary mask: tokens/sequences outside [lower_threshold, upper_threshold] → 0 (rejected)
+- Modified mask used for loss aggregation (rejected samples excluded from training)
+## Operation Modes
+The framework provides **two operating modes** for computing π_old, which can be combined with different loss functions.
+### Operating Modes and Configuration
+| Configuration          | `bypass_mode` | `loss_type`            | Operating Mode | Loss Function | Description                                                       |
+| ---------------------- | ------------- | ---------------------- | -------------- | ------------- | ----------------------------------------------------------------- |
+| **Decoupled**          | `false`       | N/A                    | Decoupled      | PPO           | Computes `old_log_prob` separately via `actor.compute_log_prob()` |
+| **Bypass + PPO-clip**  | `true`        | `"ppo_clip"` (default) | Bypass         | PPO-clip      | PPO clipped objective (IS handled by ratio)                       |
+| **Bypass + REINFORCE** | `true`        | `"reinforce"`          | Bypass         | REINFORCE     | Policy gradient with explicit IS weights (no PPO clipping)        |
+### Operating Mode Details
+#### Decoupled Mode (Three Policies)
+**Policy setup:**
+- π_rollout: Behavior policy (data collection)
+- π_old: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch)
+- π_θ: Current policy (being updated)
+**Configuration:** `bypass_mode = false`
+**Properties:**
+- ✅ Achieves batch size invariance
+- ✅ Separately corrects Drift 1 (rollout→old) and Drift 2 (old→current)
+- ✅ Efficient stale data utilization
+- ❌ Extra forward pass needed (`actor.compute_log_prob()`)
+**Theory:** See [rollout_corr_math.md §3.1.1](rollout_corr_math.md#311-decoupled-mode-three-policies)
+#### Bypass Mode (Two Policies)
+**Policy setup:**
+- π_rollout: Behavior policy (data collection)
+- π_old = π_rollout: Proximal policy equals behavior policy
+- π_θ: Current policy (being updated)
+**Configuration:** `bypass_mode = true`
+**Properties:**
+- ✅ Skips `actor.compute_log_prob()` call (faster)
+- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS)
+- ✅ Uses two policies instead of three (π_rollout = π_old)
+- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode)
+**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies)
+---
+### IS/RS Aggregation Levels (Orthogonal to Operating Mode)
+The aggregation level can be chosen **independently** of the operating mode. Any aggregation level works in either decoupled or bypass mode.
+| `rollout_is`              | `rollout_rs`                                                       | Behavior                                                                          |
+| ------------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------- |
+| `null`                    | `null`                                                             | **Disabled**: No computation, no metrics, no rejection                            |
+| `null`                    | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Rejection only**: Compute metrics, NO weight correction, YES rejection sampling |
+| `"token"` or `"sequence"` | `null`                                                             | **IS weights only**: Weight correction enabled, NO rejection sampling             |
+| `"token"` or `"sequence"` | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Full correction**: Both weight correction and rejection sampling enabled        |
+### Key Insights
+- ✅ Any IS/RS aggregation level (token/sequence/geometric) can be used in **either** decoupled or bypass mode
+- ✅ You can use **rejection sampling alone** without IS weight correction (`rollout_is=null, rollout_rs="token_k1"`)
+- ✅ You can use **IS weights alone** without outlier rejection (`rollout_is="token", rollout_rs=null`)
+- ✅ You can use **both together** (`rollout_is="token", rollout_rs="token_k1"`)
+- ✅ You can **monitor metrics only** without any correction by setting both to `null` but still providing rollout_log_probs
+**Theory:** See [rollout_corr_math.md §3.3](rollout_corr_math.md#33-isrs-aggregation-levels) for details on aggregation levels.
+### Example Workflow
+**Recommended: Bypass Mode**
+This workflow uses bypass mode for efficiency.
+1. **Start with metrics only** to understand the off-policy gap:
+   ```yaml
+   algorithm:
+     rollout_correction:
+       rollout_is: null
+       rollout_rs: null
+       bypass_mode: true # Bypass mode (recommended)
+       loss_type: ppo_clip # Default: PPO clipped objective
+   ```
+   Monitor `rollout_corr/kl`, `rollout_corr/log_ppl_abs_diff`, `rollout_corr/chi2_token` to assess off-policy gap.
+2. **Enable rejection sampling** if you see high outlier fractions:
+   ```yaml
+   algorithm:
+     rollout_correction:
+       rollout_is: null
+       rollout_rs: sequence # or "geometric" for higher sensitivity
+       rollout_rs_threshold: 2.0
+       bypass_mode: true # Bypass mode
+       loss_type: ppo_clip # or "reinforce" for explicit IS weights
+   ```
+   This excludes outliers from training without modifying gradients.
+3. **Enable full IS correction** (with REINFORCE loss) once comfortable with metrics:
+   ```yaml
+   algorithm:
+     rollout_correction:
+       rollout_is: sequence # Recommended: unbiased, suitable for most cases
+       rollout_is_threshold: 2.0
+       rollout_rs: sequence # or "geometric" for more aggressive filtering
+       rollout_rs_threshold: 2.0
+       bypass_mode: true # Bypass mode
+       loss_type: reinforce # REINFORCE with explicit IS weights
+   ```
+**Benefits of bypass mode:**
+- ✅ Skips expensive `actor.compute_log_prob()` forward pass (faster)
+- ✅ `loss_type` controls the loss function: "ppo_clip" (default) or "reinforce"
+- ✅ PPO-clip: IS handled by ratio (no explicit weights), RS mask applied
+- ✅ REINFORCE: Explicit IS weights computed on-the-fly (π_θ / π_rollout)
+- ✅ Both loss types work with all IS/RS combinations
+## Usage
+### Basic Setup
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: token # Enable IS weights at token level
+    rollout_is_threshold: 2.0 # Threshold for IS weights
+    rollout_rs: null # No rejection sampling
+actor_rollout_ref:
+  rollout:
+    calculate_log_probs: true # Required!
+```
+### Metrics
+All metrics are prefixed with `rollout_corr/` in logs. For example, `rollout_is_mean` appears as `rollout_corr/rollout_is_mean`.
+These metrics cover both:
+- **Diagnostic metrics**: KL divergence, perplexity differences (measuring off-policy gap)
+- **Correction statistics**: IS weights, rejection rates (measuring correction applied)
+#### **Core IS Weight Metrics**
+- **`rollout_is_mean`**: Mean importance sampling weight across all valid tokens
+  - Value close to 1.0 indicates minimal off-policy gap
+- **`rollout_is_std`**: Standard deviation of IS weights
+  - Higher values indicate greater variance in IS weights
+- **`rollout_is_min`**: Minimum IS weight observed
+  - Shows the most underweighted token/sequence
+  - For sequence/geometric: computed from unclamped log-space ratios (true minimum)
+  - For token: computed from safety-bounded weights
+- **`rollout_is_max`**: Maximum IS weight observed
+  - Shows the most overweighted token/sequence
+  - For sequence/geometric: computed from unclamped log-space ratios (true maximum before safety bound)
+  - For token: computed from safety-bounded weights (before threshold clamping)
+  - Compare with `rollout_is_threshold` to see truncation impact
+#### **Effective Sample Size**
+- **`rollout_is_eff_sample_size`**: Effective sample size after IS weighting
+  - **Formula**: `1 / mean(weights²)` where weights are normalized
+  - **Range**: 0.0 to 1.0 (as fraction of original batch)
+  - Lower values indicate weight concentration on fewer samples
+#### **Threshold Exceedance Metrics**
+- **`rollout_is_ratio_fraction_high`**: Fraction of weights exceeding upper threshold
+  - Shows how often truncation/masking occurs on high end
+  - For sequence/geometric: computed from unclamped log-space ratios (true exceedance)
+  - For token: computed from safety-bounded weights (before threshold clamping)
+- **`rollout_is_ratio_fraction_low`**: Fraction of weights below lower threshold (1/upper_threshold)
+  - Diagnostic metric showing how many weights are below the reciprocal threshold
+  - For sequence/geometric: computed from unclamped log-space ratios (true exceedance)
+  - For token: computed from safety-bounded weights (before truncation)
+#### **Sequence-Level Metrics** (for sequence aggregation)
+- **`rollout_is_seq_mean`**: Mean IS weight at sequence level
+  - Should match `rollout_is_mean` for sequence-level aggregation
+- **`rollout_is_seq_std`**: Standard deviation of sequence-level IS weights
+- **`rollout_is_seq_min`**: Minimum sequence-level IS weight
+- **`rollout_is_seq_max`**: Maximum sequence-level IS weight
+- **`rollout_is_seq_max_deviation`**: Maximum absolute deviation from 1.0 at sequence level
+  - Shows worst-case sequence off-policy gap
+- **`rollout_is_seq_fraction_high`**: Fraction of sequences exceeding upper threshold
+- **`rollout_is_seq_fraction_low`**: Fraction of sequences below lower threshold
+#### **Rejection Sampling Metrics** (when `rollout_rs` is enabled)
+- **`rollout_rs_masked_fraction`**: Fraction of tokens rejected via rejection sampling
+  - **Important**: Rejection sampling modifies `response_mask` (sets rejected tokens to 0)
+  - **Separate from IS weights**: IS weights are still truncated; rejection is an independent filtering step
+  - Only present when `rollout_rs` is enabled (token/sequence/geometric)
+- **`rollout_rs_seq_masked_fraction`**: Fraction of sequences with at least one rejected token
+  - Shows sequence-level impact of rejection sampling
+  - Token-level RS: sequence rejected if ANY token is outside [lower, upper]
+  - Sequence-level RS: entire sequence rejected or accepted based on sequence-level ratio
+  - Geometric RS: entire sequence rejected or accepted based on geometric mean
+#### **Off-Policy Diagnostic Metrics** (Training vs Rollout Policy)
+**Note on terminology:** These metrics use "training" to refer to the training reference policy and "rollout" to refer to π_rollout (the behavior policy used for data collection).
+- **Decoupled mode**: "training" = π_old (computed at start of training epoch)
+- **Bypass/Pure IS mode**: "training" = π_θ (current policy being trained)
+In bypass/pure IS mode, metrics measure the drift between π_θ and π_rollout directly.
+- **`training_ppl`**: Perplexity of training reference policy (π*old in decoupled mode, π*θ in bypass/pure IS mode)
+  - **Formula**: `exp(-mean(log_probs))`
+  - Lower values indicate higher model confidence
+- **`rollout_ppl`**: Perplexity of rollout policy π_rollout (e.g., vLLM BF16)
+- **`ppl_ratio`**: Ratio of training PPL to rollout PPL
+  - **Formula**: `exp(mean(log(training_ppl / rollout_ppl)))`
+  - **Meaning**: > 1.0 means training is less confident than rollout
+- **`training_log_ppl`**: Log perplexity of training policy
+  - Useful for identifying trends (linear scale)
+- **`rollout_log_ppl`**: Log perplexity of rollout policy
+- **`log_ppl_diff`**: Mean difference in log perplexities
+  - **Formula**: `mean(log_ppl_rollout - log_ppl_training)`
+  - Sign indicates which policy is more confident
+- **`log_ppl_abs_diff`**: Mean absolute log perplexity difference
+  - Magnitude of off-policy gap regardless of direction
+- **`log_ppl_diff_max`**: Maximum log perplexity difference across sequences
+  - Identifies worst-case sequence
+- **`log_ppl_diff_min`**: Minimum log perplexity difference across sequences
+- **`kl`**: KL divergence KL(π_rollout || π_training)
+  - **Formula**: `mean(log_prob_rollout - log_prob_training)`
+  - **Note**: Can be negative (rollout is less confident)
+- **`k3_kl`**: K3 divergence (equals KL(π_rollout || π_training) in expectation)
+  - **Formula**: `mean(exp(log_ratio) - log_ratio - 1)`
+  - More stable than direct KL (non-negative per token)
+  - Always >= 0
+- **`chi2_token`**: Chi-squared divergence at token level
+  - **Formula**: `mean(ratio²) - 1` where ratio = π_training/π_rollout
+  - Measures second moment of IS weight distribution
+  - Always non-negative
+- **`chi2_seq`**: Chi-squared divergence at sequence level
+  - **Formula**: `mean((∏_t ratio_t)²) - 1`
+  - Sequence-level second moment of IS weights
+  - More sensitive than token-level chi-squared
+#### **Example: Accessing Metrics in Code**
+```python
+# Metrics are returned from compute_rollout_correction_and_rejection_mask
+from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask
+# Returns 3 values (weights, modified_response_mask, metrics)
+weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+    old_log_prob=training_log_probs,      # from training policy
+    rollout_log_prob=rollout_log_probs,   # from rollout policy
+    response_mask=response_mask,
+    rollout_is="token",  # Enable IS weights at token level
+    rollout_is_threshold=2.0,
+    rollout_rs="token_k1",
+    rollout_rs_threshold="0.5_2.0",
+)
+# Extract IS weights (processed, zeroed at padding)
+is_weights = weights_proto.batch["rollout_is_weights"]
+# IS weights processing (with IS enabled at token level):
+# 1. Safety-bounded: exp(clamp(log_ratio, -20, 20)) per token
+# 2. Truncated: .clamp(max=2.0) to cap extreme weights
+# 3. Zeroed at padding positions
+# Note: Truncation is ALWAYS applied to IS weights (TIS: Truncated Importance Sampling)
+# modified_response_mask has rejection applied (since rollout_rs="token_k1"):
+# 1. RS rejection: tokens outside [0.5, 2.0] masked to 0 via response_mask
+# Note: RS and IS are separate mechanisms - both can be enabled independently
+# All metrics have 'rollout_corr/' prefix
+print(f"Mean IS weight: {metrics['rollout_corr/rollout_is_mean']:.3f}")
+print(f"Effective sample size: {metrics['rollout_corr/rollout_is_eff_sample_size']:.3f}")
+print(f"RS masked fraction: {metrics['rollout_corr/rollout_rs_masked_fraction']:.3f}")
+print(f"KL divergence: {metrics['rollout_corr/kl']:.3f}")
+# Check IS weights for valid tokens (non-padding)
+valid_weights = is_weights[response_mask.bool()]
+print(f"\n✓ IS weights min (valid tokens): {valid_weights.min():.4f}")
+print(f"✓ IS weights max (valid tokens): {valid_weights.max():.4f}")
+print(f"✓ All valid IS weights > 0: {(valid_weights > 0).all()}")
+print(f"✓ IS weights are capped at threshold: {(valid_weights <= 2.0).all()}")
+# Check rejection via response_mask
+rejected_tokens = (response_mask == 1) & (modified_response_mask == 0)
+print(f"\n✓ Rejected {rejected_tokens.sum()} tokens via response_mask")
+print(f"✓ Rejection sampling modifies response_mask (separate from IS weight truncation)")
+print(f"✓ IS weights are always truncated to [0, threshold] after safety bounding")
+# Check for warning conditions
+if metrics['rollout_corr/rollout_is_mean'] < 0.5 or metrics['rollout_corr/rollout_is_mean'] > 2.0:
+    print("⚠️  Warning: Mean IS weight far from 1.0, significant off-policy gap detected")
+if metrics['rollout_corr/rollout_is_eff_sample_size'] < 0.3:
+    print("⚠️  Warning: Low effective sample size, high weight concentration")
+```
+#### **Example: Monitoring Metrics During Training**
+```python
+# In your training loop
+for epoch in range(num_epochs):
+    for batch_idx, batch in enumerate(dataloader):
+        # ... rollout phase ...
+        # Compute IS weights and get metrics
+        rollout_corr_config = config.algorithm.get("rollout_correction", None)
+        if rollout_corr_config is not None:
+            weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+                old_log_prob=batch.old_log_prob,
+                rollout_log_prob=batch.rollout_log_prob,
+                response_mask=batch.response_mask,
+                rollout_is=rollout_corr_config.get("rollout_is", None),
+                rollout_is_threshold=rollout_corr_config.get("rollout_is_threshold", 2.0),
+                rollout_rs=rollout_corr_config.get("rollout_rs", None),
+                rollout_rs_threshold=rollout_corr_config.get("rollout_rs_threshold", None),
+            )
+        # Log to tensorboard/wandb
+        for metric_name, metric_value in metrics.items():
+            logger.log_scalar(metric_name, metric_value, step=global_step)
+        # IMPORTANT: Update batch response_mask with rejection applied
+        batch.response_mask = modified_response_mask
+        # Use IS weights in training (always safety-bounded, zeroed at padding)
+        is_weights = weights_proto.batch["rollout_is_weights"]
+        # ... apply weights to policy gradient ...
+```
+#### **Example: Conditional Alerting Based on Metrics**
+```python
+def check_rollout_correction_health(metrics, config):
+    """Check if Rollout Correction metrics indicate healthy training."""
+    warnings = []
+    # Check mean IS weight
+    mean_weight = metrics['rollout_corr/rollout_is_mean']
+    if mean_weight < 0.5 or mean_weight > 2.0:
+        warnings.append(f"Mean IS weight {mean_weight:.3f} is far from 1.0")
+    # Check effective sample size
+    ess = metrics['rollout_corr/rollout_is_eff_sample_size']
+    if ess < 0.3:
+        warnings.append(f"Effective sample size {ess:.3f} is too low")
+    # Check standard deviation
+    std = metrics['rollout_corr/rollout_is_std']
+    if std > 1.0:
+        warnings.append(f"IS weight std {std:.3f} is too high")
+    # Check KL divergence
+    kl = metrics['rollout_corr/kl']
+    if abs(kl) > 0.1:
+        warnings.append(f"KL divergence {kl:.3f} indicates significant off-policy gap")
+    # Check chi-squared divergence
+    if 'rollout_corr/chi2_token' in metrics:
+        chi2_token = metrics['rollout_corr/chi2_token']
+        if chi2_token > 1.0:
+            warnings.append(f"Chi-squared divergence (token) {chi2_token:.3f} indicates severe distribution shift")
+    if warnings:
+        print("⚠️  Rollout Correction Health Warnings:")
+        for warning in warnings:
+            print(f"  - {warning}")
+        return False
+    else:
+        print("✅ Rollout Correction metrics look healthy")
+        return True
+# Use in training
+_, _, metrics = compute_rollout_correction_and_rejection_mask(...)
+is_healthy = check_rollout_correction_health(metrics, config)
+if not is_healthy:
+    # Consider adjusting config or investigating issues
+    print("Consider:")
+    print("  - Tightening rollout_is_threshold")
+    print("  - Switching to geometric aggregation level")
+    print("  - Checking if rollout and training policies are too different")
+```
+### Running Examples
+Start with the basic token-level truncate configuration:
+```bash
+bash examples/rollout_correction/run_with_rollout_corr.sh
+```
+Monitor metrics for 1-2 epochs before adjusting parameters.
+## Configuration Examples
+### Example 1: IS Weights Only (Token Level)
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: token
+    rollout_is_threshold: 2.0
+    rollout_rs: null # No rejection sampling
+```
+### Example 2: Rejection Sampling Only (No IS Weights)
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: null # No IS weights
+    rollout_rs: token_k1
+    rollout_rs_threshold: "0.5_2.0"
+```
+### Example 3: Both IS and RS (Token RS)
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: token
+    rollout_is_threshold: 2.0
+    rollout_rs: token_k1
+    rollout_rs_threshold: "0.5_2.0"
+```
+### Example 5: Bypass Mode with PPO-clip (Default)
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: token
+    rollout_is_threshold: 2.0
+    rollout_rs: token_k1
+    rollout_rs_threshold: "0.5_2.0"
+    bypass_mode: true # Skip old_log_prob computation
+    loss_type: ppo_clip # PPO clipped objective (default)
+```
+**Skips expensive `actor.compute_log_prob()` forward pass. PPO ratio = π_θ/π_rollout handles IS.**
+### Example 6: Bypass Mode with REINFORCE
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: sequence # Explicit IS correction in loss
+    rollout_is_threshold: 2.0
+    rollout_rs: null # Optional: can add rejection sampling
+    bypass_mode: true
+    loss_type: reinforce # REINFORCE with explicit IS weights
+```
+**No PPO clipping, pure policy gradient with IS correction**
+### Example 7: Bypass Mode with PPO-clip + Rejection Sampling
+```yaml
+algorithm:
+  rollout_correction:
+    rollout_is: sequence # Computed for metrics
+    rollout_is_threshold: 2.0
+    rollout_rs: seq_max_k2 # Sequence max χ²/2 guard
+    rollout_rs_threshold: 2.5
+    bypass_mode: true
+    loss_type: ppo_clip # PPO clipped objective (IS handled by ratio)
+```
+**PPO clipping with rejection sampling. IS handled by PPO ratio (no explicit IS weights).**
+## Troubleshooting
+### Issue: High spread in IS weights
+**Symptoms:** `rollout_is_std` > 1.0, `rollout_is_eff_sample_size` < 0.3
+**Solutions:**
+1. Switch from `sequence` to `geometric` level
+2. Tighten thresholds
+3. Verify rollout and training aren't too different
+### Issue: Mean IS weight far from 1.0
+**Symptoms:** `rollout_is_mean` < 0.5 or > 2.0
+**Solutions:**
+1. Verify `calculate_log_probs=True` is set
+2. Check rollout_log_probs are correctly passed
+3. Check for systematic distribution shift
+### Debugging: Visualizing Metrics
+**Example: Plot IS weight distribution**
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+def plot_is_metrics(metrics_history):
+    """Plot rollout IS metrics over training steps."""
+    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
+    # Plot 1: Mean IS weight over time
+    axes[0, 0].plot(metrics_history['rollout_corr/rollout_is_mean'])
+    axes[0, 0].axhline(y=1.0, color='r', linestyle='--', label='Ideal')
+    axes[0, 0].set_title('Mean IS Weight')
+    axes[0, 0].set_xlabel('Step')
+    axes[0, 0].legend()
+    # Plot 2: Effective sample size
+    axes[0, 1].plot(metrics_history['rollout_corr/rollout_is_eff_sample_size'])
+    axes[0, 1].axhline(y=0.5, color='g', linestyle='--', label='Good')
+    axes[0, 1].axhline(y=0.3, color='r', linestyle='--', label='Warning')
+    axes[0, 1].set_title('Effective Sample Size')
+    axes[0, 1].set_xlabel('Step')
+    axes[0, 1].legend()
+    # Plot 3: KL divergence over time
+    axes[1, 0].plot(metrics_history['rollout_corr/kl'], label='KL')
+    axes[1, 0].plot(metrics_history['rollout_corr/k3_kl'], label='K3 KL')
+    axes[1, 0].axhline(y=0, color='g', linestyle='--', alpha=0.3)
+    axes[1, 0].set_title('KL Divergence')
+    axes[1, 0].set_xlabel('Step')
+    axes[1, 0].legend()
+    # Plot 4: PPL ratio over time
+    axes[1, 1].plot(metrics_history['rollout_corr/ppl_ratio'])
+    axes[1, 1].axhline(y=1.0, color='r', linestyle='--', label='Ideal')
+    axes[1, 1].set_title('PPL Ratio (Training/Rollout)')
+    axes[1, 1].set_xlabel('Step')
+    axes[1, 1].legend()
+    # Plot 5: Chi-squared divergence
+    if 'rollout_corr/chi2_token' in metrics_history:
+        axes[1, 2].plot(metrics_history['rollout_corr/chi2_token'], label='Token-level')
+        if 'rollout_corr/chi2_seq' in metrics_history:
+            axes[1, 2].plot(metrics_history['rollout_corr/chi2_seq'], label='Seq-level')
+        axes[1, 2].axhline(y=1.0, color='r', linestyle='--', label='Warning')
+        axes[1, 2].set_title('Chi-squared Divergence')
+        axes[1, 2].set_xlabel('Step')
+        axes[1, 2].legend()
+    else:
+        axes[1, 2].axis('off')
+    plt.tight_layout()
+    plt.savefig('rollout_is_metrics.png', dpi=150)
+    print("Saved plot to rollout_is_metrics.png")
+```
+**Example: Metric collection during training**
+```python
+# Collect metrics over time
+metrics_history = {
+    'rollout_corr/rollout_is_mean': [],
+    'rollout_corr/rollout_is_eff_sample_size': [],
+    'rollout_corr/kl': [],
+    'rollout_corr/k3_kl': [],
+    'rollout_corr/ppl_ratio': [],
+    'rollout_corr/chi2_token': [],
+    'rollout_corr/chi2_seq': [],
+}
+# In training loop
+for step in range(num_steps):
+    # ... compute IS weights and rejection mask ...
+    _, _, metrics = compute_rollout_correction_and_rejection_mask(...)
+    # Store metrics
+    for key in metrics_history.keys():
+        if key in metrics:
+            metrics_history[key].append(metrics[key])
+    # Plot every 100 steps
+    if step % 100 == 0:
+        plot_is_metrics(metrics_history)
+```
+## Performance Impact
+- **Memory overhead**: ~1% of model memory
+- **Computational overhead**: 1-3% depending on level
+- **Training stability**: Significantly improved when off-policy gap exists
+## Testing
+Run the test suite to verify everything works:
+```bash
+# Basic unit tests
+python test_rollout_corr.py
+# Integration tests (if pytest is available)
+pytest tests/trainer/ppo/test_rollout_corr_integration.py -v
+```
+Expected output: All tests pass ✓
+## Additional Resources
+- **Implementation**: `verl/trainer/ppo/rollout_corr_helper.py`
+- **Examples**: `examples/rollout_correction/`
+- **DAPO Example**: `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh`
+## Summary
+Rollout Correction provides a unified framework for handling general off-policy problems in RL:
+- ✅ Corrects ANY distribution shift between data collection and training
+- ✅ Supports diverse scenarios: policy mismatch, staleness, replay buffers, off-policy algorithms
+- ✅ Numerical stability with safety bounds and rejection mechanisms
+- ✅ Comprehensive diagnostics: KL, perplexity, χ² divergence
+- ✅ Flexible methods from token-level to sequence-level aggregation
+- ✅ Memory-efficient implementation
+## References
+- **[Mathematical Formulations](rollout_corr_math.md)** - Detailed mathematical theory and derivations for all rollout correction methods
+- [When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch](https://richardli.xyz/rl-collapse) (see Blog Series above for parts 1-3)
+- [Your Efficient RL Framework Secretly Brings You Off-Policy RL Training](https://fengyao.notion.site/off-policy-rl)

code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md ADDED Viewed

	@@ -0,0 +1,954 @@

+# Mathematical Formulations of Rollout Correction Methods in `verl`
+**Author:** [Yingru Li](https://richardli.xyz)
+**Last updated:** 2025-11-04
+---
+> **📖 Documentation Structure**
+> - **This document** - Mathematical theory: formulations, derivations, and algorithmic foundations
+> - **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical implementation: configurations, presets, troubleshooting
+>
+> Start here for theory and design rationale, refer to the usage guide for implementation.
+---
+### BibTeX Citation
+```bibtex
+@online{liu-li-2025-rl-collapse,
+  title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch},
+  author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu},
+  year = {2025},
+  month = sep,
+  url = {https://richardli.xyz/rl-collapse}
+}
+```
+### Blog Series
+- Main blog post: https://richardli.xyz/rl-collapse
+- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance)
+- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff)
+- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS)
+## Abstract
+This document provides the definitive mathematical formulations for rollout correction methods in `verl`, following the natural progression from **REINFORCE** to **PPO** to **Decoupled PPO**.
+Rollout correction provides a unified framework to handle **general off-policy problems** in RL training - any scenario where the data collection distribution differs from the training distribution.
+**Applicable scenarios include:**
+- **Policy mismatch**: Different precision (FP8 vs FP16 vs BF16 vs FP32), different backends (vLLM vs SGLang vs FSDP vs Megatron)
+- **Temporal lag**: Model staleness, asynchronous rollout workers
+- **Replay buffers**: Training on historical trajectories from earlier policy versions
+- **Off-policy algorithms**: Behavioral cloning, DAPO, expert demonstrations
+- **Data filtering**: Reweighting, preference learning, curriculum learning
+---
+## Table of Contents
+1. [Theoretical Foundation: From REINFORCE to Decoupled PPO](#1-theoretical-foundation-from-reinforce-to-decoupled-ppo)
+2. [Implementation in verl: The Three-Policy Framework](#2-implementation-in-verl-the-three-policy-framework)
+3. [Algorithmic Components and Combinations](#3-algorithmic-components-and-combinations)
+4. [Off-Policy Diagnostic Metrics](#4-off-policy-diagnostic-metrics)
+5. [Summary and Decision Guide](#5-summary-and-decision-guide)
+6. [Implementation References](#6-implementation-references)
+---
+## 1. Theoretical Foundation: From REINFORCE to Decoupled PPO
+This section establishes the theoretical progression that `verl` implements.
+### 1.1 REINFORCE: Policy Gradient Baseline
+The REINFORCE algorithm ([Williams, 1992](https://doi.org/10.1007/BF00992696)) is the foundation of policy gradient methods.
+**Vanilla REINFORCE (On-Policy)**
+For trajectories $\tau = (s_0, a_0, s_1, a_1, \ldots, s_T, a_T)$ sampled from the current policy $\pi_\theta$, the policy gradient is:
+$$
+\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+where $A_t$ is the advantage function at timestep $t$.
+**Off-Policy REINFORCE**
+When trajectories are sampled from a different behavior policy $\mu$, we apply importance sampling over the **joint trajectory distribution**:
+$$
+\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \mu} \left[ \frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+where the trajectory-level importance weight is:
+$$
+\frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} = \frac{p(s_0) \prod_{t=0}^T \pi_\theta(a_t|s_t) p(s_{t+1}|s_t, a_t)}{p(s_0) \prod_{t=0}^T \mu(a_t|s_t) p(s_{t+1}|s_t, a_t)} = \prod_{t=0}^T \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)}
+$$
+The transition dynamics $p(s_{t+1}|s_t, a_t)$ and initial state $p(s_0)$ cancel out, leaving only the product of per-step action probability ratios.
+**Key properties:**
+- **Off-policy capable**: Can learn from any behavior policy via importance sampling
+- **No trust region**: Policy updates not constrained
+**Implementation in verl:** The `bypass_pg_is` preset implements off-policy REINFORCE with truncated importance sampling.
+### 1.2 PPO: Adding Trust Region Control
+Proximal Policy Optimization ([Schulman et al., 2017](https://arxiv.org/abs/1707.06347)) adds a clipped surrogate objective:
+$$
+L_{\text{PPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+where $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)}$ and $\epsilon$ is the clip range (typically 0.2).
+**Key properties:**
+- **Two policies**: $\mu$ (reference for clipping) and $\pi_\theta$ (being updated)
+- **Trust region via clipping**: Limits policy update magnitude via ratio $r_t(\theta) = \frac{\pi_\theta}{\mu}$
+### 1.3 Decoupled PPO: Achieving Batch Size Invariance
+Decoupled PPO ([Hilton et al., 2021](https://arxiv.org/abs/2110.00641)) solves PPO's batch size sensitivity by **decoupling two roles**:
+1. **Proximal policy** $\pi_{\text{prox}}$: The anchor policy for PPO clipping (controls policy update size)
+2. **Behavior policy** $\mu$: The policy that collected the data (for off-policy correction via importance sampling)
+**The problem**: Standard PPO controls policy update size via the ratio $\frac{\pi_\theta}{\pi_{\text{old}}}$, where $\pi_{\text{old}}$ is assumed to be both the proximal policy *and* the behavior policy. This coupling makes the algorithm sensitive to batch size because aggregating data from multiple workers or using replay buffers changes the effective behavior policy.
+**The solution**: Decouple these two roles, leading to a **three-policy formulation**:
+$$
+L_{\text{DecoupledPPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+where:
+- $w_t = \frac{\pi_{\text{prox}}(a_t|s_t)}{\mu(a_t|s_t)}$: Importance sampling weight (corrects for behavior policy $\mu$). Here $\pi_{\text{prox}}$ is frozen during training, so $w_t$ is constant (no stopgrad operator needed).
+- $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{prox}}(a_t|s_t)}$: PPO ratio (controls policy update size against proximal policy $\pi_{\text{prox}}$)
+**Key properties**: By decoupling:
+- **Batch size invariance**: Policy update control (via $\pi_{\text{prox}}$) is independent of data aggregation
+- **Flexible behavior policy**: Any $\mu$ can be used (different workers, replay buffers, or stale checkpoints)
+- **Stale data utilization**: Older trajectories can be corrected via importance sampling
+- **Clipping preserved**: Clipping against $\pi_{\text{prox}}$ limits update magnitude
+**This is the algorithm that `verl` implements via its three-policy framework.**
+---
+## 2. Implementation in verl: The Three-Policy Framework
+The `verl` library implements decoupled PPO using three distinct policies, each serving a specific role.
+### 2.1 Policy Roles and Notation
+**$\pi_{\text{rollout}}$ (Behavior Policy $\mu$)**
+The policy used for data collection. This is the behavior distribution $\mu$ from theory.
+- **When created**: During rollout/data collection phase
+- **Purpose**: Generate trajectories for training
+- **Common sources**:
+  - Policy mismatch: Same weights, different implementation (precision, backend)
+  - Temporal lag: Stale checkpoint from async workers
+  - Replay buffer: Historical data from earlier iterations
+  - Off-policy algorithms: Expert demonstrations, auxiliary policies (DAPO)
+  - Data filtering: Reweighted or filtered data
+- **Fixed**: Frozen during training on a batch
+**$\pi_{\text{old}}$ (Proximal Policy $\pi_{\text{prox}}$)**
+The reference policy for PPO clipping. This is the "proximal policy" from decoupled PPO theory.
+- **When created**:
+  - **Decoupled mode**: Computed at start of training epoch via `actor.compute_log_prob()`
+  - **Bypass mode**: Set equal to $\pi_{\text{rollout}}$ (skips separate computation)
+- **Purpose**:
+  - Anchor point for PPO clipping (controls policy update size)
+  - When separate from $\pi_{\text{rollout}}$: Enables batch size invariance and efficient use of stale data
+- **Fixed**: Frozen during all PPO update epochs on the same batch
+**$\pi_{\theta}$ (Current Policy)**
+The policy being actively optimized during training.
+- **Updated**: Every gradient step
+- **Purpose**: The policy we're improving
+### 2.2 Operating Modes
+The three-policy framework can operate in two modes:
+**Decoupled Mode (Three Policies)**
+- Computes $\pi_{\text{old}}$ separately at the start of each training epoch
+- **Algorithm**: Full decoupled PPO with three policies (mathematically correct)
+- **Properties**: Achieves batch size invariance; separately corrects Drift 1 (rollout→old) and Drift 2 (old→current)
+**Bypass Mode (Two Policies)**
+- Sets $\pi_{\text{old}} = \pi_{\text{rollout}}$ (skips separate computation)
+- **Algorithm**: Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct)
+- **Key difference**: Proximal policy equals behavior policy, so no IS correction needed between them
+- **Properties**: Faster (skips `actor.compute_log_prob()` call); does not achieve batch size invariance
+### 2.3 Two Distribution Shifts
+The three-policy framework handles two types of distribution drift:
+**Drift 1: $\pi_{\text{rollout}} \to \pi_{\text{old}}$ (Off-Policy Gap)**
+This is the distribution shift between the data collection policy and the training reference policy.
+- **Nature**: Ranges from negligible (same checkpoint, minor differences) to severe (replay buffers, expert data)
+- **Correction**: Importance sampling weight $w_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$
+- **Optional**: Can be ignored (bypass mode) when negligible
+**Drift 2: $\pi_{\text{old}} \to \pi_{\theta}$ (Policy Update Drift)**
+This is the drift from policy parameter updates during training.
+- **Nature**: Occurs as $\pi_\theta$ is updated via gradient descent
+- **Correction**: PPO clipping on ratio $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$
+- **Universal**: Applies to both on-policy and off-policy training
+### 2.4 Notation Summary
+- $\pi_{\text{rollout}}$: Behavior policy (data collection)
+- $\pi_{\text{old}}$: Proximal policy (PPO anchor)
+- $\pi_{\theta}$: Current policy (being updated)
+- $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$: Per-token IS ratio (corrects Drift 1)
+- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio (corrects Drift 2)
+- $A_t$: Advantage at token $t$
+- $T$: Set of valid tokens in a sequence
+- $C_{\text{IS}}$: Upper threshold for IS weights (e.g., 2.0)
+- $C_{\text{RS-upper}}$: Upper threshold for RS mask (e.g., 2.0)
+- $C_{\text{RS-lower}}$: Lower threshold for RS mask (typically $1/C_{\text{RS-upper}}$)
+- $\epsilon$: PPO clip range (typically 0.2)
+---
+## 3. Algorithmic Components and Combinations
+The rollout correction framework in `verl` is built from **orthogonal components** that can be combined flexibly:
+1. **Operating Mode**: How $\pi_{\text{old}}$ is computed (Decoupled vs Bypass)
+2. **Loss Function**: PPO (with clipping) vs Pure IS (policy gradient only)
+3. **IS/RS Aggregation Level**: Token, Sequence, or Geometric
+This section explains each component and their valid combinations.
+### 3.1 Operating Modes: Decoupled vs Bypass
+The operating mode determines how the proximal policy $\pi_{\text{old}}$ is computed.
+#### 3.1.1 Decoupled Mode (Three Policies)
+**Configuration:** `bypass_mode = false`
+**Policy setup:**
+- $\pi_{\text{rollout}}$: Behavior policy (data collection)
+- $\pi_{\text{old}}$: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch)
+- $\pi_{\theta}$: Current policy (being updated)
+**IS ratio:** $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (corrects Drift 1: rollout→old)
+**PPO ratio:** $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (corrects Drift 2: old→current)
+**Properties:**
+- ✅ Achieves batch size invariance
+- ✅ Separately corrects two distribution drifts
+- ✅ Efficient stale data utilization
+- ❌ Extra forward pass needed (`actor.compute_log_prob()`)
+#### 3.1.2 Bypass Mode (Two Policies)
+**Configuration:** `bypass_mode = true`
+**Policy setup:**
+- $\pi_{\text{rollout}}$: Behavior policy (data collection)
+- $\pi_{\text{old}} = \pi_{\text{rollout}}$: Proximal policy equals behavior policy
+- $\pi_{\theta}$: Current policy (being updated)
+**Ratios:**
+- **With PPO-clip loss** (`loss_type = "ppo_clip"`, default): PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy (IS handled by ratio)
+- **With REINFORCE loss** (`loss_type = "reinforce"`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function
+**Properties:**
+- ✅ Skips `actor.compute_log_prob()` call (faster)
+- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS)
+- ✅ Uses two policies instead of three (π_rollout = π_old)
+- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode)
+---
+### 3.2 Loss Functions: PPO vs Policy Gradient
+#### 3.2.1 PPO Loss (with Clipping)
+**Configuration:** `loss_type = "ppo_clip"` (default in bypass mode)
+**Loss function:**
+$$
+L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+where:
+- $w_t$: IS weight (depends on aggregation level, see Section 3.3). In decoupled mode, $w_t = \frac{\pi_{\text{old}}}{\pi_{\text{rollout}}}$ where $\pi_{\text{old}}$ is frozen, so $w_t$ is constant (no stopgrad needed). In bypass mode with PPO loss, no separate IS weights are typically computed.
+- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio
+- $\epsilon$: Clip range (typically 0.2)
+**Properties:**
+- Trust region control via clipping
+- Limits policy update magnitude
+- Standard in RL training
+#### 3.2.2 Policy Gradient Loss (with IS/RS Correction)
+**Configuration:** `loss_type = "reinforce"` (requires `bypass_mode = true`)
+**Loss function** (example with sequence-level IS):
+$$
+L_{\text{PG}}(\theta) = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \log \pi_{\theta}(a_t|s_t) \cdot A_t \right]
+$$
+where:
+- $w_{\text{seq}}(\theta)$: Sample weight (IS or RS, see §3.3-3.4 for details)
+- For IS: $w_{\text{seq}}(\theta) = \min\left( \prod_{t \in T} \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}, C_{\text{IS}} \right)$
+- For RS: $w_{\text{seq}}(\theta) \in \{0, 1\}$ (binary rejection mask)
+- **stopgrad operator**: The weight $w_{\text{seq}}(\theta)$ is computed using $\pi_\theta$ but treated as a **constant coefficient** when computing $\nabla_\theta L$. This is essential for importance sampling correctness (see theoretical justification below).
+**Effective gradient:**
+$$
+\nabla_\theta L_{\text{PG}} = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \nabla_\theta \log \pi_{\theta}(a_t|s_t) \cdot A_t \right]
+$$
+**Theoretical Justification for stopgrad:**
+The stopgrad operator is **mathematically required** by importance sampling theory, not an implementation detail. Here's why:
+**The fundamental principle**: Importance sampling is a technique to **change the measure** (reweight samples from one distribution to estimate expectations under another), not to optimize the reweighting function itself.
+**Formal derivation**:
+1. **Original objective**: We want to optimize $J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta}[\sum_t A_t]$.
+2. **Off-policy setting**: We only have samples from $\pi_{\text{rollout}}$, so we use importance sampling:
+   $$
+   J(\theta) = \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ \underbrace{\frac{P_{\pi_\theta}(\tau)}{P_{\pi_{\text{rollout}}}(\tau)}}_{w(\tau;\theta)} \sum_t A_t \right]
+   $$
+3. **Computing the policy gradient**: The correct gradient uses the **policy gradient theorem BEFORE importance sampling**:
+   $$
+   \begin{aligned}
+   \nabla_\theta J(\theta) &= \nabla_\theta \mathbb{E}_{\tau \sim \pi_\theta}\left[\sum_t A_t\right] \\
+   &= \mathbb{E}_{\tau \sim \pi_\theta} \left[\sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(policy gradient theorem)} \\
+   &= \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ w(\tau;\theta) \sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(change of measure)}
+   \end{aligned}
+   $$
+   In the final line, $w(\tau;\theta)$ appears as a **multiplicative coefficient** from the change of measure, not as something we differentiate.
+4. **What goes wrong without stopgrad**: If we naively compute $\nabla_\theta \left[w(\theta) \log \pi_\theta \right]$ in the loss, we get:
+   $$
+   \nabla_\theta \left[w(\theta) \log \pi_\theta \right] = \underbrace{\log \pi_\theta \cdot \nabla_\theta w(\theta)}_{\text{WRONG: bias term}} + \underbrace{w(\theta) \cdot \nabla_\theta \log \pi_\theta}_{\text{CORRECT: IS-weighted gradient}}
+   $$
+   The first term $\log \pi_\theta \cdot \nabla_\theta w(\theta)$ is an artifact of the computational trick (using loss times log-prob), not part of the true policy gradient. It biases the gradient estimator and optimizes a different objective than $J(\theta)$.
+5. **Implementation requirement**: In PyTorch, to compute only the second term, we must use:
+   ```python
+   loss = -advantages * log_prob * rollout_is_weights.detach()  # stopgrad on weights
+   ```
+   Without `.detach()`, autograd computes both terms, giving an incorrect gradient.
+**Intuition**: The IS weight $w(\theta)$ tells us "how much to trust this sample" for estimating the gradient under $\pi_\theta$. We update $\theta$ to maximize the reweighted objective, but we don't update $\theta$ to maximize the weight itself—that would be circular reasoning (optimizing the correction factor instead of the actual objective).
+**Properties:**
+- **Algorithm**: Off-policy policy gradient with IS/RS correction
+- **Loss types** (`loss_type` config option in bypass mode):
+  - `"ppo_clip"` (default): PPO clipped objective
+    - $L = -\mathbb{E}[\min(r \cdot A, \text{clip}(r) \cdot A)]$ where $r = \pi_\theta / \pi_{\text{rollout}}$
+    - Note: IS weights NOT applied (PPO ratio already handles it; would be double-counting)
+  - `"reinforce"`: Pure policy gradient with explicit IS weights, no PPO clipping
+    - $L = -\mathbb{E}[w \cdot \log \pi_\theta(a|s) \cdot A]$ where $w = \pi_\theta / \pi_{\text{rollout}}$
+- **Always uses bypass mode**: Direct $\pi_\theta$ to $\pi_{\text{rollout}}$ comparison
+- **Fast**: Single forward pass
+**Implementation:** `compute_policy_loss_bypass_mode()` and `compute_policy_loss_reinforce()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py)
+---
+### 3.3 IS/RS Aggregation Levels
+The aggregation level determines how per-token probability ratios are combined into IS weights and/or rejection masks. This choice is **orthogonal to the operating mode** - you can use any aggregation level in either decoupled or bypass mode.
+#### 3.3.1 Token-Level Aggregation
+**IS weights:** $w_t = \min(\rho_t, C_{\text{IS}})$ where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (decoupled) or $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (bypass/pure IS)
+**Configuration:**
+```python
+rollout_is = "token"  # IS weights
+rollout_rs = "token_k1"  # Optional: rejection sampling (ratio bounds)
+```
+**Properties:**
+- Independent truncation per token
+- Lower variance than sequence-level (product of ratios bounded individually)
+- **Bias-variance tradeoff**: Token-level correction has $O(T^2 \Delta_{\max})$ bias where $T$ is sequence length and $\Delta_{\max}$ is maximum per-token policy divergence. This bias becomes significant when the rollout policy deviates substantially from the training policy. Sequence-level correction is unbiased but has higher variance.
+- Typical threshold: 1.5 - 5.0
+- Optional batch normalization (§3.6): Normalizes over all token weights to ensure $\mathbb{E}[\tilde{w}_t] = 1$ (reduces variance)
+- **When to use**: Token-level works well when rollout policy stays within the trust region of training policy. When mismatch is significant, the bias becomes intolerable and sequence-level correction is preferred.
+**Loss function (REINFORCE + Token IS):**
+$$
+L_{\text{REINFORCE+TIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_t) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+where $w_t = \min(\rho_t, C_{\text{IS}})$ are the truncated token-level IS weights. The stopgrad operator ensures that when computing $\nabla_\theta L$, the weights are treated as constants (see §3.2.2 for theoretical justification). This formulation can also be combined with PPO clipping by replacing the REINFORCE gradient with the clipped surrogate objective.
+**Implementation:**
+- IS weights: `compute_rollout_correction_weights()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L325-L402)
+- Loss: `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884)
+#### 3.3.2 Sequence-Level Aggregation
+**IS weights:** $w_{\text{seq}} = \min\left( \prod_{t \in T} \rho_t, C_{\text{IS}} \right) = \min\left( \exp\left(\sum_{t \in T} \log \rho_t\right), C_{\text{IS}} \right)$ (broadcast to all tokens)
+**Configuration:**
+```python
+rollout_is = "sequence"  # IS weights
+rollout_rs = "seq_sum_k1"  # Optional: rejection sampling
+```
+**Properties:**
+- Multiplicative aggregation across sequence
+- More sensitive to outliers than token-level
+- Typical threshold: 2.0 - 10.0
+- Optional batch normalization (§3.6): Normalizes over sequence means (one weight per sequence)
+**Terminology Note:**
+- **Seq-TIS (Sequence-Level Truncated IS)**: Clips the sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$. Maximizes information efficiency by extracting signal from all samples. Best for clean data with moderate mismatch.
+- **Seq-MIS (Sequence-Level Masked IS)**: Rejects (masks) sequences with $\rho(\tau) > C$ instead of clipping. Acts as a hard trust region filter. Best for severe mismatch or when the distribution tail is "toxic" (contains garbage/adversarial samples rather than signal).
+**Loss function (REINFORCE + Sequence IS):**
+$$
+L_{\text{REINFORCE+SeqIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_{\text{seq}}) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+where $w_{\text{seq}}$ is broadcast to all tokens in the sequence. The stopgrad operator ensures correct IS gradient computation (see §3.2.2). This formulation can also be combined with PPO clipping.
+#### 3.3.3 Geometric Mean Aggregation (Geo-RS)
+**Geometric mean ratio:** $\rho_{\text{geo}} = \exp\left( \frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{1/|T|}$ (broadcast to all tokens)
+**Configuration:**
+```python
+rollout_is = null  # No IS weights, pure rejection
+rollout_rs = "seq_mean_k1"  # Geometric mean rejection sampling (ratio bounds)
+```
+**Properties:**
+- Length-invariant (normalizes by sequence length)
+- Ideal ratio = 1.0 (policies match)
+- Typical bounds: `"0.999_1.001"` (~±0.1%)
+- **Used for rejection sampling only, not IS weighting**
+**The Length Trap Problem:**
+Standard IS estimators have a systematic **length bias** that penalizes long sequences. The importance ratio $\rho(y)$ is multiplicative:
+$$
+\rho(y) = \prod_{t=1}^T \frac{\pi(y_t|y_{<t})}{\mu(y_t|y_{<t})}
+$$
+Assume the new policy $\pi$ differs slightly from $\mu$, with average per-token ratio $\approx 1.1$:
+- **Short sequence (10 tokens):** $\rho \approx 1.1^{10} \approx 2.6$ → fits within threshold, **kept**
+- **Long sequence (100 tokens):** $\rho \approx 1.1^{100} \approx 13,780$ → explodes past threshold, **rejected**
+This creates **Context Collapse**: the model preferentially learns from short, shallow answers and rejects long chains of thought—even if per-step quality is identical. For reasoning models (CoT) and agents, this effectively penalizes "thinking too long."
+**Geo-RS Solution:**
+Geometric-level rejection normalizes by sequence length, converting the extensive property (total probability product) to an intensive property (average per-token drift):
+$$
+\rho_{\text{geo}}(y) = \rho(y)^{1/T}
+$$
+Now both sequences have the same "trust score":
+- **Short (10 tokens):** $(1.1^{10})^{1/10} = 1.1$
+- **Long (100 tokens):** $(1.1^{100})^{1/100} = 1.1$
+**Why tight thresholds?**
+For 100 tokens with per-token log-ratio = 0.01 each:
+- Arithmetic product ratio: $e^{100 \times 0.01} \approx 2.7$
+- Geometric ratio: $e^{0.01} \approx 1.010$
+A ratio bound of `"0.999_1.001"` rejects sequences whose average per-token log-deviation exceeds ≈0.1%.
+**Loss function (REINFORCE + Geometric RS):**
+$$
+L_{\text{GeoRS}}(\theta) = -\mathbb{E}_{(s,a) \mid \text{seq} \in \mathcal{A}_{\text{geo}}} \left[ \sum_{t \in T} \log \pi_\theta(a_t|s_t) \cdot A_t \right]
+$$
+where $\mathcal{A}_{\text{geo}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \rho_{\text{geo}} \leq C_{\text{RS-upper}} \}$ is the acceptance set (rejection mask). No IS weights are used, so no stopgrad needed. This formulation can also be combined with PPO clipping.
+**Combined Estimator (Geo-RS-Token-TIS):**
+For best results, combine the **Geometric Filter** (length-invariant validity check) with **Token-level IS weights** (lower variance):
+$$
+\hat{g}_{\text{geo-rs-token-tis}}(y) = \underbrace{\mathbb{I}\left( C_{\text{low}} \le \rho(y)^{1/T} \le C_{\text{high}} \right)}_{\text{Geometric Filter}} \cdot \prod_t \min(\rho_t, C) \cdot f(y)
+$$
+This is implemented by combining `rollout_rs="seq_mean_k1"` with `rollout_is="token"`.
+#### 3.3.4 K2 Divergence Aggregation
+**Per-token statistic:**
+$$
+K2_t = \frac{1}{2} \left(\log \rho_t\right)^2
+$$
+where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ and the implementation clips $\log \rho_t$ to $[-20, 20]$ for numerical safety.
+**Sequence aggregations (share the same per-token $K2_t$):**
+- `seq_sum_k2`: $K2_{\text{sum}} = \sum_{t \in T} K2_t$
+- `seq_mean_k2`: $K2_{\text{mean}} = \frac{1}{|T|} \sum_{t \in T} K2_t$
+- `seq_max_k2`: $K2_{\text{max}} = \max_{t \in T} K2_t$
+**Configuration:**
+```python
+rollout_is = null            # Optional: pair with token IS weights for lower variance
+rollout_rs = "token_k2"      # or "seq_sum_k2", "seq_mean_k2", "seq_max_k2"
+rollout_rs_threshold = 2.0   # Positive upper bound only
+```
+**Properties:**
+- Symmetric quadratic penalty in $\log \rho_t$; equals zero when policies match.
+- Approximates $\tfrac{1}{2}\operatorname{Var}[\log \rho]$ for small policy drift, making it a smooth detector of mismatch.
+- Upper-threshold only: typical ranges are 1.5-3.0 for `token_k2`, 2.0-2.5 for `seq_mean_k2`, and 2.5-4.0 for `seq_sum_k2`.
+- `seq_max_k2` isolates single-token spikes even when the rest of the sequence is clean.
+- Can co-exist with token-level IS weights (`rollout_is="token"`) to keep useful samples while clipping variance.
+**Combined Estimator (K2-RS-Token-TIS):**
+For combined filtering and weighting, let $K2_{\text{agg}}$ denote the selected aggregation (token, sum, mean, or max):
+$$
+\hat{g}_{\text{k2-rs-token-tis}}(y) = \underbrace{\mathbb{I}\left( K2_{\text{agg}}(y) \le C_{\text{k2}} \right)}_{\text{K2 Filter}} \cdot \prod_t \min(\rho_t, C) \cdot f(y)
+$$
+This is implemented via `rollout_rs="seq_mean_k2"` (or another `k2` mode) together with `rollout_is="token"`.
+#### 3.3.5 K3 Divergence Aggregation
+**K3 divergence at sequence level:**
+$$
+K3_{\text{seq}} = \frac{1}{|T|} \sum_{t \in T} \left( \rho_t - \log \rho_t - 1 \right)
+$$
+where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ is the per-token ratio.
+**K3 equals the reverse KL:** In expectation, $K3 = \text{KL}(\pi_{\text{rollout}} \| \pi_{\text{old}})$. This follows from:
+- $\mathbb{E}_{\pi_\text{rollout}}[\rho] = 1$
+- $\mathbb{E}_{\pi_\text{rollout}}[\log \rho] = -\text{KL}(\pi_{\text{rollout}} \| \pi_{\text{old}})$
+- Therefore: $K3 = 1 - (-\text{KL}) - 1 = \text{KL}(\pi_{\text{rollout}} \| \pi_{\text{old}})$
+**Configuration:**
+```python
+rollout_is = null          # No IS weights, pure rejection
+rollout_rs = "seq_mean_k3" # K3 rejection sampling
+```
+**Properties:**
+- K3 divergence is always >= 0 per token (equals 0 when ρ = 1)
+- More stable than geometric ratio checks because each token term is non-negative
+- Only upper threshold applies (no lower threshold since K3 >= 0)
+- Typical threshold: 0.001 - 0.01
+**Why K3 over geometric ratio?**
+- Geometric ratio uses average log-ratio; small numerical bias can flip sign
+- K3 = E[ρ - log ρ - 1] is non-negative per token, offering a smoother detector
+- Both estimate the same quantity: KL(π_rollout || π_old)
+- For small divergences, K3 ≈ 0.5 × Var(log_ratio)
+**Combined Estimator (K3-RS-Token-TIS):**
+For best results, combine K3 filter with token-level IS weights:
+$$
+\hat{g}_{\text{k3-rs-token-tis}}(y) = \underbrace{\mathbb{I}\left( K3_{\text{seq}} \le C_{\text{k3}} \right)}_{\text{K3 Filter}} \cdot \prod_t \min(\rho_t, C) \cdot f(y)
+$$
+This is implemented by combining `rollout_rs="k3"` with `rollout_is="token"`.
+---
+### 3.4 Batch Normalization
+An optional variance reduction technique that normalizes IS weights to have mean 1.0 within each batch.
+**Configuration:**
+```python
+rollout_is_batch_normalize = True  # Default: False
+```
+**Normalization formula (aggregation-aware):**
+For **token-level IS** (§3.3.1):
+$$
+\tilde{w}_t = \frac{w_t}{\frac{1}{\sum_{i,t} m_{i,t}} \sum_{i,t} w_{i,t} \cdot m_{i,t}}
+$$
+where $w_{i,t}$ are truncated token IS weights, $m_{i,t}$ is the response mask, and normalization is over **all tokens**.
+For **sequence-level IS** (§3.3.2):
+$$
+\tilde{w}_i = \frac{w_i}{\frac{1}{B}\sum_{j=1}^B \bar{w}_j}
+$$
+where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the per-sequence mean (all tokens in a sequence have the same weight), and normalization is over **sequences**.
+**Properties:**
+- Applied **after** truncation to preserve truncation semantics
+- Ensures $\mathbb{E}[\tilde{w}] = 1$ within each batch
+- **Aggregation-aware**: Token-level normalizes over tokens; sequence-level normalizes over sequences
+- Uses `masked_mean` to respect padding tokens
+- Reduces gradient magnitude variance by removing random batch-level scale fluctuations
+**Metrics:**
+- `rollout_is_batch_norm_factor`: The normalization factor applied (batch mean before normalization)
+**Implementation:** [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L401-L421)
+---
+### 3.5 Rejection Sampling (RS)
+Rejection sampling can be added to **any combination** of operating mode and aggregation level. It modifies the `response_mask` to exclude outlier tokens/sequences.
+**Configuration examples:**
+```python
+rollout_rs = "token_k1"    # Token-level ratio bounds
+rollout_rs_threshold = "0.6_1.6"
+rollout_rs = "seq_sum_k1"  # Sequence sum of log ratios
+rollout_rs_threshold = "0.5_2.0"
+rollout_rs = "seq_mean_k3" # Sequence mean of K3 divergence
+rollout_rs_threshold = 0.01
+```
+**Acceptance set:**
+- **Token-level**: $\mathcal{A}_{\text{token}} = \{ t : C_{\text{RS-lower}} \leq \rho_t \leq C_{\text{RS-upper}} \}$
+- **Sequence-level**: $\mathcal{A}_{\text{seq}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \prod_{t \in T} \rho_t \leq C_{\text{RS-upper}} \}$
+- **Geometric**: $\mathcal{A}_{\text{geo}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \rho_{\text{geo}} \leq C_{\text{RS-upper}} \}$
+**Properties:**
+- Separate from IS weighting (can use RS without IS)
+- Reduces effective sample size
+- Filters extreme outliers
+**Implementation:** `compute_rollout_rejection_mask()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L80-L188)
+---
+### 3.6 Combination Matrix
+**Key insight:** Estimators (how IS/RS is computed) and operating modes (decoupled PPO vs bypass PG) are **orthogonal**. Any estimator can be combined with any operating mode.
+#### Estimator × Operating Mode
+| Estimator | Configuration | Compatible Modes |
+|-----------|---------------|------------------|
+| **Token-TIS** | `rollout_is="token"` | Decoupled PPO, Bypass PG |
+| **Seq-TIS** | `rollout_is="sequence"` | Decoupled PPO, Bypass PG |
+| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Decoupled PPO, Bypass PG |
+| **Geo-RS** | `rollout_rs="seq_mean_k1"` (geometric mean) | Decoupled PPO, Bypass PG |
+| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Decoupled PPO, Bypass PG |
+| **K3-RS** | `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG |
+| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG |
+**Note:** In bypass mode, `loss_type` controls the loss function. Use "ppo_clip" (default) or "reinforce".
+#### Available Preset Methods
+| Preset Method | Estimator | Mode | Properties |
+|---------------|-----------|------|------------|
+| **Decoupled PPO Mode** (3 policies: π_rollout, π_old, π_θ) |
+| `decoupled_token_is()` | Token-TIS | Decoupled PPO | Per-token IS weights |
+| `decoupled_seq_is()` | Seq-TIS | Decoupled PPO | Sequence-level IS weights |
+| `decoupled_seq_is_rs()` | Seq-MIS | Decoupled PPO | Sequence IS + sequence RS |
+| `decoupled_geo_rs()` | Geo-RS | Decoupled PPO | Geometric RS + seq\_max\_k2 guard |
+| `decoupled_geo_rs_token_tis()` | Geo-RS-Token-TIS | Decoupled PPO | Geometric filter + token IS |
+| **K3 KL Estimator** (more stable for small KL values) |
+| `decoupled_k3_rs()` | K3-RS | Decoupled PPO | K3 rejection, no IS weights |
+| `decoupled_k3_rs_token_tis()` | K3-RS-Token-TIS | Decoupled PPO | K3 filter + token clipped weight |
+| **Bypass Mode (PPO-clip)** (ratio handles IS, RS masks outliers) |
+| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | PPO-clip only |
+| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | PPO-clip + Geo-RS (ratio) |
+| `bypass_ppo_clip_k3_rs()` | K3-RS | Bypass (PPO-clip) | PPO-clip + K3-RS |
+| **Bypass Mode (REINFORCE)** (explicit IS weights, no PPO clipping) |
+| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | REINFORCE + Seq IS |
+| `bypass_pg_geo_rs()` | Geo-RS | Bypass (REINFORCE) | REINFORCE + Geo-RS (ratio) |
+| `bypass_pg_geo_rs_token_tis()` | Geo-RS-Token-TIS | Bypass (REINFORCE) | REINFORCE + Geo filter + token IS |
+| **Other** |
+| `disabled()` | - | - | Metrics only |
+**Note:** Bypass mode sets π_old = π_rollout and uses `loss_type` to select the loss function.
+#### Additional Supported Combinations (Manual Configuration)
+These combinations are **fully supported** but require manual configuration:
+**1. Token IS + Token RS**
+```python
+config = RolloutCorrectionConfig(
+    rollout_is="token",
+    rollout_is_threshold=2.0,
+    rollout_rs="token_k1",
+    rollout_rs_threshold="0.5_2.0",
+)
+```
+**Properties:** Token-level IS weights + token-level RS mask.
+**2. Pure Token RS**
+```python
+config = RolloutCorrectionConfig(
+    rollout_is=None,
+    rollout_rs="token_k1",
+    rollout_rs_threshold="0.5_2.0",
+)
+```
+**Properties:** Token-level RS mask only, no IS weights.
+**3. Pure Sequence RS**
+```python
+config = RolloutCorrectionConfig(
+    rollout_is=None,
+    rollout_rs="seq_sum_k1",
+    rollout_rs_threshold="0.5_2.0",
+)
+```
+**Properties:** Sequence-level RS mask only, no IS weights.
+**Key properties:**
+- Any IS aggregation level (token/sequence) can be used in either decoupled or bypass mode
+- Rejection sampling can be added to any combination
+- Geometric aggregation is typically used for RS only (not IS weighting)
+- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` for REINFORCE (no IS weights)
+- All combinations in the table above are valid and supported by the implementation
+---
+### 3.7 Common Implementation Mistake
+#### Incorrect LLM-RL Implementation (PPO Without Rollout Correction)
+**Theory:** Naive LLM-RL implementation that incorrectly applies PPO by **ignoring the actual rollout policy** and assuming $\pi_{\text{old}} = \pi_{\text{rollout}}$.
+**Note:** This incorrect implementation pattern was identified in [Liu, Li, et al. (2025)](https://richardli.xyz/rl-collapse) as a key cause of training instability in LLM-RL systems, motivating the development of this rollout correction framework.
+**Loss Function:**
+$$
+L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right]
+$$
+where $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (ignores $\pi_{\text{rollout}}$).
+**Why it's wrong:**
+- **Ignores $\pi_{\text{rollout}}$**: Uses $\pi_{\text{old}}$ as behavior policy instead of actual $\pi_{\text{rollout}}$
+- **Policy mismatch**: In LLM-RL, rollout typically uses different precision/backend/checkpoint than training, causing $\pi_{\text{rollout}} \neq \pi_{\text{old}}$ even with same model weights
+- **Not PPO's fault**: PPO itself is correct; the issue is the incorrect assumption
+**Correct alternatives:**
+1. **Decoupled mode**: Three policies with IS correction from $\pi_{\text{rollout}}$ to $\pi_{\text{old}}$
+2. **Bypass mode**: Two policies using $\pi_{\text{rollout}}$ as both behavior policy and proximal policy
+3. **Bypass + Policy Gradient mode**: Two policies with IS/RS correction and no PPO clipping
+**Implementation:** `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884)
+---
+## 4. Off-Policy Diagnostic Metrics
+These metrics quantify the severity of off-policy drift.
+**Note on notation:** Metrics use $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$. In bypass mode, $\pi_{\text{old}} = \pi_{\text{rollout}}$, so metrics measure rollout→current drift using $\rho_t = \frac{\pi_{\theta}}{\pi_{\text{rollout}}}$ instead.
+### 4.1 KL Divergence
+**Direct KL estimator:**
+$$
+\text{KL}(\pi_{\text{rollout}} \| \pi_{\text{old}}) = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \log \pi_{\text{rollout}}(a_t|s_t) - \log \pi_{\text{old}}(a_t|s_t) \right]
+$$
+**K3 KL estimator** (alternative formulation):
+$$
+\text{KL}_{\text{K3}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t - \log \rho_t - 1 \right]
+$$
+where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$.
+### 4.2 Perplexity
+**Old policy perplexity:**
+$$
+\text{PPL}_{\text{old}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{old}}(a_t|s_t) \right)
+$$
+**Rollout policy perplexity:**
+$$
+\text{PPL}_{\text{rollout}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{rollout}}(a_t|s_t) \right)
+$$
+**PPL ratio** (inverse of geometric mean IS weight):
+$$
+\text{PPL}_{\text{ratio}} = \frac{\text{PPL}_{\text{old}}}{\text{PPL}_{\text{rollout}}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{-1/|T|}
+$$
+**Interpretation:** Values > 1 mean $\pi_{\text{old}}$ assigns lower probability than $\pi_{\text{rollout}}$ to the observed actions (distribution shift).
+### 4.3 Chi-squared Divergence
+Measures the second moment of the IS weight distribution.
+**Token-level:**
+$$
+\chi^2_{\text{token}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t^2 \right] - 1
+$$
+**Sequence-level:**
+$$
+\chi^2_{\text{seq}} = \mathbb{E}_{\text{seq} \sim \pi_{\text{rollout}}} \left[ \left(\prod_{t \in T} \rho_t\right)^2 \right] - 1
+$$
+**Interpretation:**
+- $\chi^2 = 0$: Policies are identical
+- $\chi^2 > 0$: Higher values indicate more severe off-policy distribution shift
+**Implementation:** `compute_offpolicy_metrics()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L670-L776)
+---
+## 5. Summary and Decision Guide
+### 5.1 Method Summary Table
+| Method | Theory | Policies | PPO Clip | IS Correction | Correctness | Speed |
+|--------|--------|----------|----------|---------------|-------------|-------|
+| **Bypass Mode** (π_old = π_rollout, `loss_type` selects algorithm) |
+| `loss_type="ppo_clip"` (default) | PPO (ratio = π_θ/π_rollout) | 2 (rollout, θ) | ✅ | RS mask only (ratio handles IS) | ✅ Correct | **Fast** |
+| `loss_type="reinforce"` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ (explicit IS weights) | ✅ Correct | **Fast** |
+| **Bypass Mode Presets (PPO-clip)** |
+| `bypass_ppo_clip` | PPO only | 2 (rollout, θ) | ✅ | - | ✅ Correct | **Fast** |
+| `bypass_ppo_clip_geo_rs` | PPO + Geo-RS | 2 (rollout, θ) | ✅ | Geo-RS mask (ratio) | ✅ Correct | **Fast** |
+| **Bypass Mode Presets (REINFORCE)** |
+| `bypass_pg_is` | REINFORCE + Seq-TIS | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** |
+| `bypass_pg_geo_rs` | REINFORCE + Geo-RS | 2 (rollout, θ) | ❌ | Geo-RS only (ratio) | ✅ Correct | **Fast** |
+| `bypass_pg_geo_rs_token_tis` | REINFORCE + Geo RS + Token IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Token-TIS | ✅ Correct | **Fast** |
+| **Decoupled PPO Mode** (IS weights = π_old / π_rollout) |
+| `decoupled_token_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Token-TIS | ✅ Correct | Standard |
+| `decoupled_seq_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Seq-TIS | ✅ Correct | Standard |
+| `decoupled_seq_is_rs` | Decoupled PPO + RS | 3 (rollout, old, θ) | ✅ | ✅ Seq-MIS | ✅ Correct | Standard |
+| `decoupled_geo_rs` | Decoupled PPO + Geo-RS | 3 (rollout, old, θ) | ✅ | Geo-RS only (ratio) | ✅ Correct | Standard |
+| `decoupled_geo_rs_token_tis` | Decoupled PPO + Geo RS + Token IS | 3 (rollout, old, θ) | ✅ | ✅ Geo-RS-Token-TIS | ✅ Correct | Standard |
+| **Incorrect (for reference)** |
+| Naive LLM-RL | Incorrect PPO usage | 2 (old, θ) | ✅ | ❌ | ⚠️ Incorrect | Standard |
+**Notes:**
+- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function:
+  - `"ppo_clip"` (default): PPO clipped ratio (IS handled by ratio = π_θ/π_rollout, no explicit IS weights to avoid double-counting)
+  - `"reinforce"`: Explicit IS weights applied as $w \cdot \log \pi \cdot A$
+- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples
+### 5.2 Estimator Hierarchy
+These estimators define **how IS weights and rejection masks are computed**. They are orthogonal to the operating mode (decoupled PPO vs bypass policy gradient) and can be combined with either.
+| Estimator | Configuration | Mechanism | Best For |
+|-----------|---------------|-----------|----------|
+| **Token-TIS** | `rollout_is="token"` | Clips per-token ratios | Lower variance IS with acceptable bias |
+| **Seq-TIS** | `rollout_is="sequence"` | Clips sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$ | Clean data with moderate mismatch; unbiased |
+| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Rejects sequences with $\rho(\tau) > C$ | Severe mismatch; filters "toxic tail" (garbage data) |
+| **Geo-RS** | `rollout_rs="seq_mean_k1"` | Rejects on geometric mean ratio exp(E[log(r)]) | Length-invariant trust region |
+| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Geometric filter + token IS weights | Ratio-based length normalization + lower variance IS |
+| **K3-RS** | `rollout_rs="seq_mean_k3"` | Rejects on K3 KL divergence | Small KL values; smooth detector |
+| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | K3 filter + token IS weights | Small KL + lower variance IS |
+**Note:** Each estimator can be used with either:
+- **Decoupled PPO** (`bypass_mode=false`): Three policies with PPO clipping
+- **Bypass Mode** (`bypass_mode=true`): Two policies with configurable loss type
+  - `loss_type="ppo_clip"` (default): PPO clipped objective (IS via ratio, RS mask applied)
+  - `loss_type="reinforce"`: REINFORCE with explicit IS weights
+### 5.3 Method Characteristics by Scenario
+**Choosing estimator by off-policy severity:**
+- **Negligible** (same checkpoint, minor differences): No IS correction needed; use bypass mode for efficiency
+- **Moderate** (async workers, slight staleness): Token-TIS provides per-token IS correction with lower variance
+- **Severe** (replay buffers, old data): Seq-TIS or Seq-MIS provides sequence-level IS correction; use Seq-MIS when high-weight samples are likely garbage
+**Choosing estimator by sequence length:**
+- **Short sequences** (standard chat): Seq-TIS is optimal
+- **Long sequences** (CoT, agents): K1-RS or K1-RS-Token-TIS to avoid Length Trap
+**Choosing operating mode:**
+- **Batch size invariance needed**: Use decoupled mode (`bypass_mode=false`)
+- **Computational efficiency needed**: Use bypass mode (`bypass_mode=true`) to skip `old_log_prob` computation
+- **No PPO clipping**: Use bypass mode with `loss_type="reinforce"`
+### 5.4 Decoupled Mode vs Bypass Mode
+**Decoupled mode** (computes `old_log_prob` separately):
+- Implements full decoupled PPO with three policies (mathematically correct)
+- Separately measures and corrects Drift 1 (rollout→old) and Drift 2 (old→current)
+- Achieves batch size invariance and efficient stale data utilization
+- Enables accurate off-policy metrics monitoring
+**Bypass mode** (sets $\pi_{\text{old}} = \pi_{\text{rollout}}$):
+- Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct)
+- Computational efficiency: Skips separate `old_log_prob` computation
+- Does not achieve batch size invariance (proximal policy depends on data collection)
+---
+## 6. Implementation References
+- **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical configuration and troubleshooting
+- **Config:** [verl/trainer/config/algorithm.py](../../verl/trainer/config/algorithm.py)
+- **IS/RS Helper:** [verl/trainer/ppo/rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py)
+- **PPO Loss:** [verl/trainer/ppo/core_algos.py](../../verl/trainer/ppo/core_algos.py)
+- **Tests:** [tests/trainer/ppo/test_rollout_corr.py](../../tests/trainer/ppo/test_rollout_corr.py)
+---
+## References
+- **Williams, R. J. (1992).** "Simple statistical gradient-following algorithms for connectionist reinforcement learning." *Machine Learning*, 8(3-4), 229-256. https://doi.org/10.1007/BF00992696
+- **Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017).** "Proximal policy optimization algorithms." *arXiv preprint arXiv:1707.06347.* https://arxiv.org/abs/1707.06347
+- **Hilton, J., Cobbe, K., & Schulman, J. (2021).** "Batch size-invariance for policy optimization." *arXiv preprint arXiv:2110.00641.* https://arxiv.org/abs/2110.00641
+  - Introduced decoupled PPO: separating proximal policy (for controlling policy update size) from behavior policy (for off-policy correction) to achieve batch size invariance
+- **Liu, J., Li, Y., et al. (2025).** "When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch"
+  - Blog post: https://richardli.xyz/rl-collapse (see Blog Series above for parts 1-3)

code/RL_model/verl/verl_train/docs/algo/spin.md ADDED Viewed

	@@ -0,0 +1,179 @@

+# Recipe: Self-Play Fine-Tuning (SPIN)
+Last updated: 05/31/2025.
+`verl` provides a recipe inspired by the paper **"Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models"** (SPIN). SPIN is a language model finetuning algorithm that enables iterative self-improvement through a self-play mechanism inspired by game theory.
+**Core Idea:** Models learn by playing against themselves, reducing reliance on external preference datasets or stronger teacher models:
+1.  **Synthetic Data Generation:** The current model generates responses, creating its own training data from previous iterations.
+2.  **Two-Player Game Setup:** A game involving two players acted by a single LLM.
+3.  **Iterative Training:** The model progressively improves by refining its policy, with each iteration's model becoming the opponent for the next iteration.
+Paper Authors: [Zixiang Chen](https://github.com/uclaml/SPIN)\*, [Yihe Deng](https://github.com/uclaml/SPIN)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
+[[Webpage](https://uclaml.github.io/SPIN/)] [[Huggingface](https://huggingface.co/papers/2401.01335)] [[Paper](https://arxiv.org/abs/2401.01335)] [[Original Implementation](https://github.com/uclaml/SPIN)]
+verl Implementation Authors: [Chendong Wang](https://cdwang96.github.io/), [Chenyang Zhao](https://github.com/zhaochenyang20)
+---
+## Key Function (compute_online_dpo_loss) and Related works
+SPIN (Chen et al., 2024) proposes an iterative self-play mechanism to fine-tune language models. In each iteration, SPIN's training objective, when using a logistic loss function, is equivalent to Direct Preference Optimization (DPO) loss (Rafailov et al., 2023).
+This `verl` recipe realizes SPIN's core concept by using DPO loss iteratively (Xu et al., 2023; Xiong et al., 2023; Snorkel AI, 2024). This means that in each iteration, we fine-tune the LLM using DPO loss for preference optimization. Notably, Xu et al. (2023) explored iterative preference optimization with pairwise cringe loss, while Xiong et al. (2023) discussed how to bridge theory and practice for RLHF under KL constraints using iterative training. The concept of iterative preference learning was also explored in online DPO (Guo et al., 2024), which focuses on direct alignment from online AI feedback. In online DPO, preference data is dynamically updated during training, allowing the model to learn from its own generated data.
+Specifically, we developed the **`compute_online_dpo_loss`** function and built this SPIN recipe on top of it. By incorporating online preference generation, this approach enables continuously refining language models without relying on fixed external preference datasets.
+**Reference Papers:**
+* [Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models](https://arxiv.org/abs/2401.01335) (Chen et al., 2024)
+* [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2305.18290) (Rafailov et al., 2023)
+* [Somethings are more cringe than others: Preference optimization with the pairwise cringe loss](https://arxiv.org/abs/2312.16682) (Xu et al., 2023)
+* [Iterative preference learning from human feedback: Bridging theory and practice for rlhf under kl-constraint](https://arxiv.org/abs/2312.11456) (Xiong et al., 2023)
+* [Snorkel-Mistral-PairRM-DPO](https://huggingface.co/snorkelai/Snorkel-Mistral-PairRM-DPO) (Snorkel AI, 2024)
+* [Direct language model alignment from online ai feedback](https://arxiv.org/abs/2402.04792) (Guo et al., 2024)
+## Our Online DPO Implementation
+Our `compute_online_dpo_loss` function adapts `verl`'s existing PPO infrastructure (based on `verl` v0.3.0.post1) for this iterative online DPO. Key aspects of our implementation include:
+* **No Critic:** Unlike PPO, we omit the value function critic.
+* **Dynamic Reference Model:** An explicit reference policy (`ref_policy_wg`) is used for DPO loss. This reference model's weights can be periodically updated from the actor (`ref_update_freq`), providing a dynamic baseline.
+* **Online Preference Generation:** The `compute_onlineDPO_pref` function (in `core_algos.py`) dynamically creates chosen/rejected pairs based on a reward source (e.g., rule-based ranking for math problems).
+* **DPO Loss Integration:** We replace PPO's policy loss with our `compute_online_dpo_loss` (in `core_algos.py`) within the actor update (`dp_actor.py`), directly optimizing the policy using the generated preferences.
+* **Iterative Training Orchestration:** The `SpinTrainer` (in `spin_trainer.py`) manages the entire self-play loop: generation, preference labeling, optional reference model updates, and policy updates, enabling continuous self-improvement aligned with SPIN's principles.
+---
+## Algorithm
+This recipe implements an Online algorithm adapted to the `verl` Reinforcement Learning framework, which provides an alternative to PPO for fine-tuning language models.
+**Online Loop:** Instead of maximizing a scalar reward signal in PPO, this approach directly optimizes the policy model to align with preference data generated *online* during training:
+1.  **Generation:** The current model generates multiple responses for each prompt in a batch.
+2.  **Preference Labeling:** A function evaluates these generated responses to determine which one is preferred (chosen) and which is dispreferred (rejected). This can be done using a reward function or implicit ranking based on specific rules. (In this recipe, we use rule-based ranking on the math problem).
+3.  **Update:** This preference tuple (`prompt`, `chosen_response`, `rejected_response`) is used to update the actor model using `compute_online_dpo_loss`, comparing against a reference model.
+**Connection with SPIN:**
+Instead of only using a fixed target data distribution, the online generation loop in step 2 will dynamically change the target data distribution by using a certain Preference Labeling method (rule-based ranking on the math problem by selecting the better one in this recipe). This explores the direction mentioned in SPIN's paper Section 7 about "dynamically changing target data distribution" to potentially elevate LLM performance beyond the fixed human-annotated data ceiling.
+---
+## Reproduce the Experiment (Example Setup)
+The following steps outline how to set up the environment and run the SPIN recipe, based on the provided test log using GSM8K and Qwen2.5-3B-Instruct.
+1.  **Setup Environment (Example using Docker):**
+    ```bash
+    # Start a container with GPU access and shared memory
+    docker run -it --name spin_test --gpus all \
+        --shm-size=32g \
+        --ipc=host \
+        -v /path/to/host/.cache:/root/.cache \
+        -e HF_TOKEN=<YOUR_HUGGINGFACE_TOKEN> \
+        lmsysorg/sglang:latest \
+        /bin/bash
+    # Inside the container or on your host machine:
+    # Ensure /tmp is writable
+    mkdir -p /tmp
+    chmod 1777 /tmp
+    # Install Python 3.10 (if not present) and venv
+    sudo apt update
+    sudo apt install -y python3.10 python3.10-venv tmux
+    python3 -m ensurepip --upgrade
+    # Create and activate a virtual environment
+    python3 -m venv ~/.python/spin_env
+    source ~/.python/spin_env/bin/activate
+    # Install uv (fast package installer)
+    python3 -m pip install uv
+    ```
+2.  **Install verl and Dependencies:**
+    ```bash
+    # Clone the verl repository and checkout the spin branch
+    cd ~
+    git clone git@github.com:volcengine/verl.git && cd verl
+    # Install flash-attn (handle potential build issues)
+    python3 -m uv pip install wheel packaging
+    python3 -m uv pip install flash-attn --no-build-isolation --no-deps
+    # Install verl with sglang extras
+    python3 -m uv pip install -e ".[sglang]"
+    ```
+    *Note: If `flash-attn` installation fails, try the manual steps again or consult its documentation.*
+3.  **Login & Download Data/Model:**
+    ```bash
+    # Login to Weights & Biases (optional, for logging)
+    export WANDB_API_KEY=<YOUR_WANDB_API_KEY>
+    # wandb login
+    # Download the GSM8K dataset
+    python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path
+    # Download the base model (Example: Qwen2.5-3B-Instruct)
+    hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct
+    ```
+4.  **Configure:**
+    * Modify the configuration file (e.g., `config/spin_trainer.yaml` or the one specified in the run script) with correct paths to your downloaded model, data, desired hyperparameters (`dpo_beta`, learning rate, etc.), and distributed training settings (nodes, GPUs per node).
+    * Pay attention to `actor_rollout_ref.model`, `data` paths, `reward_model` config (if using one), and `trainer.ref_update_freq`.
+5.  **Run Training:**
+    ```bash
+    # Set CUDA visible devices (adjust based on your hardware and config)
+    export CUDA_VISIBLE_DEVICES=0,1,2,3
+    # Launch the training script (e.g., test.sh or a custom script)
+    # Ensure test.sh points to the correct config and main script
+    bash recipe/spin/run_spin.sh
+    ```
+---
+## Configuration
+* The primary configuration is typically managed through a YAML file specified in the launch script (e.g., `config/spin_trainer.yaml`).
+* Key configuration sections:
+    * `data`: Paths to training/validation prompt files, batch sizes, sequence lengths.
+    * `actor_rollout_ref`: Paths to the base model (used for actor and initial reference), FSDP settings, optimization parameters (learning rate, scheduler).
+    * `reward_model`: Configuration for the reward model used for online preference labeling (path, batch size, etc.). Can be omitted if using a simpler reward function.
+    * `algorithm`: DPO-specific hyperparameters like `dpo_beta`, `dpo_loss_type`.
+    * `trainer`: Distributed training settings (nodes, GPUs per node), logging (WandB), checkpointing frequency, and `ref_update_freq` (set > 0 to enable periodic reference model updates from the actor).
+---
+## Key Files
+* `main_spin.py`: Main entry point using Hydra to load the config and launch the `SpinTrainer`.
+* `spin_trainer.py`: Defines the `SpinTrainer` class, orchestrating the Online DPO training loop.
+* `fsdp_workers.py`: Implements Ray workers (Actor, Reference) potentially using FSDP.
+* `dp_actor.py`: Contains the actor class, including the DPO policy update logic.
+* `core_algos.py`: Includes helper functions for `compute_online_dpo_loss` and `compute_onlineDPO_pref`.
+* `config/spin_trainer.yaml` (or similar): Main Hydra configuration file for the recipe.
+* `run_spin.sh` (or similar): Example bash script for launching a training run.
+* `README.md`: This file.
+---
+## Acknowledgement
+We sincerely thank the contribution and guidance from the `verl` community and advisors, including (adapted from SPPO):
+* [Zixiang Chen](https://sites.google.com/view/zxchen)
+* [Yuhao Yang](https://github.com/yhyang201)
+* [Yifan Zhang](https://github.com/yifanzhang-pro)
+* [Yongan Xiang](https://github.com/BearBiscuit05)
+* [Junrong Lin](https://github.com/ocss884)
+* [Yuxuan Tong](https://github.com/tongyx361)
+* [Guangming Shen](https://github.com/PeterSH6)
+* [Biao He](https://www.linkedin.com/in/biao-he/)
+* [Qingquan Song](https://qingquansong.github.io/)
+* [Chenyang Zhao](https://zhaochenyang20.github.io/Chayenne/)
+* [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)

code/RL_model/verl/verl_train/docs/algo/sppo.md ADDED Viewed

	@@ -0,0 +1,52 @@

+# Recipe: Self-Play Preference Optimization (SPPO)
+Last updated: 05/28/2025.
+verl provides a community recipe implementation for the paper [Self-Play Preference Optimization for Language Model Alignment](https://arxiv.org/abs/2405.00675). SPPO can significantly enhance the performance of an LLM without strong external signals such as responses or preferences from GPT-4. It can outperform the model trained with iterative direct preference optimization (DPO), among other methods. SPPO is theoretically grounded, ensuring that the LLM can converge to the von Neumann winner (i.e., Nash equilibrium) under general, potentially intransitive preference, and empirically validated through extensive evaluations on multiple datasets.
+Paper Authors: [Yue Wu](https://yuewu.us/)\*, [Zhiqing Sun](https://www.cs.cmu.edu/~zhiqings/)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Yiming Yang](https://www.cs.cmu.edu/~yiming/), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
+verl Implementation Authors: [Yuhao Yang](https://github.com/yhyang201), [Chenyang Zhao](https://github.com/zhaochenyang20)
+[[Webpage](https://uclaml.github.io/SPPO/)] [[Huggingface](https://huggingface.co/papers/2405.00675)] [[Paper](https://arxiv.org/abs/2405.00675)][[Original Implementation](https://github.com/uclaml/SPPO)]
+## Reproduce the Experiment
+We evaluate the performance of SPPO on the MATH dataset. Starting from an initial score of 46.6 with Qwen2.5-7B-Instruct, we achieve a score of 65.6 after 20 epochs of training, placing our model approximately in the top 20 on the [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math). It's important to note that verl's internal evaluation metrics may not perfectly align with the official evaluation methodology for Qwen2.5-7B-Instruct. Therefore, for consistency and fair comparison, we report only the results based on verl's evaluation framework.
+```
+git clone git@github.com:volcengine/verl.git
+cd verl
+python3 -m uv pip install -e ".[sglang]"
+export WANDB_API_KEY=<YOUR_WANDB_API_KEY>
+python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
+hf download Qwen/Qwen2.5-7B-Instruct --local-dir $HOME/models/Qwen2.5-7B-Instruct
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+bash recipe/sppo/run_qwen2.5-7b_rm.sh
+```
+Note that the installation would occasionally fail to install flash-attn. If this happens, you can install it manually by running:
+```bash
+python3 -m uv pip install wheel
+python3 -m uv pip install packaging
+python3 -m uv pip install flash-attn --no-build-isolation --no-deps
+```
+## Acknowledgement
+We sincerely thank the contribution and guidance from:
+- [Yue Wu](https://yuewu.us/)
+- [Chendong Wang](https://cdwang96.github.io/)
+- [Yifan Zhang](https://github.com/yifanzhang-pro)
+- [Yongan Xiang](https://github.com/BearBiscuit05)
+- [Junrong Lin](https://github.com/ocss884)
+- [Yuxuan Tong](https://github.com/tongyx361)
+- [Guangming Shen](https://github.com/PeterSH6)
+- [Biao He](https://www.linkedin.com/in/biao-he/)
+- [Qingquan Song](https://qingquansong.github.io/)
+- [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)

code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst ADDED Viewed

	@@ -0,0 +1,796 @@

+Getting started with AMD (ROCM Kernel)
+=====================================================
+Last updated: 07/06/2025.
+Author: `Yusheng Su <https://yushengsu-thu.github.io/>`_
+Setup
+-----
+If you run on AMD GPUs (MI300) with ROCM platform, you cannot use the previous quickstart to run verl. You should follow the following steps to build a docker and set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` or ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training.
+docker/Dockerfile.rocm
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. code-block:: bash
+    FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04"
+    SHELL ["/bin/bash", "-ceuxo", "pipefail"]
+    ENV MAX_JOBS=512
+    ENV PATH="/usr/local/python3.12/bin:$PATH"
+    RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \
+        ln -sf /usr/bin/pip3.12 /usr/bin/pip
+    ############################################
+    RUN apt-get update
+    RUN apt-get install -y pkg-config liblzma-dev
+    ############################################
+    ###########################################
+    ##########Install TransformerEngine########
+    ###########################################
+    WORKDIR /workspace/
+    # transformer-engine install
+    # https://github.com/ROCm/TransformerEngine
+    RUN rm -rf TransformerEngine
+    RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git
+    WORKDIR /workspace/TransformerEngine
+    git checkout 236178e5
+    # git checkout bb061ade
+    # git checkout 864405c
+    ENV NVTE_FRAMEWORK=pytorch
+    ENV NVTE_ROCM_ARCH=gfx942
+    ENV NVTE_USE_HIPBLASLT=1
+    ENV NVTE_USE_ROCM=1
+    # export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}"
+    ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr"
+    RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv
+    WORKDIR /workspace/
+    ###########################################
+    ###########################################
+    ###########################################
+    ####################################################################################
+    ################Install vllm - sglang require vllm 0.6.7 dependency#################
+    ####################################################################################
+    #### Require vllm 0.6.7 - checkout 113274a0
+    WORKDIR /workspace/
+    RUN rm -rf vllm
+    RUN pip uninstall -y vllm
+    # Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html
+    RUN git clone https://github.com/ROCm/vllm.git
+    # git clone https://github.com/vllm-project/vllm.git
+    WORKDIR /workspace/vllm
+    RUN git checkout 113274a0
+    ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+    #ENV MAX_JOBS=512
+    ENV MAX_JOBS=${MAX_JOBS}
+    RUN pip install "boto3>=1.26.0"
+    RUN pip install setuptools_scm
+    # will add src into py. You can delete the repo
+    RUN python3 setup.py install
+    WORKDIR /workspace/
+    ####################################################################################
+    ####################################################################################
+    ####################################################################################
+    ###########################################
+    ############For hack docker################
+    ###########################################
+    RUN pip install setuptools==75.8.0
+    ###########################################
+    ###########################################
+    ###########################################
+    ###########################################
+    ############build sgalng###################
+    ###########################################
+    # Set environment variables
+    ENV BASE_DIR=/sgl-workspace
+    ENV BUILD_TYPE=all
+    ENV SGL_REPO=https://github.com/sgl-project/sglang
+    ENV SGL_BRANCH=v0.4.6.post5
+    ENV TRITON_REPO=https://github.com/ROCm/triton.git
+    ENV TRITON_COMMIT=improve_fa_decode_3.0.0
+    ENV AITER_REPO=https://github.com/ROCm/aiter.git
+    ENV AITER_COMMIT=v0.1.2
+    # v0.1.2 version - commit id: 9d11f47
+    # ENV AITER_COMMIT=9d11f47
+    ENV HIP_FORCE_DEV_KERNARG=1
+    ENV HSA_NO_SCRATCH_RECLAIM=1
+    ENV SGLANG_SET_CPU_AFFINITY=1
+    ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1
+    ENV NCCL_MIN_NCHANNELS=112
+    ENV MOE_PADDING=1
+    ENV VLLM_FP8_PADDING=1
+    ENV VLLM_FP8_ACT_PADDING=1
+    ENV VLLM_FP8_WEIGHT_PADDING=1
+    ENV VLLM_FP8_REDUCE_CONV=1
+    ENV TORCHINDUCTOR_MAX_AUTOTUNE=1
+    ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1
+    ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+    ENV AMDGPU_TARGETS=gfx942
+    ENV ROCM_ARCH=gfx942
+    ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
+    # Switch to working directory
+    WORKDIR /sgl-workspace
+    # Clean and create directory
+    RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace
+    # Clone and build sglang
+    RUN git clone ${SGL_REPO} \
+        && cd sglang \
+        && git checkout ${SGL_BRANCH} || echo "Using default branch" \
+        && cd sgl-kernel \
+        && rm -f pyproject.toml \
+        && mv pyproject_rocm.toml pyproject.toml \
+        && python setup_rocm.py install \
+        && cd .. \
+        && if [ "$BUILD_TYPE" = "srt" ]; then \
+            python -m pip --no-cache-dir install -e "python[srt_hip]"; \
+        else \
+            python -m pip --no-cache-dir install -e "python[all_hip]"; \
+        fi \
+        && cd /sgl-workspace \
+        && cp -r /sgl-workspace/sglang /sglang \
+        && python -m pip cache purge
+    # Install common Python packages
+    RUN pip install IPython orjson python-multipart torchao pybind11
+    # Rebuild Triton
+    RUN pip uninstall -y triton || true \
+        && git clone ${TRITON_REPO} \
+        && cd triton \
+        && git checkout ${TRITON_COMMIT} \
+        && cd python \
+        && python3 setup.py install \
+        && cd /sgl-workspace
+    # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1"
+    # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942"
+    # Build aiter
+    #version: Commit 9d11f47
+        # && git checkout ${AITER_COMMIT} \
+    RUN pip uninstall -y aiter || true
+    RUN git clone ${AITER_REPO} \
+        && cd aiter \
+        && git checkout ${AITER_COMMIT} \
+        && git submodule sync \
+        && git submodule update --init --recursive \
+        && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \
+        && cd /sgl-workspace
+    # Copy MI300X config
+    RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \
+            /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \
+            -type f -name '*MI300X*' | \
+            xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {}
+    # Environment setup complete.
+    RUN echo "Environment setup complete."
+    WORKDIR /workspace/
+    ###########################################
+    ###########################################
+    ###########################################
+    ###########################################
+    ###############vllm v0.8.5#################
+    ###########################################
+    WORKDIR /workspace/
+    ENV VLLM_TARGET_DEVICE=rocm
+    ENV ROCM_PATH=/opt/rocm
+    ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev
+    # Find the repo path in: DockerFile/Dockerfile.rocm_yang
+    # RUN git clone https://github.com/RLFoundation/vllm-patch.git
+    RUN pip uninstall -y vllm || true
+    RUN rm -rf vllm-patch
+    RUN git clone https://github.com/RLFoundation/vllm-patch.git \
+        && cd vllm-patch \
+        && git checkout v0.8.5-sleep-numa \
+        && rm -rf build/ dist/ *.egg-info \
+        && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \
+        && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install
+        # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop
+    WORKDIR /workspace/
+    ###########################################
+    ###########################################
+    ###########################################
+    #########################################
+    #### Install megatron-core###############
+    #########################################
+    RUN pip uninstall -y megatron-core && \
+        git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \
+        cd Megatron-LM-amd_version && \
+        pip install -vvv -e . && \
+        cd /workspace/
+    #########################################
+    #########################################
+    #########################################
+    #######################################
+    ################apex###################
+    #######################################
+    WORKDIR /workspace/
+    RUN pip uninstall -y apex && \
+        git clone git@github.com:ROCm/apex.git && \
+        cd apex && \
+        python setup.py install && \
+        cd /workspace/
+    #######################################
+    #######################################
+    #######################################
+    ################################################################################
+    ###########################Add torch_memory_saver###############################
+    ################################################################################
+    # Set environment variables
+    ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
+    ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
+    ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
+    RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa"
+    ################################################################################
+    ################################################################################
+    ################################################################################
+    ########################################
+    ######Install ray#######################
+    ########################################
+    # need to add this patch: https://github.com/ray-project/ray/pull/53531/files
+    RUN pip uninstall ray -y
+    RUN pip install "ray[data,train,tune,serve]>=2.47.0"
+    ########################################
+    ########################################
+    ########################################
+    ##########################################
+    #######Install other dependencies#########
+    ##########################################
+    RUN pip install "tensordict==0.6.2" --no-deps && \
+        pip install accelerate \
+        codetiming \
+        datasets \
+        dill \
+        hydra-core \
+        liger-kernel \
+        numpy \
+        pandas \
+        peft \
+        "pyarrow>=15.0.0" \
+        pylatexenc \
+        torchdata \
+        wandb \
+        orjson \
+        pybind11
+    WORKDIR /workspace/
+    RUN git clone https://github.com/volcengine/verl.git && \
+        cd verl && \
+        pip install -e .
+    ##########################################
+    ##########################################
+    ##########################################
+    WORKDIR /workspace/
+    CMD ["/usr/bin/bash"]
+Build the image:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. code-block:: bash
+    docker docker/build -t verl-rocm .
+Run the container
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Note: You can pull the docker from this DockerHub: [RLSys Foundation](https://hub.docker.com/u/yushengsuthu)
+Pull the image:
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. code-block:: bash
+    docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4
+    docker tag rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 verl-rocm:latest
+Run the container
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Optional: Running without root and with user permissions
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: bash
+    docker run --rm -it \
+      --device /dev/dri \
+      --device /dev/kfd \
+      -p 8265:8265 \
+      --group-add video \
+      --cap-add SYS_PTRACE \
+      --security-opt seccomp=unconfined \
+      --privileged \
+      -v $HOME/.ssh:/root/.ssh \
+      -v $HOME:$HOME \
+      --shm-size 128G \
+      -w $PWD \
+      verl-rocm \
+      /bin/bash
+(Optional): If you do not want to root mode and require assign yourself as the user
+Please add ``-e HOST_UID=$(id -u)`` and ``-e HOST_GID=$(id -g)`` into the above docker launch script.
+Example
+-------
+Due to to special setting in AMD (ROCM) torch,
+1. If your ``ray>=2.45.0`` (default), you need to set ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training and add this [patch](https://github.com/ray-project/ray/pull/53531/files).
+2. If your ``ray<2.45.0``, you need to set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` when starting ray in verl's RLHF training.
+Inference ``$ENGINE`` can be ``vllm`` or ``sglang``. We choose ``vllm`` as default in the following examples.
+PPO
+~~~
+.. code-block:: bash
+    YOUR_PROJECT_NAME=r1-verl-ppo-upstream
+    YOUR_RUN_NAME=r1-training_ppo-upstream
+    # export HYDRA_FULL_ERROR=1
+    export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    # [ray] < 2.45.0
+    #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+    # [ray] >= 2.45.0
+    export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
+    GPUS_PER_NODE=8
+    MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
+    python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
+    python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
+    ENGINE=vllm #sglang
+    PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
+     data.train_files=data/gsm8k/train.parquet \
+     data.val_files=data/gsm8k/test.parquet \
+     data.train_batch_size=256 \
+     data.val_batch_size=1312 \
+     data.max_prompt_length=512 \
+     data.max_response_length=256 \
+     actor_rollout_ref.model.path=$MODEL_PATH \
+     actor_rollout_ref.actor.optim.lr=1e-6 \
+     actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+     actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
+     actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
+     actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
+     actor_rollout_ref.rollout.name=$ENGINE \
+     actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+     actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
+     critic.optim.lr=1e-5 \
+     critic.model.path=$MODEL_PATH \
+     critic.ppo_micro_batch_size_per_gpu=4 \
+     algorithm.kl_ctrl.kl_coef=0.001 \
+     trainer.logger=console \
+     trainer.project_name=$YOUR_PROJECT_NAME \
+     trainer.experiment_name=$YOUR_RUN_NAME \
+     trainer.val_before_train=False \
+     trainer.n_gpus_per_node=$GPUS_PER_NODE \
+     trainer.nnodes=1 \
+     trainer.save_freq=10 \
+     trainer.test_freq=10 \
+     trainer.total_epochs=15 #2>&1 | tee verl_demo.log
+GRPO
+~~~~
+.. code-block:: bash
+    YOUR_PROJECT_NAME=r1-verl-grpo-upstream
+    YOUR_RUN_NAME=r1-training_grpo-upstream
+    # export HYDRA_FULL_ERROR=1
+    # export FSDP_VERBOSE=1
+    #export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    # [ray] < 2.45.0
+    #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+    # [ray] >= 2.45.0
+    export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
+    GPUS_PER_NODE=8
+    MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
+    # MODEL_PATH=Qwen/Qwen2-7B-Instruct
+    python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k
+    python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
+    ENGINE=vllm #sglang
+    python3 -m verl.trainer.main_ppo \
+        algorithm.adv_estimator=grpo \
+        data.train_files=data/gsm8k/train.parquet \
+        data.val_files=data/gsm8k/test.parquet \
+        data.train_batch_size=1024 \
+        data.val_batch_size=1312 \
+        data.max_prompt_length=512 \
+        data.max_response_length=1024 \
+        actor_rollout_ref.model.path=$MODEL_PATH \
+        actor_rollout_ref.actor.optim.lr=1e-6 \
+        actor_rollout_ref.model.use_remove_padding=True \
+        actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+        actor_rollout_ref.actor.use_dynamic_bsz=True \
+        actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
+        actor_rollout_ref.actor.use_kl_loss=True \
+        actor_rollout_ref.actor.kl_loss_coef=0.001 \
+        actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+        actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
+        actor_rollout_ref.actor.fsdp_config.param_offload=False \
+        actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+        actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+        actor_rollout_ref.rollout.name=$ENGINE \
+        actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+        actor_rollout_ref.rollout.n=5 \
+        actor_rollout_ref.ref.fsdp_config.param_offload=False \
+        algorithm.kl_ctrl.kl_coef=0.001 \
+        trainer.critic_warmup=0 \
+        trainer.logger=console \
+        trainer.project_name=$YOUR_PROJECT_NAME \
+        trainer.experiment_name=$YOUR_RUN_NAME \
+        trainer.n_gpus_per_node=$GPUS_PER_NODE \
+        trainer.val_before_train=False \
+        trainer.nnodes=1 \
+        trainer.save_freq=-1 \
+        trainer.test_freq=10 \
+        trainer.total_epochs=15
+Multi-node training: slurm with Docker/Podman container
+---------------------------------------------------------------------------------------
+If you want to run multi-node training with slurm, you can use the following script.
+.. note::
+    1. You need to use ``podman`` or ``docker`` in the following script. We will release the apptainer script later.
+    2. If you want to use ``podman``, you just replace ``docker`` with ``podman`` in the following script.
+The script includes the following steps:
+1. SLURM Configuration
+2. Environment Setup
+3. Docker/Podman Container Setup
+4. Ray Cluster Initialization
+5. Data Preprocessing
+6. Model Setup
+7. Training Launch
+slurm_script.sh
+~~~~~~~~~~~~~~~~~~~~
+.. code-block:: bash
+    #!/bin/bash
+    #SBATCH --job-name=verl-ray-on-slurm
+    #SBATCH --nodes=2
+    #SBATCH --ntasks-per-node=2
+    #SBATCH --mem=200G
+    #SBATCH --time=30-00:00:00
+    #SBATCH --gpus-per-node=8
+    #SBATCH --cpus-per-task=28
+    #SBATCH --output=../verl_log/slurm-%j.out
+    #SBATCH --error=../verl_log/slurm-%j.err
+    #SBATCH --nodelist=gpu-[0,1]
+    # load necessary modules
+    ### Run this setup
+    # [Cluster]: Use docker
+    # docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
+    ##########################################################################
+    ###The following setting should be set in different project and cluster###
+    ##########################################################################
+    ### Project
+    CONTAINER_NAME="multinode_verl_training"
+    IMG="verl.rocm"
+    DOCKERFILE="docker/Dockerfile.rocm"
+    # echo $PWD
+    verl_workdir="${HOME}/projects/verl_upstream"
+    export TRANSFORMERS_CACHE="${HOME}/.cache/huggingface"
+    export HF_HOME=$TRANSFORMERS_CACHE
+    ### Cluster Network Setting
+    export NCCL_DEBUG=TRACE
+    export GPU_MAX_HW_QUEUES=2
+    export TORCH_NCCL_HIGH_PRIORITY=1
+    export NCCL_CHECKS_DISABLE=1
+    # export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7
+    export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9
+    export NCCL_IB_GID_INDEX=3
+    export NCCL_CROSS_NIC=0
+    export CUDA_DEVICE_MAX_CONNECTIONS=1
+    export NCCL_PROTO=Simple
+    export RCCL_MSCCL_ENABLE=0
+    export TOKENIZERS_PARALLELISM=false
+    export HSA_NO_SCRATCH_RECLAIM=1
+    ##########################################################################
+    ## Assign using GPUs
+    export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+    ### For rocm and training script
+    # [ray] < 2.45.0
+    #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+    # [ray] >= 2.45.0
+    export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794
+    # Build and launch the Docker container
+    srun bash -c "
+        # Exit on any error
+        set -e
+        # Clean up dangling images (images with <none> tag)
+        docker image prune -f
+        # Need to pull the docker first
+        docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4
+        if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "${IMG}"; then
+            echo \"Building ${IMG} image...\"
+            docker build -f \"${DOCKERFILE}\" -t \"${IMG}\" .
+        else
+            echo \"${IMG} image already exists, skipping build\"
+        fi
+        # Removing old container if exists
+        docker rm \"${CONTAINER_NAME}\" 2>/dev/null || true
+        # Checking network devices
+        ibdev2netdev
+        # Launch the docker
+        docker run --rm -d \
+        -e HYDRA_FULL_ERROR=1 \
+        -e RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \
+        -e RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 \
+        -e NCCL_DEBUG=${NCCL_DEBUG} \
+        -e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES} \
+        -e TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY} \
+        -e NCCL_CHECKS_DISABLE=${NCCL_CHECKS_DISABLE} \
+        -e NCCL_IB_HCA=${NCCL_IB_HCA} \
+        -e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX} \
+        -e NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \
+        -e CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS} \
+        -e NCCL_PROTO=${NCCL_PROTO} \
+        -e RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE} \
+        -e TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM} \
+        -e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM} \
+        -e TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE} \
+        -e HF_HOME=${HF_HOME} \
+        --network host \
+        --device /dev/dri \
+        --device /dev/kfd \
+        --device /dev/infiniband \
+        --group-add video \
+        --cap-add SYS_PTRACE \
+        --security-opt seccomp=unconfined \
+        --privileged \
+        -v \${HOME}:\${HOME} \
+        -v \${HOME}/.ssh:/root/.ssh \
+        -w "${verl_workdir}" \
+        --shm-size 128G \
+        --name \"${CONTAINER_NAME}\" \
+        \"${IMG}\" \
+        tail -f /dev/null
+        echo \"Container setup completed\"
+    "
+        # (Optional): If you do not want to root mode and require assign yuorself as the user
+        # Please add `-e HOST_UID=$(id -u)` and `-e HOST_GID=$(id -g)` into the above docker launch script.
+    ### Ray launch the nodes before training
+    # Getting the node names
+    nodes_array=($(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' '))
+    head_node=${nodes_array[0]}
+    head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
+    # if we detect a space character in the head node IP, we'll
+    # convert it to an ipv4 address. This step is optional.
+    if [[ "$head_node_ip" == *" "* ]]; then
+        IFS=' ' read -ra ADDR <<<"$head_node_ip"
+    if [[ ${#ADDR[0]} -gt 16 ]]; then
+        head_node_ip=${ADDR[1]}
+    else
+        head_node_ip=${ADDR[0]}
+    fi
+        echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
+    fi
+    port=6379
+    ip_head=$head_node_ip:$port
+    export ip_head
+    echo "IP Head: $ip_head"
+    # make sure we set environment variables before Ray initialization
+    # Print out all env variables
+    printenv
+    echo "Starting HEAD at $head_node"
+    srun --nodes=1 --ntasks=1 -w "$head_node" \
+        docker exec "${CONTAINER_NAME}" \
+            ray start --head --node-ip-address="$head_node_ip" --port=$port \
+            --dashboard-port=8266 \
+            --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
+    # optional, though may be useful in certain versions of Ray < 1.0.
+    sleep 10
+    # number of nodes other than the head node
+    worker_num=$((SLURM_JOB_NUM_NODES - 1))
+    for ((i = 1; i <= worker_num; i++)); do
+        node_i=${nodes_array[$i]}
+        echo "Debug: Starting worker on node_i = ${node_i}"
+        if [ -z "$node_i" ]; then
+            echo "Error: Empty node name for worker $i"
+            continue
+        fi
+        echo "Starting WORKER $i at $node_i"
+        srun --nodes=1 --ntasks=1 -w "$node_i" \
+            docker exec "${CONTAINER_NAME}" \
+                ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
+        sleep 5
+    done
+    # Ray initlization test (See whether any error in the above execution)
+    echo "Testing Ray initialization in the slurm nodes..."
+    docker exec "${CONTAINER_NAME}" python3 -c '
+    import ray
+    try:
+        ray.init(address="auto")
+        print("\n=== Ray Cluster Status ===")
+        print(f"Number of nodes: {len(ray.nodes())}")
+        for node in ray.nodes():
+            print("Node: {}, Status: {}".format(node["NodeManagerHostname"], node["Alive"]))
+            # print(f"Node: {node}")
+        ray.shutdown()
+        print("Ray initialization successful!")
+    except Exception as e:
+        print(f"Ray initialization failed: {str(e)}")
+    '
+    echo "=== Ray test completed ==="
+    ######
+    # Run data preprocessing
+    echo "Starting data preprocessing..."
+    docker exec "${CONTAINER_NAME}" \
+        python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k"
+    echo "Starting data preprocessing..."
+    docker exec "${CONTAINER_NAME}" \
+        python3 "examples/data_preprocess/math_dataset.py" "--local_dir" "../data/math"
+    train_files="../data/gsm8k/train.parquet"
+    val_files="../data/gsm8k/test.parquet"
+    # Download and test model
+    echo "Loading model..."
+    docker exec "${CONTAINER_NAME}" \
+        python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
+    MODEL_PATH="Qwen/Qwen2-7B-Instruct"
+    # Set model path after pipeline test
+    MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct"
+    echo "== Data and model loading Done =="
+    echo "Start to train..."
+    docker exec "${CONTAINER_NAME}" \
+        python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')"
+    MODEL_PATH="Qwen/Qwen2-7B-Instruct"
+    PYTHONUNBUFFERED=1 srun --overlap --nodes=${SLURM_NNODES} --ntasks=1 -w "$head_node" \
+        docker exec "${CONTAINER_NAME}" \
+        python3 -m verl.trainer.main_ppo \
+        data.train_files=$train_files \
+        data.val_files=$val_files \
+        data.train_batch_size=1024 \
+        data.max_prompt_length=1024 \
+        data.max_response_length=1024 \
+        actor_rollout_ref.model.path=$MODEL_PATH \
+        actor_rollout_ref.model.enable_gradient_checkpointing=False \
+        actor_rollout_ref.actor.optim.lr=1e-6 \
+        actor_rollout_ref.model.use_remove_padding=True \
+        actor_rollout_ref.actor.ppo_mini_batch_size=256 \
+        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
+        actor_rollout_ref.model.enable_gradient_checkpointing=True \
+        actor_rollout_ref.actor.fsdp_config.param_offload=False \
+        actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
+        actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+        actor_rollout_ref.rollout.name=vllm \
+        actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \
+        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
+        actor_rollout_ref.ref.fsdp_config.param_offload=True \
+        critic.optim.lr=1e-5 \
+        critic.model.use_remove_padding=True \
+        critic.model.path=$MODEL_PATH \
+        critic.model.enable_gradient_checkpointing=False \
+        critic.ppo_micro_batch_size_per_gpu=8 \
+        critic.model.fsdp_config.param_offload=False \
+        critic.model.fsdp_config.optimizer_offload=False \
+        algorithm.kl_ctrl.kl_coef=0.0001 \
+        trainer.critic_warmup=0 \
+        trainer.logger='["console","wandb"]' \
+        trainer.project_name='verl_example' \
+        trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \
+        trainer.n_gpus_per_node=${SLURM_GPUS_PER_NODE} \
+        trainer.val_before_train=False \
+        trainer.nnodes=${SLURM_NNODES} \
+        trainer.save_freq=-1 \
+        trainer.test_freq=10 \
+        trainer.total_epochs=15
+Run slurm_script.sh
+~~~~~~~~~~~~~~~~~~~~
+Just sbatch your slurm_script.sh
+.. code-block:: bash
+    sbatch slurm_script.sh

code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst ADDED Viewed

	@@ -0,0 +1,41 @@

+verl performance tuning for AMD (ROCm Kernel)
+=====================================================
+Last updated: 11/13/2025.
+Author: `Yang Wang <https://github.com/YangWang92/>`_, `Songlin Jiang <https://github.com/HollowMan6/>`_
+Use vLLM Sleep Mode for AMD MI3xx series GPUs
+--------------------------------------------------------------
+By default, verl requires vLLM to enable sleep mode, which allows vLLM to offload GPU memory to CPU memory after rollout. This feature has been merged into the main branch of vLLM for version later than 0.11.0.
+For now, you can use the vLLM main branch and build it from the source code, or you can directly install vLLM from the pre-built ROCm wheels for vLLM version later than 0.11.0 when it's available.
+1. Clone the vLLM repository and build it with the following commands:
+.. code-block:: bash
+    git clone https://github.com/vllm-project/vllm.git
+    cd vllm
+    git reset --hard 4ca5cd5740c0cd7788cdfa8b7ec6a27335607a48 # You can also use a later commit as you wish
+    python -m pip install -r requirements/rocm.txt
+    VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ python3 setup.py develop
+2. Additionally, we recommend you to use the ROCm version later than or equal to ROCm 7.0.
+After the upgrade, you can verify whether sleep mode is working by trying out `these scripts <https://github.com/EmbeddedLLM/inference-experiment/tree/main/sleep_mode>`_.
+If sleep mode is working, you should see the memory usage reduce after sleep.
+After applying the vLLM patch and completing the installation, you can enable sleep mode in verl to reduce memory overhead. This allows verl to offload unused GPU memory during rollout, significantly lowering the memory footprint during long-context training or multi-node reinforcement learning.
+Enable CUDA Graph and Bypass ROCm-related issues
+--------------------------------------------------------------
+Due to potential issues with CUDA graph capture in ROCm, we've found that vLLM's CUDA graph feature cannot be enabled on multiple nodes in verl on AMD platforms with vLLM V1 mode. This leads to significantly slower rollout performance.
+Our investigation shows that ROCm may trigger an unexpected crash when attempting to capture large batches with CUDA graph. One workaround is to set ``actor_rollout_ref.rollout.cudagraph_capture_sizes`` to values such as ``[1, 2, 4, 8, 16, 32, 64]`` (change depending on your GPU memory size).
+Then, you can choose to enable CUDA graph by setting ``actor_rollout_ref.rollout.enforce_eager`` to ``False`` in your verl configuration file.

code/RL_model/verl/verl_train/docs/api/data.rst ADDED Viewed

	@@ -0,0 +1,61 @@

+Data interface
+=========================
+Last updated: 05/19/2025 (API docstrings are auto-generated).
+DataProto is the interface for data exchange.
+The :class:`verl.DataProto` class contains two key members:
+- batch: a :class:`tensordict.TensorDict` object for the actual data
+- meta_info: a :class:`Dict` with additional meta information
+TensorDict
+~~~~~~~~~~~~
+:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
+A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
+.. code-block:: python
+    >>> import torch
+    >>> from tensordict import TensorDict
+    >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
+    >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
+    >>> zeros = tensordict["zeros"]
+    >>> tensordict
+    TensorDict(
+    fields={
+        ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
+        twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
+        zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
+    batch_size=torch.Size([2]),
+    device=None,
+    is_shared=False)
+One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
+.. code-block:: python
+    >>> tensordict[..., :1]
+    TensorDict(
+    fields={
+        ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
+        twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
+        zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
+    batch_size=torch.Size([1]),
+    device=None,
+    is_shared=False)
+    >>> tensordict = tensordict.to("cuda:0")
+    >>> tensordict = tensordict.reshape(6)
+For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
+.. _tensordict: https://pytorch.org/tensordict/stable/overview.html
+Core APIs
+~~~~~~~~~~~~~~~~~
+.. autoclass::  verl.DataProto
+   :members: to, select, union, make_iterator, concat

code/RL_model/verl/verl_train/docs/api/single_controller.rst ADDED Viewed

	@@ -0,0 +1,30 @@

+Single Controller interface
+============================
+Last updated: 05/27/2025 (API docstrings are auto-generated).
+The Single Controller provides a unified interface for managing distributed workers
+using Ray or other backends and executing functions across them.
+It simplifies the process of dispatching tasks and collecting results, particularly
+when dealing with data parallelism or model parallelism.
+Core APIs
+~~~~~~~~~~~~~~~~~
+.. autoclass:: verl.single_controller.Worker
+   :members: __init__, __new__, get_master_addr_port, get_cuda_visible_devices, world_size, rank
+.. autoclass:: verl.single_controller.WorkerGroup
+   :members: __init__,  world_size
+.. autoclass:: verl.single_controller.ClassWithInitArgs
+   :members: __init__, __call__
+.. autoclass:: verl.single_controller.ResourcePool
+   :members: __init__, world_size, local_world_size_list, local_rank_list
+.. autoclass:: verl.single_controller.ray.RayWorkerGroup
+   :members: __init__
+.. autofunction:: verl.single_controller.ray.create_colocated_worker_cls

code/RL_model/verl/verl_train/docs/api/trainer.rst ADDED Viewed

	@@ -0,0 +1,31 @@

+Trainer Interface
+================================
+Last updated: 06/08/2025 (API docstrings are auto-generated).
+Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged.
+.. autosummary::
+   :nosignatures:
+   verl.trainer.ppo.ray_trainer.RayPPOTrainer
+Core APIs
+~~~~~~~~~~~~~~~~~
+.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
+   :members: __init__, init_workers, fit
+.. automodule:: verl.utils.tokenizer
+   :members: hf_tokenizer
+.. automodule:: verl.trainer.ppo.core_algos
+   :members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty
+.. automodule:: verl.trainer.ppo.reward
+   :members: load_reward_manager, compute_reward, compute_reward_async
+.. autoclass:: verl.workers.reward_manager.NaiveRewardManager
+.. autoclass:: verl.workers.reward_manager.DAPORewardManager

code/RL_model/verl/verl_train/docs/api/utils.rst ADDED Viewed

	@@ -0,0 +1,76 @@

+Utilities
+============
+Last updated: 05/19/2025 (API docstrings are auto-generated).
+This section documents the utility functions and classes in the VERL library.
+Python Functional Utilities
+------------------------------
+.. automodule:: verl.utils.py_functional
+   :members: append_to_dict
+File System Utilities
+------------------------
+.. automodule:: verl.utils.fs
+   :members: copy_to_local
+Tracking Utilities
+---------------------
+.. automodule:: verl.utils.tracking
+   :members: Tracking
+Metrics Utilities
+---------------------
+.. automodule::  verl.utils.metric
+   :members: reduce_metrics
+Checkpoint Management
+------------------------
+.. automodule:: verl.utils.checkpoint.checkpoint_manager
+   :members: find_latest_ckpt_path
+.. automodule:: verl.utils.checkpoint.fsdp_checkpoint_manager
+   :members: FSDPCheckpointManager
+Dataset Utilities
+---------------------
+.. automodule:: verl.utils.dataset.rl_dataset
+   :members: RLHFDataset, collate_fn
+Torch Functional Utilities
+-----------------------------
+.. automodule:: verl.utils.torch_functional
+   :members: get_constant_schedule_with_warmup, masked_whiten, masked_mean, logprobs_from_logits
+Sequence Length Balancing
+----------------------------
+.. automodule:: verl.utils.seqlen_balancing
+   :members: get_reverse_idx, rearrange_micro_batches
+Ulysses Utilities
+--------------------
+.. automodule:: verl.utils.ulysses
+   :members: gather_outputs_and_unpad, ulysses_pad_and_slice_inputs
+FSDP Utilities
+------------------
+.. automodule:: verl.utils.fsdp_utils
+   :members: get_fsdp_wrap_policy, get_init_weight_context_manager, init_fn, load_fsdp_model_to_gpu, load_fsdp_optimizer, offload_fsdp_model_to_cpu, offload_fsdp_optimizer,
+Debug Utilities
+-------------------
+.. automodule:: verl.utils.profiler
+   :members: log_gpu_memory_usage, GPUMemoryLogger

code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst ADDED Viewed

	@@ -0,0 +1,50 @@

+Align the Inference results of the verl and vLLM frameworks on Ascend devices(zh)
+====================================
+在昇腾设备上对齐verl和vLLM两个框架下的推理结果。
+Last updated: 11/17/2025.
+这是一份在昇腾设备上对齐verl和vLLM两个框架下推理结果的教程。
+环境变量配置
+~~~~~~~~~~~~
+在多卡通信情况下：
+- HCCL通信下(默认场景):
+  -  export CLOSE_MATMUL_K_SHIFT=1
+  -  export ATB_MATMUL_SHUFFLE_K_ENABLE=0
+  -  export HCCL_DETERMINISTIC="true"
+  -  export VLLM_ENABLE_V1_MULTIPROCESSING=0
+- LCCL通信下(通过export HCCL_OP_EXPANSION_MODE="AIV"使能）:
+  -  export CLOSE_MATMUL_K_SHIFT=1
+  -  export ATB_MATMUL_SHUFFLE_K_ENABLE=0
+  -  export LCCL_DETERMINISTIC=1
+  -  export ATB_LLM_LCOC_ENABLE=0
+  -  export VLLM_ENABLE_V1_MULTIPROCESSING=0
+在单卡无通信情况下：
+- HCCL和LCCL通信下:
+  -  export CLOSE_MATMUL_K_SHIFT=1
+  -  export ATB_MATMUL_SHUFFLE_K_ENABLE=0
+  -  export VLLM_ENABLE_V1_MULTIPROCESSING=0
+vLLM初始化参数
+~~~~~~~~~~~~
+需要对 SamplingParams 参数里单独设置seed, 保持vLLM和verl推理结果一致, 举例修改如下：
+.. code:: yaml
+      sampling_params = SamplingParams(n=1,
+                                       logprobs=0,  # can be set to 0 and let actor to recompute
+                                       max_tokens=config.response_length,
+                                       repetition_penalty=config.get("repetition_penalty", 1.0),
+                                       seed=1234)

code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst ADDED Viewed

	@@ -0,0 +1,403 @@

+Performance data collection based on FSDP or MindSpeed(Megatron) on Ascend devices(en)
+==========================================================================================
+Last updated: 12/20/2025.
+This is a tutorial for data collection using the GRPO or DAPO algorithm
+based on FSDP or MindSpeed(Megatron) on Ascend devices.
+Configuration
+-------------
+Leverage two levels of configuration to control data collection:
+- **Global profiler control**: Use parameters in ``verl/trainer/config/ppo_trainer.yaml`` (FSDP) or ``verl/trainer/config/ppo_megatron_trainer.yaml`` (MindSpeed) to control the collection mode and steps.
+- **Role profile control**: Use parameters in each role's ``profile`` field to control various parameters.
+Global collection control
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Use parameters in ppo_trainer.yaml to control the collection mode
+and steps.
+-  global_profiler: Control the ranks and mode of profiling
+   -  tool: The profiling tool to use, options are nsys, npu, torch,
+      torch_memory.
+   -  steps: This parameter can be set as a list that has
+      collection steps, such as [2, 4], which means it will collect steps 2
+      and 4. If set to null, no collection occurs.
+   -  save_path: The path to save the collected data. Default is
+      "outputs/profile".
+Role collection control
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+In each role's ``profiler`` field, you can control the collection mode for that role.
+-  enable: Whether to enable profiling for this role.
+-  all_ranks: Whether to collect data from all ranks.
+-  ranks: A list of ranks to collect data from. If empty, no data is collected.
+-  tool_config: Configuration for the profiling tool used by this role.
+Use parameters in each role's ``profiler.tool_config.npu`` to control npu profiler behavior:
+-  level: Collection level—options are level_none, level0, level1, and
+   level2
+   -  level_none: Disables all level-based data collection (turns off profiler_level).
+   -  level0: Collect high-level application data, underlying NPU data, and operator execution details on NPU. After balancing data volume and analytical capability, Level 0 is recommended as the default configuration.
+   -  level1: Extends level0 by adding CANN-layer AscendCL data and AI Core performance metrics on NPU.
+   -  level2: Extends level1 by adding CANN-layer Runtime data and AI CPU metrics.
+-  contents: A list of options to control the collection content, such as
+   npu, cpu, memory, shapes, module, stack.
+   -  npu: Whether to collect device-side performance data.
+   -  cpu: Whether to collect host-side performance data.
+   -  memory: Whether to enable memory analysis.
+   -  shapes: Whether to record tensor shapes.
+   -  module: Whether to record framework-layer Python call stack information. It is recommended to use 'module' instead of 'stack' for recording call stack information, as it costs less performance overhead.
+   -  stack: Whether to record operator call stack information.
+-  analysis: Enables automatic data parsing.
+-  discrete: Whether to enable discrete mode.
+Examples
+--------
+Disabling collection
+~~~~~~~~~~~~~~~~~~~~
+.. code:: yaml
+      global_profiler:
+         steps: null # disable profile
+End-to-End collection
+~~~~~~~~~~~~~~~~~~~~~
+.. code:: yaml
+      global_profiler:
+         steps: [1, 2, 5]
+         save_path: ./outputs/profile
+      actor_rollout_ref:
+         actor:  # Set actor role profiler collection configuration parameters
+            profiler:
+               enable: True
+               all_ranks: True
+               tool_config:
+                  npu:
+                     discrete: False
+                     contents: [npu, cpu]  # Control collection list, default cpu, npu, can configure memory, shapes, module, etc.
+        # rollout & ref follow actor settings
+Discrete Mode Collection
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. code:: yaml
+      global_profiler:
+         steps: [1, 2, 5]
+         save_path: ./outputs/profile
+      actor_rollout_ref:
+         actor:
+            profiler:
+               enable: True  # Set to True to profile training
+               all_ranks: False
+               ranks: [0]  # Global Rank 0
+               tool_config:
+                  npu:
+                     discrete: True
+                     contents: [npu, cpu]
+         rollout:
+            profiler:
+               enable: True  # Set to True to profile inference
+               all_ranks: False
+               ranks: [0]  # In Agent Loop mode, this is the Replica Rank (e.g., 0-th instance)
+               tool_config:
+                  npu:
+                     discrete: True  # Must be enabled in Agent Loop mode
+         # ref follow actor settings
+**Agent Loop Scenario Description**:
+When Rollout runs in `Agent Loop <../advance/agent_loop.rst>`_ mode, performance data for the Rollout phase **must be collected using discrete mode**. At this time, the Profiler is triggered by the inference engine backend.
+1. **Rank Meaning**: ``ranks`` in the Rollout config refers to the **Replica Rank** (instance index), not the global rank.
+2. **Inference Engine Setup**:
+   - **vLLM Engine**
+      - **Must be configured via environment variables**:
+         - ``VLLM_TORCH_PROFILER_DIR``: Directory to save traces (**Required**).
+         - ``VLLM_TORCH_PROFILER_WITH_STACK``: Control stack tracing (1: on, 0: off, default: on).
+         - ``VLLM_TORCH_PROFILER_RECORD_SHAPES``: Set to 1 to record shapes of operator inputs.
+         - ``VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY``: Set to 1 to track tensor memory allocation/free.
+         - ``VLLM_TORCH_PROFILER_WITH_FLOPS``: Set to 1 to estimate FLOPS.
+      - *Note: vLLM ignores the save_path and contents in yaml.*
+   - **SGLang Engine**
+      - **Zero Configuration**. Automatically reads configuration from ``ppo_trainer.yaml``.
+Visualization
+-------------
+Collected data is stored in the user-defined save_path and can be
+visualized by using the `MindStudio Insight <https://www.hiascend.com/document/detail/zh/mindstudio/80RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0002.html>`_ tool.
+Additionally, in a Linux environment, the MindStudio Insight tool is provided in the form of a `JupyterLab Plugin <https://www.hiascend.com/document/detail/zh/mindstudio/82RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0130.html>`_ ，offering a more intuitive and highly interactive user interface. The advantages of the JupyterLab plugin are as follows:
+- Seamless integration: Supports running the MindStudio Insight tool directly within the Jupyter environment, eliminating the need to switch platforms or copy data from the server, enabling data to be collected and used immediately.
+- Fast startup: Allows MindStudio Insight to be launched quickly via the JupyterLab command line or graphical interface.
+- Smooth operation: In a Linux environment, launching MindStudio Insight through JupyterLab effectively alleviates performance lag compared to the full-package communication mode, significantly improving the user experience.
+- Remote access: Supports remotely launching MindStudio Insight. Users can connect to the service via a local browser for direct visual analysis, reducing the difficulty of uploading and downloading data during large-model training or inference.
+If the analysis parameter is set to False, offline parsing is required after data collection:
+.. code:: python
+    import torch_npu
+    # Set profiler_path to the parent directory of the "localhost.localdomain_<PID>_<timestamp>_ascend_pt" folder
+    torch_npu.profiler.profiler.analyse(profiler_path=profiler_path)
+Advanced Guide: Fine-grained Collection
+---------------------------------------
+Background and Challenges
+~~~~~~~~~~~~~~~~~~~~~~~~~
+Although the configuration-based collection method mentioned above is convenient, it faces challenges in training scenarios with **long sequences (Long Context)** or **large global batch sizes (Large Global Batch Size)**. Within a complete training step (Step), model computation exhibits high-frequency and repetitive characteristics:
+1. **Rollout phase**: Sequence generation (Generate Sequence) is an autoregressive process involving thousands of forward computations of the Decoder model.
+2. **Training phase**: To control peak memory usage, verl typically adopts a Micro-Batch strategy, dividing large data streams into multiple micro-batches for computation.
+   - **compute_log_prob (Actor/Ref)**: Involves multiple rounds of pure forward propagation.
+   - **update_policy (Actor/Critic)**: Involves multiple rounds of forward and backward propagation.
+This characteristic leads to massive and repetitive operator records from full profiling. As shown in the image below:
+.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png
+Even with ``discrete`` mode enabled, performance data files for a single stage can still reach several TB, leading to **parsing failures** or **visualization tool lag**.
+Solution: Critical Path Sampling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+To solve the above problems, we can adopt a **critical path sampling** strategy: Based on the API interface provided by `torch_npu.profiler <https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/devaids/auxiliarydevtool/atlasprofiling_16_0038.html>`_, directly modify Python source code to collect only representative data segments (such as specific Decode Steps or the first Micro-Batch).
+    **Important Notes**
+    1. This chapter involves direct source code modification. It is recommended to back up files before modification and restore them after debugging.
+    2. When using code instrumentation for collection, be sure to **disable global collection** (``global_profiler: steps: null``) in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yaml`` to avoid Profiler conflicts.
+1. Fine-grained Collection in Rollout Phase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+For vLLM or SGLang inference engines, we can control the ``schedule`` parameter to collect model forward propagation performance data for specific tokens.
+**vLLM Engine**
+- **Reference Version**: vLLM v0.11.0, vLLM-Ascend v0.11.0rc1
+- **Modified File**: ``vllm-ascend/vllm_ascend/worker/worker_v1.py``
+.. code-block:: diff
+      class NPUWorker(WorkerBase):
+          def __init__(self, *args, **kwargs):
+              # ... existing code ...
+  +           # Initialize profiler
+  +           import torch_npu
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(
+  +               profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+  +               export_type=torch_npu.profiler.ExportType.Db,  # You can choose torch_npu.profiler.ExportType.Text format
+  +           )
+  +           self.profiler_npu = torch_npu.profiler.profile(
+  +               activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU],
+  +               with_modules=False,  # Collect call stack
+  +               profile_memory=False,  # Collect memory
+  +               experimental_config=experimental_config,
+  +               # Skip first step, warmup one step, collect 3 steps, repeat 1 time. If you want to collect decode steps 30~70, set schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1)
+  +               schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True)  # Data save path and whether to parse online
+  +           )
+  +           self.profiler_npu.start()
+              # ... existing code ...
+          def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs):
+              # ... existing code ...
+              output = self.model_runner.execute_model(scheduler_output,
+                                                  intermediate_tensors)
+  +           self.profiler_npu.step()  # Drive schedule to collect partial decode steps
+              # ... existing code ...
+**SGLang Engine**
+- **Reference Version**: SGLang master branch
+- **Modified File**: ``sglang/python/sglang/srt/model_executor/model_runner.py``
+.. code-block:: diff
+      # ... existing imports ...
+  +   import torch_npu
+      class ModelRunner:
+          def __init__(self, *args, **kwargs):
+              # ... existing init code ...
+  +           # Initialize profiler (same configuration as above, omitted)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.profiler_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # Skip first step, warmup one step, collect 3 steps, repeat 1 time.
+  +               schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True)
+  +           )
+  +           self.profiler_npu.start()
+          def forward(self, forward_batch, **kwargs):
+              # ... existing code ...
+  +           self.profiler_npu.step()  # Drive schedule to collect partial decode steps
+              return output
+2. Fine-grained Collection in compute_log_prob (Actor & Ref) Phase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+This phase computes probability distributions for new and old policies.
+**FSDP Backend**
+The FSDP backend allows fine-grained control at the Micro-Batch level.
+- **Modified File**: ``verl/workers/actor/dp_actor.py``
+.. code-block:: diff
+      # ... import dependencies ...
+  +   import torch_npu
+      class DataParallelPPOActor(BasePPOActor):
+          def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor:
+  +           role = "Ref" if self.actor_optimizer is None else "Actor"
+  +           # Prepare profiler (same configuration as above, omitted)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.prof_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # wait=0, warmup=0, active=1: directly collect first micro-batch
+  +               schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True)
+  +           )
+  +           # This function is shared by ref and actor, set role flag to distinguish. If you want to collect actor_compute_log_prob, set if role=="Actor":
+  +           if role=="Ref":
+  +               self.prof_npu.start()
+              for micro_batch in micro_batches:
+                  # ... original computation logic ...
+                  with torch.no_grad():
+                      entropy, log_probs = self._forward_micro_batch(...)
+  +                   # Drive schedule to collect micro batch
+  +                   if role=="Ref":
+  +                       self.prof_npu.step()
+                  # ...
+**Megatron Backend**
+The Micro-Batch scheduling in the Megatron backend is managed internally by the framework and does not currently support fine-grained collection at the Micro-Batch level through simple code instrumentation. It is recommended to use global configuration for collection.
+3. Fine-grained Collection in update_policy (Actor & Critic) Phase
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+The Update phase includes forward and backward propagation.
+**FSDP Backend**
+The FSDP backend supports collection at both Mini-Batch and Micro-Batch granularities.
+- **Modified File**: ``verl/workers/actor/dp_actor.py``
+.. code-block:: diff
+      # ... import dependencies ...
+  +   import torch_npu
+      class DataParallelPPOActor(BasePPOActor):
+          def update_policy(self, data: DataProto):
+  +           # Prepare profiler (same configuration as above, omitted)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.prof_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # Only collect first Mini Batch (including all Micro-Batch computations and one optimizer update)
+  +               schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True)
+  +           )
+  +           self.prof_npu.start()
+              # ... PPO Epochs loop ...
+              for _ in range(self.config.ppo_epochs):
+                  # ... Mini Batch loop ...
+                  for batch_idx, mini_batch in enumerate(mini_batches):
+                      # ... mini_batches split ...
+                      for i, micro_batch in enumerate(micro_batches):
+                          # ... Original Forward & Backward logic ...
+                          # ... loss.backward() ...
+                          pass
+                      grad_norm = self._optimizer_step()
+  +                   # Drive schedule to collect mini batch, if you want micro batch collection, move self.prof_npu.step() inside the micro_batch loop
+  +                   self.prof_npu.step()
+**Megatron Backend**
+The Megatron backend supports collection at the Mini-Batch granularity.
+- **Modified File**: ``verl/workers/actor/megatron_actor.py``
+.. code-block:: diff
+      class MegatronPPOActor(BasePPOActor):
+          def update_policy(self, dataloader: Iterable[DataProto]) -> dict:
+              # ...
+  +           # Prepare profiler (same configuration as above, omitted)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.prof_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # Only collect computation of first Mini Batch (including all Micro-Batches) and one optimizer update
+  +               schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True)
+  +           )
+  +           self.prof_npu.start()
+              for data in dataloader:
+                  # ... internally calls self.forward_backward_batch for computation ...
+                  # ... metric_micro_batch = self.forward_backward_batch(...)
+                  # ... self.actor_optimizer.step() ...
+  +               # Drive schedule to collect mini batch
+  +               self.prof_npu.step()

code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst ADDED Viewed

	@@ -0,0 +1,398 @@

+Performance data collection based on FSDP or MindSpeed(Megatron) on Ascend devices(zh)
+==================================================================================
+在昇腾设备上基于 FSDP 或 MindSpeed (Megatron) 后端进行性能数据采集
+----------------------------------------------------------------
+Last updated: 12/20/2025.
+这是一份在昇腾设备上基于FSDP或MindSpeed(Megatron)后端，使用GRPO或DAPO算法进行数据采集的教程。
+配置
+----
+使用两级profile设置来控制数据采集
+- 全局采集控制：使用verl/trainer/config/ppo_trainer.yaml(FSDP)，或verl/trainer/config/ppo_megatron_trainer.yaml(MindSpeed)中的配置项控制采集的模式和步数。
+- 角色profile控制：通过每个角色中的配置项控制等参数。
+全局采集控制
+~~~~~~~~~~~~
+通过 ppo_trainer.yaml 中的参数控制采集步数和模式：
+-  global_profiler: 控制采集的rank和模式
+   -  tool: 使用的采集工具，选项有 nsys、npu、torch、torch_memory。
+   -  steps: 此参数可以设置为包含采集步数的列表，例如 [2, 4]，表示将采集第2步和第4步。如果设置为 null，则不进行采集。
+   -  save_path: 保存采集数据的路径。默认值为 "outputs/profile"。
+角色profiler控制
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+在每个角色的 ``profiler`` 字段中，您可以控制该角色的采集模式。
+-  enable: 是否为此角色启用性能分析。
+-  all_ranks: 是否从所有rank收集数据。
+-  ranks: 要收集数据的rank列表。如果为空，则不收集数据。
+-  tool_config: 此角色使用的性能分析工具的配置。
+通过每个角色的 ``profiler.tool_config.npu`` 中的参数控制具体采集行为：
+-  level: 采集级别—选项有 level_none、level0、level1 和 level2
+   -  level_none: 禁用所有基于级别的数据采集（关闭 profiler_level）。
+   -  level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。在权衡数据量和分析能力后，level0是推荐的默认配置。
+   -  level1: 在level0基础上增加CANN层AscendCL数据和NPU上的AI Core性能指标。
+   -  level2: 在level1基础上增加CANN层Runtime数据和AI CPU指标。
+-  contents: 控制采集内容的选项列表，例如
+   npu、cpu、memory、shapes、module、stack。
+   -  npu: 是否采集设备端性能数据。
+   -  cpu: 是否采集主机端性能数据。
+   -  memory: 是否启用内存分析。
+   -  shapes: 是否记录张量形状。
+   -  module: 是否记录框架层Python调用栈信息。相较于stack，更推荐使用module记录调用栈信息，因其产生的性能膨胀更低。
+   -  stack: 是否记录算子调用栈信息。
+-  analysis: 启用自动数据解析。
+-  discrete: 使用离散模式。
+示例
+----
+禁用采集
+~~~~~~~~~~~~~~~~~~~~
+.. code:: yaml
+      global_profiler:
+         steps: null # disable profile
+端到端采集
+~~~~~~~~~~~~~~~~~~~~~
+.. code:: yaml
+      global_profiler:
+         steps: [1, 2, 5]
+         save_path: ./outputs/profile
+      actor_rollout_ref:
+         actor:  # 设置 actor role 的 profiler 采集配置参数
+            profiler:
+               enable: True
+               all_ranks: True
+               tool_config:
+                  npu:
+                     discrete: False
+                     contents: [npu, cpu]  # 控制采集列表，默认cpu、npu，可配置memory、shapes、module等
+        # rollout & ref follow actor settings
+离散模式采集
+~~~~~~~~~~~~~~~~~~~~~~~~
+.. code:: yaml
+      global_profiler:
+         steps: [1, 2, 5]
+         save_path: ./outputs/profile
+      actor_rollout_ref:
+         actor:
+            profiler:
+               enable: True  # 设置为 True 以采集训练阶段
+               all_ranks: False
+               ranks: [0]  # 全局 Rank 0
+               tool_config:
+                  npu:
+                     discrete: True
+                     contents: [npu, cpu]
+         rollout:
+            profiler:
+               enable: True  # 设置为 True 以采集推理阶段
+               all_ranks: False
+               ranks: [0]  # 在 Agent Loop 模式下，此处指推理实例的 Replica Rank (例如第 0 个实例)
+               tool_config:
+                  npu:
+                     discrete: True  # Agent Loop 模式下必须开启离散模式
+         # ref follow actor settings
+**Agent Loop 场景说明**：
+当 Rollout 运行在 `Agent Loop <../advance/agent_loop.rst>`_ 模式时，Rollout 阶段的性能数据 **必须使用离散模式** 采集。此时 Profiler 由推理引擎后端触发，配置要求如下：
+1. **Rank 含义**：Rollout 配置中的 ``ranks`` 指代 **Replica Rank**（实例索引），而非全局 Rank。
+2. **推理引擎配置**：
+   - **vLLM 引擎**
+      - **必须通过环境变量配置**：
+         - ``VLLM_TORCH_PROFILER_DIR``: 设置数据保存路径（**必选**）。
+         - ``VLLM_TORCH_PROFILER_WITH_STACK``: 是否记录调用栈 (1开启, 0关闭，默认开启)。
+         - ``VLLM_TORCH_PROFILER_RECORD_SHAPES``: 设置为 1 以记录形状。
+         - ``VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY``: 设置为 1 以记录内存。
+         - ``VLLM_TORCH_PROFILER_WITH_FLOPS``: 设置为 1 以估算 FLOPS。
+      - *注意：vLLM 会忽略 yaml 中的 save_path 和 contents。*
+   - **SGLang 引擎**
+      - **零配置**。自动读取 ``ppo_trainer.yaml`` 中的配置。
+可视化
+------
+采集后的数据存放在用户设置的save_path下，可通过 `MindStudio Insight <https://www.hiascend.com/document/detail/zh/mindstudio/80RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0002.html>`_ 工具进行可视化。
+另外在Linux环境下，MindStudio Insight工具提供了 `JupyterLab插件 <https://www.hiascend.com/document/detail/zh/mindstudio/82RC1/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0130.html>`_ 形态，提供更直观和交互式强的操作界面。JupyterLab插件优势如下：
+- 无缝集成：支持在Jupyter环境中直接运行MindStudio Insight工具，无需切换平台，无需拷贝服务器上的数据，实现数据即采即用。
+- 快速启动：通过JupyterLab的命令行或图形界面，可快速启动MindStudio Insight工具。
+- 运行流畅：在Linux环境下，通过JupyterLab环境启动MindStudio Insight，相较于整包通信，有效解决了运行卡顿问题，操作体验显著提升。
+- 远程访问：支持远程启动MindStudio Insight，可通过本地浏览器远程连接服务直接进行可视化分析，缓解了大模型训练或推理数据上传和下载的困难。
+如果analysis参数设置为False，采集之后需要进行离线解析：
+.. code:: python
+    import torch_npu
+    # profiler_path请设置为"localhost.localdomain_<PID>_<timestamp>_ascend_pt"目录的上一级目录
+    torch_npu.profiler.profiler.analyse(profiler_path=profiler_path)
+进阶指南：精细化采集
+--------------------
+背景与挑战
+~~~~~~~~~~
+上述基于配置文件的采集方式虽然便捷，但在 **长序列 (Long Context)** 或 **大全局批量 (Large Global Batch Size)** 的训练场景中面临挑战。
+在一个完整的训练步 (Step) 内，模型计算呈现出高频次、重复性的特征：
+1. Rollout 阶段：序列生成 (Generate Sequence) 是一个自回归过程，涉及成千上万次 Decoder 模型的前向计算。
+2. Training 阶段：为了控制显存峰值，verl 通常采用 Micro-Batch 策略，将庞大的数据流切分为多个微批次进行计算。
+  - compute_log_prob (Actor/Ref)：涉及多轮纯前向传播。
+  - update_policy (Actor/Critic)：涉及多轮前向与反向传播。
+这种特性会导致全量 Profiling 产生海量且重复的算子记录。如下图所示：
+.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png
+即使使用了 ``discrete`` 模式，单个阶段的性能数据文件仍可能达到数 TB，导致 **解析失败** 或 **可视化工具卡顿** 。
+解决方案：关键路径采样
+~~~~~~~~~~~~~~~~~~~~~~
+为了解决上述问题，我们可以采用 **关键路径采样** 策略：基于 `torch_npu.profiler <https://www.hiascend.com/document/detail/zh/canncommercial/80RC2/devaids/auxiliarydevtool/atlasprofiling_16_0038.html>`_ 提供的API接口，直接修改 Python 源码，仅采集具有代表性的数据片段（如特定 Decode Step 或首个 Micro-Batch）。
+    **重要提示**
+    1. 本章节涉及直接修改源码。建议修改前备份文件，调试完成后恢复。
+    2. 使用代码插桩采集时，请务必在 ``ppo_trainer.yaml`` 或 ``ppo_megatron_trainer.yaml`` 中**禁用全局采集** (``global_profiler: steps: null``)，以避免 Profiler 冲突。
+1. Rollout 阶段精细化采集
+~~~~~~~~~~~~~~~~~~~~~~~~~
+对于 vLLM 或 SGLang 推理引擎，我们可以通过控制 ``schedule`` 参数来控制采集模型在特定token的前向传播性能数据。
+**vLLM 引擎**
+- **参考版本**：vLLM v0.11.0, vLLM-Ascend v0.11.0rc1
+- **修改文件**：``vllm-ascend/vllm_ascend/worker/worker_v1.py``
+.. code-block:: diff
+      class NPUWorker(WorkerBase):
+          def __init__(self, *args, **kwargs):
+              # ... existing code ...
+  +           # Initialize profiler
+  +           import torch_npu
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(
+  +               profiler_level=torch_npu.profiler.ProfilerLevel.Level1,
+  +               export_type=torch_npu.profiler.ExportType.Db,  # 可选择torch_npu.profiler.ExportType.Text格式
+  +           )
+  +           self.profiler_npu = torch_npu.profiler.profile(
+  +               activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU],
+  +               with_modules=False,  # 采集调用栈
+  +               profile_memory=False,  # 采集内存
+  +               experimental_config=experimental_config,
+  +               # 跳过第一步，warmup一步，采集3步，重复1次。如果想采集第30~70个decode step，可以设置为schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1)
+  +               schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True)  # 采集数据保存路径，是否在线解析
+  +           )
+  +           self.profiler_npu.start()
+              # ... existing code ...
+          def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs):
+              # ... existing code ...
+              output = self.model_runner.execute_model(scheduler_output,
+                                                  intermediate_tensors)
+  +           self.profiler_npu.step()  # 驱动 schedule，对部分decode step进行采集
+              # ... existing code ...
+**SGLang 引擎**
+- **参考版本**：SGLang master 分支
+- **修改文件**：``sglang/python/sglang/srt/model_executor/model_runner.py``
+.. code-block:: diff
+      # ... existing imports ...
+  +   import torch_npu
+      class ModelRunner:
+          def __init__(self, *args, **kwargs):
+              # ... existing init code ...
+  +           # Initialize profiler (配置同上，略)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.profiler_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # 跳过第一步，warmup一步，采集3步，重复1次。
+  +               schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True)
+  +           )
+  +           self.profiler_npu.start()
+          def forward(self, forward_batch, **kwargs):
+              # ... existing code ...
+  +           self.profiler_npu.step()  # 驱动 schedule，对部分decode step进行采集
+              return output
+2. compute_log_prob (Actor & Ref) 阶段精细化采集
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+该阶段计算新旧策略的概率分布。
+**FSDP 后端**
+FSDP 后端允许在 Micro-Batch 级别进行精细控制。
+- **修改文件**：``verl/workers/actor/dp_actor.py``
+.. code-block:: diff
+      # ... 引入依赖 ...
+  +   import torch_npu
+      class DataParallelPPOActor(BasePPOActor):
+          def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor:
+  +           role = "Ref" if self.actor_optimizer is None else "Actor"
+  +           # 准备 profiler (配置同上，略)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.prof_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # wait=0, warmup=0, active=1: 直接采集第一个 micro-batch
+  +               schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True)
+  +           )
+  +           # 此函数ref和actor共用，设置role标志位来区分。如果想采集actor_compute_log_prob，可设置if role=="Actor":
+  +           if role=="Ref":
+  +               self.prof_npu.start()
+              for micro_batch in micro_batches:
+                  # ... 原始计算逻辑 ...
+                  with torch.no_grad():
+                      entropy, log_probs = self._forward_micro_batch(...)
+  +                   # 驱动 schedule，对micro batch进行采集
+  +                   if role=="Ref":
+  +                       self.prof_npu.step()
+                  # ...
+**Megatron 后端**
+Megatron 后端的 Micro-Batch 调度由框架内部管理，暂不支持通过简单的代码插桩进行 Micro-Batch 级别的精细化采集。建议使用全局配置进行采集。
+3. update_policy (Actor & Critic) 阶段精细化采集
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Update 阶段包含前向和反向传播。
+**FSDP 后端**
+FSDP 后端支持设置对 Mini-Batch 和 Micro-Batch 的粒度进行采集。
+- **修改文件**：``verl/workers/actor/dp_actor.py``
+.. code-block:: diff
+      # ... 引入依赖 ...
+  +   import torch_npu
+      class DataParallelPPOActor(BasePPOActor):
+          def update_policy(self, data: DataProto):
+  +           # 准备 profiler (配置同上，略)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.prof_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # 仅采集第一个 Mini Batch（包含所有 Micro-Batch 的计算和一次优化器更新）
+  +               schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True)
+  +           )
+  +           self.prof_npu.start()
+              # ... PPO Epochs 循环 ...
+              for _ in range(self.config.ppo_epochs):
+                  # ... Mini Batch 循环 ...
+                  for batch_idx, mini_batch in enumerate(mini_batches):
+                      # ... mini_batches 切分 ...
+                      for i, micro_batch in enumerate(micro_batches):
+                          # ... 原始 Forward & Backward 逻辑 ...
+                          # ... loss.backward() ...
+                          pass
+                      grad_norm = self._optimizer_step()
+  +                   # 驱动 schedule，对mini batch进行采集，如果想对micro batch进行，则将self.prof_npu.step()移动到micro_batch的循环内
+  +                   self.prof_npu.step()
+**Megatron 后端**
+Megatron 后端支持以 Mini-Batch 的粒度进行采集。
+- **修改文件**：``verl/workers/actor/megatron_actor.py``
+.. code-block:: diff
+      class MegatronPPOActor(BasePPOActor):
+          def update_policy(self, dataloader: Iterable[DataProto]) -> dict:
+              # ...
+  +           # 准备 profiler (配置同上，略)
+  +           experimental_config = torch_npu.profiler._ExperimentalConfig(...)
+  +           self.prof_npu = torch_npu.profiler.profile(
+  +               # ...
+  +               # 仅采集第一个 Mini Batch 的计算（含所有 Micro-Batch）和一次优化器更新
+  +               schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1),
+  +               on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True)
+  +           )
+  +           self.prof_npu.start()
+              for data in dataloader:
+                  # ... 内部会调用 self.forward_backward_batch 进行计算 ...
+                  # ... metric_micro_batch = self.forward_backward_batch(...)
+                  # ... self.actor_optimizer.step() ...
+  +               # 驱动 schedule，对mini batch进行采集
+  +               self.prof_npu.step()

code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst ADDED Viewed

	@@ -0,0 +1,289 @@

+Ascend Quickstart
+===================================
+Last updated: 12/11/2025.
+我们在 verl 上增加对华为昇腾设备的支持。
+关键更新
+----------------------------------
+2025/12/11：verl 存量场景目前支持自动识别 NPU 设备类型， GPU 脚本在昇腾上运行，原则上不再需要显式设置 trainer.device=npu 参数，新增特性通过设置 trainer.device 仍可优先使用，逐步适配自动识别能力。
+    [说明] 自动识别 NPU 设备类型的前提，是运行程序所在环境包含 torch_npu 软件包。如不包含该软件包，仍需显式指定 trainer.device=npu 参数。
+硬件支持
+-----------------------------------
+Atlas 200T A2 Box16
+Atlas 900 A2 PODc
+Atlas 800T A3
+安装流程
+-----------------------------------
+DockerFile镜像构建 & 使用
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+如需要通过 DockerFile 构建镜像，或希望使用基于 verl 构建的镜像，请参考 `文档 <https://github.com/volcengine/verl/tree/main/docs/ascend_tutorial/dockerfile_build_guidance.rst>`_ 。
+安装基础环境
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+1. 基础环境涉及以下软件包，请参考 `文档 <https://gitcode.com/Ascend/pytorch>`_ 安装。
+    +---------------+----------------------+
+    | software      | version              |
+    +---------------+----------------------+
+    | Python        | >= 3.10, <3.12       |
+    +---------------+----------------------+
+    | CANN          | == 8.3.RC1           |
+    +---------------+----------------------+
+    | torch         | == 2.7.1             |
+    +---------------+----------------------+
+    | torch_npu     | == 2.7.1             |
+    +---------------+----------------------+
+2. （可选）在 x86 平台安装时，pip 需要配置额外的源，指令如下：
+    .. code-block:: bash
+        pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/"
+安装其他软件包
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+基础环境准备完毕后，需要通过指令安装以下软件包：
+    +---------------+----------------------+
+    | torchvision   | == 0.22.1            |
+    +---------------+----------------------+
+    | triton-ascend | == 3.2.0rc4          |
+    +---------------+----------------------+
+    | transformers  | latest release       |
+    +---------------+----------------------+
+    安装指令：
+    .. code-block:: bash
+        # 安装torchvision，版本需要和torch匹配
+        pip install torchvision==0.22.1
+        # 清理环境上可能存在的历史triton/triton-ascend软件包残留
+        pip uninstall -y triton triton-ascend
+        # 安装triton-ascend，不需要单独安装triton
+        pip install triton-ascend==3.2.0rc4
+安装 vllm & vllm-ascend
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+1. 需确保CANN ascend-toolkit 和 nnal 环境变量被激活，对于CANN默认安装路径 /usr/local/Ascend 而言，激活指令如下：
+    .. code-block::
+        source /usr/local/Ascend/ascend-toolkit/set_env.sh
+        source /usr/local/Ascend/nnal/atb/set_env.sh
+2. vllm 源码安装指令：
+    .. code-block:: bash
+        git clone --depth 1 --branch v0.11.0 https://github.com/vllm-project/vllm.git
+        cd vllm && VLLM_TARGET_DEVICE=empty pip install -v -e . && cd ..
+3. vllm-ascend 源码安装指令：
+    .. code-block:: bash
+        git clone --depth 1 --branch v0.11.0rc1 https://github.com/vllm-project/vllm-ascend.git
+        cd vllm-ascend && pip install -v -e . && cd ..
+安装 MindSpeed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+MindSpeed 源码安装指令：
+    .. code-block:: bash
+        # 下载 MindSpeed，切换到指定commit-id，并下载 Megatron-LM
+        git clone https://gitcode.com/Ascend/MindSpeed.git
+        cd MindSpeed && git checkout f2b0977e && cd ..
+        git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git
+        # 安装 MindSpeed & Megatron
+        pip install -e MindSpeed
+        # 将 Megatron-LM 源码路径配置到 PYTHONPATH 环境变量中
+        export PYTHONPATH=$PYTHONPATH:"$(pwd)/Megatron-LM"
+        # （可选）如希望 shell 关闭，或系统重启后，PYTHONPATH 环境变量仍然生效，建议将它添加到 .bashrc 配置文件中
+        echo "export PYTHONPATH=$PYTHONPATH:\"$(pwd)/Megatron-LM\"" >> ~/.bashrc
+        # 安装 mbridge
+        pip install mbridge
+MindSpeed 对应 Megatron-LM 后端使用场景，使用方式如下：
+    1. 使能 verl worker 模型 ``strategy`` 配置为 ``megatron`` ，例如 ``actor_rollout_ref.actor.strategy=megatron``。
+    2. MindSpeed 自定义入参可通过 ``override_transformer_config`` 参数传入，例如对 actor 模型开启 FA 特性可使用 ``+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True``。
+    3. 更多特性信息可参考 `MindSpeed & verl 文档 <https://gitcode.com/Ascend/MindSpeed/blob/master/docs/user-guide/verl.md>`_ 。
+安装verl
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+.. code-block:: bash
+    git clone --depth 1 https://github.com/volcengine/verl.git
+    cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd ..
+昇腾暂不支持生态库说明
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+verl 中昇腾暂不支持生态库如下：
+    +---------------+----------------+
+    | software      | description    |
+    +---------------+----------------+
+    | flash_attn    | not supported  |
+    +---------------+----------------+
+    | liger-kernel  | not supported  |
+    +---------------+----------------+
+    1. 不支持通过 flash_attn 使能 flash attention 加速，支持通过 transformers 使用。
+    2. 不支持 liger-kernel 使能。
+快速开始
+-----------------------------------
+正式使用前，建议您通过对Qwen2.5-0.5B GRPO的训练尝试以检验环境准备和安装的正确性。
+1.下载数据集并将数据集预处理为parquet格式，以便包含计算RL奖励所需的必要字段
+    .. code-block:: bash
+        python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k
+2.执行训练
+    .. code-block:: bash
+        set -x
+        export VLLM_ATTENTION_BACKEND=XFORMERS
+        python3 -m verl.trainer.main_ppo \
+            algorithm.adv_estimator=grpo \
+            data.train_files=$HOME/data/gsm8k/train.parquet \
+            data.val_files=$HOME/data/gsm8k/test.parquet \
+            data.train_batch_size=128 \
+            data.max_prompt_length=512 \
+            data.max_response_length=128 \
+            data.filter_overlong_prompts=True \
+            data.truncation='error' \
+            actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
+            actor_rollout_ref.actor.optim.lr=5e-7 \
+            actor_rollout_ref.model.use_remove_padding=False \
+            actor_rollout_ref.actor.entropy_coeff=0.001 \
+            actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+            actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \
+            actor_rollout_ref.actor.use_kl_loss=True \
+            actor_rollout_ref.actor.kl_loss_coef=0.001 \
+            actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+            actor_rollout_ref.model.enable_gradient_checkpointing=True \
+            actor_rollout_ref.actor.fsdp_config.param_offload=False \
+            actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+            actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
+            actor_rollout_ref.rollout.enable_chunked_prefill=False \
+            actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
+            actor_rollout_ref.rollout.name=vllm \
+            actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
+            actor_rollout_ref.rollout.n=5 \
+            actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
+            actor_rollout_ref.ref.fsdp_config.param_offload=True \
+            algorithm.kl_ctrl.kl_coef=0.001 \
+            trainer.critic_warmup=0 \
+            trainer.logger=console \
+            trainer.project_name='verl_grpo_example_gsm8k' \
+            trainer.experiment_name='qwen2_7b_function_rm' \
+            trainer.n_gpus_per_node=8 \
+            trainer.nnodes=1 \
+            trainer.save_freq=-1 \
+            trainer.test_freq=5 \
+            trainer.total_epochs=1 $@
+算法支持现状
+-----------------------------------
+**表1** RL类算法
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    | algorithm             |         model           | download link                                                    |   actor.strategy  |   rollout.name    |   shell location                                                                                                                             |     hardware             |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-7B-instruct     |`7B <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct>`_           |        FSDP       |    vllm-ascend    |`qwen2_5_7b_grpo_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_7b_grpo_npu.sh>`_                        |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-32B-instruct    |`32B <https://huggingface.co/Qwen/Qwen2.5-32B-Instruct>`_         |        FSDP       |    vllm-ascend    |`qwen2_5_32b_grpo_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_32b_grpo_npu.sh>`_                      |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-VL-3B-instruct  |`3B <https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct>`_        |        FSDP       |    vllm-ascend    |`qwen2_5_vl_3b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh>`_                            |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-VL-7B-instruct  |`7B <https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct>`_        |        FSDP       |    vllm-ascend    |`qwen2_5_vl_7b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh>`_                            |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen2.5-VL-32B-instruct |`32B <https://huggingface.co/Qwen/Qwen2.5-VL-32B-Instruct>`_      |        FSDP       |    vllm-ascend    |`qwen2_5_vl_32b_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh>`_                          |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-4B                |`4B <https://huggingface.co/Qwen/Qwen3-4B>`_                      |        FSDP       |    vllm-ascend    |`qwen3-4B_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3_4b_grpo_vllm_1k_npu.sh>`_                         |    Atlas 800T A3         |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_vllm_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-8b_npu.sh>`_                                 |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    sglang         |`qwen3_8b_sglang_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh>`_          |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | Qwen3-32B               |`32B <https://huggingface.co/Qwen/Qwen3-32B>`_                    |        FSDP       |    vllm-ascend    |`qwen3-32B_npu <https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-32b_npu.sh>`_                                    |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   GRPO                | DeepSeekv3-671B         |`671B <https://huggingface.co/deepseek-ai/DeepSeek-V3>`_          |        Megatron   |    vllm-ascend    |`deepseek_v3_megatron_npu <https://github.com/verl-project/verl-recipe/blob/main//r1_ascend/run_deepseekv3_671b_grpo_megatron_npu.sh>`_       |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen2.5-7B-instruct     |`7B <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct>`_           |        FSDP       |    vllm-ascend    |`qwen2.5_7b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen2.5_7b_npu.sh>`_                                    |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen2.5-32B             |`32B <https://huggingface.co/Qwen/Qwen2.5-32B>`_                  |        FSDP       |    vllm-ascend    |`qwen2.5_32b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen2.5_32b_npu.sh>`_                                  |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-8B-base           |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_8b_base_npu.sh>`_                                   |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-14B-base          |`14B <https://huggingface.co/Qwen/Qwen3-14B>`_                    |        FSDP       |    vllm-ascend    |`qwen3_14b_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_14b_base_npu.sh>`_                                 |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-30B-A3B-base      |`30B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_                |        FSDP       |    vllm-ascend    |`qwen3_30b_fsdp_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_moe_30b_base_fsdp_npu.sh>`_                   |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   DAPO                | Qwen3-30B-A3B-base      |`30B <https://huggingface.co/Qwen/Qwen3-30B-A3B>`_                |        Megatron   |    vllm-ascend    |`qwen3_30b_megatron_npu <https://github.com/verl-project/verl-recipe/blob/main//dapo/run_dapo_qwen3_moe_30b_megatron_npu.sh>`_                |    Atlas 200T A2 Box16   |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   PPO                 | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |    vllm-ascend    |`qwen3_8b_ppo_npu <https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/run_qwen3-8b_npu.sh>`_                                   |    Atlas 900 A2 PODc     |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+    |   One_Step_Off_Policy | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP2      |    vllm-ascend    |`qwen3_8b_fsdp2_npu <https://github.com/verl-project/verl-recipe/blob/main//one_step_off_policy/shell/grpo_qwen3_8b_gsm8k_fsdp2_8_8_npu.sh>`_ |    Atlas 800T A3         |
+    +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+
+**表2** SFT类算法
+    +-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+    | algorithm |         model           |  download link                                                   |   actor.strategy  |    shell location                                                                                                                            |     hardware         |
+    +-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+    |  SFT-PEFT | Qwen3-8B                |`8B <https://huggingface.co/Qwen/Qwen3-8B>`_                      |        FSDP       |`sft_peft_sp2_npu <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/run_qwen3_8b_sft_peft_sp2_npu.sh>`_                        |   Atlas 900 A2 PODc  |
+    +-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+    | ReTool-SFT| Qwen2-7B-instruct       |`7B <https://huggingface.co/Qwen/Qwen2-7B-Instruct>`_             |        FSDP       |`qwen2_7b_sft_npu <https://github.com/verl-project/verl-recipe/blob/main/retool/run_qwen2_7b_sft_npu.sh>`_                                    |   Atlas 900 A2 PODc  |
+    +-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+声明
+-----------------------------------
+verl中提供的ascend支持代码、Dockerfile、镜像皆为参考样例，如在生产环境中使用请通过官方正式途径沟通，谢谢。

code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst ADDED Viewed

	@@ -0,0 +1,153 @@

+Ascend Quickstart with SGLang Backend
+===================================
+Last updated: 01/27/2026.
+我们在 verl 上增加对华为昇腾设备的支持。
+硬件支持
+-----------------------------------
+Atlas 200T A2 Box16
+Atlas 900 A2 PODc
+Atlas 800T A3
+安装
+-----------------------------------
+关键支持版本
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
++-----------+-----------------+
+| software  | version         |
++===========+=================+
+| Python    | == 3.11         |
++-----------+-----------------+
+| HDK       | >= 25.3.RC1     |
++-----------+-----------------+
+| CANN      | >= 8.3.RC1      |
++-----------+-----------------+
+| torch     | >= 2.7.1        |
++-----------+-----------------+
+| torch_npu | >= 2.7.1.post2  |
++-----------+-----------------+
+| sglang    | v0.5.8          |
++-----------+-----------------+
+从 Docker 镜像进行安装
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+我们提供了DockerFile进行构建,详见 `dockerfile_build_guidance <https://github.com/verl-project/verl/blob/main/docs/ascend_tutorial/dockerfile_build_guidance.rst>`_ ，请根据设备自行选择对应构建文件
+从自定义环境安装
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+**1. 安装HDK&CANN依赖并激活**
+异构计算架构CANN(Compute Architecture for Neural Networks)是昇腾针对AI场景推出的异构计算架构, 为了使训练和推理引擎能够利用更好、更快的硬件支持, 我们需要安装以下 `先决条件 <https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/softwareinst/instg/instg_quick.html?Mode=PmIns&InstallType=netconda&OS=openEuler&Software=cannToolKit>`_
++-----------+-------------+
+| HDK       | >= 25.3.RC1 |
++-----------+-------------+
+| CANN      | >= 8.3.RC1  |
++-----------+-------------+
+安装完成后请激活环境
+.. code-block:: bash
+    source /usr/local/Ascend/ascend-toolkit/set_env.sh
+    source /usr/local/Ascend/nnal/atb/set_env.sh
+**2. 创建conda环境**
+.. code-block:: bash
+    # create conda env
+    conda create -n verl-sglang python==3.11
+    conda activate verl-sglang
+**3. 然后，执行我们在 verl 中提供的脚本** `install_sglang_mcore_npu.sh <https://github.com/verl-project/verl/blob/main/scripts/install_sglang_mcore_npu.sh>`_
+如果在此步骤中遇到错误，请检查脚本并手动按照脚本中的步骤操作。
+.. code-block:: bash
+    git clone https://github.com/volcengine/verl.git
+    # Make sure you have activated verl conda env
+    # NPU_DEVICE=A3 or A2 depends on your device
+    NPU_DEVICE=A3 bash verl/scripts/install_sglang_mcore_npu.sh
+**4. 安装verl**
+.. code-block:: bash
+    cd verl
+    pip install --no-deps -e .
+    pip install -r requirements-npu.txt
+快速开始
+-----------------------------------
+**1.当前NPU sglang脚本一览**
+.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
+.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh
+.. _Qwen3-8B-1k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh
+.. _Qwen3-8B-32k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh
+   +-----------------+----------------+----------+-------------------+
+   | 模型            | 推荐NPU型号    | 节点数量 | 训推后端          |
+   +=================+================+==========+===================+
+   | `Qwen3-30B`_    | Atlas 800T A3  | 1        | SGLang + Megatron |
+   +-----------------+----------------+----------+-------------------+
+   | `Qwen2.5-32B`_  | Atlas 900 A2   | 2        | SGLang + FSDP     |
+   +-----------------+----------------+----------+-------------------+
+   | `Qwen3-8B-1k`_  | Atlas A3/A2    | 1        | SGLang + FSDP     |
+   +-----------------+----------------+----------+-------------------+
+   | `Qwen3-8B-32k`_ | Atlas A3/A2    | 1        | SGLang + FSDP     |
+   +-----------------+----------------+----------+-------------------+
+**2.最佳实践**
+我们提供基于verl+sglang `Qwen3-30B`_ 以及 `Qwen2.5-32B`_ 的 `最佳实践 <https://github.com/verl-project/verl/blob/main/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst>`_ 作为参考
+**3.环境变量与参数**
+当前NPU上支持sglang后端必须添加以下环境变量
+.. code-block:: bash
+    #支持NPU单卡多进程 https://www.hiascend.com/document/detail/zh/canncommercial/850/commlib/hcclug/hcclug_000091.html
+    export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+    export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+    #规避ray在device侧调用无法根据is_npu_available接口识别设备可用性
+    export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+    #根据当前设备和需要卡数定义
+    export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+    #使能推理EP时���要
+    export SGLANG_DEEPEP_BF16_DISPATCH=1
+当前verl已解析推理常见参数, 详见 `async_sglang_server.py <https://github.com/verl-project/verl/blob/main/verl/workers/rollout/sglang_rollout/async_sglang_server.py>`_  中 ServerArgs初始化传参,其他 `sglang参数 <https://github.com/sgl-project/sglang/blob/main/docs/advanced_features/server_arguments.md>`_ 均可通过engine_kwargs 进行参数传递
+vllm后端推理脚本转换为sglang, 需要添加修改以下参数
+.. code-block:: bash
+    #必须
+    actor_rollout_ref.rollout.name=sglang
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend"
+    #可选
+    #使能推理EP，详细使用方法见 https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README_CN.md
+    ++actor_rollout_ref.rollout.engine_kwargs.sglang.deepep_mode="auto"
+    ++actor_rollout_ref.rollout.engine_kwargs.sglang.moe_a2a_backend="deepep"
+    #Moe模型多DP时必须设置为True
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False
+    #chunked_prefill默认关闭
+    +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1

code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst ADDED Viewed

	@@ -0,0 +1,82 @@

+Ascend Dockerfile Build Guidance
+===================================
+Last updated: 12/4/2025.
+我们在verl上增加对华为昇腾镜像构建的支持。
+镜像硬件支持
+-----------------------------------
+Atlas 200T A2 Box16
+Atlas 900 A2 PODc
+Atlas 800T A3
+镜像内各组件版本信息清单
+----------------
+================= ============
+组件        版本
+================= ============
+基础镜像            Ubuntu 22.04
+Python             3.11
+CANN               8.3.RC1
+torch              2.7.1
+torch_npu          2.7.1
+torchvision        0.22.1
+vLLM               0.11.0
+vLLM-ascend        0.11.0rc1
+Megatron-LM        v0.12.1
+MindSpeed          (f2b0977e)
+triton-ascend      3.2.0rc4
+mbridge            latest version
+SGLang             v0.5.8
+sgl-kernel-npu     (46b73de)
+================= ============
+Dockerfile构建镜像脚本清单
+---------------------------
+============== ============== ============== ==============================================================
+设备类型         基础镜像版本     推理后端        参考文件
+============== ============== ============== ==============================================================
+A2              8.2.RC1        vLLM            `Dockerfile.ascend_8.2.rc1_a2 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend_8.2.rc1_a2>`_
+A2              8.3.RC1        vLLM            `Dockerfile.ascend_8.3.rc1_a2 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend_8.3.rc1_a2>`_
+A2              8.3.RC1        SGLang          `Dockerfile.ascend.sglang_8.3.rc1_a2 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a2>`_
+A3              8.2.RC1        vLLM            `Dockerfile.ascend_8.2.rc1_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend_8.2.rc1_a3>`_
+A3              8.3.RC1        vLLM            `Dockerfile.ascend_8.3.rc1_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend_8.3.rc1_a3>`_
+A3              8.3.RC1         SGLang          `Dockerfile.ascend.sglang_8.3.rc1_a3 <https://github.com/volcengine/verl/blob/main/docker/ascend/Dockerfile.ascend.sglang_8.3.rc1_a3>`_
+============== ============== ============== ==============================================================
+镜像构建命令示例
+--------------------
+.. code:: bash
+   # Navigate to the directory containing the Dockerfile
+   cd {verl-root-path}/docker/ascend
+   # Build the image
+   # vLLM
+   docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend:8.3.rc1-a2 .
+   # SGLang
+   docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend-sglang:8.3.rc1-a2 .
+公开镜像地址
+--------------------
+昇腾在 `quay.io/ascend/verl <https://quay.io/repository/ascend/verl?tab=tags&tag=latest>`_ 中托管每日构建的 A2/A3 镜像，基于上述 Dockerfile 构建。
+每日构建镜像名格式：verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-latest
+verl release版本镜像名格式：verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-{verl release版本号}
+声明
+--------------------
+verl中提供的ascend相关Dockerfile、镜像皆为参考样例，可用于尝鲜体验，如在生产环境中使用请通过官方正式途径沟通，谢谢。

code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst ADDED Viewed

	@@ -0,0 +1,296 @@

+Ascend SGLang Best Practice
+===================================
+Last updated: 01/27/2026.
+.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
+.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh
+引言
+----------------------------------
+SGLang 是当前主流的高性能开源推理引擎, 昇腾已经全面原生支持该推理引擎在verl中使用,
+仅需简单的构建流程，开发者即可完成环境构建，本文将提供两个经典用例来帮助开发者了解以下内容：
+1. 环境构建
+2. 模型训练与评估
+3. 性能采集
+两个用例模型脚本以及其需要的硬件条件各自如下：
++----------------------+---------------------+----------+------------------------+
+| 模型                 | NPU型号             | 节点数量 | 训推后端               |
++======================+=====================+==========+========================+
+| `Qwen3-30B`_         | Atlas 800T A3       | 1        | SGLang + Megatron      |
++----------------------+---------------------+----------+------------------------+
+| `Qwen2.5-32B`_       | Atlas 900 A2        | 2        | SGLang + FSDP          |
++----------------------+---------------------+----------+------------------------+
+环境构建
+-----------------------------------
+我们在quickstart中提供了两种构建环境的方法, 1.从镜像文件DockerFile进行构建 2.从自定义Conda环境进行构建
+在本实践中, 我们额外指定verl 的commit id 以避免引入其他问题
+.. code-block:: bash
+    cd verl
+    git checkout 772c224
+模型训练与评估
+-----------------------------------
+1.模型数据准备
+^^^^^^^^^^^
+`Qwen3-30B`_
+^^^^^^^^^^^
+**下载模型权重**
+--local-dir: 模型保存路径
+.. code-block:: bash
+  export HF_ENDPOINT=https://hf-mirror.com
+  hf download --resume-download Qwen/Qwen3-30B-A3B --local-dir /path/to/local_dir
+**下载数据集**
+.. code-block:: bash
+  git clone https://www.modelscope.cn/datasets/AI-ModelScope/DAPO-Math-17k.git
+**HuggingFace To Megatron权重转换(可选)**
+.. code-block:: bash
+  python scripts/converter_hf_to_mcore.py \
+      --hf_model_path Qwen/Qwen3-30B-A3B \
+      --output_path Qwen/Qwen3-30B-A3B-mcore \
+      --use_cpu_initialization    # Only work for MoE models
+*注:verl当前已支持mbridge进行灵活的hf和mcore之间的权重转换,可以修改以下相关参数直接加载hf权重*
+.. code-block:: bash
+    actor_rollout_ref.actor.megatron.use_dist_checkpointing=False
+    actor_rollout_ref.actor.megatron.use_mbridge=True
+`Qwen2.5-32B`_
+^^^^^^^^^^^
+**下载模型权重**
+--local-dir: 模型保存路径
+.. code-block:: bash
+  export HF_ENDPOINT=https://hf-mirror.com
+  hf download --resume-download Qwen/Qwen2.5-32B --local-dir /path/to/local_dir
+**下载及处理数据集**
+.. code-block:: bash
+    wget https://huggingface.co/datasets/agentica-org/DeepScaleR-Preview-Dataset/resolve/main/deepscaler.json
+    python recipe/r1_ascend/json_to_parquet.py --output_dir ./data/deepscaler --json_path path/to/deepscaler.json --train_data_ratio 0.9
+2.训练
+^^^^^^^^^^^
+根据开发者实际路径配置情况修改模型训练脚本中的以下参数
+.. code-block:: bash
+    # Model Weights Paths
+    MODEL_PATH=Qwen/Qwen3-30B-A3B
+    MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-mcore
+    RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+    CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+    # File System Paths
+    TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet
+    TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet
+    #保存频率，-1默认不保存，如需评测请修改此参数
+    trainer.save_freq=-1
+对于单机任务 `Qwen3-30B`_ , 可以直接bash执行verl仓上示例脚本
+.. code-block:: bash
+  bash examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh
+对于多节点任务 `Qwen2.5-32B`_ ，我们推荐使用以下脚本进行大规模多节点训练拉起
+.. code-block:: bash
+  pkill -9 python
+  ray stop --force
+  rm -rf /tmp/ray
+  export RAY_DEDUP_LOGS=0
+  export HYDRA_FULL_ERROR=1
+  # TASK_QUEUE_ENABLE，下发优化，图模式设置为1，非图模式设置为2
+  export TASK_QUEUE_ENABLE=1
+  export HCCL_ASYNC_ERROR_HANDLING=0
+  export HCCL_EXEC_TIMEOUT=3600
+  export HCCL_CONNECT_TIMEOUT=3600
+  export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050
+  export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050
+  export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
+  export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8
+  # 修改为当前需要跑的用例路径
+  DEFAULT_SH="./run_*.sh"
+  echo "Use $DEFAULT_SH"
+  ulimit -n 32768
+  mkdir logs
+  NNODES=2
+  NPUS_PER_NODE=8
+  # 修改为对应主节点IP
+  MASTER_ADDR="IP FOR MASTER NODE"
+  # 修改为当前节点的通信网卡
+  SOCKET_IFNAME="Your SOCKET IFNAME"
+  export HCCL_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
+  export GLOO_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE"
+  # 获取当前IP
+  CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}')
+  if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then
+    # 主节点启动
+    ray start --head --port 6766 --dashboard-host=$MASTER_ADDR --node-ip-address=$CURRENT_IP --dashboard-port=8260 --resources='{"NPU": '$NPUS_PER_NODE'}'
+    while true; do
+        ray_status_output=$(ray status)
+        npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1)
+        npu_count_int=$(echo "$npu_count" | awk '{print int($1)}')
+        device_count=$((npu_count_int / $NPUS_PER_NODE))
+        # 判断device_count 是否与 NNODES 相等
+        if [ "$device_count" -eq "$NNODES" ]; then
+            echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script."
+            ray status
+            bash $DEFAULT_SH
+            break
+        else
+            echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count"
+            sleep 5
+        fi
+    done
+  else
+    # 子节点尝试往主节点注册 ray 直到成功
+    while true; do
+        # 尝试连接 ray 集群
+        ray start --address="$MASTER_ADDR:6766" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP
+        # 检查连接是否成功
+        ray status
+        if [ $? -eq 0 ]; then
+            echo "Successfully connected to the Ray cluster!"
+            break
+        else
+            echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..."
+            sleep 5
+        fi
+    done
+  fi
+  sleep 600
+DEFAULT_SH:修改为训练所用配置 sh 文件路径。在此案例中修改为 `Qwen2.5-32B`_ 路径。
+NNODES 和 NPUS_PER_NODE:修改为使用节点数量和每个节点 NPU 数量。在此案例中分别为2和8。
+MASTER_ADDR:修改为对应主节点 IP。即所有节点的 MASTER_ADDR 应该相同。
+SOCKET_IFNAME, HCCL_SOCKET_IFNAME, GLOO_SOCKET_IFNAME: 修改为对应通信网卡，通信网卡可以通过以下命令获取：
+.. code-block:: bash
+  ifconfig |grep "$(hostname -I |awk '{print $1}'|awk -F '.' '{print $0}')" -B 1|awk -F ':' '{print$1}' | head -1 | tail -1
+3.模型评估
+^^^^^^^^^^^
+不同模型步骤一致,仅以Qwen3-30b为例列举
+我们通过 AISBenchmark 评估模型,该工具支持vllm/sglang多种推理后端的评估
+**安装方法**
+.. code-block:: bash
+  git clone https://gitee.com/aisbench/benchmark.git
+  cd benchmark
+  pip install -e .
+**下载评估数据集**
+.. code-block:: bash
+  cd path/to/benchmark/ais_bench/datasets
+  wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip
+  unzip math.zip
+  rm math.zip
+**修改AISBench配置代码使能sglang推理评测**
+打开 benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat.py 文件，这是推理配置文件
+.. code-block:: bash
+    from ais_bench.benchmark.models import VLLMCustomAPIChatStream
+    from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content
+    from ais_bench.benchmark.clients import OpenAIChatStreamClient, OpenAIChatStreamSglangClient
+    models = [
+        dict(
+            attr="service",
+            type=VLLMCustomAPIChatStream,
+            abbr='sgl-api-stream-chat',
+            path="/path/to/Qwen3-30B", # 修改为 Qwen3-30B 模型路径
+            model="qwen3-30b",
+            request_rate = 0,
+            max_seq_len=2048,
+            retry = 2,
+            host_ip = "localhost", # 推理服务的IP
+            host_port = 8005, # 推理服务的端口
+            max_out_len = 8192,  # 最大输出tokens长度
+            batch_size=48, # 推理的最大并发数
+            trust_remote_code=False,
+            custom_client=dict(type=OpenAIChatStreamSglangClient), #使用sglang客户端
+            generation_kwargs = dict(
+                temperature = 0,
+                seed = 1234,
+            ),
+            pred_postprocessor=dict(type=extract_non_reasoning_content)
+        )
+    ]
+**启动sglang_server服务**
+.. code-block:: bash
+    python -m sglang.launch_server --model-path "/path/to/Qwen3-30B"  --tp-size 4 --dp-size 1 --port 8005
+**启动sglang_client评测**
+.. code-block:: bash
+    ais_bench --models vllm_api_stream_chat --datasets math500_gen_0_shot_cot_chat_prompt
+**评测结果**
+经过训练,模型在Math-500上的评分显著上升
++------+----------------------+---------+----------+------+----------------------+
+| iter | dataset              | version | metric   | mode | sgl-api-stream-chat  |
++======+======================+=========+==========+======+======================+
+|   0  | math_prm800k_500     | c4b6f0  | accuracy | gen  | 	84.4             |
++------+----------------------+---------+----------+------+----------------------+
+|  150 | math_prm800k_500     | c4b6f0  | accuracy | gen  |     91.7             |
++------+----------------------+---------+----------+------+----------------------+
+性能采集
+-----------------------------------
+关于NPU profiling的详细文档请参考 `ascend_profiling_zh <https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/ascend_profiling_zh.rst>`_
+在 `Qwen3-30B`_ 的脚本中提供了基本的采集性能选项PROF_CONFIG，默认设置 global_profiler.steps=null 关闭采集， 开发者可根据实际需要进行参数修改
+采集完成后，开发者可以使用 `MindStudio Insight <https://www.hiascend.com/document/detail/zh/mindstudio/830/GUI_baseddevelopmenttool/msascendinsightug/Insight_userguide_0002.html>`_ 进行数据解析
+注: verl框架侧进行采集全量 Profiling 产生海量且重复的算子记录，可以根据文档修改代码仅采集关键阶段

code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md ADDED Viewed

	@@ -0,0 +1,324 @@

+# DAPO 介绍
+Last updated: 01/27/2026.
+DAPO的论文可以参考：[DAPO](https://arxiv.org/pdf/2503.14476)，其中包含以下几个关键技术。
+* **Clip-Higher**: 通过对重要性采样比的上限剪裁促进了系统的多样性并避免了熵坍缩（Entropy Collapse）。
+* **Dynamic Sampling**: 提高了训练效率和稳定性。DAPO出了一种执行动态采样的策略，并过滤掉准确率等于1和0的提示组，从而保持批次间具有有效梯度的提示数量一致。
+* **Token-level Policy Gradient Loss**: 在长链思维强化学习 (long-CoT RL) 场景中至关重要。
+* **Overlong Reward Shaping**: 减少奖励噪声并稳定了训练。
+在verl中，可以进行如下设置，从而进行DAPO算法的运行。
+- **奖励模型的管理策略为 DAPO**
+  在dapo算法中，必须配置成dapo。
+```
+reward_model.reward_manager=dapo
+```
+- **Clip-Higher 更高裁剪 **
+  `clip_ratio_low` 和 `clip_ratio_high` 用于指定 DAPO 目标函数中的 $\varepsilon_{\text {low }}$ 和 $\varepsilon_{\text {high }}$。
+```
+clip_ratio_low=0.2  # 裁剪比例下限，默认值为0.2
+clip_ratio_high=0.28 # 裁剪比例上限，默认值为0.28
+```
+- **动态采样的相关配置 **
+  将 `filter_groups.enable` 设置为 `True` 会过滤掉输出 `metric` 完全相同的组，例如对于 `acc` 指标，过滤掉输出准确率全部为 1 或 0 的组。
+  训练器会使用 `gen_batch_size` 进行重复采样，直到生成足够数量的符合条件的组，或者达到 `max_num_gen_batches` 所指定的上限为止。
+```
+data.gen_batch_size=${gen_prompt_bsz}
+algorithm.filter_groups.enable=${enable_filter_groups} # 动态采样开关
+algorithm.filter_groups.metric=${filter_groups_metric} # 使用准确率作为过滤标准
+algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} # 最大生成批次数量,最多重复生成数据的次数
+```
+- **Token-level Loss **
+  将 `loss_agg_mode` 设置为 `token-mean` 意味着计算一个批次中所有序列内所有 token 的（策略梯度）损失的平均值。
+```
+actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode}
+#注意：“token-mean”是默认行为。
+```
+- **奖励模型对超长回答的惩罚配置 **
+  将 `overlong_buffer.enable` 设置为 `True` 将对输出长度过长但仍未超过硬上下文限制的输出进行惩罚。具体来说，当输出的长度超过 `max_response_length - overlong_buffer.len` 且超出 `0` 到 `overlong_buffer.len` 个 token 时，惩罚值会从 `0` 线性增加到 `overlong_buffer.penalty_factor`。
+```
+reward_model.overlong_buffer.enable=${enable_overlong_buffer} # 启用超长缓冲区惩罚,开启对超长输出的惩罚机制
+reward_model.overlong_buffer.len=${overlong_buffer_len}  # 缓冲区长度,定义缓冲区的toke,最大惩罚强度
+reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor}   #惩罚因子,最大惩罚强度
+```
+相关参数涉及的代码可以参考：[Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)](https://github.com/verl-project/verl-recipe/blob/main/dapo/README.md)
+# 硬件要求
+当前支持Atlas 800T A3 与 Atlas 900 A3 SuperPoD。完成跑完本次最佳实践需要 2台Atlas 800T A3。关键软件版本可以参考：[Ascend Quickstart](https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/ascend_quick_start.rst)
+# 模型训练
+## 数据集准备
+Geometry3k 数据集是由加利福尼亚大学洛杉矶分校与浙江大学联合研发的几何领域专用数据集，核心面向视觉问答（VQA）任务展开研究与模型训练。该数据集总计包含 3002 个样本，采用图像和文本两种模态数据形式构建，其中文本模态涵盖各类几何问题描述，图像则以可视化图表呈现问题中的几何图形信息，包括三角形、圆形、四边形等基础几何形状，以及不同图形间的位置、嵌套、相交等关联关系。可以从Hugging Face库下载对应的原始数据集：[Geometry3k ](https://huggingface.co/datasets/hiyouga/geometry3k)
+```python
+# 下载原始数据并预处理
+python ./examples/data_preprocess/geo3k.py --local_dir=./data/geo3k
+```
+## 权重下载
+从Hugging Face库下载对应的模型权重：[Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct/tree/main
+)
+## 全局变量导入
+- 为了确保 Ray 进程能够正常回收内存，需要安装并使能 jemalloc 库进行内存管理，用于更好管理内存，避免长跑过程中内存 OOM。
+```
+# 根据实际安装路径设置 jemalloc 环境变量
+export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
+```
+- 某些模型是通过 vllm ascend 进行优化的。但在某些情况下，优化后的模型可能并不适用。此时，将此值设置为 0 即可禁用优化后的模型。
+```
+export USE_OPTIMIZED_MODEL=0
+```
+- 启用vLLM V1
+```
+export VLLM_USE_V1=1
+```
+昇腾多卡通信的兜底��置，延长连接超时时间，避免集群环境下训练启动因连接慢而失败
+```
+export HCCL_CONNECT_TIMEOUT=5400
+```
+- 控制 vLLM 在昇腾芯片上是否启用NZ优化
+```
+export VLLM_ASCEND_ENABLE_NZ=0
+```
+- 根据使用机器的情况，修改相关配置， 例如双机机 A2 可设置`trainer.nnodes`为 1 、`trainer.n_gpus_per_node`为8
+## 训练脚本
+基于以上修改，提供了示例配置文件，创建 run_dapo_qwen3_vl_30b.sh 文件。
+```bash
+set -xeuo pipefail
+export VLLM_USE_V1=1
+export HCCL_CONNECT_TIMEOUT=5400
+export VLLM_ASCEND_ENABLE_NZ=0
+export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2
+# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training,
+# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model.
+export USE_OPTIMIZED_MODEL=0
+project_name='DAPO'
+exp_name='DAPO-Qwen3-vl-30B'
+adv_estimator=grpo
+use_kl_in_reward=False
+kl_coef=0.0
+use_kl_loss=False
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+max_prompt_length=1024
+max_response_length=2048
+enable_overlong_buffer=False
+overlong_buffer_len=$((1024 * 2))
+overlong_penalty_factor=1.0
+loss_agg_mode="token-mean"
+enable_filter_groups=True
+filter_groups_metric=acc
+max_num_gen_batches=4
+train_prompt_bsz=64
+gen_prompt_bsz=$((train_prompt_bsz * 3))
+n_resp_per_prompt=8
+train_prompt_mini_bsz=16
+# Ray
+PWD=./
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+# Paths
+RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
+MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-30B-A3B-Instruct"}
+CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
+TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/geo3k/train.parquet"}
+TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/geo3k/test.parquet"}
+# Algorithm
+temperature=1.0
+top_p=1.0
+top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+val_top_p=0.7
+# Performance Related Parameter
+sp_size=8
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+gen_tp=8
+fsdp_size=16
+ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
+    --working-dir "${WORKING_DIR}" \
+    --address "${RAY_ADDRESS}" \
+    -- python3 -m recipe.dapo.main_dapo \
+    data.train_files="${TRAIN_FILE}" \
+    data.val_files="${TEST_FILE}" \
+    data.prompt_key=prompt \
+    data.truncation='left' \
+    data.max_prompt_length=${max_prompt_length} \
+    data.max_response_length=${max_response_length} \
+    data.gen_batch_size=${gen_prompt_bsz} \
+    data.train_batch_size=${train_prompt_bsz} \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    algorithm.adv_estimator=${adv_estimator} \
+    algorithm.use_kl_in_reward=${use_kl_in_reward} \
+    algorithm.kl_ctrl.kl_coef=${kl_coef} \
+    actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
+    actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
+    algorithm.filter_groups.metric=${filter_groups_metric} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
+    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \
+    actor_rollout_ref.model.path="${MODEL_PATH}" \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
+    actor_rollout_ref.actor.optim.weight_decay=0.1 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
+    actor_rollout_ref.actor.use_torch_compile=False \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=1.0 \
+    actor_rollout_ref.rollout.enforce_eager=True \
+    actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.70 \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
+    actor_rollout_ref.rollout.enable_chunked_prefill=True \
+    actor_rollout_ref.rollout.temperature=${temperature} \
+    actor_rollout_ref.rollout.top_p=${top_p} \
+    actor_rollout_ref.rollout.top_k="${top_k}" \
+    actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \
+    actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \
+    actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.expert_parallel_size=8 \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
+    actor_rollout_ref.rollout.name=vllm \
+    +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+    actor_rollout_ref.actor.strategy=fsdp2 \
+    actor_rollout_ref.ref.strategy=fsdp2 \
+    critic.strategy=fsdp2 \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
+    actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \
+    reward_model.reward_manager=dapo \
+    reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
+    reward_model.overlong_buffer.len=${overlong_buffer_len} \
+    reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    trainer.logger=console \
+    trainer.project_name="${project_name}" \
+    trainer.experiment_name="${exp_name}" \
+    trainer.n_gpus_per_node=8 \
+    trainer.nnodes=2 \
+    trainer.val_before_train=True \
+    trainer.test_freq=1 \
+    trainer.save_freq=20 \
+    trainer.resume_mode=auto \
+    trainer.device=npu \
+    trainer.total_epochs=30 \
+    trainer.total_training_steps=100 \
+    trainer.default_local_dir="${CKPTS_DIR}"
+```
+# 优化参考
+- **启动动态批次大小**
+  根据单 GPU 的最大 Token 总数（ppo_max_token_len_per_gpu）动态调整批次大小
+```
+actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz}
+actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz}
+```
+- **单个 GPU 能处理的最大 Token 总数**
+  当`use_dynamic_bsz=True`时，单 GPU 在一个微批次中能处理的最大 Token 数量
+```
+actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len}
+actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len}
+```
+- **单个 GPU 微批次大小**
+  当`use_dynamic_bsz=True`时，框架会以该值为初始批次大小，再根据`ppo_max_token_len_per_gpu`向上 / 向下调整
+```
+actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2
+actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2
+actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2
+```
+- **启用 FSDP2 框架**
+  “将模型参数、梯度、优化器状态分片存储在不同 GPU 上”，避免单卡加载全量模型导致显存溢出。
+```
+# 启用 FSDP2 框架
+actor_rollout_ref.actor.strategy=fsdp2
+actor_rollout_ref.ref.strategy=fsdp2
+critic.strategy=fsdp2
+# 仅用于 FSDP2：前向传播后重新分片以减少内存占用。
+actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True
+# 仅用于 FSDP2：是否在模型前向传播后重新分片以节省内存。
+actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True
+```
+- **启用专家并行配置**
+  指定有多少个 GPU用于并行计算不同的专家网络
+```
+# MoE 架构 Actor 模型的专家并行配置
+actor_rollout_ref.rollout.expert_parallel_size=8
+```

code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md ADDED Viewed

	@@ -0,0 +1,233 @@

+## NPU Qwen3-32B GSPO Optimization Practice
+Last updated: 01/27/2026.
+本文章对应脚本地址：[qwen3_32b_gspo_npu](https://github.com/volcengine/verl/blob/main/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh)
+### 算法适配
+GSPO通过将优化颗粒度从**token级**提升到**sequence级**，规避了GRPO会遇到的**方差急剧增大**导致训练不稳定的情况，增加了训练的稳定性，同时该算法也在一定程度上提升了算法的收敛速度。
+想要成功在verl仓库中成功调用到GSPO算法，需要进行如下的必要配置
+~~~python
+# 核心算法配置
+algorithm.adv_estimator=grpo \                    # 使用GRPO优势估计器
+algorithm.use_kl_in_reward=False \                # 不在奖励中添加KL惩罚
+# GSPO策略损失模式
+actor_rollout_ref.actor.policy_loss.loss_mode=gspo \ # 启用GSPO策略损失
+# 极小裁剪范围（GSPO特色）
+actor_rollout_ref.actor.clip_ratio_low=0.0003 \   # 裁剪下界，论文推荐值
+actor_rollout_ref.actor.clip_ratio_high=0.0004 \  # 裁剪上界，论文推荐值
+# KL配置（GSPO不使用KL loss）
+actor_rollout_ref.actor.use_kl_loss=False \       # 禁用KL损失
+actor_rollout_ref.actor.kl_loss_coef=0.0 \        # KL损失系数设为0
+# 序列级损失聚合模式（GSPO核心）
+actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-mean \ # 序列级平均，GSPO论文推荐
+# 批次配置
+actor_rollout_ref.rollout.n=16 \                  # 每个prompt生成16个响应（组采样）
+~~~
+一般选择入口函数为`verl.trainer.main_ppo`
+### 性能调优
+优化从训练、推理、调度和其他四个方面入手。
+#### 训练
+##### 动态bsz
+~~~bash
+actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size))
+~~~
+**这个优化点主要调整上面这两个参数，不过需要注意这两个参数调整的太大会导致OOM**
+**主要调整**`actor_ppo_max_token_len`,调大了会降低训练的耗时，调整`infer_ppo_max_token_len`没有明显的收益，可以不动
+**这两个参数的作用介绍如下：**
+**这两个参数用于控制动态批处理(dynamic batch size)模式下每个GPU处理的最大token数量**
+- **`actor_ppo_max_token_len`**: Actor模型在PPO更新(前向+反向传播)时每个GPU能处理的最大token数
+- **`infer_ppo_max_token_len`**: 推理阶段(Reference policy和Rollout)计算log概率时每个GPU能处理的最大token数
+#### 推理
+##### ACLgraph+FULL_DECODE_ONLY
+推理算子下发方面的优化，平均能有`15%~20%`左右的性能收益。
+先看单开**ACLgraph**，如下：
+~~~bash
+# 开启ACLgraph+FULL_DECODE_ONLY（注意：当设置此参数为False时，TASK_QUEUE_ENABLE必须设置为1，不然会报错）
+actor_rollout_ref.rollout.enforce_eager=False
+actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes='[8,16,32,64,128]' \
+actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode='FULL_DECODE_ONLY' \
+~~~
+`FULL_DECODE_ONLY`开启成功后有如下输出：
+![FULL_DECODE_ONLY result](https://github.com/wucong25/verl-data/blob/main/ascend_acl_graph.png)
+**`cudagraph_capture_sizes`参数设置指南**
+cudagraph_capture_sizes设置的值对应的是批大小，这里的批大小不是配置里的DP域对应的那个批次大小，这里是相较于vllm来说的批大小，单位为**token**
+默认生成的算法如下，可做参考
+![cudagraph_capture_sizes](https://github.com/wucong25/verl-data/blob/main/ascend_set_cudagraph_sizes.png)
+##### 推理后端切换
+使用方式：`export VLLM_ATTENTION_BACKEND=XFORMERS`
+![VLLM_ATTENTION_BACKEND](https://github.com/wucong25/verl-data/blob/main/ascend_vllm_attn_backend.png)
+注：需要注意某些后端在一些比较老的vllm-ascend版本内并不支持
+##### 使能vllm v1版本
+使用方式：`export VLLM_USE_V1=1`
+可以常开，一般都是正收益。
+#### 调度
+##### AIV
+打开方式：设置`export HCCL_OP_EXPANSION_MODE="AIV"`
+HCCL_OP_EXPANSION_MODE环境变量用于配置通信算法的编排展开位置，支持如下取值：
+- AI_CPU：代表通信算法的编排展开位置在Device侧的AI CPU计算单元。
+- AIV：代表通信算法的编排展开位置在Device侧的Vector Core计算单元。
+- HOST：代表通信算法的编排展开位置为Host侧CPU，Device侧根据硬件型号自动选择相应的调度器。
+- HOST_TS：代表通信算法的编排展开位置为Host侧CPU，Host向Device的Task Scheduler下发任务，Device的Task Scheduler进行任务调度执行。
+下面介绍两种展开机制
+###### HOST展开
+<img src="https://github.com/wucong25/verl-data/blob/main/ascend_task_queue1.png" alt="image-20260113194257095" style="zoom:50%;" />
+- 软件栈工作在hostcpu，通信算法展开一个个task
+- 每个task调用runtime接口���下发到device的rtsqueue
+- STARS从rstqueue上顺序拿取task
+- 根据task类型分别调用掉SDMA和RDMA引擎。
+    **单算子瓶颈**：hostbound 每个task提交是2~5us，一个通信算子有几百个task，单算子场景不会在device上缓存，下发一个执行一个
+###### AICpu机制展开
+<img src="https://github.com/wucong25/verl-data/blob/main/ascend_task_queue3.png" alt="image-20260113194333218" style="zoom:50%;" />
+- host侧不下发一个个task，把通信算子作为一个个kernel，放在通信算子kernel的队列上去。
+- STARS调度kernel队列流上的kernel，把kernel放到AiCPU上去执行。
+- AICPU调用函数（kernel），用一个线程执行kernel 函数，在函数内把通信task展开，把task放到rstqueue上，STARS调用。
+- 降低host和aicpu交互，由几百次降低为一次。
+- task的提交在AICPU上提交，做了提交的部分合并。
+##### TASK_QUEUE_ENABLE
+**使用方式：**`export TASK_QUEUE_ENABLE=2`
+TASK_QUEUE_ENABLE，下发优化，图模式设置为1（即开启图模式的时候这个要设置为1），非图模式设置为2
+示意图：
+![ascend task queue](https://github.com/wucong25/verl-data/blob/main/ascend_task_queue2.png)
+##### 绑核优化
+**使用方式：**`export CPU_AFFINITY_CONF=1`
+详细设置原理可看：https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0059.html
+#### 其他
+以下内容汇总了若干全局环境变量的调优配置。由于这些参数在训练阶段与推理阶段往往都能带来正向收益，且目前尚缺乏足够精细的消融实验来严格区分它们各自对训练或推理的贡献占比，故统一归拢在此，供后续持续监控与进一步拆解分析。
+##### 使能jemalloc
+使用方式（注意需要先安装jemalloc库）：`export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2`
+**安装使用教程：**[MindSpeed-RL/docs/install_guide.md · Ascend/MindSpeed-RL - AtomGit | GitCode](https://gitcode.com/Ascend/MindSpeed-RL/blob/master/docs/install_guide.md#高性能内存库-jemalloc-安装)
+##### 多流复用
+内存方面有优化
+使能方式：`export MULTI_STREAM_MEMORY_REUSE=1`
+原理介绍：https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0040.html
+##### VLLM_ASCEND_ENABLE_FLASHCOMM
+使用方式：`export VLLM_ASCEND_ENABLE_FLASHCOMM=1`
+启用昇腾 NPU 特有的FLASHCOMM高速通信优化技术
+地址：https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html
+##### VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE
+使用方式：`export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1`
+启用昇腾 NPU针对大模型推理的稠密计算优化
+地址：https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html
+##### VLLM_ASCEND_ENABLE_PREFETCH_MLP
+使用方式：`export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1`
+启用 MLP 层的权重预取机制
+<img src="https://github.com/wucong25/verl-data/blob/main/ascend_prefetch.png" alt="image-20251124173132677" style="zoom:50%;" />
+##### verl框架参数设置
+主要是内存方面的一些设置开关（注意，这个里面的优化都或多或少会导致吞吐量有一定程度的劣化）
+~~~bash
+# 梯度检查点 (Gradient Checkpointing)
+# 作用: 通过重新计算激活值来节省显存,以计算换内存。在前向传播时不保存中间激活值,反向传播时重新计算,可以显著降低显存占用,允许使用更大的batch size。
+actor_rollout_ref.model.enable_gradient_checkpointing=True
+# 参数卸载 (Parameter Offload)
+# 作用: 将模型参数卸载到CPU内存,训练时再加载回GPU。
+actor_rollout_ref.actor.fsdp_config.param_offload=${offload}  # True
+actor_rollout_ref.ref.fsdp_config.param_offload=${offload}    # True
+# 优化器状态卸载 (Optimizer Offload)
+# 作用: 将优化器状态(如Adam的动量)卸载到CPU。优化器状态通常占用大量显存(对于Adam,每个参数需要额外8字节),卸载可以节省显存。
+actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload}  # True
+# 释放推理引擎缓存 (Free Cache Engine)
+# 作用: 在训练阶段释放推理引擎的KV cache和权重。这是3D-HybridEngine的核心优化,允许在同一GPU上交替进行推理和训练,显著降低显存需求。
+actor_rollout_ref.rollout.free_cache_engine=True
+#  熵计算优化
+# entropy_checkpointing: 在训练时对熵计算启用重计算,降低显存峰值
+# entropy_from_logits_with_chunking: 分块处理logits张量(如2048 tokens一组),避免一次性加载整个[bsz*seq_len, vocab]张量
+actor_rollout_ref.actor.entropy_checkpointing=True
+actor_rollout_ref.ref.entropy_checkpointing=True
+actor_rollout_ref.actor.entropy_from_logits_with_chunking=True
+actor_rollout_ref.ref.entropy_from_logits_with_chunking=True
+# 推理引擎显存配置
+# gpu_memory_utilization: 控制vLLM使用的GPU显存比例(0.90 = 90%)
+# enforce_eager=False: 启用CUDA graphs加速推理,但会占用额外显存
+actor_rollout_ref.rollout.gpu_memory_utilization=0.90
+actor_rollout_ref.rollout.enforce_eager=False
+~~~
+### NPU调优参考文章
+环境变量相关：[环境变量列表-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/apiref/Envvariables/Envir_001.html)
+社区性能调优教程：[性能调优流程-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0001.html)