diff --git a/code/RL_model/verl/verl_train/docs/_static/custom.css b/code/RL_model/verl/verl_train/docs/_static/custom.css new file mode 100644 index 0000000000000000000000000000000000000000..32f08475754bc280bca407d1643ec3aa68eeacf3 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/_static/custom.css @@ -0,0 +1,217 @@ +/* Make the documentation use full screen width */ +.wy-nav-content { + max-width: none !important; + width: 100% !important; + padding: 1.618em 3.236em !important; +} + +/* Adjust the content wrapper - will be set by JavaScript */ +.wy-nav-content-wrap { + margin-left: 300px; + transition: margin-left 0.2s ease; + width: auto !important; + position: relative !important; + background: white !important; + min-height: 100vh !important; +} + +/* Make the main content area responsive */ +.rst-content { + max-width: none !important; + width: 100% !important; +} + +/* Optional: Adjust table widths to prevent overflow */ +.rst-content table.docutils { + width: 100% !important; + table-layout: auto !important; +} + +/* Optional: Better code block width handling */ +.rst-content .highlight { + width: 100% !important; +} + +/* Content area positioning already handled above */ + +/* Optional: Improve readability with some margin on very wide screens */ +@media (min-width: 1400px) { + .wy-nav-content { + max-width: none !important; + margin: 0 auto !important; + } +} + +/* Resizable sidebar styles */ +.wy-nav-side { + position: fixed !important; + top: 0 !important; + bottom: 0 !important; + left: 0 !important; + width: 300px; + min-width: 200px; + max-width: 600px; + display: flex; + flex-direction: column; + z-index: 200 !important; +} + +/* Ensure sidebar header (logo, search) adapts to width */ +.wy-side-nav-search { + width: 100% !important; + box-sizing: border-box !important; + padding: 0.809em 0.809em !important; +} + +.wy-side-nav-search input[type="text"] { + width: 100% !important; + box-sizing: border-box !important; +} + +/* Make logo/title area responsive */ +.wy-side-nav-search > div.version { + width: 100% !important; +} + +.wy-side-nav-search > a { + width: 100% !important; + display: block !important; + white-space: nowrap !important; + overflow: hidden !important; + text-overflow: ellipsis !important; +} + +/* Responsive adjustments for narrow sidebar */ +@media (max-width: 300px) { + .wy-side-nav-search > a { + font-size: 0.9em !important; + } + + .wy-side-nav-search input[type="text"] { + font-size: 0.8em !important; + } +} + +/* Ensure search input doesn't overflow */ +.wy-side-nav-search form { + width: 100% !important; + margin: 0 !important; +} + +/* Make search icon responsive */ +.wy-side-nav-search .wy-dropdown { + width: 100% !important; +} + +/* Adjust search results dropdown width */ +.wy-side-nav-search .wy-dropdown-menu { + width: 100% !important; + max-width: none !important; + left: 0 !important; + right: 0 !important; +} + +/* Resize handle is created by JavaScript */ + +/* Make sure the sidebar content doesn't overflow */ +.wy-side-scroll { + width: 100% !important; + flex: 1 !important; + overflow-y: auto !important; + overflow-x: hidden !important; + padding-right: 10px !important; + box-sizing: border-box !important; + scroll-behavior: auto !important; /* Prevent smooth scrolling on sidebar itself */ +} + +/* Ensure proper scroll behavior for main content area */ +html { + scroll-behavior: smooth !important; +} + +/* Ensure anchor links work properly in main content */ +.wy-nav-content-wrap { + scroll-behavior: smooth !important; +} + +/* Fix scroll to target for anchor links */ +.rst-content { + scroll-behavior: smooth !important; +} + +/* Fix anchor scroll offset to account for fixed header */ +.rst-content .section { + scroll-margin-top: 60px; +} + +/* Fix anchor scroll offset for headers */ +.rst-content h1, .rst-content h2, .rst-content h3, .rst-content h4, .rst-content h5, .rst-content h6 { + scroll-margin-top: 60px; +} + +/* Fix anchor scroll offset for specific scroll targets */ +.rst-content .headerlink { + scroll-margin-top: 60px; +} + +/* Fix sidebar navigation styling */ +.wy-menu-vertical { + width: 100% !important; +} + +.wy-menu-vertical li { + width: 100% !important; +} + +.wy-menu-vertical a { + width: 100% !important; + word-wrap: break-word !important; + white-space: normal !important; +} + +/* Content area margin is handled by JavaScript */ + +/* Custom drag handle (more visible) */ +.resize-handle { + position: absolute; + top: 0; + right: 0; + width: 8px; + height: 100%; + background: #ccc; + cursor: col-resize; + z-index: 1001; + opacity: 0.3; + transition: opacity 0.2s ease; +} + +.resize-handle:hover { + opacity: 0.8; + background: #999; +} + +.resize-handle::before { + content: ''; + position: absolute; + top: 50%; + left: 50%; + width: 2px; + height: 20px; + background: #666; + transform: translate(-50%, -50%); + border-radius: 1px; +} + +.resize-handle:hover::before { + background: #333; +} + +/* Ensure smooth resizing */ +.wy-nav-side.resizing { + user-select: none; + pointer-events: none; +} + +.wy-nav-side.resizing .wy-side-scroll { + overflow: hidden; +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js b/code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js new file mode 100644 index 0000000000000000000000000000000000000000..2a51fa90043bb0ecf78149b092fd3447740fdaee --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/_static/js/resizable-sidebar.js @@ -0,0 +1,251 @@ +// Resizable sidebar functionality +document.addEventListener('DOMContentLoaded', function() { + const sidebar = document.querySelector('.wy-nav-side'); + const content = document.querySelector('.wy-nav-content-wrap'); + + if (!sidebar || !content) return; + + // Create resize handle + const resizeHandle = document.createElement('div'); + resizeHandle.className = 'resize-handle'; + sidebar.appendChild(resizeHandle); + + let isResizing = false; + let startX = 0; + let startWidth = 0; + + // Get initial width + const getInitialWidth = () => { + return 300; // Default width + }; + + // Save width to localStorage + const saveWidth = (width) => { + localStorage.setItem('sidebar-width', width); + }; + + // Load width from localStorage + const loadWidth = () => { + const savedWidth = localStorage.getItem('sidebar-width'); + if (savedWidth) { + const width = parseInt(savedWidth, 10); + if (width >= 200 && width <= 600) { + return width; + } + } + return getInitialWidth(); + }; + + // Apply width to sidebar and content + const applyWidth = (width) => { + // Update sidebar width + sidebar.style.width = width + 'px'; + + // Update content margin with !important to override any CSS + content.style.setProperty('margin-left', width + 'px', 'important'); + + // Also update any other content wrapper that might exist + const contentInner = document.querySelector('.wy-nav-content'); + if (contentInner) { + contentInner.style.setProperty('margin-left', '0px', 'important'); + } + + // Force reflow and repaint + sidebar.offsetHeight; + content.offsetHeight; + + // Trigger window resize event to notify other components + window.dispatchEvent(new Event('resize')); + }; + + // Initialize with saved width + const initialWidth = loadWidth(); + applyWidth(initialWidth); + + // Mouse down on resize handle + resizeHandle.addEventListener('mousedown', (e) => { + isResizing = true; + startX = e.clientX; + startWidth = parseInt(window.getComputedStyle(sidebar).width, 10); + + sidebar.classList.add('resizing'); + document.body.style.cursor = 'col-resize'; + document.body.style.userSelect = 'none'; + + // Add overlay to prevent iframe issues + const overlay = document.createElement('div'); + overlay.style.cssText = ` + position: fixed; + top: 0; + left: 0; + width: 100%; + height: 100%; + z-index: 9999; + cursor: col-resize; + `; + overlay.id = 'resize-overlay'; + document.body.appendChild(overlay); + + e.preventDefault(); + }); + + // Mouse move + document.addEventListener('mousemove', (e) => { + if (!isResizing) return; + + const width = startWidth + e.clientX - startX; + const clampedWidth = Math.max(200, Math.min(600, width)); + applyWidth(clampedWidth); + }); + + // Mouse up + document.addEventListener('mouseup', () => { + if (!isResizing) return; + + isResizing = false; + sidebar.classList.remove('resizing'); + document.body.style.cursor = ''; + document.body.style.userSelect = ''; + + // Remove overlay + const overlay = document.getElementById('resize-overlay'); + if (overlay) { + overlay.remove(); + } + + // Save the current width + const currentWidth = parseInt(window.getComputedStyle(sidebar).width, 10); + saveWidth(currentWidth); + }); + + // Handle window resize - removed to prevent infinite loop + // The sidebar width is fixed and managed by drag functionality, no need to recalculate on window resize + + // Double-click to reset to default width + resizeHandle.addEventListener('dblclick', () => { + const defaultWidth = 300; + applyWidth(defaultWidth); + saveWidth(defaultWidth); + }); +}); + +// Fix navigation issues - Using MutationObserver for reliable initialization +document.addEventListener('DOMContentLoaded', function() { + let navigationFixed = false; + + function setupNavigationFix() { + if (navigationFixed) return; + + // Find all links in the sidebar + const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a'); + + // Only proceed if we have sidebar links + if (sidebarLinks.length === 0) return; + + console.log('Setting up navigation fix...'); + + sidebarLinks.forEach(function(link) { + const href = link.getAttribute('href'); + + // Clone the link to remove all existing event listeners + const newLink = link.cloneNode(true); + + // Add our own click handler + newLink.addEventListener('click', function(e) { + console.log('Link clicked:', href); + + // If it's an anchor link within the same page + if (href && href.startsWith('#') && href !== '#') { + e.preventDefault(); + e.stopPropagation(); + + const targetId = href.substring(1); + const targetElement = document.getElementById(targetId); + + if (targetElement) { + // Calculate offset for fixed header + const headerHeight = 60; + const elementPosition = targetElement.getBoundingClientRect().top; + const offsetPosition = elementPosition + window.pageYOffset - headerHeight; + + window.scrollTo({ + top: offsetPosition, + behavior: 'smooth' + }); + + // Update URL hash + if (history.pushState) { + history.pushState(null, null, '#' + targetId); + } else { + location.hash = '#' + targetId; + } + } + } + // For external links, navigate normally + else if (href && !href.startsWith('#') && !href.startsWith('javascript:')) { + console.log('Navigating to external link:', href); + window.location.href = href; + } + }); + + // Replace the old link with the new one + link.parentNode.replaceChild(newLink, link); + }); + + navigationFixed = true; + + // Handle initial page load with hash + if (window.location.hash) { + // Use requestAnimationFrame for better timing + requestAnimationFrame(() => { + const targetId = window.location.hash.substring(1); + const targetElement = document.getElementById(targetId); + if (targetElement) { + const headerHeight = 60; + const elementPosition = targetElement.getBoundingClientRect().top; + const offsetPosition = elementPosition + window.pageYOffset - headerHeight; + + window.scrollTo({ + top: offsetPosition, + behavior: 'smooth' + }); + } + }); + } + } + + // Try to set up navigation fix immediately + setupNavigationFix(); + + // If it didn't work, use MutationObserver to watch for when sidebar links are added + if (!navigationFixed) { + const observer = new MutationObserver(function(mutations) { + mutations.forEach(function(mutation) { + if (mutation.type === 'childList' && mutation.addedNodes.length > 0) { + // Check if sidebar links were added + const sidebarLinks = document.querySelectorAll('.wy-menu-vertical a'); + if (sidebarLinks.length > 0) { + setupNavigationFix(); + if (navigationFixed) { + observer.disconnect(); + } + } + } + }); + }); + + // Start observing the document for changes + observer.observe(document.body, { + childList: true, + subtree: true + }); + + // Fallback timeout in case MutationObserver doesn't work + setTimeout(function() { + if (!navigationFixed) { + setupNavigationFix(); + } + observer.disconnect(); + }, 5000); + } +}); \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js b/code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js new file mode 100644 index 0000000000000000000000000000000000000000..bec345cacc5b943693e1bf1973a7a6d863b0d85e --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/_static/js/runllm-widget.js @@ -0,0 +1,14 @@ +document.addEventListener("DOMContentLoaded", function () { + var script = document.createElement("script"); + script.type = "module"; + script.id = "runllm-widget-script"; + script.src = "https://widget.runllm.com"; + script.setAttribute("version", "stable"); + script.setAttribute("crossorigin", "true"); + script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); + script.setAttribute("runllm-name", "verl Chatbot"); + script.setAttribute("runllm-position", "TOP_RIGHT"); + script.setAttribute("runllm-assistant-id", "679"); + script.async = true; + document.head.appendChild(script); + }); \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/_static/logo.png b/code/RL_model/verl/verl_train/docs/_static/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..6a3b61308d73eb72071eaec8c95544ab36cd3970 Binary files /dev/null and b/code/RL_model/verl/verl_train/docs/_static/logo.png differ diff --git a/code/RL_model/verl/verl_train/docs/advance/agent_loop.rst b/code/RL_model/verl/verl_train/docs/advance/agent_loop.rst new file mode 100644 index 0000000000000000000000000000000000000000..013ec9ed887924138c92d3bf12d94dd035ad5301 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/agent_loop.rst @@ -0,0 +1,238 @@ +Agent Loop +========== + +Last updated: 07/17/2025. + +.. versionadded:: 0.4.2 + [status: alpha] + +.. warning:: + Agent Loop is ready for use, but the API may change in future releaes. + +Agent Loop is designed as general interface for multi-turn rollout and agentic reinforcement learning. + +**Design goal**: + +- Plugable user defined agent loop +- Provide standard request generate api with different inference frameworks +- Provide request level load balance between multiple inference servers + +**Non-goal**: + +- How tool is defined and how to call tool + +In high level overview, agent loop is given a prompt, run user defined loop: call LLM generate api, call tools, ... +and return the final output. The final output is then calculated reward and used as trajectory for RL training. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_overview.svg?raw=true + + +API Design +---------- + +``AgentLoopBase`` class is the abstraction of agent loop, and ``run`` method is the only interface that user need to implement. +The run method, given prompt messages in format: [{"role": "user"}, {"content": "..."}], and additional sampling params, +could do whatever user wants, such as + +- call LLM generate api +- call tools: web search, database query, code sandbox, ... +- environment interaction +- reflection +- ... + +.. code:: python + + class AgentLoopBase(ABC): + @abstractmethod + async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: + """Run agent loop to interact with LLM server and environment. + + Args: + sampling_params (Dict[str, Any]): LLM sampling params. + **kwargs: dataset fields from `verl.utils.dataset.RLHFDataset`. + + Returns: + AgentLoopOutput: Agent loop output. + """ + raise NotImplementedError + +After running user defined loop, run method should return ``AgentLoopOutput``, including prompt token ids, +response token ids, and response mask. + +.. code:: python + + class AgentLoopOutput(BaseModel): + """Agent loop output.""" + + prompt_ids: list[int] + """Prompt token ids.""" + response_ids: list[int] + """Response token ids including LLM generated token, tool response token.""" + response_mask: list[int] + """Response mask, 1 for LLM generated token, 0 for tool response token.""" + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_output.svg?raw=true + +.. note:: AgentLoopOutput only output one trajectory for a given prompt, multiple trajectories output is still under discussion. + +Architecture Design +------------------- + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop_architecture.png?raw=true + +A single PPO step contain two phase: rollout and train. In rollout phase: + +1. PPOTrainer sample a batch from dataset and call ``AgentLoopManager.generate_sequences``. +2. AgentLoopManager ``wake_up`` all async LLM server instances, which will sync weights between inference engine(vLLM/SGLang) and training engine(FSDP/Megatron-LM). +3. AgentLoopManager split batch into chunks and send each chunk to ``AgentLoopWorker``. +4. AgentLoopWorker receive chunk and for each prompt, spawn a user defined ``AgentLoopBase`` instance, run ``run`` coroutine until end and get ``AgentLoopOutput``. + +.. tip:: + AgentLoopWorker schedules multiple coroutines concurrently. If number of AgentLoopWorker equals batch_size, then each worker is response for one prompt. + +In agent loop, when user need LLM generate response: + +5. Call ``AsyncLLMServerManager.generate`` with prompt_ids. +6. AsyncLLMServerManager select a server instance with least request in first turn and send request to it. (In following turns, the request will be sent to the same server instance). +7. AsyncLLMServer receive a request, issue ipc/rpc with model_runner, and generate response. (There's slight differences between vLLM and SGLang, see below). + +When all prompts in all AgentLoopWorker finish, AgentLoopManager gather results and return to PPOTrainer. + +8. AgentLoopManager ``sleep`` all server instances, which will free kv cache and offload weights to CPU memory. + +AsyncLLMServer +~~~~~~~~~~~~~~ + +AsyncLLMServer is the abstraction of LLM server with two types of generation api: + +- `OpenAI chat completion `_: generate response for the given chat conversation. +- Token in token out: generate response ids for the given token ids. + +We have officially supported vLLM and SGLang AsyncLLMServer, both of them implement the two api and are well tested. +Other inference engine should be easy to plug-in by implement the ``AsyncServerBase`` class. + +.. code:: python + + class AsyncServerBase(ABC): + @abstractmethod + async def chat_completion(self, raw_request: Request) -> JSONResponse: + """OpenAI chat completion API. + + Args: + raw_request (Request): raw json request + + Returns: + JSONResponse: json response + + API reference: https://platform.openai.com/docs/api-reference/chat/create + """ + raise NotImplementedError + + @abstractmethod + async def generate(self, prompt_ids: list[int], sampling_params: dict[str, Any], request_id: str) -> list[int]: + """Generate response ids given prompt ids. + + Args: + prompt_ids (List[int]): prompt ids + sampling_params (Dict[str, Any]): sampling params + request_id (str): request id + + Returns: + List[int]: response ids + """ + raise NotImplementedError + + +Chat completion vs Token in token out +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. warning:: + The following conclusion is based on our recent experience and is still open to investigation and discussion. + +Almost all agent frameworks (LangGraph, CrewAI, LlamaIndex, etc) call LLM with OpenAI chat completion api, and +keep chat history as messages. So user may expect that we should use the chat completion api in multi-turn rollout. + +But based on our recent experience on single-turn training on DAPO and multi-turn training on `retool `_, +we found the token_ids from apply the final messages may not equal to the token_ids by concat prompt_ids and response_ids in each turn. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/multi_turn.png?raw=true + +**Where does this inconsistency happened?** + +First, the tool parser may alter the content. For example + +.. code:: json + + {"role": "assistant", "content": "Let me call a ... and get the result"} + +After tool_calls extraction, the messages is like this: + +.. code:: json + + {"role": "assistant", "content": "Let me call a and get the result", "tool_calls": [{"name": "foo", "arguments": "{}"}]} + +Encode the extracted message back is not equal to the original LLM generated response_ids. + +Second, the `decode-encode` may also lead to inconsistency: `Agent-R1 issue#30 `_. + +**What is the impact of this inconsistency?** + +This inconsistency is not a big problem for serving/agent system, but is critical to RL training. +It causes the trajectory deviate from the policy model distribution. We have observed that apply_chat_template +to the final chat history messages make PPO training not even converged in single-turn. + +vLLM +^^^^ + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_vllm.png?raw=true + +For vLLM, the Async LLM Engine is running in same process as the server, and ModelRunner is running in same process as FSDP/Megatron-LM workers. +Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it directly call engine to generate response_ids. + +SGLang +^^^^^^ + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/async_sglang.png?raw=true + +For SGLang, the Async LLM Engine is running in same process as FSDP/Megatron-LM worker-0, and it spawn multiple subprocesses as ModelRunner. +Also, Async LLM Engine communicate with ModelRunner through ZeroMQ. When server receive a request, it remote call the worker-0 and get response_ids. + +AsyncLLMServerManager +~~~~~~~~~~~~~~~~~~~~~ + +AsyncLLMServerManager serve as proxy to multiple AsyncLLMServer instances, provides: + +- load balance: select a server instance with least request in first turn and send request to it. +- sticky session: bind request_id to server instance, so that the same request_id will be sent to the same server instance in following turns. + +AsyncLLMServerManager is passed to ``AgentLoopBase.__init__``, whenever user want to interact with LLM in agent loop, +they can call ``AsyncLLMServerManager.generate`` to generate response_ids. + +.. code:: python + + class AsyncLLMServerManager: + async def generate( + self, + request_id, + *, + prompt_ids: list[int], + sampling_params: dict[str, Any], + ) -> list[int]: + """Generate tokens from prompt ids. + + Args: + request_id (str): request id for sticky session. + prompt_ids (List[int]): List of prompt token ids. + sampling_params (Dict[str, Any]): Sampling parameters for the chat completion. + + Returns: + List[int]: List of generated token ids. + """ + ... + +Next +---- + +- :doc:`Agentic RL Training<../start/agentic_rl>`: Quick start agentic RL training with gsm8k dataset. +- `LangGraph MathExpression `_: Demonstrate how to use LangGraph to build agent loop. +- `Retool `_: End-to-end retool paper reproduction using tool agent. diff --git a/code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md b/code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md new file mode 100644 index 0000000000000000000000000000000000000000..55b8d392206c94968d6ade5a29ce82eb8d267c8f --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/async-on-policy-distill.md @@ -0,0 +1,242 @@ +# Recipe: Async On-Policy Knowledge Distillation Trainer + +**Authors:** Brilliant Hanabi, furunding + +**Last updated:** 2025-11-08 + +## 1. Background + +On-policy knowledge distillation (KD) trains a student policy to imitate a stronger teacher using samples drawn from the student's current policy. For each on-policy rollout the teacher returns soft, top-k token distributions and the student is optimized with a token-wise sparse KL objective that focuses learning on the teacher's high-probability modes. Because training examples come from the student's own state distribution, KD reduces distributional mismatch relative to off-policy distillation or supervised fine-tuning (SFT), improving stability and sample efficiency. Compared with reinforcement learning, KD avoids high-variance reward-based optimization and complex reward design by providing dense, informative per-token targets, which typically yields faster convergence and simpler scaling. Recent empirical and implementation-focused writeups (e.g., [ThinkingMachines' blog on on-policy distillation](https://thinkingmachines.ai/blog/on-policy-distillation/)) also demonstrate that on-policy distillation can deliver high-quality behavior with substantially lower compute and data requirements than many alternative approaches. + +Built on verl’s Ray-based single-controller components, we initially assembled a strictly on-policy KD pipeline where rollout generation, teacher knowledge acquisition, and policy optimization ran in lockstep. In practice, this synchronous design proved highly inefficient: the three stages had to wait for one another, creating pipeline bubbles and underutilized GPUs. To address this, we extend the asynchronous schedulers introduced by the One-Step-Off Policy pipeline to overlap these phases. This overlap preserves the same distillation objective while trading some strict on-policy guarantees for substantial gains in end-to-end throughput and hardware utilization. + +## 2. Distillation Overview and Objective + +This recipe centers on on-policy knowledge distillation: the student policy learns from a stronger teacher on samples generated by the current policy (on-policy). For each input prompt, the student (actor) generates responses; the teacher provides top-k token distributions, and the student is trained to match them token-wise. + +Core components: + +1. Teacher signal: top-k log-probabilities and token indices per valid token position. +2. Student objective: sparse, token-level KL divergence between student logits and teacher top-k distribution. + +Objective: encourage student probabilities $Q$ to cover teacher modes $P$ using token-wise $\mathrm{KL}(P\,\|\,Q)$ computed on the teacher's top-k support. + +## 3. Efficient System Design + +### 3.1 Schedulers (One-Step / Two-Step Off-Policy) + +The native (serial) on-policy distillation process is shown in the figure below. + +![Zero-Step-Off Scheduler](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/zero-step-off-distill.png) + +This recipe supports optional schedulers that overlap generation, teacher querying, and updates to improve throughput without changing the distillation objective. + +#### 3.1.1 One-Step-Off-Policy + +![One-Step-Off Scheduler](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/one-step-off-distill.png) + +- Warm-up: 2 steps. +- Overlap pattern: rollout while actor update; weight sync while teacher retrieving. +- Timing keys: `sync_rollout_weights`, `wait_prev_gen`, `wait_prev_teacher`. + +#### 3.1.2 Two-Step-Off-Policy + +![Two-Step-Off Scheduler](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/two-step-off-distill.png) + +- Warm-up: 3 steps. +- Overlap pattern: rollout, actor update while teacher retrieving; interleave weight sync. +- Timing keys: `sync_rollout_weights`, `max(wait_prev_gen, wait_prev_prev_teacher)`. + +Tip: Use `two_step_off` when teacher takes much more time than sync; `one_step_off` for simpler overlapping. + +Practical details: + +- Inputs per batch: `teacher_topk_logps`, `teacher_topk_indices`, `attention_mask` (to select valid token positions). +- Loss injection: last pipeline stage computes KL via a logits processor; earlier stages remain unchanged. +- Optional dynamic micro-batching groups sequences by density to reduce padding overhead. + +The pipeline: + +1. Actor parameters are synchronized to a rollout worker group (nccl broadcast) with a little bit latency. +2. Rollout workers (vLLM-backed) generate sequences asynchronously (`async_generate_sequences`). +3. Teacher client service (ZeroMQ based) returns top-k log-probabilities + token indices for each sequence (batched micro-requests), enabling KL-based guidance. +4. Megatron actor performs a KL divergence computation between student logits and teacher top-k distributions (custom TP-aware kernel in `megatron_kl_loss.py`). +5. Scheduling strategies (`one_step_off_scheduler`, `two_step_off_scheduler`) can overlap phases (optional for throughput): + +### 3.2 Weights sync between actor and rollout + +We initially followed the weight synchronization path from the One-Step-Off-Policy recipe (Ray collective broadcast across all actor and rollout ranks, plus Megatron-side allgather of parameter shards). In practice this became the dominant bottleneck, so we made three changes: + +1. Batch-and-bulk load on the rollout side: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched load into the rollout engine. In our setup this reduced the weight-loading time by roughly 3×. +2. Batch-and-bulk broadcast between the actor and rollout: instead of streaming tensors one-by-one (in one-step-off-policy recipe), we stage a bundle of parameter tensors and issue a single batched broadcast between the actor and rollout workers. +3. Replace allgather with gather-to-root in Megatron: parameter shards are gathered to actor rank 0 (rather than allgathered to everyone), and that root then serves as the single source for broadcasting to rollout ranks. On top of the previous change, 2 and 3 changes delivered an additional ~4× speedup in the synchronization phase. + +## 4. High-Level Data & Control Flow + +``` +Driver (TaskRunner) + ├─ Initialize Ray, tokenizer, datasets, worker groups + ├─ Build ResourcePoolManager (actor vs rollout GPU layouts) + ├─ Trainer.fit() + ├─ init_workers(): build actor + rollout groups, broadcast weight metadata, create nccl collective group + ├─ continuous_iterator(): epochs → batches + ├─ scheduler (see Section 6) + • _async_gen_next_batch(): optional weight sync + non-blocking rollout + • _async_get_teacher_knowledge(): submit teacher requests, store future + ├─ For each step: + • Sync rollout weights + • Retrieve (batch, gen_output, teacher_output) from futures + • Merge gen + teacher outputs → DataProto + • Compute metrics (response length stats, timing, throughput) + • Update actor (forward_backward_batch + KL loss + optimizer step) + • (Optional) save checkpoint +``` + +> Note: Schedulers are optional and explained later; the distillation objective is independent of how phases are overlapped. + +## 5. Key Components + +### 5.1 `OnPolicyDistillTrainer` (`ray_trainer.py`) +- Creates `GenerationBatchFuture` objects holding rollout and (later) teacher futures. +- Adds scheduling + teacher integration + modified metric emission (KL, timing, MFU). + +### 5.2 Actor Worker (Megatron) +- `OnPolicyDistillActor.update_policy()` orchestrates micro-batch forward/backward. +- KL Loss injection via `logits_processor` during forward on pipeline last stage. + +### 5.3 Rollout Worker (vLLM / SGLang) +- Pure inference mode (`init_model` builds model; no optimizer). +- `async_generate_sequences` returns a Ray future for overlapping. + +### 5.4 Teacher Service (`teacher/`) +- Proxy + worker architecture (ZMQ REQ/REP) for batched top-k retrieval. +- `TeacherClient.submit()` returns a `Future`; aggregator composes micro-batches. +- Configurable temperature, max tokens, only-response mode. + +### 5.5 KL Loss (`megatron_kl_loss.py`) +- Performs normalization & stable per-token probability construction across TP shards. +- Gradient is (student_probs - teacher_sparse_probs) scaled by upstream grad. + +## 6. Configuration Highlights (`on_policy_distill_trainer.yaml`) + +| Section | Purpose | Notable Keys | +|---------|---------|-------------| +| actor_rollout_ref.teacher | Teacher server | server_ip, server_port, n_server_workers | +| trainer | Global training control | total_epochs, save_freq, scheduler (one_step_off | two_step_off), n_gpus_per_node, nnodes | +| rollout | Resource split for rollout | n_gpus_per_node, nnodes | + +**Remember to set `trainer.n_gpus_per_node`, `trainer.nnodes`, `rollout.n_gpus_per_node` and `rollout.nnodes` to allocate GPU resources.** + +### Dynamic Batch Size + +Enable by: + +``` +actor_rollout_ref.actor.use_dynamic_bsz=True +actor_rollout_ref.actor.max_token_len=6000 # cap post-group token length +``` + +Improves utilization under variable sequence lengths. + +### Resource Guidelines + +- Actor pool: `trainer.nnodes * trainer.n_gpus_per_node` GPUs. +- Rollout pool: `rollout.nnodes * rollout.n_gpus_per_node` GPUs. +- Ensure teacher server capacity ≈ `n_server_workers` to avoid stalls (monitor `wait_prev_teacher`). + +## 7. Usage Examples + +### 7.1 Launch Teacher Server + +Before training process, you should have a teacher server to provide logp information. + +We provide a toy teacher server example with vLLM. It needs `telnet` to check proxy status, and `python` command to run. So if you have not installed `telnet`, you can just delete these code in `start_server.sh`. And some OS use `python3` rather than `python`, so you also need to modify it. Also you can change the port of teacher if you meet port conflict. + +There are 3 arguments can be set for vllm backend `--tp-size`, `--n-logprobs` and `--ckpt-path` in `start_server.sh` / `worker.py`. You should set before you start server. + +We also provide a toy multi-node teacher server. You can start the main node using `start_server.sh` and start the slave nodes using `join_server.sh`. Still remember to set args in `join_server.sh`, especially the `$PROXY_IP` and `$PROXY_BACKEND_PORT` of main node. + +When training, student will automatically use the teacher's topk (n-logprobs) to set its own topk argument at line 83 of `recipe/gkd/megatron_kl_loss.py`, so you don't need to set student's topk argument. + +```bash +cd recipe/gkd/teacher +bash start_server.sh +# Exports ports and launches proxy + worker (default vLLM backend) +``` + +Verify with: + +```bash +telnet localhost 15555 +``` + +### 7.2 Minimal Local (Megatron + vLLM) Run + +```bash +python3 -m recipe.gkd.main_gkd \ + --config-path=recipe/gkd/config \ + --config-name=on_policy_distill_trainer \ + actor_rollout_ref.model.path=/path/to/MODEL \ + data.train_files=/path/to/train.parquet \ + trainer.total_epochs=2 \ + trainer.n_gpus_per_node=4 rollout.n_gpus_per_node=2 \ + actor_rollout_ref.teacher.server_ip=127.0.0.1 \ + actor_rollout_ref.teacher.server_port=15555 \ + trainer.scheduler=one_step_off +``` + +(Requires a running teacher server). + +### 7.3 Ray Job Submission (Distilled 16B Example) + +See `run_moonlight_dsv3_training.sh` for a full script including: + +- Dist ckpt path setup (`dist_checkpointing_path`) +- Expert parallel sizing (EP / ETP) +- Dynamic batch sizing +- Two-step-off scheduling for deeper overlap. + +Submit (after adjusting paths): + +```bash +bash recipe/gkd/run_moonlight_dsv3_training.sh +``` + +## 8. Metrics & Monitoring + +Emitted metrics include (prefixes may vary): + +- Timing: `timing/wait_prev_gen`, `timing/sync_rollout_weights`, `timing/get_teacher_knowledge`, `timing/update_actor`. +- Sequence stats: `response_seq_len/*` (avg, max, min, counts). +- Performance: `perf/mfu/actor`, `perf/max_memory_allocated_gb`, `perf/cpu_memory_used_gb`. +- Distillation: `actor/kl_loss`, `actor/grad_norm`, `actor/lr`. + +Interpretation Tips: + +- High `wait_prev_teacher` → scale `n_server_workers` and allocate more teacher GPUs or reduce per-request batch size, or just use `two_step_off`. +- High `wait_prev_gen` with uniform lengths → allocate more rollout GPUs. +- High `sync_rollout_weights` → check NCCL env / network congestion and try to modify `actor_rollout_ref.rollout.update_weights_bucket_megabytes`. + +## 9. Extensibility Notes + +- Add new schedulers by following interface returning `(epoch, batch, gen_output, teacher_output, timing_dict)`. +- Integrate different distillation signals (e.g., hidden states, intermediate reasoning tokens) by extending `teacher_utils.get_teacher_knowledge` and modifying `logits_processor`. + +## 10. Functional Support Summary + +| Category | Supported | +|----------|-----------| +| Train engine | Megatron | +| Rollout engine | vLLM | +| Distillation signal | Teacher top-k logprobs & indices | +| Scheduling | one_step_off, two_step_off | + +## 11. Quick Checklist Before Running + +- Teacher server reachable (`telnet `). +- `actor_rollout_ref.model.path` contains the correct Megatron/HF config artifacts. +- `train_files` points to a parquet dataset compatible with this recipe's dataset loader. +- NCCL environment vars set (see `config/runtime_env.yaml`). + +--- +Feel free to open issues or PRs to extend scheduler variants, add new distillation objectives, or broaden engine support, and more improvement. diff --git a/code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst b/code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst new file mode 100644 index 0000000000000000000000000000000000000000..c068bd92115d38a86b4ba9414ae4c5e5a18a2218 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/attention_implementation.rst @@ -0,0 +1,119 @@ +.. _attention-implementation-override: + +Attention Implementation Override +================================== + +Last updated: 10/31/2025. + +By default, VERL's FSDP workers use ``flash_attention_2`` as the attention implementation for improved performance. +However, you can now override this setting to use different attention implementations based on your needs. + +Supported Attention Implementations +----------------------------------- + +The following attention implementations are supported (subject to model and hardware compatibility): + +- ``flash_attention_2``: High-performance attention implementation (default) +- ``eager``: Standard PyTorch attention implementation +- ``sdpa``: Scaled Dot-Product Attention (PyTorch native) + +When to Override +---------------- + +You might want to override the attention implementation in the following scenarios: + +- **Debugging**: Use ``eager`` for easier debugging and better error messages +- **Compatibility**: Some models or hardware configurations may not support ``flash_attention_2`` +- **Memory constraints**: Different implementations have different memory characteristics +- **Performance tuning**: Testing different implementations for optimal performance + +Configuration Examples +----------------------- + +PPO Training with Eager Attention +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To override the attention implementation for the actor, rollout, and reference models: + +.. code:: bash + + python3 ppo_trainer.py \ + +actor_rollout_ref.model.override_config.attn_implementation=eager \ + [other parameters...] + +PPO Training with SDPA Attention +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: bash + + python3 ppo_trainer.py \ + +actor_rollout_ref.model.override_config.attn_implementation=sdpa \ + [other parameters...] + +Critic Model Override +~~~~~~~~~~~~~~~~~~~~~ + +For training configurations that include a critic model, you can also override its attention implementation: + +.. code:: bash + + python3 ppo_trainer.py \ + +actor_rollout_ref.model.override_config.attn_implementation=eager \ + +critic.model.override_config.attn_implementation=eager \ + [other parameters...] + +YAML Configuration +~~~~~~~~~~~~~~~~~~ + +You can also specify the attention implementation in your YAML configuration file: + +.. code:: yaml + + actor_rollout_ref: + model: + override_config: + attn_implementation: eager + # other overrides... + + critic: # if using a critic model + model: + override_config: + attn_implementation: eager + # other overrides... + +Important Notes +--------------- + +**Backward Compatibility**: If you don't specify ``attn_implementation`` in the override config, +VERL will continue to use ``flash_attention_2`` by default, ensuring backward compatibility with existing configurations. + +**Model Support**: Not all models support all attention implementations. Ensure your model is compatible +with the chosen attention implementation before training. + +**Performance Impact**: Different attention implementations have varying performance characteristics. +``flash_attention_2`` typically offers the best performance, while ``eager`` provides better debugging capabilities. + +**Hardware Dependencies**: Some attention implementations (like ``flash_attention_2``) may require +specific hardware or CUDA versions. If you encounter compatibility issues, try using ``eager`` or ``sdpa``. + +Troubleshooting +--------------- + +If you encounter errors when using a specific attention implementation: + +1. **Check model compatibility**: Verify that your model supports the chosen attention implementation +2. **Try eager attention**: Use ``attn_implementation=eager`` as a fallback for debugging +3. **Check hardware requirements**: Ensure your hardware supports the attention implementation +4. **Review error messages**: Attention implementation errors often provide clear guidance on supported options + +Example Error Resolution +~~~~~~~~~~~~~~~~~~~~~~~~ + +If you see an error like "flash_attention_2 is not supported", you can resolve it by switching to eager attention: + +.. code:: bash + + # Instead of the default flash_attention_2 + python3 ppo_trainer.py +actor_rollout_ref.model.override_config.attn_implementation=eager + +This override ensures your training can proceed while you investigate the flash attention compatibility issue. diff --git a/code/RL_model/verl/verl_train/docs/advance/checkpoint.rst b/code/RL_model/verl/verl_train/docs/advance/checkpoint.rst new file mode 100644 index 0000000000000000000000000000000000000000..9782af951d9cf626cae6b603666d3adc3114dfdc --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/checkpoint.rst @@ -0,0 +1,159 @@ +.. _checkpoint-page: + +Using Checkpoints to Support Fault Tolerance Training +===================================================== + +Last updated: 06/25/2025. + +There could be training errors or machine failure during the whole RLHF training process, +so it is recommended to enable checkpoints to minimize your loss. + +The API Interface has already been listed in :ref:`config-explain-page`, +and we will not repeat them. But there are still some technique details +we hope to clarify. + +.. note:: + + Notice that the ``checkpoint.contents`` field has no effect to FSDP checkpoint except ``hf_model``, + the other 3 fields are binded together to save and load. We recommend to include ``model``, ``optimizer`` and ``extra`` all. + +Checkpoint Saving Directory Structure +------------------------------------- + +Commonly, we use the ``default_local_dir`` declared in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yml`` +to work as preffix when saving checkpoints, which is ``checkpoints/${trainer.project_name}/${trainer.experiment_name}``. + +So the inner checkpoint structure of **FSDP** is like: + +.. code:: + + checkpoints/${trainer.project_name}/${trainer.experiment_name} + ├── global_steps_${i} + │ ├── actor + │ │ ├── huggingface # default save config and tokenizer, save huggingface model if include ``hf_model`` in checkpoint.contents + │ │ └── fsdp_config.json # FSDP config file, including world_size and fsdp version + │ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt + │ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt + │ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt + │ ├── critic + │ │ ├── huggingface + │ │ └── fsdp_config.json + │ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt + │ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt + │ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt + └── latest_checkpointed_iteration.txt + +All model shards, optimizers and extra states are stored together, in a sharded and distributed way. + +While **Megatron** current checkpoint structure is: + +.. code:: + + checkpoints/${trainer.project_name}/${trainer.experiment_name} + ├── global_steps_${i} + │ ├── actor + │ │ ├── huggingface # default save config and tokenizer, save huggingface model if include ``hf_mode`` in checkpoint.contents + │ │ └── dist_ckpt # save sharded model/optimizer/rng_states, naming the same as Megatron + │ └── critic + │ │ ├── huggingface + │ │ └── dist_ckpt + └── latest_checkpointed_iteration.txt + +Convert FSDP and Megatron Checkpoints to HuggingFace Format Model +----------------------------------------------------------------- + +We provide a tool to convert the FSDP and Megatron checkpoints to HuggingFace format model. +The tool is located in ``verl/model_merger``. For older versions of verl that don't include fsdp_config.json in checkpoints, you can use the legacy model merger located at ``verl/scripts/legacy_model_merger.py``. + +The script supports two main sub-commands: `merge` (to convert and save checkpoints) and `test` (to validate merged checkpoints against a reference model). +The arguments for the `merge` sub-command are as follows: + +.. code:: bash + + usage: python -m verl.model_merger merge [-h] --backend {fsdp,megatron} [--local_dir LOCAL_DIR] [--tie-word-embedding] [--is-value-model] [--use_cpu_initialization] [--target_dir TARGET_DIR] + [--hf_upload_path HF_UPLOAD_PATH] [--private] + + options: + -h, --help show this help message and exit + --backend {fsdp,megatron} + The backend of the model + --local_dir LOCAL_DIR + Path to the saved model checkpoints + --tie-word-embedding Whether to tie word embedding weights (currently only Megatron supported) + --is-value-model Whether the model is a value model (currently only Megatron supported) + --use_cpu_initialization + Whether to use CPU initialization for the model. This is useful for large models that cannot fit into GPU memory during initialization. + --target_dir TARGET_DIR + Directory to save the merged huggingface model + --hf_upload_path HF_UPLOAD_PATH + Hugging Face repository ID to upload the model + --private Whether to upload the model to a private Hugging Face repository + +Example usage for merging Megatron checkpoints: + +.. code:: bash + + python -m verl.model_merger merge \ + --backend megatron \ + --tie-word-embedding \ + --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \ + --target_dir /path/to/merged_hf_model + +Example usage for distributed merging Megatron checkpoints: + +.. code:: bash + + torchrun --nproc_per_node 1 --nnodes 8 --node_rank ${RANK} -m verl.model_merger merge \ + --backend megatron \ + --tie-word-embedding \ + --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \ + --target_dir /path/to/merged_hf_model + +Example usage for merging FSDP checkpoints: + +.. code:: bash + + python -m verl.model_merger merge \ + --backend fsdp \ + --local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor \ + --target_dir /path/to/merged_hf_model + + +Megatron Merger details +----------------------- + +Current implement of decoder layers uses ``nn.ModuleList`` to store the layers, +and thus the model layers on every PP rank and VPP rank starts their index from 0. + +There are 3 ways to correct this behavior: + +1. Modify the decoder layer's state_dict, add ``offset`` to each layer's index, thus rewrite ``nn.ModuleList`` implementation. +2. Modify the layer index when saving checkpoint and recover them when loading checkpoint. +3. The Checkpoint merger do this work, calculate the actual ``offset`` from ``state_dict`` only, a little complex. + +Current implementation use solution 2. + + +HuggingFace to Megatron DistCheckpoint details +---------------------------------------------- + +Through ``mbridge``, we can directly save the mcore model to huggingface format during training. +No need to convert the model to Megatron dist-checkpoint format. + +Original Checkpoint Utils +------------------------- + +Original Checkpoint Utils refer to original checkpoint implementation in ``verl/models/[model]/megatron/checkpoint_utils``. + +We only need ``[model]_loader.py`` in original checkpoint utils now, since we get rid of storing ``hf_model`` every time (which is not recommended for large model training, try only saving sharded models if you can). + +.. note:: + + Note that ``[model]_loader`` only support environments where **storage clusters are able to connect with every calculation nodes**. + Because it utilizes **sharded load way to minimize the loading checkpoint overhead**. + Every rank loads its own data from ``state_dict`` which can be accessed by all of them. + While there is also no need to broadcast among DP ranks, since the saved state_dict is only produced by DP rank 0. + + For users who can **only place the huggingface model on one device**, we keep the original costly implementation in ``[model]_loader_deprecated``. In this implementation, rank 0 broadcast all weights to each tp and pp rank, and then dp rank 0 broadcast to all dp ranks. There may be at risks of OOM. + + To use deprecated loader, change the import package of ``load_state_dict_to_megatron_llama``. diff --git a/code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst b/code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst new file mode 100644 index 0000000000000000000000000000000000000000..ee9ac619dde1ebfe3390d0b409b92252cb4e4104 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/dpo_extension.rst @@ -0,0 +1,273 @@ +Extend to other RL(HF) algorithms +================================= + +Last updated: 02/25/2025. + +We already implemented the complete training pipeline of the PPO +algorithms. To extend to other algorithms, we analyze the high-level +principle to use verl and provide a tutorial to implement the DPO +algorithm. Users can follow the similar paradigm to extend to other RL algorithms. + +.. note:: **Key ideas**: Single process drives multi-process computation and data communication. + +Overall Approach +---------------- + +Step 1: Consider what multi-machine multi-GPU computations are needed +for each model, such as ``generate_sequence`` , ``compute_log_prob`` and +``update_policy`` in the actor_rollout model. Implement distributed +single-process-multiple-data (SPMD) computation and encapsulate them +into APIs + +Step 2: Based on different distributed scenarios, including FSDP and 3D +parallelism in Megatron-LM, implement single-process control of data +interaction among multi-process computations. + +Step 3: Utilize the encapsulated APIs to implement the control flow + +Example: Online DPO +------------------- + +We use verl to implement a simple online DPO algorithm. The algorithm +flow of Online DPO is as follows: + +1. There is a prompt (rollout) generator which has the same weight as + the actor model. After a batch of prompts are fed into the generator, + it generates N responses for each prompt. +2. Send all the prompts + responses to a verifier for scoring, which can + be reward model or a rule-based function. Then sort them in pairs to + form a training batch. +3. Use this training batch to train the actor model using DPO. During + the process, a reference policy is needed. + +Step 1: What are the multi-machine multi-GPU computations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**Sample Generator** + +Implementation details: + +.. code:: python + + from verl.single_controller.base import Worker + from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool + import ray + + @ray.remote + class SampleGenerator(Worker): + def __init__(self, config): + super().__init__() + self.config = config + + def generate_sequences(self, data): + pass + +Here, ``SampleGenerator`` can be viewed as a multi-process pulled up by +``torchrun``, with each process running the same code (SPMD). +``SampleGenerator`` needs to implement a ``generate_sequences`` API for +the control flow to call. The implementation details inside can use any +inference engine including vllm, sglang and huggingface. Users can +largely reuse the code in +verl/verl/workers/rollout/vllm_rollout/vllm_rollout.py and we won't +go into details here. + +**ReferencePolicy inference** + +API: compute reference log probability + +.. code:: python + + from verl.single_controller.base import Worker + import ray + + @ray.remote + class ReferencePolicy(Worker): + def __init__(self): + super().__init__() + self.model = Model() + + def infer(self, data): + return self.model(data) + +**Actor update** + +API: Update actor model parameters + +.. code:: python + + from verl.single_controller.base import Worker + import ray + + @ray.remote + class DPOActor(Worker): + def __init__(self): + super().__init__() + self.model = Model() + self.model = FSDP(self.model) # or other distributed strategy + self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3) + self.loss_fn = xxx + + def update(self, data): + self.optimizer.zero_grad() + logits = self.model(data) + loss = self.loss_fn(logits) + loss.backward() + self.optimizer.step() + +**Notes: How to distinguish between control processes and distributed computation processes** +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Control processes are generally functions directly decorated with + ``@ray.remote`` +- Computation processes are all wrapped into a ``RayWorkerGroup``. + +Users can reuse most of the distribtued computation logics implemented +in PPO algorithm, including FSDP and Megatron-LM backend in +verl/verl/trainer/ppo. + +Step 2: Based on different distributed scenarios, implement single-process control of multi-process data interaction +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +**The core problem to solve here is how a single process sends data to +multiple processes, drives multi-process computation, and how the +control process obtains the results of multi-process computation.** +First, we initialize the multi-process ``WorkerGroup`` in the control +process. + +.. code:: python + + @ray.remote(num_cpus=1) + def main_task(config): + # construct SampleGenerator + resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs + ray_cls = RayClassWithInitArgs(SampleGenerator, config=config) + # put SampleGenerator onto resource pool + worker_group = RayWorkerGroup(resource_pool, ray_cls) + + # construct reference policy + +As we can see, in the control process, multiple processes are wrapped +into a ``RayWorkerGroup``. Inside this ``WorkerGroup``, there is a +``self._workers`` member, where each worker is a RayActor +(https://docs.ray.io/en/latest/ray-core/actors.html) of SampleGenerator. +ray_trainer.md also provide an implementation of +``MegatronRayWorkerGroup``. + +Assuming the model is distributed using FSDP, and there is a batch of +data on the control process, for data parallelism, the underlying +calling process is: + +.. code:: python + + data = xxx + data_list = data.chunk(dp_size) + + output = [] + for d in data_list: + # worker_group._workers[i] is a SampleGenerator + output.append(worker_group._workers[i].generate_sequences.remote(d)) + + output = ray.get(output) + output = torch.cat(output) + +Single process calling multiple processes involves the following 3 +steps: + +1. Split the data into DP parts on the control process. +2. Send the data to remote, call the remote computation through RPC, and + utilize multi-process computation. +3. Obtain the computation results of each worker on the control process + and merge them. + +Frequently calling these 3 steps on the controller process greatly hurts +code readability. **In verl, we have abstracted and encapsulated these 3 +steps, so that the worker's method + dispatch + collect can be +registered into the worker_group** + +.. code:: python + + from verl.single_controller.base.decorator import register + + def dispatch_data(worker_group, data): + return data.chunk(worker_group.world_size) + + def collect_data(worker_group, data): + return torch.cat(data) + + dispatch_mode = { + 'dispatch_fn': dispatch_data, + 'collect_fn': collect_data + } + + @register(dispatch_mode=dispatch_mode) + def generate_sequences(self, data): + pass + +In this way, we can directly call the method inside the worker through +the ``worker_group`` on the control (driver) process (which is a single +process): + +.. code:: python + + output = worker_group.generate_sequences(data) + +This single line includes data splitting, data distribution and +computation, and data collection. + +Furthermore, the model parallelism size of each model is usually fixed, +including dp, tp, pp. So for these common distributed scenarios, we have +pre-implemented specific dispatch and collect methods,in `decorator.py `_, which can be directly used to wrap the computations. + +.. code:: python + + from verl.single_controller.base.decorator import register, Dispatch + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def generate_sequences(self, data: DataProto) -> DataProto: + pass + +Here it requires the data interface to be ``DataProto``. Definition of +``DataProto`` is in `protocol.py `_. + +Step 3: Main training loop +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +With the above training flows, we can implement the algorithm's control +flow. It is recommended that ``main_task`` is also a ray remote process. + +.. code:: python + + @ray.remote(num_cpus=1) + def main_task(config): + # construct SampleGenerator + resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs + ray_cls = RayClassWithInitArgs(SampleGenerator, config=config) + # put SampleGenerator onto resource pool + sample_gen = RayWorkerGroup(resource_pool, ray_cls) + + # construct reference policy + ray_cls = RayClassWithInitArgs(ReferencePolicy) + ref_policy = RayWorkerGroup(resource_pool, ray_cls) + + # construct actor + ray_cls = RayClassWithInitArgs(DPOActor) + dpo_policy = RayWorkerGroup(resource_pool, ray_cls) + + dataloader = DataLoader() + + for data in dataloader: + # generate data + data = sample_gen.generate_sequences(data) + # generate scores for each data + data = generate_scores(data) + # generate pairwise data using scores + data = generate_pairwise_data(data) + # generate ref_log_prob + data.batch['ref_log_prob'] = ref_policy.infer(data) + # update using dpo + dpo_policy.update(data) + # logging + +Here, different ``WorkerGroups`` can be placed in the same resource pool or +in different resource pools using ``create_colocated_worker_cls`` +similar as in `ray_trainer.py `_. diff --git a/code/RL_model/verl/verl_train/docs/advance/fp8.md b/code/RL_model/verl/verl_train/docs/advance/fp8.md new file mode 100644 index 0000000000000000000000000000000000000000..0006392d7cd8ae3303527868900fb3254a9f1740 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/fp8.md @@ -0,0 +1,107 @@ +# FP8 rollout for verl + +Last updated: 12/4/2025 + +This document introduces FP8 rollout in verl. + + +We monkey patch several vLLM functions to enable FP8 rollout for reinforcement learning: + +1. **Quantize weights**: Quantize model weights on-the-fly from higher-precision formats to FP8. +2. **Process weights after loading**: For vLLM, we replace the `vllm.model_executor.layers.quantization.fp8.Fp8LinearMethod.process_weights_after_loading` function to handle weight processing after quantization. For SGLang, this patch is not needed as it natively supports loading quantized weights. + + +## Support Matrix +- FP8 blockwise quantization for rollout + - Used in Deepseek, +which is 1x128 quantization for activations and 128x128 quantization for model weights +- Dense models and MoE models +- Async rollout interfaces +- vLLM 0.10.x & vLLM 0.11 & SGlang 0.5.5 +- FSDP and Megatron training backends + +## Experiments and Outcomes +### Qwen3-8B-Base Dense Model + +**Configuration** +- DAPO recipe. AIME24 online validation. +- vLLM(FP8 spmd rollout) + FSDP + - Note that SPMD rollout has been deprecated, so we removed the FP8 SPMD rollout. +- Prompt batch size 32, n=16. +- Rollout batch size: 32\*3*16 +- Train_batch_size & ppo_mini_batch_size 32 +- Max response length 20K +- Token-level TIS, C=2 +- 8*H100 +- vLLM 0.10.0+CUDA 12.6 vs vLLM 0.11.0+CUDA 12.9 + +**Accuracy** +![Qwen3-8b-base_fp8_acc]( +https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-8b-base_fp8_acc.png?raw=true) +*dark green: BF16, orange: FP8 rollout + token-level TIS, light green: FP8 rollout without TIS* + +Results and observations: +- With TIS, FP8 rollout aligns with BF16 +- Obvious accuracy drop when TIS is not enabled +- Higher mismatch kl but within acceptable range throughout the training + + +**Performance** + +![Qwen3-8b-base_fp8_rollout_perf]( +https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-8b-base_fp8_rollout_perf.png?raw=true) +*green: BF16, orange: FP8 rollout + CUDA12.6 + DeepGemm, purple: FP8 rollout + CUDA 12.9 + DeepGemm* + +Results and observations: +- FP8 rollout leads to around ~12% rollout speedup with CUDA 12.6 + DeepGemm +- When upgrading to CUDA 12.9, speedup can be up to ~18% + +### Qwen3-30B-A3B-Base MoE Model + +**Configuration** +- DAPO recipe. AIME24 online validation. +- FP8 async rollout, vLLM+FSDP +- Prompt batch size 32 +- Rollout batch size: 32\*3*16 +- Train_batch_size & ppo_mini_batch_size 32 +- Max response length 20K +- Token-level TIS, C=2 +- 2\*8*H100 +- vLLM 0.10.0+CUDA 12.6 + +Please refer to `recipe/dapo/run_dapo_qwen3_moe_30b_vllm_fp8_rollout.sh` + +**Accuracy** +![Qwen3-30b-a3b_fp8_acc]( +https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-30b-a3b_fp8_acc.png?raw=true) +*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS* + +Results and observations: +- Rollout & training distribution mismatch is in general higher for MoE +- Rollout correction required even for BF16 +- FP8 rollout with token-level TIS aligns with BF16 + + +**Performance** + +![Qwen3-30b-a3b_fp8_perf]( +https://github.com/Agoniii/verl/blob/xueh/fp8_pr_images/docs/advance/images/Qwen3-30b-a3b_fp8_perf.png?raw=true) +*grey: BF16 + token-level TIS, red: FP8 rollout + token-level TIS​* + +Results and observations: +- FP8 rollout : over 35% rollout speedup +- Expecting more perf gain with CUDA 12.9 + +## Usage + +FP8 can be enabled in the config file `verl/trainer/config/ppo_megatron_trainer.yaml`: + +``` + rollout: + quantization: "fp8" +``` + +Or it can be enabled by command line: +- `actor_rollout_ref.rollout.quantization=fp8` + +Please refer to `recipe/dapo/run_dapo_qwen3_moe_30b_vllm_fp8_rollout.sh` diff --git a/code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst b/code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst new file mode 100644 index 0000000000000000000000000000000000000000..181e109082262f26334034337c5915d522049759 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/fsdp_extension.rst @@ -0,0 +1,97 @@ + +Add models with the FSDP backend +================================== + +Last updated: 02/09/2025. + +Model +-------------------------- + +In principle, our FSDP backend can support any HF model and we can +sychronoize the actor model weight with vLLM using `hf_weight_loader.py` under `third_party/vllm`. +However, ``hf_weight_loader`` is will gather the full state_dict of a +model during synchronization, which may cause OOM. We suggest using +``dtensor_weight_loader`` which gather the full model parameter layer by +layer to reduce the peak memory usage. We already support dtensor weight +loader for the models below in `dtensor_weight_loader.py` under `third_party/vllm`: + +- ``GPT2LMHeadModel`` +- ``LlamaForCausalLM`` +- ``LLaMAForCausalLM`` +- ``MistralForCausalLM`` +- ``InternLMForCausalLM`` +- ``AquilaModel`` +- ``AquilaForCausalLM`` +- ``Phi3ForCausalLM`` +- ``GemmaForCausalLM`` +- ``Gemma2ForCausalLM`` +- ``GPTBigCodeForCausalLM`` +- ``Starcoder2ForCausalLM`` +- ``Qwen2ForCausalLM`` +- ``DeepseekV2ForCausalLM`` + +To implement ``dtensor_weight_loader`` of a model that's supported in +vLLM, follow the guide of gemma model below: + +1. Copy the + ``load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]])`` from the vllm model class + to ``dtensor_weight_loaders.py`` +2. Modify the arguments to + ``(actor_weights: Dict, vllm_model: nn.Module)`` +3. Replace the ``self`` to ``vllm_model`` +4. Add the + ``local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)`` + before each ``param = params_dict[name]`` and modify the following + weight loading using ``local_loaded_weight``. +5. Register the implemented dtensor weight loader to ``__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__``. + +.. code-block:: diff + + - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + + def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + ("qkv_proj", "q_proj", "q"), + ("qkv_proj", "k_proj", "k"), + ("qkv_proj", "v_proj", "v"), + ("gate_up_proj", "gate_proj", 0), + ("gate_up_proj", "up_proj", 1), + ] + - params_dict = dict(self.named_parameters()) + + params_dict = dict(vllm_model.named_parameters()) + loaded_params = set() + - for name, loaded_weight in weights: + + for name, loaded_weight in actor_weights.items(): + for (param_name, shard_name, shard_id) in stacked_params_mapping: + if shard_name not in name: + continue + name = name.replace(shard_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight) + param = params_dict[name] + weight_loader = param.weight_loader + - weight_loader(param, loaded_weight, shard_id) + + weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id) + break + else: + # lm_head is not used in vllm as it is tied with embed_token. + # To prevent errors, skip loading lm_head.weight. + if "lm_head.weight" in name: + continue + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight) + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", + default_weight_loader) + - weight_loader(param, loaded_weight) + + weight_loader(param, local_loaded_weight.to(dtype=param.dtype)) + loaded_params.add(name) + unloaded_params = params_dict.keys() - loaded_params + if unloaded_params: + raise RuntimeError( + "Some weights are not initialized from checkpoints: " + f"{unloaded_params}") \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/advance/fully_async.md b/code/RL_model/verl/verl_train/docs/advance/fully_async.md new file mode 100644 index 0000000000000000000000000000000000000000..0c03bac6e86eac1f98337ed798b22311dc16c2d8 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/fully_async.md @@ -0,0 +1,595 @@ +# Recipe: Fully Async Policy Trainer + +**Author:** `https://github.com/meituan-search` + +Last updated: 12/25/2025. + +This document introduces a fully asynchronous PPO training system that completely decouples the Trainer and Rollouter, +supporting asynchronous sample generation and training. +Under this system, we achieved a 2.35x-2.67x performance improvement when training the Qwen2.5-7B model with 128 GPUs, +without significantly affecting the results. + +## Introduction + +### Background + +The separated rollout and train architecture, compared to the colocate architecture, can allocate resources more +flexibly and design more flexible training logic, thereby addressing issues such as low GPU utilization and training +efficiency caused by long-tail problems. +The one_step_off_policy alleviates the problem of long rollout times and achieves some gains in training efficiency by +designing a separated architecture and performing asynchronous training between rollout and train for one round. +However, it forcibly uses data from one round of asynchronous training, which is not flexible enough and cannot +completely eliminate the impact of long-tail on training efficiency. +In other frameworks such as AReaL, Magistral, StreamRL, and AsyncFlow, asynchronous training and streaming training have +been implemented based on the separated architecture and have achieved gains. +We borrow from their methods and implemented them in VERL. The fully_async_policy supports asynchronous, streaming, and +partial +rollout training. +By reasonably setting parameters such as resource allocation and parameter synchronization frequency, fully_async_policy +can significantly improve training efficiency. + +> Magistral https://arxiv.org/abs/2506.10910 +> +> AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language +> Reasoning https://arxiv.org/abs/2505.24298 +> +> StreamRL: Scalable, Heterogeneous, and Elastic RL for LLMs with Disaggregated Stream +> Generation https://arxiv.org/abs/2504.15930 +> +> AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training https://arxiv.org/abs/2507.01663 + +### Core Contributions + +- **Resource Isolation**: Unlike using hybrid_engine, Rollouter and Trainer use separate computing resources and need to + specify the resources they occupy separately. +- **Parallel Generation and Training**: While the Trainer is training, the Rollouter is generating new samples. +- **Multi-step Asynchronous**: Compared to one step off policy, it supports asynchronous settings from 0.x steps to + multiple steps, making the asynchronous solution more flexible. +- **NCCL Parameter Synchronization**: Based on the nccl communication primitive, refer to [checkpoint-engine](https://github.com/MoonshotAI/checkpoint-engine) to + achieve efficient parameter synchronization between Rollouter and Trainer. +- **Stream Inference and Training**: Rollouter generates data sample by sample, and data transmission uses a single + sample as the minimum transmission unit. +- **Asynchronous Training and Freshness Control**: By setting the parameter async_training.staleness_threshold, it + supports training with samples generated by old parameters. +- **PartialRollout**: The Rollouter's inference process supports partial rollout logic. During parameter + synchronization, by adding `sleep() and resume()` logic, it + saves samples from ongoing rollouts and continues using them in the next rollout, reducing the time spent waiting for + ongoing tasks to finish during parameter synchronization. + +Currently, the supported usage mode is Megatron/FSDP+vLLM/SGLang. vLLM/SGLang must use the server mode based on AgentLoop. + +## Design + +The overall architecture of fully_async_policy is shown in the figure below. fully_async_policy mainly consists of four +parts: Rollouter, MessageQueue, Trainer, and ParameterSynchronizer. + +![fully_async_policy_structure](https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_structure.svg?raw=true) + +1. Rollouter generates sequences sample by sample and puts the generated samples into the MessageQueue, with the + production speed controlled by freshness. +2. MessageQueue is used to temporarily store samples generated by Rollouter. +3. Trainer fetches samples from MessageQueue sample by sample. After fetching `require_batches*ppo_mini_batch_size` + samples, it will perform training. After training for async_training.trigger_parameter_sync_step rounds, it triggers + a parameter synchronization with Rollouter. +4. ParameterSynchronizer implements the NCCL synchronous parameter synchronization capability. + +The source of benefits compared to the base scheme lies in the fact that in the colocate case, using more resources for +rollout cannot solve the idleness caused by long-tail samples. +After we perform resource isolation, the time for rollout and train may be longer than before (because fewer resources +are used), +but the overlap in their time consumption reduces the end-to-end time consumption. + +![fully_async_policy_revenue](https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_revenue.svg?raw=true) + +## Usage + +### Parameter Description + +| super params | implication | +| ---------------------------------------------------------------- | ---------------------------------------------------------------------------------------------- | +| `trainer.nnodes` | Number of nodes for Trainer | +| `trainer.n_gpus_per_node` | Number of GPUs per node for Trainer | +| `rollout.nnodes` | Number of nodes for Rollouter | +| `rollout.n_gpus_per_node` | Number of GPUs per node for Rollouter | +| `data.train_batch_size` | In the fully async strategy, this value is not effective (default is 0) | +| `data.gen_batch_size` | In the fully async strategy, uses streaming sample production logic (default is 1) | +| `rollout.total_rollout_steps` | Total number of rollout samples | +| `rollout.test_freq` | How many times Rollouter updates parameters before performing a validation | +| `actor_rollout_ref.actor.ppo_mini_batch_size` | The ppo_mini_batch_size is a global num across all workers/gpus | +| `async_training.require_batches` | Number of ppo_mini_batch_size that FullyAsyncTrainer fetches at once | +| `async_training.trigger_parameter_sync_step` | Indicates how many local updates FullyAsyncTrainer performs before a parameter synchronization | +| `async_training.staleness_threshold` | Freshness control | +| `async_training.partial_rollout` | Whether to perform partial_rollout | +| `async_training.use_rollout_log_probs` | Use log_probs generated by rollout | +| `async_training.compute_prox_log_prob` | Whether to compute log_prob using the training model's parameters during the training phase | +| `async_training.checkpoint_engine.enable` | Whether to use checkpoint_engine for accelerating, default `True` | +| `async_training.checkpoint_engine.overlap_broadcast_and_consume` | When use checkpoint_engine, whether to overlap broadcast and load_weights, default `False` | +| `async_training.checkpoint_engine.device_buffer_size_M` | When use checkpoint_engine, the user-specific bucket size (MB), default `4096` | +| `async_training.use_trainer_do_validate` | Whether use trainer node to do validate process, default `False`| + +**Further Explanation:** + +- `rollout.total_rollout_steps` + + Compared to colocate, the quantity can be aligned by multiplying train_batch_size and step: + `rollout.total_rollout_steps = data.train_batch_size * step`. + +- `async_training.trigger_parameter_sync_step` + + In the fully async strategy, it indicates how many local updates the Trainer performs (i.e., how many times it fetches + `require_batches * ppo_mini_batch_size` samples) before a parameter synchronization with Rollouter. + Between every two parameter synchronizations between Rollouter and Trainer, the Trainer will process + `trigger_parameter_sync_step* require_batches*ppo_mini_batch_size` samples. + To fairly compare speed with colocate, trigger_parameter_sync_step should be set to + `data.train_batch_size / (require_batches * ppo_mini_batch_size)`. + +- `async_training.staleness_threshold` + + In the fully async strategy, it indicates the maximum proportion of stale samples allowed to be used. + + - staleness_threshold=0, indicates synchronous training. + Rollouter will generate a fixed number of samples between two parameter updates, the sample count is: + $$rollout\_num = (trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size)$$ + - staleness_threshold>0, indicates asynchronous training, can be set to a decimal for more flexible asynchronous + calls. + Rollouter will generate at most the following number of samples between two parameter updates: + $$rollout\_num = (1+staleness\_threshold)*(trigger\_parameter\_sync\_step*require\_batches*ppo\_mini\_batch\_size) - num\_staleness\_sample $$ + + num_staleness_sample represents the number of stale samples generated in excess during the last rollout. + + Since it's a streaming system, rollout continues to generate and trainer continues to consume. If rollouter is slower, + trainer will trigger parameter synchronization earlier, and rollouter will not actually produce rollout_num samples. + When rollout is fast enough, setting staleness_threshold to 1 is basically equivalent to one_step_off policy. + To avoid too many expired samples affecting training accuracy, it is recommended to set this value to less than 1. + +- `async_training.partial_rollout` + + partial_rollout only actually takes effect when staleness_threshold>0. + +- `async_training.use_rollout_log_probs` + + In reinforcement learning algorithms, log_probs have implicit correlations with parameter versions and tokens. Due to + the settings of algorithms like PPO/GRPO/DAPO, when calculating importance sampling, + old_log_prob must use the log_probs corresponding to the rollout parameters and tokens to ensure algorithm + correctness. In the fully + async strategy, we default to old_log_prob being calculated by rollout rather than by trainer. + +- `async_training.require_batches` + + In streaming training, require_batches should be set to 1, indicating that training is performed after producing + enough ppo_mini_batch_size samples. + In actual testing, we found that if fewer samples are issued at once, due to the order of data distribution, it can + cause training instability and longer response lengths. + Here, we additionally provide require_batches for streaming distribution and control the number of samples + participating in training at once. + +- `async_training.compute_prox_log_prob` (experimental) + + During the training process, we observed that metrics and response lengths may become unstable in the later + stages of training. To mitigate this issue, we can use + the [Rollout Importance Sampling](https://verl.readthedocs.io/en/latest/advance/rollout_is.html) + technique for importance sampling. To utilize Rollout Importance Sampling, we need to compute log_prob using + the training engine, which requires enabling this switch. + Additionally, when compute_prox_log_prob and Rollout Importance Sampling are enabled under mode d + (async stream pipeline with partial rollout), our implementation approximates `Areal's Decoupled PPO`. + +- `async_training.checkpoint_engine.enable` + + Enabling the checkpoint engine generally reduces synchronization time overhead by more than 60% compared to + the original per-tensor parameter synchronization method. However, assembling buckets incurs additional + temporary GPU memory overhead. + +- `async_training.checkpoint_engine.overlap_broadcast_and_consume` + + Enabling pipeline between the broadcast and load_weights parameters will allocate additional GPU memory. + Since the main time consumption for parameter synchronization is not in the broadcast and load_weights phases, + but in the parameter generation phase (by megatron or FSDP), this option is off by default. + +- `async_training.checkpoint_engine.device_buffer_size_M` + + It controls the size of the memory buffer used for synchronization when the checkpoint-engine is enabled. + The actual `bucket_size` = `max(device_buffer_size_M, maximum parameter tensor size)`. + + - When enable `overlap_broadcast_and_consume`, the additional device memory overhead of + trainer rank is `3 * bucket_size`and rollout rank is `2 * bucket_size`。 + - When disable `overlap_broadcast_and_consume`, the additional device memory overhead of + trainer rank is `2 * bucket_size`and rollout rank is `1 * bucket_size`。 + +* `async_training.use_trainer_do_validate` + + It controls whether to use the trainer's `do_validate` method for validation. + If set to True, the trainer will perform validation after each parameter update. It can reduce the validation time + overhead and trainer node idle time. + If set to False, the trainer will not perform validation. + +### Supported Modes + +1. on policy pipeline: + + 1. **trigger_parameter_sync_step=1, staleness_threshold=0** + 2. Rollouter produces `require_batches*ppo_mini_batch_size` samples at once, Trainer fetches these samples for + training, and after training completes, Trainer and Rollouter perform a parameter synchronization; + 3. During the rollout phase, if there are long-tail samples but few rollout samples, shorter samples cannot fill + idle resources, causing some resource waste. + 4. As shown in figure a; + +2. stream off policy pipeline: + + 1. **trigger_parameter_sync_step>1, staleness_threshold=0** + 2. Synchronous streaming training will be performed. Rollouter produces + `require_batches*ppo_mini_batch_size*trigger_parameter_sync_step` samples at once, Trainer performs a local + training every time it fetches `require_batches*ppo_mini_batch_size` samples, and after training + trigger_parameter_sync_step times, Trainer and Rollouter perform a parameter synchronization; + 3. Compared to a, since more samples are generated at once, resource idleness will be lower. + 4. In one step training, there will be two periods of resource idleness: when fetching the first batch of samples, + train waits for `require_batches*ppo_mini_batch_size` samples to be produced, and during the last parameter + update, rollout waits for training to complete. + 5. As shown in figure b; + +3. async stream pipeline with stale samples: + + 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=False** + 2. After each parameter update, Rollouter will plan to produce at most rollout_num samples (in practice, the number + of samples generated may be less than this value depending on rollout speed). + 3. If the rollout process is relatively fast, Rollouter will generate some additional samples num_stale_samples + before parameter synchronization for immediate use by Trainer after synchronization. + When triggering parameter synchronization, if Rollouter has ongoing tasks, it will wait for the tasks to complete + and not add new tasks; + 4. Compared to b, except for the first step training, subsequent training will not have the time to wait for the + first batch rollout to finish, but will have the time to wait for active tasks to finish. + 5. As shown in figure c; + +4. async stream pipeline with partial rollout: + 1. **trigger_parameter_sync_step>=1, staleness_threshold>0, partial_rollout=True** + 2. Compared to c, when triggering parameter synchronization, if Rollouter has samples being produced, it will + interrupt the rollout process and perform parameter synchronization. The interrupted samples will continue to be + generated after synchronization. This reduces the time to wait for active tasks to finish. + 3. As shown in figure d; + +![fully_async_policy_mode](https://github.com/ArronHZG/verl-community/blob/main/docs/fully_async_policy_mode.svg?raw=true) + +### Key Metrics + +| metrics | implication | +| ---------------------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `trainer/idle_ratio` | Trainer idle rate | +| `rollouter/idle_ratio` | Rollouter idle rate | +| `fully_async/count/stale_samples_processed` | Total number of old samples used in training | +| `fully_async/count/stale_trajectory_processed` | Total number of old trajectories used in training (one sample produces rollout.n trajectories) | +| `fully_async/partial/total_partial_num` | Number of partial samples processed by Trainer between two trigger_parameter_sync_step | +| `fully_async/partial/partial_ratio` | Ratio of partial samples processed by Trainer between two trigger_parameter_sync_step | +| `fully_async/partial/max_partial_span` | Maximum parameter span of partial samples processed by Trainer between two trigger_parameter_sync_step | + +### Parameter Tuning Recommendations + +- Resource Allocation and Adjustment: + + - Reasonable resource allocation is the prerequisite for achieving good training efficiency. The ideal resource + allocation should make the rollout time and train time close, thereby minimizing pipeline bubbles in the entire + training process, + avoiding resource idleness, and ensuring Trainer does not use old samples. In real training scenarios, resource + allocation can be adjusted based on the idle time of rollout and train during actual training, + which can be obtained from rollouter/idle_ratio and trainer/idle_ratio. If rollouter/idle_ratio is high and + trainer/idle_ratio is low, + Trainer resources should be increased and Rollouter resources should be reduced, and vice versa. + +- Key Parameters: + + - staleness_threshold: Setting it too high will cause more old samples to be used, affecting model performance. It + is recommended to set it to less than 1. + - require_batches: The closer to 1, the closer to a pure streaming process, the smaller the training bubbles, and + the faster the acceleration effect that can be achieved in terms of speed, but it will affect the order of sample + processing; + - trigger_parameter_sync_step: The smaller the setting, the closer to on policy, but it will cause frequent + parameter synchronization. Long-tail samples waste resources that cannot be filled by short samples, resulting in + low resource utilization. + The larger the setting, the higher the computational efficiency, but the accuracy will be affected by off policy. + - rollout.test_freq: It will occupy Rollouter resources and is not recommended to be set too small. + +- Mode Selection: By adjusting different parameters, the Fully Async architecture supports optimization acceleration at + different levels, suitable for tasks in different scenarios. + - For small-scale tasks that need to ensure training stability and on-policy nature, and have low speed + requirements, the on policy pipeline mode (Mode 1) can be tried. + - For scenarios that need to improve training throughput but are sensitive to staleness, the stream off policy + pipeline mode can be tried. That is, by + setting trigger_parameter_sync_step>1 to improve training efficiency, but still maintaining the synchronization + mechanism (staleness_threshold=0) (Mode 2). + - For large-scale tasks with high training speed requirements and can tolerate a certain degree of off-policy and + staleness, setting staleness_threshold> + 0 and partial_rollout=True can improve training efficiency, using the async stream pipeline mode (Mode 3 or 4). + +### Quick Start + +```shell +rollout_mode="async" +rollout_name="vllm" # sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +train_prompt_bsz=0 +gen_prompt_bsz=1 +n_resp_per_prompt=16 +train_prompt_mini_bsz=32 +total_rollout_steps=$(((512*400))) +test_freq=10 +staleness_threshold=0 +trigger_parameter_sync_step=16 +partial_rollout=False + + +python -m verl.experimental.fully_async_policy.fully_async_main \ + train_batch_size=${train_prompt_bsz} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + actor_rollout_ref.actor.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.hybrid_engine=False \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.calculate_log_probs=True \ + trainer.nnodes="${NNODES_TRAIN}" \ + trainer.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.nnodes="${NNODES_ROLLOUT}" \ + rollout.n_gpus_per_node="${NGPUS_PER_NODE}" \ + rollout.total_rollout_steps="${total_rollout_steps}" \ + rollout.test_freq="${test_freq}" \ + async_training.staleness_threshold="${staleness_threshold}" \ + async_training.trigger_parameter_sync_step="${trigger_parameter_sync_step}" \ + async_training.partial_rollout="${partial_rollout}" +``` + +## Experiments + +### Asynchronous Training on 7B Model + +We used Qwen2.5-Math-7B to verify the benefits of the fully async strategy under long candidates and multiple resources. +Using the `async stream pipeline with stale samples` strategy, we achieved about 2x performance improvement on 32 cards, +64 cards, and 128 cards without significantly affecting experimental results. + +- Machine: H20 +- Model: Qwen2.5-Math-7B +- Rollout length: max_response_length FSDP2: 28K tokens; +- Algorithm: DAPO +- Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet +- Engine: vLLM + FSDP2 +- rollout.n: 16 +- ppo_mini_batch_size: 32 +- test_freq: 20 + +- colocate sync: + + - step: 400 + - train_batch_size: 512 + +- fully_async_policy + - total_rollout_steps: 512\*400 + - require_batches: 4 + - trigger_parameter_sync_step: 4 + - staleness_threshold: 0.5 + - partial_rollout: True + +| training mode | resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +| :----------------: | :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: | +| colocate sync | 32 | 790.10 | 357.41 | 107.71 | 269.80 | 13h 44m | 1d 3h 43m | 2d 9h 22m | 3d 17h 5m | max: 0.3313
last: 0.2448 | +| fully_async_policy | 16:16 | 294.77 | 21.26 | \ | 313.81 | 7h 58m
(1.72x) | 16h 21m
(1.70x) | 1d 0h 53m
(2.31x) | 1d 9h 26m
(2.66x) | max: 0.3302
last: 0.2333 | +| colocate sync | 64 | 365.28 | 150.72 | 70.26 | 133.41 | 10h 22m | 20h 45m | 1d 7h 6m | 1d 17h 32m | max: 0.3365
last: 0.2333 | +| fully_async_policy | 32:32 | 189.26 | 28.46 | \ | 156.98 | 4h 57m
(2.09x) | 10h 14m
(2.03x) | 16h 58m
(1.83x) | 21h 40m
(1.92x) | max: 0.3677
last: 0.3406 | +| colocate sync | 128 | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| fully_async_policy | 64:64 | 150.63 | 33.14 | \ | 113.16 | 3h 13m
(2.67x) | 6h 46m
(2.65x) | 10h 53m
(2.67x) | 17h 22m
(2.35x) | max: 0.3521
last: 0.3094 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-colocate_async?nw=nwuserhouzg + +### 128-card 7B Asynchronous Mode Experiment + +We used Qwen2.5-Math-7B to verify the effects of various modes supported by fully async. +We can see that the benefit brought by streaming is approximately 1.6x, and after combining staleness and +partial_rollout, the benefit reaches 2.35x. + +| mode | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +| :---------------------------------------------------------------------------------------------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: | +| colocate sync | 356.30 | 177.85 | 53.92 | 113.81 | 8h 36m | 17h 56m | 1d 5h 6m | 1d 16h 48m | max: 0.3573
last: 0.2958 | +| `stream off policy pipeline`
(+fully async: trigger_parameter_sync_step= 4,
require_batches= 4) | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| `async stream pipeline with stale samples`
(+staleness_threshold=0.5) | | | | | | | | | | +| `async stream pipeline with partial rollout`
(+partial_rollout=True) | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg + +### 128-card Stale Ablation Experiment + +Under the `async stream pipeline with partial rollout` mode, we verified the impact of staleness settings on training +efficiency. +We found that the larger the staleness, the more obvious the final gains. +We also noticed that the times for staleness values of 0.3 and 0.5 are quite close, because as the training steps +increase, the response length changes significantly, causing training instability. +Further analysis and optimization are needed for this issue. + +| staleness_threshold | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | total time
400 step | acc/mean@1 | +| :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: | +| 0 | 231.34 | 128.47 | \ | 98.77 | 4h 25m | 9h 41m | 15h 2m | 1d 1h 53m | max: 0.2844
last: 0.2604 | +| 0.1 | 171.30 | 58.17 | \ | 109.12 | 3h 53m | 8h 37m | 14h 25m | 19h 59m | max: 0.3542
last: 0.2979 | +| 0.3 | 146.11 | 38.88 | \ | 103.22 | 3h 18m | 6h 49m | 11h 40m | 17h 20m | max: 0.3469
last: 0.2865 | +| 0.5 | 150.63 | 33.14 | \ | 113.16 | 3h 13m | 6h 46m | 10h 53m | 17h 22m | max: 0.3521
last: 0.3094 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-stream_stale_partial?nw=nwuserhouzg + +### 128-card 7B require_batches Ablation Experiment + +In multiple tests, we found that the number of samples issued each time in streaming affects the response length during +training, which in turn affects training time. We verified the impact on results by modifying +`async_training.require_batches`. + +| require_batches | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | total time
300 step | acc/mean@1 | +| :-------------: | :----: | :---: | :----------: | :----------: | :--------------------: | :--------------------: | :--------------------: | :-------------------------: | +| 1 | 203.47 | 30.88 | \ | 181.08 | 3h 31m | 8h 29m | 17h 36m | max: 0.349
last: 0.326 | +| 2 | 158.72 | 26.32 | \ | 128.08 | 3h 35m | 7h 38m | 13h 57m | max: 0.351
last: 0.3406 | +| 4 | 124.64 | 25.62 | \ | 95.06 | 3h 13m | 6h 46m | 10h 53m | max: 0.3521
last: 0.3521 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-ablation_require_batches?nw=nwuserhouzg + +### 30B Model Mode Experiment + +We achieved a 1.7x performance improvement with `async stream pipeline with staleness samples` strategy on the +Qwen3-30B-A3B-Base model compared to the colocate setup. It is worth noting that this is far from the upper limit of +performance gains achievable through asynchrony. Firstly, the comparative experiments used a maximum response length of +only 8k, which is much shorter than the 20k sequence length in previous experiments, resulting in a less pronounced +rollout tail effect. Secondly, we adopted a highly skewed resource allocation, with rollout using 96 GPUs and trainer +using 32 GPUs, which is not an optimal configuration. During the experiments, we observed that the current verl +implementation imposes certain constraints, such as requiring data to be evenly divisible by the number of GPUs, making +resource adjustment less flexible. Additionally, as asynchronous training and deployment accelerate, the performance gap +is gradually narrowing. Therefore, enabling more flexible resource allocation and dynamic resource adjustment in the +future will be our next focus. + +- Machine: H20 +- Model: Qwen3-30B-A3B-Base +- Rollout length: max_response_length : 8K tokens; +- Algorithm: GRPO +- Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet +- Engine: vLLM + Megatron +- rollout.n: 16 +- ppo_mini_batch_size: 128 +- test_freq: 20 + +- colocate sync: + + - step:400 + - train_batch_size: 512 + +- fully_async_policy + - total_rollout_steps: 512\*400 + - trigger_parameter_sync_step: 512/128 = 4 + - staleness_threshold: 0.5 + - partial_rollout: True + +| Training Mode | Resource Allocation | Step | Gen | Old Log Prob | Ref | Update Actor | Total Time 100 Step | Total Time 200 Step | Total Time 300 Step | Total Time 400 Step | Acc/Mean@1 | +| ------------------ | ------------------- | ------ | ------ | ------------ | ----- | ------------ | ------------------- | ------------------- | ------------------- | ------------------- | --------------------------- | +| Colocate Sync | 128 | 497.89 | 348.05 | 28.73 | 20.86 | 86.27 | 13h 36m | 1d 3h 48m | 1d 19h 4m | 2d 11h 39m | max: 0.3500
last: 0.3208 | +| Fully Async Policy | 96:32 | 282.75 | 22.06 | \ | 50.05 | 206.63 | 6h 45m (2.01x) | 14h 48m (1.88x) | 1d 0h 9m (1.78x) | 1d 10h 41m (1.72x) | max: 0.3813
last: 0.3448 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-30B?nw=nwuserhouzg | | | + +### checkpoint-engine Ablation Experiment + +We tested the single-step parameter synchronization time of the checkpoint-engine on three models: Qwen2.5-Math-7B, Qwen3-30B-A3B, and Qwen3-235B-A22B, using default checkpoint-engine configurations. All experiments were performed on H20 machines, and the Megatron engine was used for training. +| model | trainer rank | rollout rank | checkpoint-engine | total sync time | +|:-----------------:|:--------:|:-------:|:--------------:|:--------------:| +| Qwen2.5-Math-7B | 4 | 4 | False | 0.12s | +| Qwen2.5-Math-7B | 4 | 4 | True | 0.02s | +| Qwen3-30B-A3B | 16 | 16 | False | 15.76s | +| Qwen3-30B-A3B | 16 | 16 | True | 4.38s | +| Qwen3-235B-A22B | 64 | 64 | False | 58.57s | +| Qwen3-235B-A22B | 64 | 64 | True | 23.70s | + +### use_trainer_do_validate Experiment + +We tested the effect of setting `use_trainer_do_validate=True` on the training process. The results show that setting +this parameter to True can reduce the validation time overhead and trainer node idle time. +We used Qwen2.5-Math-7B to verify the benefits of `use_trainer_do_validate=True` on the training process, we achieved about 2x performance improvement on validation time, and the trainer node idle time is reduced by about 40%. + +* Machine: H20 +* Model: Qwen2.5-Math-7B +* Rollout length: max_response_length FSDP2: 10K tokens; +* Algorithm: DAPO +* Dataset: TRAIN_FILE: dapo-math-17k.parquet TEST_FILE: aime-2024.parquet +* Engine: vllm+FSDP2 +* rollout.n: 16 +* ppo_mini_batch_size: 32 +* test_freq: 10 + +* fully_async_policy + * total_rollout_steps: 512*400 + * require_batches: 4 + * trigger_parameter_sync_step: 4 + * staleness_threshold: 0.5 + * partial_rollout: True + +| training mode | resource allocation | step | gen | old_log_prob | update_actor | validate time | total time
50 step | acc/mean@2 | +|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:|:---------------:| +| colocate sync | 16 | 484.623 | 52.939 | 0 | 430.263 | 205.080 | 7h9m | 22.6 | +| fully_async_policy | 8:8 | 489.953 | 52.622 | 0 | 435.874 | 95.699 | 7h2m | 21.0 | + + +## Multi-Turn Tool Calling + +Referencing **recipe/retool** and **ToolAgentLoop**, we implemented **AsyncPartialToolAgentLoop**, a multi-turn +tool-calling loop that supports partial_rollout for **fully_async_policy**. + +### Core Design + +`AsyncPartialToolAgentLoop` inherits from `ToolAgentLoop` and is adapted for the asynchronous training mode of +`fully_async_policy`. When `partial_rollout=True`, the Rollouter interrupts ongoing generation tasks before +synchronizing parameters with the Trainer. `AsyncPartialToolAgentLoop` is capable of: + +1. **Interrupting Tasks**: Responding to an interrupt signal to save the current state. Currently, interruptions occur + during the `GENERATING` process or after other states have completed. +2. **Resuming Tasks**: Resuming execution from the saved state after parameter synchronization is complete, rather than + starting over. + +### How to Use + +RL training with multi-turn tool calling in `fully_async_policy` is similar to `recipe/retool`. It is enabled by +specifying `multi_turn` configurations in the config file. + +1. **SFT Stage**: First, the model should undergo SFT to learn how to follow tool-calling format instructions. +2. **Multi-turn Configuration**: In the `fully_async_policy` training configuration, set the following parameters: + ```yaml + actor_rollout_ref: + rollout: + multi_turn: + enable: True # AsyncPartialToolAgentLoop will be used by default in fully_async_policy mode + # Other multi_turn related configurations + ``` +3. **Async Parameters**: To improve efficiency, enable `partial_rollout` and `staleness_threshold` when using multi-turn + tool calling: + ```yaml + async_training: + partial_rollout: True + staleness_threshold: 0.5 + # Other async parameters + ``` +4. **Example**: See `recipe/fully_async_policy/shell/dapo_7b_async_retool.sh`. + +### Experimental Results + +To validate the performance of `fully_async_policy` on multi-turn tool-calling tasks, we compared it with the standard +`colocate` synchronous mode. Key parameter settings are as follows. + +- **SFT Model**: Based on `Qwen2.5-7B-Instruct`, trained for 6 epochs on the `ReTool-SFT` dataset +- **RL Algorithm**: DAPO +- **Dataset**: + - Train: `DAPO-Math-17k` + - Test: `aime_2025` +- **Resource and Mode Comparison**: + - `colocate sync`: 32 H20 gpus + - `fully_async_policy`: 16 gpus for Trainer + 16 gpus for Rollouter +- **Key Configurations**: + 1. **Tool Calling Configuration**: + - `multi_turn.enable: True` + - `multi_turn.max_user_turns: 16` + - `multi_turn.max_assistant_turns: 16` + - `multi_turn.tool_config_path: recipe/retool/sandbox_fusion_tool_config.yaml` + 2. **`colocate sync` Configuration**: + - `ppo_mini_batch_size: 16` + - `train_batch_size: 64` + 3. **`fully_async_policy` Configuration**: + - `ppo_mini_batch_size: 16` + - `trigger_parameter_sync_step: 4` + - `require_batches: 1` + - `staleness_threshold: 1` + - `partial_rollout: True` + +| training mode | Resource allocation | step | gen | old_log_prob | update_actor | total time
100 step | total time
200 step | aime_2025
acc/mean@30 | +| :----------------: | :-----------------: | :----: | :----: | :----------: | :----------: | :--------------------: | :--------------------: | :-------------------------: | +| colocate | 32 | 375.47 | 228.03 | 35.19 | 111.84 | 9h 46m | 22h 28m | start:0.1078
last:0.2056 | +| fully_async_policy | 16: 16 | 221.36 | 40.59 | \ | 179.58 | 6h 19m
(1.55x) | 14h 4m
(1.60x) | start:0.11
last:0.2044 | + +> source data: https://wandb.ai/hou-zg-meituan/fully-async-policy-multiturn-tool?nw=nwuserhouzg + +## Future Plans +- Transfer queue integration +- Asynchronous parameter synchronization diff --git a/code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md b/code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md new file mode 100644 index 0000000000000000000000000000000000000000..3b59f936728e2142df8765b6f886804069566cd9 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/grafana_prometheus.md @@ -0,0 +1,193 @@ +# Use Prometheus and Grafana to Monitor Rollout + +**Author:** `https://github.com/meituan-search` + +Last updated: 12/05/2025. + +Monitor the rollout computation process using Prometheus and Grafana when using verl to enhance system observability and facilitate further performance optimization. + +We provide an additional training monitoring capability, leveraging Prometheus and Grafana to display rollout information during training and enhance system observability to facilitate further performance optimization. + +The system automatically configures Prometheus to scrape metrics from rollout servers, eliminating manual configuration steps. + +## Overview + +The figures below show the performance of Qwen235B on the AIME2024 dataset with a response length of 20k, where the emergence of a long-tail problem is clearly observable. + +![fully_async_policy_structure](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana_validate.png?raw=true) + +The following figure presents the fully asynchronous training of the Qwen235B model. Here, resource idleness is distinctly noticeable, indicating that rollout resources can be reduced. + +![fully_async_policy_structure](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana_fully_async_train.png?raw=true) + +Through the above two examples, we also illustrate the necessity of system observability. + +## Architecture Overview + +The overall workflow consists of the following steps: + +1. **Multi-node Ray Cluster Setup**: Start Ray cluster across multiple nodes with Grafana and Prometheus information configured in environment variables on the master node +2. **Start Grafana Service**: Launch Grafana on the master node for visualization of monitoring dashboards +3. **Start Prometheus Service**: Launch Prometheus on the master node for metrics collection and storage +4. **verl Async Rollout Mode**: verl uses async rollout mode to obtain rollout server ports and IP addresses +5. **Automatic Prometheus Configuration**: verl automatically rewrites the Prometheus configuration to add monitoring for rollout servers and notifies Prometheus to reload the configuration +6. **Metrics Collection**: After program execution, metrics can be viewed in Prometheus +7. **Dashboard Visualization**: Upload and view monitoring metrics in Grafana dashboards + +## Detailed Setup Steps + +### Step 1: Environment Variables and Start Ray Cluster + +First, set the necessary environment variables and start the Ray service. + +> Reference: [configure-manage-dashboard](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html) + +```bash +# Master node environment variables +export GF_SERVER_HTTP_PORT=3000 # Grafana service default port (customizable) +export PROMETHEUS_PORT=9090 # Prometheus service default port (customizable) +export RAY_HEAD_PORT=6379 # Ray master node port (customizable) +export RAY_DASHBOARD_PORT=8265 # Ray dashboard default port (customizable) +export GRAFANA_PATHS_DATA=/tmp/grafana # Grafana data storage directory (customizable) +export RAY_GRAFANA_HOST="http://${master_ip}:${GF_SERVER_HTTP_PORT}" # Ray-associated Grafana address +export RAY_PROMETHEUS_HOST="http://${master_ip}:${PROMETHEUS_PORT}" # Ray-associated Prometheus address + +# Start Ray on master node +ray start --head --port=${RAY_HEAD_PORT} --dashboard-port=${RAY_DASHBOARD_PORT} + +# Start Ray on worker nodes +ray start --address={master_addr}:${RAY_HEAD_PORT} +``` + +**Verification:** Visit `http://master_ip:8265` to confirm Ray has started successfully. + +### Step 2: Start Grafana (Visualization Dashboard) + +Grafana is used to display metrics collected by Prometheus (such as cache hit rate, throughput, etc.): + +```bash +# Master node +nohup grafana-server \ + --config /tmp/ray/session_latest/metrics/grafana/grafana.ini \ + --homepath /usr/share/grafana \ + web > grafana.log 2>&1 & +``` + +**Verification:** Visit `http://master_ip:3000` to confirm Grafana has started successfully (default credentials: `admin/admin`). + +If you need to change the port, modify the `GF_SERVER_HTTP_PORT` environment variable, and grafana-server will automatically recognize it. + +### Step 3: Start Prometheus (Metrics Collection) + +Prometheus is responsible for scraping metrics from vLLM services and storing them as time-series data: + +```bash +# Master node +nohup prometheus \ + --config.file /tmp/ray/session_latest/metrics/prometheus/prometheus.yml \ + --web.enable-lifecycle \ + --web.listen-address=:${PROMETHEUS_PORT} \ + > prometheus.log 2>&1 & +``` + +**Verification:** Visit `http://master_ip:9090` to confirm Prometheus service has started successfully. + +### Step 4 & 5: Start verl Training + +Start verl training with the following parameters configured: + +**Required Configuration:** + +- `actor_rollout_ref.rollout.mode="async"` +- `actor_rollout_ref.rollout.disable_log_stats=False` +- `actor_rollout_ref.rollout.prometheus.enable=True` + +If use default port, this parameter can be omitted. + +- `actor_rollout_ref.rollout.prometheus.port=9090` + +If use default path, this parameter can be omitted. + +- `actor_rollout_ref.rollout.prometheus.file="/tmp/ray/session_latest/metrics/prometheus/prometheus.yml"` + +served_model_name uses `model_path.split("/")[-1]` for data statistics by default. +Users can also customize other aliases: + +- `actor_rollout_ref.rollout.prometheus.served_model_name="Qwen3-235B"` + +**Shell Script Example:** + +```bash +WORKING_DIR=${WORKING_DIR:-"${PWD}"} +RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} + +rollout_mode="async" +rollout_name="vllm" # Options: sglang or vllm +if [ "$rollout_mode" = "async" ]; then + export VLLM_USE_V1=1 + return_raw_chat="True" +fi + +# Synchronous training +ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ + --working-dir "${WORKING_DIR}" \ + -- python3 -m verl.trainer.main_ppo \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.disable_log_stats=False \ + actor_rollout_ref.rollout.prometheus.enable=True + ... + +# Asynchronous training +ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ + --working-dir "${WORKING_DIR}" \ + -- python3 verl.experimental.fully_async_policy.fully_async_main \ + data.return_raw_chat=${return_raw_chat} \ + actor_rollout_ref.rollout.name=${rollout_name} \ + actor_rollout_ref.rollout.mode=${rollout_mode} \ + actor_rollout_ref.rollout.disable_log_stats=False \ + actor_rollout_ref.rollout.prometheus.enable=True + ... +``` + +### Step 6: View Metrics in Prometheus + +After task execution, verify that Prometheus is correctly collecting metrics. + +**Verification:** Visit the Prometheus interface at `http://master_ip:9090` and search for `vllm:` or `sglang:` to +confirm metrics are being reported correctly. + +**Troubleshooting:** + +If no metrics appear: + +1. Check logs for `AgentLoopManager` to find the server port +2. Visit `http://master_ip:server_port/metrics` to verify server metrics are available +3. Confirm that `actor_rollout_ref.rollout.disable_log_stats=False` is set + +### Step 7: View Metrics in Grafana + +After task execution, log in to Grafana to view and customize monitoring dashboards. + +**Login:** Visit `http://master_ip:3000` (default credentials: `admin/admin`) + +**Import Dashboard:** + +1. Select `Dashboards` → `New` → `Import` → `Upload dashboard JSON file` +2. Upload a pre-built dashboard JSON file + +**Available Dashboards:** + +- [vLLM Grafana Dashboard style 1](https://github.com/ArronHZG/verl-community/blob/main/docs/grafana/vllm_grafana.json) +- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/performance_statistics.json) +- [vLLM Grafana Dashboard style 2](https://github.com/vllm-project/vllm/blob/main/examples/online_serving/dashboards/grafana/query_statistics.json) +- [SGLang Grafana Dashboard](https://github.com/sgl-project/sglang/blob/main/examples/monitoring/grafana/dashboards/json/sglang-dashboard.json) + +## Additional Resources + +- [Ray Monitoring Documentation](https://docs.ray.io/en/latest/cluster/configure-manage-dashboard.html) +- [Prometheus Documentation](https://prometheus.io/docs/) +- [Grafana Documentation](https://grafana.com/docs/) +- [vLLM GitHub Repository](https://github.com/vllm-project/vllm) +- [SGLang GitHub Repository](https://github.com/sgl-project/sglang) diff --git a/code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst b/code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst new file mode 100644 index 0000000000000000000000000000000000000000..9a52e6017b7adc77b404398501587aff0e045129 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/megatron_extension.rst @@ -0,0 +1,20 @@ +Add models with the Megatron-LM backend +========================================= + +Last updated: 04/25/2025. + +Model +----------- + + +If use latest verl, we have direct support of ``GPTModel`` for Megatron backend. +You can use the similar way of using Megatron to pretrain custom models. +We list the steps here: + +1. Find `model_initializer.py `_ +2. If your model is configurable by ``TransformerLayerSpec`` , you can + directly use ``GPTModel``. Otherwise, Please implement a new + ``ModelLayerSpec`` and ``ModelLayer`` here. +3. Use the right ``LayerSpec`` , ``TransformerConfig`` and ``HuggingfaceConfig`` + as arguments to initialize the GPTModel. +4. Return the model at last. diff --git a/code/RL_model/verl/verl_train/docs/advance/mtp.md b/code/RL_model/verl/verl_train/docs/advance/mtp.md new file mode 100644 index 0000000000000000000000000000000000000000..b4c5a25c631220d5307d11beb1de122f43312699 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/mtp.md @@ -0,0 +1,105 @@ +# Guide to Using MTP in SFT/RL Training and Inference + +**Author**: `https://github.com/meituan-search` + +Last updated: 01/30/2026 + +# 1. Scope of Support + +Currently, RL training can be performed on mimo-7B-RL, Qwen-next, and Deepseek series models based on the MTP architecture. The support rules for training and inference engines are as follows: + +- **Training Engine**: Only supports the `mbridge + megatron` combination; other training engines are not compatible at this time; + +- **Inference Engine**: Compatible with all engines, but the model must be in the corresponding engine's compatibility list; + +- **Dependency Versions**: + + - mbridge: Use the specified branch: [https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp](https://github.com/ArronHZG/mbridge/tree/feature/verl_mtp) (will be merged into the main branch in the future); + + - megatron: Use the latest dev version (commit: [23e092f41ec8bc659020e401ddac9576c1cfed7e](https://github.com/NVIDIA/Megatron-LM/tree/23e092f41ec8bc659020e401ddac9576c1cfed7e)), which supports MTP + CP training methods. + + - sglang: Use the specified branch: [https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor](https://github.com/ArronHZG/sglang/tree/fix_mtp_update_weights_from_tensor), [PR](https://github.com/sgl-project/sglang/pull/17870) , which fix the MTP update weights from tensor OOM issue. + +# 2. MTP Training Configuration (Core Parameters) + +The MTP training process can be flexibly controlled through the following configurations. All configurations are based on the `actor_rollout_ref.model.mtp` prefix: + +| Configuration Scenario | Core Parameters | Description | +|------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------| +| Load MTP Parameters Only | `enable=True` | VRAM usage will increase, but the exported parameters include the MTP module and can be directly used for online deployment | +| Full-Parameter MTP Training | `enable=True`
`enable_train=True`
`mtp_loss_scaling_factor=0.1` | MTP Loss will apply to all model parameters | +| MTP Parameter-Only Training | `enable=True`
`enable_train=True`
`detach_encoder=True` | Freeze the Encoder layer, update only MTP module parameters, MTP Loss applies only to MTP parameters | +| MTP Accelerated Rollout | 1. vLLM configuration:
`enable=True`
`enable_rollout=True`
`method="mtp"`
`num_speculative_tokens=1`
2. SGLang configuration:
`enable=True`
`enable_rollout=True`
`speculative_algorithm="EAGLE"`
`speculative_num_steps=2`
`speculative_eagle_topk=2`
`speculative_num_draft_tokens=4` | Achieve inference acceleration during the Rollout phase based on MTP | + +# 3. Experimental Results + +The experiment was conducted as follows: + +* model = mimo-7B-math +* max_response_length = 8k + +Experiment chart: + +![fully_async_policy_revenue]( +https://github.com/ArronHZG/verl-community/blob/main/docs/mimo-7b-mtp.png?raw=true) + +The wandb link for the graph: [wandb](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg) + +**Scenarios with No Significant Effect** + +The following configurations will not have a noticeable impact on training results: + +1. The base model does not carry MTP parameters; + +2. The base model carries MTP parameters, but the MTP module is not trained; + +3. The base model carries MTP parameters and trains MTP, with `mtp_loss_scaling_factor=0`; + +4. The base model carries MTP parameters, trains MTP and detaches the encoder, with `mtp_loss_scaling_factor=0.1`. + +**Scenarios with Significant Effect** + +Only the following configuration will have a noticeable impact on training results: + +- The base model carries MTP parameters, MTP Loss applies to all model parameters, and `mtp_loss_scaling_factor=0.1`. + +**Recommended Training Method** + +It is recommended to adopt the `detach_encoder=True` approach for MTP training. + +# 4. Performance Notes for MTP in Rollout Inference + +The effectiveness of MTP-accelerated Rollout is significantly affected by **model size** and **inference hardware**. Key reference information is as follows: + +**Hardware Tensor Core Performance** + +| Hardware Model | FP16 Performance (TFLOPS) | +|----------------|---------------------------| +| H20 | 148 | +| H800 | 1,671 | +| H200 | 1,979 | + +**Measured Performance and Recommendations** + +Taking the mimo-7B model deployed separately on H20 hardware using SGLang as an example: After enabling MTP speculative decoding, the Rollout throughput decreases by approximately 50%. + +- Current priority recommendation: Do not enable MTP acceleration during the inference phase for now; + +- Future planning: Further optimization of the speculative logic in the Rollout phase will be conducted to improve throughput performance. + +# 5. SFT training + +The SFT training with MTP is supported, using the same MTP training configuration as RL training. + +An example configuration for running SFT can be found in `examples/sft/gsm8k/run_mimo_megatron_mtp.sh` + +**SFT result** + +The experiment was conducted using following data: +- model = mimo-7B-math +- dataset = gsm8k + +The result: [wandb link](https://wandb.ai/hou-zg-meituan/mimo-7b-sft-mtp?nw=nwuserhouzg) + +The presence of mtp layer has limited effect on main loss. However, when MTP layer is detached, the mtp_loss converges to a higher value. + diff --git a/code/RL_model/verl/verl_train/docs/advance/one_step_off.md b/code/RL_model/verl/verl_train/docs/advance/one_step_off.md new file mode 100644 index 0000000000000000000000000000000000000000..99170d75edc3112b5eba00ab562d8c2316acb9c0 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/one_step_off.md @@ -0,0 +1,319 @@ +# Recipe: One Step Off Policy Async Trainer + +**Author:** `https://github.com/meituan-search` + +Last updated: 07/17/2025. + +## Introduction + +### Background + +The current reinforcement learning training process implemented by verl is synchronous, adhering to the algorithmic +workflows of established methods like PPO, GRPO, and DAPO. In each step, training samples are generated by the latest +model, and the model is updated after training completes. While this approach aligns with off-policy reinforcement +learning and stabilizes RL training, but it suffers from severe efficiency issues. +Model updates must wait for the longest output in the generation phase to complete. +During the generation of long-tail samples, GPUs remain idle, resulting in significant underutilization. +The more severe the long-tail problem in sample generation, the lower the overall training efficiency. +For example, in DAPO 32B training, the Rollout phase accounts for approximately 70% of the total time, +and increasing resources does not reduce the Rollout duration. + +![DAPO 32B Math Performance](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/dapo_32b_math.png) + +> source data: https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=nwusertongyuxuan361 + +### Solution + +We have implemented the **One Step Off Async Trainer** to help alleviate this issue. This approach parallelizes the +generation and training processes, utilizing samples generated in the previous step for current training. +It also involves appropriately partitioning resources, allocating dedicated resources for generation while automatically +assigning the remainder to training. By reducing resources allocated to the generation phase, we mitigate GPU idle time +during long-tail sample generation. Throughout this process, generation and training parameters maintain a one-step off +policy. + +![One Step Off Policy Diagram](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/one_step_off_policy.png) + +> reference: [AReaL: A Large-Scale Asynchronous Reinforcement Learning System for Language Reasoning](https://arxiv.org/abs/2505.24298) + +Our core contributions include: + +1. **Parallel Generation and Training**: + Samples for the next batch are asynchronously generated while the current batch is being trained. + +2. **Resource Isolation**: + Unlike `hybrid_engine`, this method requires explicit resource allocation for rollout, with remaining resources + automatically assigned to training. + +3. **NCCL Parameter Synchronization**: + Employs NCCL communication primitives for seamless parameter transfer between generation and training modules. + +### Experimental Results + +- **Machine Configuration**: 2 nodes with 16 H20 GPUs each + - Generation: 4 GPUs + - Training: 12 GPUs +- **Model**: Qwen2.5-Math-7B +- **Rollout Configuration**: +- **Max Response Length**: FSDP2: 20,480 tokens; Megatron: 8,192 tokens +- **Algorithm**: DAPO +- **Rollout Engine**: vLLM + +| training mode | engine | step | gen | wait_prev_gen | generate_sequences | old_log_prob | update_actor | total time | acc/best@32/mean | acc/maj@32/mean | +| ---------------------- | ------------- | ---- | --- | ------------- | ------------------ | ------------ | ------------ | -------------- | ---------------- | --------------- | +| colocate sync | VLLM+FSDP2 | 749 | 321 | - | 247 | 88 | 286 | 19h18m | 0.5948 | 0.417 | +| one-step-overlap async | VLLM+FSDP2 | 520 | - | 45 | 458 | 108 | 337 | 15h34m(+23%) | 0.6165 | 0.494 | +| colocate sync | VLLM+Megatron | 699 | 207 | - | 162 | 119 | 344 | 18h21m | 0.605 | 0.4217 | +| one-step-overlap async | VLLM+Megatron | 566 | - | 59 | 501 | 120 | 347 | 13h06m (+40%) | 0.6569 | 0.4038 | + +- colocate sync: step ≈ gen + old_log_prob + update_actor +- one-step-overlap async: step ≈ wait_prev_gen + old_log_prob + update_actor + +![One Step Off Megatron Performance](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/docs/one_step_off_megatron.png) + +> source data: https://wandb.ai/hou-zg-meituan/one-step-off-policy?nw=nwuserhouzg + +## Implementation + +### One Step Off Policy Async Pipeline + +Our implemented **One Step Off Policy Async Pipeline** integrates seamlessly into existing training logic at minimal +cost, +eliminating the need for additional sample storage management. The core mechanism uses `async_gen_next_batch` +for asynchronous rollout generation while maintaining continuous operation during epoch transitions +via `create_continuous_iterator`. + +```python +# iterator generator, simplify one-step integration of the training process +def _create_continuous_iterator(self): + for epoch in range(self.config.trainer.total_epochs): + iterator = iter(self.train_dataloader) + for batch_dict in iterator: + yield epoch, batch_dict + + +# read next batch samples, parameters sync and launch asyn gen_seq +def _async_gen_next_batch(self, continuous_iterator): + # read train_data + try: + epoch, batch_dict = next(continuous_iterator) + except StopIteration: + return None + batch = DataProto.from_single_dict(batch_dict) + gen_batch = batch_pocess(batch) + # sync weights from actor to rollout + self.sync_rollout_weights() + # async generation + gen_batch_output = self.rollout_wg.async_generate_sequences(gen_batch) + # future encapsulated + return GenerationBatchFuture(epoch, batch, gen_batch_output) + + +continuous_iterator = self._create_continuous_iterator() +# run rollout first to achieve one-step-off +batch_data_future = self._async_gen_next_batch(continuous_iterator) + +while batch_data_future is not None: + # wait for the gen_seq result from the previous step + batch = batch_data_future.get() + # launch the next async call to generate sequences + batch_data_future = self._async_gen_next_batch(continuous_iterator) + + # compute advantages + batch = critic.compute_values(batch) + batch = reference.compute_log_prob(batch) + batch = reward.compute_reward(batch) + batch = compute_advantages(batch) + + # model update + critic_metrics = critic.update_critic(batch) + actor_metrics = actor.update_actor(batch) +``` + +### Parameter Synchronization + +The exciting point is that our nccl based weights updating for rollout model has great performance. +At most of time, the latency is under 300ms, which is negligible for RLHF. + +> **sync_rollout_weights**:The time for synchronizing parameters from actor to rollout is extremely fast and can almost +> be ignored because it is implemented with nccl. + +```python +class ActorRolloutRefWorker: + # actor acquires the meta-info of model parameters for parameter sync + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def get_actor_weights_info(self): + params = self._get_actor_params() + ret = [] + for key, tensor in params.items(): + ret.append((key, tensor.size(), tensor.dtype)) + self._weights_info = ret + return ret + + # rollout sets the meta-info of model parameters for parameter sync + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def set_actor_weights_info(self, weights_info): + self._weights_info = weights_info + + +class AsyncRayPPOTrainer(RayPPOTrainer): + def init_workers(self): + ... + # rollout obtains the meta-info of model parameters from the actor for parameter sync + weights_info = self.actor_wg.get_actor_weights_info()[0] + self.rollout_wg.set_actor_weights_info(weights_info) + + # Create an actor-rollout communication group for parameter sync + self.create_weight_sync_group +``` + +```python +# The driving process invokes the actor and rollout respectively to create a weight synchronization group based on nccl/hccl. +def create_weight_sync_group(self): + master_address = ray.get(self.actor_wg.workers[0]._get_node_ip.remote()) + master_port = ray.get(self.actor_wg.workers[0]._get_free_port.remote()) + world_size = len(self.actor_wg.workers + self.rollout_wg.workers) + self.actor_wg.create_weight_sync_group( + master_address, + master_port, + 0, + world_size, + ) + ray.get( + self.rollout_wg.create_weight_sync_group( + master_address, + master_port, + len(self.actor_wg.workers), + world_size, + ) + ) + +# drive process call the actor and rollout respectively to sync parameters by nccl +def sync_rollout_weights(self): + self.actor_wg.sync_rollout_weights() + ray.get(self.rollout_wg.sync_rollout_weights()) + + +# fsdp model parameter sync +@register(dispatch_mode=Dispatch.ONE_TO_ALL, blocking=False) +def sync_rollout_weights(self): + params = self._get_actor_params() if self._is_actor else None + if self._is_rollout: + inference_model = ( + self.rollout.inference_engine.llm_engine.model_executor.driver_worker.worker.model_runner.model + ) + from verl.utils.vllm.patch import patch_vllm_moe_model_weight_loader + patch_vllm_moe_model_weight_loader(inference_model) + # Model parameters are broadcast tensor-by-tensor from actor to rollout + for key, shape, dtype in self._weights_info: + tensor = torch.empty(shape, dtype=dtype, device=get_torch_device().current_device()) + if self._is_actor: + assert key in params + origin_data = params[key] + if hasattr(origin_data, "full_tensor"): + origin_data = origin_data.full_tensor() + if torch.distributed.get_rank() == 0: + tensor.copy_(origin_data) + from ray.util.collective import collective + + collective.broadcast(tensor, src_rank=0, group_name="actor_rollout") + if self._is_rollout: + inference_model.load_weights([(key, tensor)]) +``` + +### PPO Correctness + +To ensure the correctness of the PPO algorithm, we use rollout log_probs for PPO importance sampling. +For the related algorithm details, please refer to: https://verl.readthedocs.io/en/latest/algo/rollout_corr_math.html +The default mode is `bypass_ppo_clip`, but other modification strategies can also be explored. + +### AgentLoop + +In the current implementation, we no longer provide SPMD model rollout mode. +Instead, we have switched to AgentLoop mode, which also supports multi-turn tool calling. + +## Usage + +### FSDP2 Configuration Example + +```shell +python3 -m verl.experimental.one_step_off_policy.async_main_ppo \ + --config-path=config \ + --config-name='one_step_off_ppo_trainer.yaml' \ + actor_rollout_ref.actor.strategy=fsdp2 \ + # actor and rollout are placed separately + actor_rollout_ref.hybrid_engine=False \ + # actor and rollout resource + trainer.nnodes=1 \ + trainer.n_gpus_per_node=6 \ + rollout.nnodes=1 \ + rollout.n_gpus_per_node=2 +``` + +### Megatron Configuration Example + +```shell +python3 -m verl.experimental.one_step_off_policy.async_main_ppo \ + --config-path=config \ + --config-name='one_step_off_ppo_megatron_trainer.yaml' \ + actor_rollout_ref.actor.strategy=megatron \ + # actor and rollout are placed separately + actor_rollout_ref.hybrid_engine=False \ + # actor and rollout resource + trainer.nnodes=1 \ + trainer.n_gpus_per_node=6 \ + rollout.nnodes=1 \ + rollout.n_gpus_per_node=2 +``` + +### Configuration Guidelines + +1. **Card Number Relationships** + Maintain either of these relationships for optimal batch distribution: + + - `actor_rollout_ref.rollout.n` should be an integer divisor of: + `trainer.n_gpus_per_node * trainer.nnodes` + - `actor_rollout_ref.rollout.n * data.train_batch_size` should be evenly divisible by: + `trainer.n_gpus_per_node * trainer.nnodes` + + > Rationale: Ensures training samples can be evenly distributed across training GPUs when using partial resources for + > generation. + +2. **Dynamic Resource Tuning** + Adjust `trainer.nnodes` `trainer.n_gpus_per_node` `rollout.nnodes` `rollout.n_gpus_per_node` based on phase + durations: + - **Ideal state**: Rollout and training phases have comparable durations + - **Diagnostic metrics**: + - Monitor `wait_prev_gen` duration + - Analyze `sequence_length` distribution + - **Adjustment strategy**: + - High `wait_prev_gen` + uniform sequence lengths → Increase rollout resources + - High `wait_prev_gen` + long-tail sequences → Optimize stopping criteria (resource increase won't help) + > **wait_prev_gen**:The time consumed waiting for the previous rollout to end (the part that is not fully + > overlapped). + > **Resource Configuration Strategies:** + - **Resource-constrained scenario**: Optimize resource utilization by adjusting GPU allocation ratios, + keeping the number of nodes equal to allow training and rollout to share nodes; + - Configure `trainer.nnodes = rollout.nnodes` with + `trainer.n_gpus_per_node + rollout.n_gpus_per_node = physical_gpus_per_node`. Control rollout resource + allocation by adjusting `n_gpus_per_node`. + - **Resource-abundant scenario**: Optimize performance by adjusting the number of nodes, + keeping the number of GPUs per node equal to enable independent scaling of training and rollout + parallelism. + - Configure `trainer.n_gpus_per_node = rollout.n_gpus_per_node` and control rollout resource allocation by + adjusting `trainer.nnodes` and `rollout.nnodes`to achieve optimal performance. + > **Note**: The total number of nodes required by the system is not simply `trainer.nnodes + rollout.nnodes`. The + > actual calculation depends on GPU capacity: + > + > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node <= physical_gpus_per_node`, + > the required node count is `max(trainer.nnodes, rollout.nnodes)` + > - When `trainer.n_gpus_per_node + rollout.n_gpus_per_node > physical_gpus_per_node`, + > the required node count is `trainer.nnodes + rollout.nnodes` + +## Functional Support + +| Category | Support Situation | +| ------------------ | --------------------------------------------------------------------------------------------------------------- | +| train engine | FSDP2
Megatron | +| rollout engine | vLLM | +| AdvantageEstimator | GRPO
GRPO_PASSK
REINFORCE_PLUS_PLUS
RLOO
OPO
REINFORCE_PLUS_PLUS_BASELINE
GPG | +| Reward | all | diff --git a/code/RL_model/verl/verl_train/docs/advance/placement.rst b/code/RL_model/verl/verl_train/docs/advance/placement.rst new file mode 100644 index 0000000000000000000000000000000000000000..43ba761f76d86591d31b447c0ac5140149dd1082 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/placement.rst @@ -0,0 +1,13 @@ +Ray API Design Tutorial +======================================= + +Last updated: 10/30/2024. + +We provide a tutorial for our Ray API design, including: + +- Ray basic concepts +- Resource Pool and RayWorkerGroup +- Data Dispatch, Execution and Collection +- Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool + +See details in `tutorial.ipynb `_. \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst b/code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst new file mode 100644 index 0000000000000000000000000000000000000000..5317f9fb15b1664b5e57d1a0daafee5b93365193 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/ppo_lora.rst @@ -0,0 +1,208 @@ +RL(HF) algorithms with LoRA Support +=========================================== + +Last updated: 12/17/2025. + +We support LoRA (Low-Rank Adaptation) for reinforcement learning algorithms such as PPO, GRPO, and others. + +LoRA is a parameter-efficient fine-tuning technique that injects trainable low-rank matrices into pre-trained weights (typically linear layers). This reduces memory footprint and compute cost, making it possible to fine-tune large models with limited hardware. + +The benefits this brings include: + +- reinforcement learning with very large models (e.g. 70B+) with modest hardware (e.g. 8x80G GPUs), +- enable larger batch sizes due to reduced memory usage, +- simplify model transfer and deployment, as only LoRA adapters need to be saved, +- Combine with techniques like `SLoRA `_ or `CCoE `_ to serve multiple LoRA adapters efficiently + +This guide explains how to enable LoRA in RL training and configure related parameters. + +FSDP Backend Usage Guide +------------------------ + +.. note:: + + This section applies to **FSDP/FSDP2 backend only**. For Megatron backend, see the :ref:`megatron-lora` section below. + +1. Lora is available in the `verl.trainer.ppo.ray_trainer.RayPPOTrainer`. Examples are provided via the `verl.trainer.main_ppo` entry point. + +2. Currently, LoRA is supported via huggingface peft, only with fsdp/fsdp2 and vllm backend (sglang support coming soon). + +- `strategy=fsdp` or `strategy=fsdp2` +- `rollout.name=vllm` + +3. Required configurations for LoRA: + +- `actor_rollout_ref.model.lora_rank`: int, set to a reasonable value greater than 0 (e.g., 8, 16, 32, 64) +- `actor_rollout_ref.model.lora_alpha`: float, the alpha term in LoRA +- `actor_rollout_ref.rollout.load_format="safetensors"`: required. This enables vLLM to load the base model. +- `actor_rollout_ref.model.target_modules`: the target modules for LoRA. Typically set to "all-linear". + +4. Optional configurations for LoRA: + +- `actor_rollout_ref.model.lora_adapter_path`: string, path to a pretrained LoRA adapter directory. + If provided, loads existing adapter instead of creating new one. Enables multi-stage training from previously saved adapters. + Directory need contain `adapter_model.safetensors` and `adapter_config.json`. + +5. Recommend options: + +- `actor_rollout_ref.model.use_shm=True`: preload the model into `/dev/shm` to improve model loading speed. +- `actor_rollout_ref.rollout.layered_summon=True`: this enables the actor-model to gather the FSDP shards per layers when synchronizing the LoRA Adapter to vLLM, thereby reducing GPU peak memory. Recommended if the model is very large (70B+) or the GPU memory is limited (< 48GB) + +.. _megatron-lora: + +Megatron Backend Usage Guide +---------------------------- + +.. warning:: + + The FSDP-specific config options are **NOT applicable** to Megatron backend, and they will be ignored if set. Only options listed under ``lora`` key are applicable: + + - ``actor_rollout_ref.model.lora.*`` + - ``critic.model.lora.*`` + +You need to install and enable Megatron-Bridge for Megatron LoRA support. + +Make sure you use Megatron-Bridge later than 0.2.0, and we recommended using `this commit `_ or later for proper support, and use the following settings to enable Megatron-Bridge: + +- ``actor_rollout_ref.actor.megatron.use_mbridge=True`` +- ``actor_rollout_ref.actor.megatron.vanilla_mbridge=False`` + +**Key Differences from FSDP LoRA:** + +1. **LoRA Implementation**: Verl Megatron backend uses Megatron-Bridge's native LoRA implementation, which differs from HuggingFace PEFT. + +2. **Weight Sync / Refit Mechanism**: Currently, Megatron-Bridge can support syncing weights by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss), as well as loading separate adapters. + +**Configuration for Megatron LoRA:** + +.. code-block:: yaml + + actor_rollout_ref: + model: + lora: + # LoRA type: "lora", "vlm_lora", "canonical_lora", or "dora" + type: lora + + # whether to sync weights / refit by either merging LoRA adapters into the base model weights before transferring to vLLM (for better inference speed but more refit time and potential precision loss). If this is False, it will load separate adapters. + merge: False + + # LoRA rank (Dimension of the low-rank projection space.). Set to 0 to disable LoRA + rank: 0 + + # Weighting factor for the low-rank projection. Defaults to 32 + alpha: 32 + + # Dropout rate for the low-rank projection. Defaults to 0.0 + dropout: 0.0 + + # A list of module names to apply LoRA to. + # For fused LoRA, Defaults to all linear layers ['linear_qkv', 'linear_proj', 'linear_fc1', 'linear_fc2']. + # For canonical LoRA: ["linear_q", "linear_k", "linear_v", "linear_proj", "linear_fc1_up", "linear_fc1_gate", "linear_fc2"] + # - 'linear_qkv': Apply LoRA to the fused linear layer used for query, key, and value projections in self-attention + # - 'linear_proj': Apply LoRA to the linear layer used for projecting the output of self-attention + # - 'linear_fc1': Apply LoRA to the first fully-connected layer in MLP + # - 'linear_fc2': Apply LoRA to the second fully-connected layer in MLP + # Target modules can also contain wildcards. For example, you can specify + # target_modules=['*.layers.0.*.linear_qkv', '*.layers.1.*.linear_qkv'] to add LoRA to only linear_qkv on the first two layers + # + # Note: + # For MLA (e.g., DeepSeek), you should use ["linear_kv_down_proj","linear_kv_up_proj","linear_q_down_proj","linear_q_up_proj","linear_q_proj"] + # Instead of "linear_qkv" or ["linear_q","linear_k","linear_v"] + # By default, MoE routers are excluded from LoRA adaptation, and you will need to specify "router" in target_modules to include them. + target_modules: + - linear_qkv + - linear_proj + - linear_fc1 + - linear_fc2 + + # A list of module names not to apply LoRa to. It will match all nn.Linear & nn.Linear-adjacent modules whose name + # does not match any string in exclude_modules. If used, will require target_modules to be empty list or None + exclude_modules: [] + + # Position for applying dropout, can be 'pre' (before the low-rank projection) or 'post' (after). Defaults to 'pre' + dropout_position: pre + + # Initialization method for the low-rank matrix A. Defaults to "xavier". + lora_A_init_method: xavier + + # Initialization method for the low-rank matrix B. Defaults to "zero". + lora_B_init_method: zero + + # Enables the experimental All-to-All (A2A) communication strategy. Defaults to False + a2a_experimental: False + + # Parameter data type for LoRA weights. Default to null, which will use model's dtype. + dtype: null + + # Path to pre-trained LoRA adapter weights (null to train from scratch) + adapter_path: null + + # VLMLoRA additionally allows the user to specify whether the language or vision models should be frozen. + # For example, a common finetuning workload for multimodal models is to apply adapters to language model and fully + # finetune the vision model. + freeze_vision_model: True + freeze_vision_projection: True + freeze_language_model: True + +LoRA training experiment with Qwen3-8B on 8 * H200 single node comparing FSDP and Megatron backend (script adapted from examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh): + +.. image:: https://github.com/user-attachments/assets/0482f423-01a3-4e52-a7ee-8b9cd79b7b1a +.. image:: https://github.com/user-attachments/assets/6ce10400-8164-47d8-90a6-c1bf002fb9e8 +.. image:: https://github.com/user-attachments/assets/092d3a43-4eba-425e-a584-8d83c1f02de4 + + +Best Practices and Notes +------------------------- + +1. **Learning rate**: it is recommended to increase the value of learning rate by an order of magnitude. + +2. **LoRA Rank**: + +- Too small a rank can hurt convergence. +- LoRA rank recommendation from @thelongestusernameofall: + + - A very small lora_rank can lead to slower convergence or worse training performance. It is recommended to set lora_rank to be>=32. Tests have shown that for a 0.5B model, with lora_rank=32,the training convergence speed and final performance are almost identical to non-LoRA training + - For a 32B model,with lora_rank=128,the training convergence speed and final performance are also almost identical to non-LoRA training. + - More comprehensive reference results are coming soon. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/f2b80b8b26829124dd393b7a795a0640eff11644/docs/lora.jpg?raw=true + +3. **FSDP-Specific:** Reference configuration for RL training with the Qwen2.5-72B model using 8 x 80GB GPUs (increase lora_rank if needed): + +.. code-block:: + + data.train_batch_size=64 \ + actor_rollout_ref.model.use_shm=True \ + actor_rollout_ref.model.lora_rank=32 \ + actor_rollout_ref.model.lora_alpha=32 \ + actor_rollout_ref.model.target_modules=all-linear \ + actor_rollout_ref.actor.optim.lr=3e-5 \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.rollout.tensor_model_parallel_size=8 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.rollout.max_num_seqs=64 \ + actor_rollout_ref.rollout.max_model_len=1536 \ + actor_rollout_ref.rollout.max_num_batched_tokens=1536 \ + actor_rollout_ref.rollout.load_format=safetensors \ + actor_rollout_ref.rollout.layered_summon=True \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \ + +Example Scripts +------------------- + +For end-to-end examples, refer to the scripts below: + +**FSDP Examples:** + +- LoRA training from scratch: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh +- LoRA training from adapter path: examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora_from_adapter.sh + +**Megatron Examples:** + +- LoRA training with Dense: examples/grpo_trainer/run_qwen2-7b_math_megatron_lora.sh +- LoRA training with MoE: examples/grpo_trainer/run_qwen3moe-30b_megatron_lora.sh diff --git a/code/RL_model/verl/verl_train/docs/advance/reward_loop.rst b/code/RL_model/verl/verl_train/docs/advance/reward_loop.rst new file mode 100644 index 0000000000000000000000000000000000000000..cb755d9c6044e14f59f1e88d476fa4dd526d3260 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/reward_loop.rst @@ -0,0 +1,301 @@ +Reward Loop +=========== + +.. _yyding: https://yyding1.github.io + +Author: `Yuyang Ding `_ + +Last updated: 12/20/2025. + +.. warning:: + Reward Loop is ready for use, but the API may change in future releases. + User can set ``reward_model.use_reward_loop=True`` or ``False`` to control whether to enable reward loop. + +Reward Loop is designed to support flexible and user-friendly reward computation, with most implementation in ``verl/experimental/reward_loop``. + +Compared with the previous reward mechanism, the Reward Loop offers the following key features: + +1. provides a more flexible and user-friendly design for reward-model settings, enabling hybrid reward scenarios where multiple reward sources can be seamlessly integrated. +2. implements asynchronous reward computation instead of the previous batch-based computation, improving efficiency for both rule-based rewards and reward-model-based scenarios. + +Hybrid Reward Scenarios +----------------------- + +Reward Loop covers all typical reward-computation scenarios. + +- **Rule-based Reward**: The reward is determined by predefined rules, e.g., checking whether the predicted answer matches the ground truth via simple string matching. +- **Discriminative Reward Model (DisRM)**: The reward is produced by a specified discriminative reward model, such as ``Skywork/Skywork-Reward-Llama-3.1-8B-v0.2``. +- **Generative Reward Model (GenRM)**: The reward is obtained using a generative reward model, for example ``dyyyyyyyy/FAPO-GenRM-4B``. +- **Hybrid Reward Scenarios**: Reward Loop provides interfaces for plugging in reward models, allowing users to define custom reward logic based on their needs (e.g., combining rule-based methods with GenRM). + +Rule-based Reward +~~~~~~~~~~~~~~~~~ + +If ``custom_reward_function`` is not provided, the reward loop will fall back to the default rule-based reward function. +Otherwise, only the user-defined reward function will be used. The files under ``verl/utils/reward_score/`` provide some examples. + +Reward Loop supports both synchronous and asynchronous user-defined reward functions. It automatically detects the function type and executes it accordingly, ensuring that reward computation remains non-blocking and efficient. + +Discriminative Reward Model (DisRM) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For scenarios involving a discriminative reward model, users should provide ``reward_model.model.path`` to specify the reward model. + +The Reward Loop will pass the question and the model rollout as inputs to the reward model and obtain a reward score from its output. + +Generative Reward Model (GenRM) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For generative reward model scenarios, users need to specify both ``reward_model.model.path`` and ``custom_reward_function``. + +The custom reward function should implement the following components: + +- Convert the question and the model rollout into a GenRM input prompt using a custom prompt template. +- Invoke the GenRM to perform generation with custom sampling parameters. For this purpose, the Reward Loop provides an HTTP interface (i.e., ``reward_router_address``) for interacting with GenRM. +- Parse the GenRM output using a custom parser and extract the reward score. + +As these steps are highly customizable and task-dependent, we offer this flexibility entirely to the user-defined reward function. + +Below we provide an example of a custom reward function using GenRM. + +.. code:: python + + async def compute_score_gsm8k( + data_source: str, + solution_str: str, + ground_truth: str, + extra_info: dict, + reward_router_address: str, # an HTTP router endpoint provided by Reward Loop + reward_model_tokenizer: PreTrainedTokenizer, + ): + """Compute the reward score.""" + + # Step 1: Prepare prompt and request payload + grm_prompt = GRM_PROMPT_TEMPLATE.format(problem=extra_info["question"], solution=solution_str) + messages = [{"role": "user", "content": grm_prompt}] + sampling_params = {"temperature": 0.7, "top_p": 0.8, "max_tokens": 4096} + chat_complete_request = {"messages": messages, **sampling_params} + + # Step 2: Send async request to the reward model + # here, chat_complete sends async http request to the router address + result = await chat_complete( + router_address=reward_router_address, + chat_complete_request=chat_complete_request, + ) + + # Step 3: Parse model response and extract score + grm_response = result.choices[0].message.content.strip() + try: + score_str = grm_response.split("\n\n")[-1].strip() + score = int(score_str) + except Exception: + score = 0 + + return {"score": score} + +Hybrid Reward Scenarios +~~~~~~~~~~~~~~~~~~~~~~~ + +For more complex application settings, such as combining rule-based rewards with GenRM, or mixing rule-based rewards with DisRM, users can also achieve this by specifying the ``reward_model.model.path`` together with the ``custom_reward_function``. +The implementation of the customized reward function follows the same pattern as illustrated above. + +A runnable and reproducible example that demonstrates how to use a rule-based reward function together with a GenRM is provided in the ``recipe/fapo`` directory for reference. Welcome to use and cite. + +Architecture Design +------------------- + +Reward Loop supports multiple execution modes for reward training: + +- **Colocate Mode**: The reward model shares the same resource pool as the actor/rollout/reference models. In this setup, all rollouts must complete first, after which the reward model is awakened to perform inference. +- **Standalone Mode**: The reward model runs on a separate resource pool, independent from the actor/rollout/reference models. In this setup, each sample is evaluated by the reward model immediately after its rollout finishes. + +.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop.svg?raw=true + +RewardLoopWorker +~~~~~~~~~~~~~~~~~ + +The ``RewardLoopWorker`` is responsible for handling batch-level reward computation, operating in an asynchronous manner. + +.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop_worker.svg?raw=true + +For each sample, the reward is computed according to the following logic: + +- if ``custom_reward_function`` is provided, we directly use user-customized reward function +- if ``custom_reward_function`` is not provided: + - **reward model is not enabled**: use default rule-based reward function + - **reward model is discriminative**: compute reward score using disrm + - **reward model is generative**: this is not permitted (user-customized reward func **must be** provided) + +In most cases, we encourage users to define and use their own customized reward functions. + +``RewardLoopWorker`` will initialize a ``RewardManager`` via ``_init_reward_fn()``. +Then the batch reward computation request of ``compute_score_batch`` will be processed asynchronously. + +.. code:: python + + @ray.remote + class RewardLoopWorker: + def __init__(self, config: DictConfig, reward_router_address: str = None): + self.config = config + self.reward_router_address = reward_router_address + self._init_reward_fn() + + def _init_reward_fn(self): + input_tokenizer_local_path = copy_to_local(self.config.actor_rollout_ref.model.path) + self.input_tokenizer = hf_tokenizer(input_tokenizer_local_path, trust_remote_code=True) + self.reward_model_tokenizer = None + if self.config.reward_model.enable: + reward_model_tokenizer_local_path = copy_to_local(self.config.reward_model.model.path) + self.reward_model_tokenizer = hf_tokenizer(reward_model_tokenizer_local_path, trust_remote_code=True) + self.reward_fn = get_custom_reward_fn(self.config) + reward_manager_cls = get_reward_manager_cls(self.config.reward_model.reward_manager) + self.reward_loop = reward_manager_cls( + self.config, self.input_tokenizer, self.reward_fn, self.reward_router_address, self.reward_model_tokenizer + ) + + async def compute_score_batch(self, data: DataProto) -> list[dict]: + tasks = [] + for i in range(len(data)): + tasks.append(asyncio.create_task(self.compute_score(data[i : i + 1]))) + outputs = await asyncio.gather(*tasks) + return outputs + + async def compute_score(self, data: DataProto) -> dict: + assert len(data) == 1, "RewardLoopWorker only support single data item" + if self.config.custom_reward_function.path is not None: + # directly use user-customized reward function + return await self.reward_loop.run_single(data) + else: + if self.config.reward_model.enable: + # we assume the rm is disrm + # genrm must set custom_reward_function + return await self.compute_score_disrm(data) + else: + return await self.reward_loop.run_single(data) + +RewardManager +~~~~~~~~~~~~~ + +Reward Loop refactors the previous reward manager, which processed rewards sequentially on batched inputs. +Instead, the Reward Loop performs reward computation asynchronously and in parallel at the per-sample level. + +In the ``RewardManager`` of Reward Loop, we implement a ``run_single`` function to compute the score for single sample. All the reward functions are executed by ``compute_score_fn``. The input should be a ``DataProto`` containing only one item. + +.. code:: python + + @register("naive") + class NaiveRewardManager(RewardManagerBase): + async def run_single(self, data: DataProto) -> dict: + assert len(data) == 1, "Only support single data item" + ... + +Commonly used reward managers, such as ``DAPORewardManager`` has been implemented in reward loop. +In addition, ``RateLimitRewardManager`` is also ready for use for external API-based reward computation scenarios like ChatGPT. + +Users can also customize their own ``RewardManager``, by adding the ``@register`` decorator, inheriting from ``RewardManagerBase``, and implementing the ``run_single`` function. +See ``verl/experimental/reward_manager/*`` for reference. + +.. code:: python + + @register("user_costomized") + class UserCostomizedRewardManager(RewardManagerBase): + async def run_single(self, data: DataProto) -> dict: + assert len(data) == 1, "Only support single data item" + # your own reward manager + ... + +After defining it, users can specify their custom reward manager by setting ``reward_model.reward_manager=user_costomized``. + +RewardLoopManager +~~~~~~~~~~~~~~~~~ + +To enable parallel reward computation, the Reward Loop launches multiple reward workers that handle reward computation requests concurrently. + +In **standalone mode**, we directly launch one ``RewardLoopWorker`` for each ``AgentLoopWorker`` to handle reward computation independently. + +In **colocate mode**, we launch a ``RewardLoopManager`` to + +1. launch reward model if enabled +2. manage multiple ``RewardLoopWorker`` instances to parallelize reward computation. + +Users can specify the number of workers by setting ``reward_model.num_workers`` in colocate mode. + +.. code:: python + + class RewardLoopManager: + """ + RewardLoopManager run in single controller. + This class will create reward loop workers and manage them. + RewardLoopManager will deprecate fsdp/megatron RewardModelWorker in the future. + """ + def __init__(self, config: DictConfig, rm_resource_pool: RayResourcePool = None): + self.config = config + if self.config.reward_model.enable: + self.reward_model_manager = RewardModelManager(config.reward_model, rm_resource_pool) + self.reward_router_address = self.reward_model_manager.get_router_address() + else: + self.reward_model_manager = None + self.reward_router_address = None + + self._init_reward_loop_workers() + + def _init_reward_loop_workers(self): + self.reward_loop_workers = [] + num_workers = self.config.reward_model.get("num_workers", 1) + node_ids = [node["NodeID"] for node in ray.nodes() if node["Alive"] and node["Resources"].get("CPU", 0) > 0] + + for i in range(num_workers): + # Round-robin scheduling over the all nodes + node_id = node_ids[i % len(node_ids)] + self.reward_loop_workers.append( + RewardLoopWorker.options( + name=f"reward_loop_worker_{i}", + scheduling_strategy=ray.util.scheduling_strategies.NodeAffinitySchedulingStrategy( + node_id=node_id, + soft=True, + ), + ).remote(self.config, self.reward_router_address) + ) + + def compute_rm_score(self, data: DataProto) -> DataProto: + """ + Compute reward score for the given data. + """ + ... + + +RewardModelManager +~~~~~~~~~~~~~~~~~~ + +To support flexible and scalable reward model computation, Reward Loop implement a reward router that coordinates requests among multiple reward model servers. + +Each reward model runs as an independent server and is registered with the router. +This router will forward the requests to the registered reward servers with load balancing and return the results. +This design allows us to expose a single unified router address to user-defined reward functions, enabling them to access various reward models seamlessly through the same interface. + +.. image:: https://github.com/yyDing1/verl-materials/blob/main/reward_loop_full.svg?raw=true + +.. code:: python + + class RewardModelManager: + """Reward model manager.""" + + def __init__( + self, + config: RewardModelConfig, + resource_pool: RayResourcePool = None, + ): + """ + Initialize the reward model manager. + + Args: + config (RewardModelConfig): Reward model configuration. + resource_pool (RayResourcePool, optional): Resource pool. Defaults to None. + """ + self.config = config + self.resource_pool = resource_pool + self._initialize_llm_servers() + self._initialize_router() + assert self.config.rollout.skip_tokenizer_init is False, "Reward model should not skip tokenizer init." + if self.config.rollout.free_cache_engine: + self.sleep() diff --git a/code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst b/code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst new file mode 100644 index 0000000000000000000000000000000000000000..1839beed3e46805293cc7cdf9836571b4525c7fe --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/rollout_skip.rst @@ -0,0 +1,61 @@ +RolloutSkip Function Usage Documentation +======================================== + +Last updated: 08/01/2025. + +Applicable Scenarios +-------------------- + +The RolloutSkip functionality is designed to accelerate the rollout process in reinforcement learning training by caching and reusing previously generated sequences. This feature is particularly useful when: + +1. You need to repeatedly run experiments with the same configuration + +2. You want to save time by avoiding redundant sequence generation to come close to the optimal policy + + +API and Usage Example +---------------------- + +2.1 Trainer Adaptation +~~~~~~~~~~~~~~~~~~~~~~ + +Both`RayDAPOTrainer()` (in `verl/recipe/dapo/dapo_ray_trainer.py`) and `RayPPOTrainer()`(in `verl/trainer/ppo/ray_trainer.py``) have already been adapted. + +This is an example of how to patch rollout_skip in RayPPOTrainer. + +.. code-block:: python + + #* Import the RolloutSkip class + from verl.utils.rollout_skip import RolloutSkip + + ... + class RayPPOTrainer: + ... + def fit(self): + ... + + #* Add code as follow: + rollout_skip = RolloutSkip(self.config, self.actor_rollout_wg) + rollout_skip.wrap_generate_sequences() + + ... + + for epoch in range(self.config.trainer.total_epochs): + for batch_dict in self.train_dataloader: + ... + +2.2 Basic Configuration +~~~~~~~~~~~~~~~~~~~~~~~ + +Then, you should add the following parameters to your config to enable the RolloutSkip feature: + +.. code-block:: bash + + actor_rollout_ref.rollout.skip_rollout=True \ + actor_rollout_ref.rollout.skip_dump_dir="/tmp/rollout_dump" \ + + +Note: + +1. The `skip_dump_dir` is the directory where the cached sequences will be stored. Ensure that this directory is writable and accessible by your training process. And make sure that `skip_dump_dir` is not relative path because ray will store the data in `/tmp/ray/session_/` and the relative path will not be found in the worker. +2. The dumped data path follows this naming pattern `{experiment_name}_{project_name}_TrainGBS{train_gbs}__InferGBS{gen_gbs}__N{n}`, once you change the `experiment_name`, `project_name`, `train_gbs`, `gen_gbs`, or `n`, the cached data will be stored in a new directory. diff --git a/code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst b/code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst new file mode 100644 index 0000000000000000000000000000000000000000..5801353cb8c64ed741e0f2ecc54c4d5c0300f260 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/rollout_trace.rst @@ -0,0 +1,146 @@ +Trace Function Usage Instructions +======================================== + +Last updated: 07/10/2025. + +Applicable Scenarios +-------------------- + +Agentic RL involves multiple turns of conversations, tool invocations, and user interactions during the rollout process. During the Model Training process, it is necessary to track function calls, inputs, and outputs to understand the flow path of data within the application. The Trace feature helps, in complex multi-round conversations, to view the transformation of data during each interaction and the entire process leading to the final output by recording the inputs, outputs, and corresponding timestamps of functions, which is conducive to understanding the details of how the model processes data and optimizing the training results. + +The Trace feature integrates commonly used Agent trace tools, including wandb weave and mlflow, which are already supported. Users can choose the appropriate trace tool according to their own needs and preferences. Here, we introduce the usage of each tool. + + +Trace Parameter Configuration +----------------------------- + +- ``actor_rollout_ref.rollout.trace.backend=mlflow|weave`` # the trace backend type +- ``actor_rollout_ref.rollout.trace.token2text=True`` # To show decoded text in trace view +- ``actor_rollout_ref.rollout.trace.max_samples_per_step_per_worker=N`` # Limit traces per worker (optional) + +Limiting Trace Volume +~~~~~~~~~~~~~~~~~~~~~~ + +By default, all samples are traced, which can generate large amounts of data and incur significant costs with trace backends like Weave or MLflow. To limit trace volume while maintaining representative coverage, use ``max_samples_per_step_per_worker``. + +Example configuration: + +.. code-block:: yaml + + actor_rollout_ref: + rollout: + trace: + backend: weave + token2text: False + max_samples_per_step_per_worker: 5 # Each worker traces 5 random samples + +Each agent loop worker independently selects up to N unique samples to trace per training step. For GRPO (``n > 1``), all rollouts for selected samples are traced. Total traces per step = max_samples_per_step_per_worker * num_workers * n. + +Example: With 4 workers, max_samples_per_step_per_worker=5, and GRPO n=4, you get 4 * 5 * 4 = 80 traces per step instead of tracing all samples. Set to null (default) to trace all samples. + + +Glossary +-------- + ++----------------+------------------------------------------------------------------------------------------------------+ +| Object | Explaination | ++================+======================================================================================================+ +| trajectory | A complete multi-turn conversation includes: | +| | 1. LLM output at least once | +| | 2. Tool Call | ++----------------+------------------------------------------------------------------------------------------------------+ +| step | The training step corresponds to the global_steps variable in the trainer | ++----------------+------------------------------------------------------------------------------------------------------+ +| sample_index | The identifier of the sample, defined in the extra_info.index of the dataset. It is usually a number,| +| | but may also be a uuid in some cases. | ++----------------+------------------------------------------------------------------------------------------------------+ +| rollout_n | In the GROP algorithm, each sample is rolled out n times. rollout_n represents the serial number of | +| | the rollout. | ++----------------+------------------------------------------------------------------------------------------------------+ +| validate | Whether the test dataset is used for evaluation? | ++----------------+------------------------------------------------------------------------------------------------------+ + +Rollout trace functions +----------------------- + +There are 2 functions used for tracing: + +1. ``rollout_trace_op``: This is a decorator function used to mark the functions to trace. In default, only few method has it, you can add it to more functions to trace more infor. +2. ``rollout_trace_attr``: This function is used to mark the entry of a trajectory and input some info to trace. If you add new type of agent, you may need to add it to enable trace. + + +Usage of wandb weave +-------------------- + +1.1 Basic Configuration +~~~~~~~~~~~~~~~~~~~~~~~ + +1. Set the ``WANDB_API_KEY`` environment variable +2. Configuration Parameters + + 1. ``actor_rollout_ref.rollout.trace.backend=weave`` + 2. ``trainer.logger=['console', 'wandb']``: This item is optional. Trace and logger are independent functions. When using Weave, it is recommended to also enable the wandb logger to implement both functions in one system. + 3. ``trainer.project_name=$project_name`` + 4. ``trainer.experiment_name=$experiment_name`` + 5. ``actor_rollout_ref.rollout.mode=async``: Since trace is mainly used for agentic RL, need to enable agent toop using async mode for either vllm or sglang. + +Note: +The Weave Free Plan comes with a default monthly network traffic allowance of 1GB. During the training process, the amount of trace data generated is substantial, reaching dozens of gigabytes per day, so it is necessary to select an appropriate wandb plan. + + +1.2 View Trace Logs +~~~~~~~~~~~~~~~~~~~ + +After executing the training, on the project page, you can see the WEAVE sidebar. Click Traces to view it. + +Each Trace project corresponds to a trajectory. You can filter and select the trajectories you need to view by step, sample_index, rollout_n, and experiment_name. + +After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the input and output content. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_list.png?raw=true + +1.3 Compare Trace Logs +~~~~~~~~~~~~~~~~~~~~~~ + +Weave can select multiple trace items and then compare the differences among them. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/weave_trace_compare.png?raw=true + +Usage of mlflow +--------------- + +1. Basic Configuration +~~~~~~~~~~~~~~~~~~~~~~ + +1. Set the ``MLFLOW_TRACKING_URI`` environment variable, which can be: + + 1. Http and https URLs corresponding to online services + 2. Local files or directories, such as ``sqlite:////tmp/mlruns.db``, indicate that data is stored in ``/tmp/mlruns.db``. When using local files, it is necessary to initialize the file first (e.g., start the UI: ``mlflow ui --backend-store-uri sqlite:////tmp/mlruns.db``) to avoid conflicts when multiple workers create files simultaneously. + +2. Configuration Parameters + + 1. ``actor_rollout_ref.rollout.trace.backend=mlflow`` + 2. ``trainer.logger=['console', 'mlflow']``. This item is optional. Trace and logger are independent functions. When using mlflow, it is recommended to also enable the mlflow logger to implement both functions in one system. + 3. ``trainer.project_name=$project_name`` + 4. ``trainer.experiment_name=$experiment_name`` + + +2. View Log +~~~~~~~~~~~ + +Since ``trainer.project_name`` corresponds to Experiments in mlflow, in the mlflow view, you need to select the corresponding project name, then click the "Traces" tab to view traces. Among them, ``trainer.experiment_name`` corresponds to the experiment_name of tags, and tags corresponding to step, sample_index, rollout_n, etc., are used for filtering and viewing. + +For example, searching for ``"tags.step = '1'"`` can display all trajectories of step 1. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_list.png?raw=true + +Opening one of the trajectories allows you to view each function call process within it. + +After enabling token2text, prompt_text and response_text will be automatically added to the output of ToolAgentLoop.run, making it convenient to view the content. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/mlflow_trace_view.png?raw=true + +Note: + +1. mlflow does not support comparing multiple traces +2. rollout_trace can not associate the mlflow trace with the run, so the trace content cannot be seen in the mlflow run logs. diff --git a/code/RL_model/verl/verl_train/docs/advance/rope.rst b/code/RL_model/verl/verl_train/docs/advance/rope.rst new file mode 100644 index 0000000000000000000000000000000000000000..9463549e47d055552a273e83a851fc76f93f9d1a --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/advance/rope.rst @@ -0,0 +1,39 @@ +RoPE Scaling override +======================================= + +Last updated: 05/14/2025. + +Some models such as `Qwen/Qwen2.5-7B-Instruct `_ support RoPE Scaling but don't have it defined in their config.json file. +For example, this model supports this configuration: + +.. code:: python + + { + ..., + "rope_scaling": { + "factor": 4.0, + "original_max_position_embeddings": 32768, + "type": "yarn" + } + } + + + +In order to support a longer context for such models, you must override the model configs when starting the trainer. + +PPO example: + +.. code:: bash + + +actor_rollout_ref.model.override_config.rope_scaling.type=yarn \ + +actor_rollout_ref.model.override_config.rope_scaling.factor=4.0 \ + +actor_rollout_ref.model.override_config.rope_scaling.original_max_position_embeddings=32768 \ + + +And for the critic model + +.. code:: bash + + +critic.model.override_config.rope_scaling.type=yarn \ + +critic.model.override_config.rope_scaling.factor=4.0 \ + +critic.model.override_config.rope_scaling.original_max_position_embeddings=32768 \ diff --git a/code/RL_model/verl/verl_train/docs/algo/baseline.md b/code/RL_model/verl/verl_train/docs/algo/baseline.md new file mode 100644 index 0000000000000000000000000000000000000000..ca821865f44f9a3697688d43d80f501d9a771df7 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/baseline.md @@ -0,0 +1,73 @@ +# Algorithm Baselines + +Last updated: 06/18/2025. + +## Math related datasets + +### GSM8k + +Assuming GSM8k/math dataset is preprocessed via: + +```bash +python3 examples/data_preprocess/*.py +``` + +Refer to the table below to reproduce RL training from different pre-trained checkpoints. Below is the performance on the GSM8k dataset if not specified otherwise. More comprehensive benchmark results areavailable in the recipe folder. + +| Hardware | Model | Method | Test score | Details | +| ---------- | -------------------------------- | --------------- | ------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| NVIDIA GPU | google/gemma-2-2b-it | hf checkpoint | 23.9 | [Huggingface](https://huggingface.co/google/gemma-2-2b-it#benchmark-results) | +| NVIDIA GPU | google/gemma-2-2b-it | SFT | 52.06 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log) | +| NVIDIA GPU | google/gemma-2-2b-it | SFT + PPO | 64.02 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log), [wandb](https://api.wandb.ai/links/verl-team/h7ux8602) | +| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | hf checkpoint | 49.6 | [Qwen blog](https://qwen.ai/blog?id=qwen2.5-llm) | +| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [command and log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) | +| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PRIME | 58.7 | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen.sh), [wandb](https://api.wandb.ai/links/zefan-wang-thu-tsinghua-university/rxd1btvb) | +| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | GRPO-LoRA | 54.3 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.543.log) | +| NVIDIA GPU | Qwen/Qwen2.5-1.5B-Instruct | GRPO-LoRA | 77.9 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-1.5B-bsz64_2-prompt512-resp1024-lorarank32-score0.779.log) | +| NVIDIA GPU | Qwen/Qwen2.5-3B-Instruct | GRPO-LoRA | 86.1 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-3B-bsz64_2-prompt512-resp1024-lorarank32-score0.861.log) | +| NVIDIA GPU | deepseek-ai/deepseek-llm-7b-chat | PPO (Megatron) | 69.5 [1] | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log), [wandb](https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3) | +| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO | 89 | [script](https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh) | +| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (FSDP2) | 89.8 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log) | +| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (Megatron) | 89.6 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b_math_megatron.log) | +| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | ReMax | 97 | [script](https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh), [wandb](https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln) | +| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPPO | 65.6 (MATH) | [SPPO script](https://github.com/volcengine/verl-recipe/tree/main/sppo/README.md) | +| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | GRPO-LoRA | 93.4 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-7B-bsz64_8-prompt512-resp1024-lorarank32-score0.934.log) | +| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | Instruct model | 83.7 | [Qwen Blog](https://qwen.ai/blog?id=qwen2.5-llm) | +| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | RLOO (Megatron) | 92.3 | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d) | +| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPIN | 92 | [script](https://github.com/volcengine/verl-recipe/tree/main/spin/README.md) | +| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GPG | 88 | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/ab86c4va) | +| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GPG (Megatron) | 88 | [log](https://github.com/diqiuzhuanzhuan/verldata/blob/main/run_logs/qwen2-7b_math_megatron.log), [wandb](https://wandb.ai/diqiuzhuanzhuan/verl_gpg_example_gsm8k_math/runs/yy8bheu8) | +| NVIDIA GPU | Qwen/Qwen2.5-VL-7B-Instruct | GRPO (Megatron) | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek) | +| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | PPO | 70.5 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log) | +| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | GRPO | 71.4 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log) | +| NVIDIA GPU | Qwen/Qwen2.5-14B-Instruct | GRPO-LoRA | 94.6 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-14B-bsz64_8-prompt512-resp1024-lorarank32-score0.946.log) | +| NVIDIA GPU | Qwen/Qwen2.5-32B-Instruct | GRPO-LoRA | 95.8 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-32B-bsz64_8-prompt512-resp1024-lorarank32-score0.958.log) | +| NVIDIA GPU | Qwen/Qwen2.5-72B-Instruct | GRPO-LoRA | 96.0 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-72B-bs64_8-prompt512-resp1024-lorarank32-score0.960.log) | + +### DAPO math-17k + +- Training DAPO math-17k dataset: https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k +- Testing: AIME'24: https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024 + +Note: + +- For Qwen/Qwen2.5-Math-7B, we directly modify the max_position_embeddings to 32768 without observing performance degradation in order to train longer response length. + +| Hardware | Model | Method | Test score | Details | +| ---------- | -------------------------- | ----------------------- | ---------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| NVIDIA GPU | Qwen/Qwen2.5-Math-7B (32k) | DAPO | 36.3 | [command](https://github.com/verl-project/verl-recipe/blob/main//dapo/test_dapo_7b_math.sh), [logs](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/runs/ow47vvon?nw=nwusertongyuxuan361) | +| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | DAPO + Code Interpreter | 40.0 | [command](https://github.com/verl-project/verl-recipe/blob/main//retool/run_qwen2_7b_dapo.sh) | + +## Coding related datasets + +Below is the result on leetcode if not specified otherwise. + +| Hardware | Model | Method | Test score | Details | +| ---------- | ----------------------- | ------ | ---------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| NVIDIA GPU | PRIME-RL/Eurus-2-7B-SFT | RPIME | 36.1 | [script](https://github.com/verl-project/verl-recipe/blob/main//prime/run_prime_qwen_code.sh), [swanlab](https://swanlab.cn/@wangzefan/prime_example/runs/7f541qhspgmy8nmhdlx35/chart) | + +### Notes + +[1] During evaluation, we have only extracted answers following the format `"####"`. A more flexible answer extraction, longer response length, and better prompt engineering may lead to a higher score. + +[2] The default value of `actor_rollout_ref.actor.entropy_coeff` is set to `0.0` since verl 0.3.x on 2025-05-30, which is different from previous versions. diff --git a/code/RL_model/verl/verl_train/docs/algo/collabllm.md b/code/RL_model/verl/verl_train/docs/algo/collabllm.md new file mode 100644 index 0000000000000000000000000000000000000000..3279e0ff3a43b4154c9ee54ed80452ea997408e0 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/collabllm.md @@ -0,0 +1,105 @@ +# Recipe: CollabLLM + +Last updated: 09/22/2025. + +> Open-Source Algorithm Implementation & Expriement Running: [Haiquan Chen](https://github.com/chenhaiq), [Shirley Wu](https://github.com/Wuyxin) + +🏠 [Homepage](https://aka.ms/CollabLLM) | 📝 [Paper](https://arxiv.org/pdf/2502.00640) | 🤗 [Datasets & Models](https://huggingface.co/collabllm) | ⭐️ [Original Implementation](https://github.com/Wuyxin/collabllm) + +`verl` provides a recipe for the Outstanding Paper at ICML 2025, **"CollabLLM: From Passive Responders to Active Collaborators"**. [CollabLLM](https://aka.ms/CollabLLM) is a unified fine-tuning framework that optimizes LLMs for effective and efficient multiturn collaboration with users. + +**Core Idea:** Models are rewarded based on how well their responses enable effective *future* collaboration with users. + +Paper Authors: [Shirley Wu](https://cs.stanford.edu/~shirwu/), [Michel Galley](https://www.microsoft.com/en-us/research/people/mgalley/), Baolin Peng, Hao Cheng, Gavin Li, Yao Dou, Weixin Cai, [James Zou](https://www.james-zou.com/), [Jure Leskovec](https://cs.stanford.edu/people/jure/), [Jianfeng Gao](https://www.microsoft.com/en-us/research/people/jfgao/) + + +--- +## Quick Start + +### 0. Environment +Make sure the required packages for `verl` are installed. Additionally, install `litellm` and export the required API keys. The API model will be used for user simulators and, optionally, LLM Judges (see the Configuration section below). + +### 1. Prepare Your Dataset + +First, process your dataset using the provided script (see example commands and usage in `process_dataset.py`): + +```bash +python process_dataset.py --dataset <> ... --dataset_type +``` + + +**Requirements:** +- Input: A Hugging Face multiturn dataset. Existing datasets: `collabllm/collabllm-multiturn-$DATASET`, with `DATASET` in one of [`math-hard(-large)`, `medium(-large)`, `bigcodebench(-large)`] (*-large are the datasets used in the CollabLLM paper) +- Example format: See [collabllm-multiturn-math-hard](https://huggingface.co/datasets/collabllm/collabllm-multiturn-math-hard) +- To generate your own dataset: Use [build_dataset.py](https://github.com/Wuyxin/collabllm/blob/main/scripts/engine/build_dataset.py) from the original CollabLLM repository + + +### 2. Train Your Model + +**(Optional) For Supervised Fine-Tuning (SFT):** +```bash +bash train_sft_collabllm.sh +``` + +**For Reinforcement Learning (RL):** + +```bash +bash train_rl_collabllm.sh +``` + +The RL script shows an example to train CollabLLM on `math-hard-large`. + +- The config to sample future conversations are in `recipe/collabllm/config/collabllm_interaction_config.yaml`. +- The Multiturn-aware Reward is aggregated from these three conversational-level rewards: + + ``` + +reward_model.reward_kwargs.metric_weights.accuracy=1 \ + +reward_model.reward_kwargs.metric_weights.interactivity=1 \ + +reward_model.reward_kwargs.metric_weights.token_amount=-0.0001 \ + ``` + + You can remove, add, or modify the weights depending on your task. A list of implemented metrics you can already add are under `recipe/collabllm/metrics`. For example, on `medium-large`, you can replace `accuracy` with `bleu_score` via + ``` + +reward_model.reward_kwargs.metric_weights.bleu_score=1 + ``` + which will instead apply bleu score on the sampled future conversations. + +## Algorithm + +| Step | Name | Description | +|------|-------------------------------|-----------------------------------------------------------------------------| +| 1 | Model response generation | The model generates multiple responses for each prompt in a batch. | +| 2 | Collaborative simulation | A user simulator (e.g., GPT or Claude) samples `num_repeat_rollouts` conversations for up to `max_user_turns` additional turns. | +| 3 | Compute Multiturn-aware Reward | Customized conversational reward functions are applied to the sampled conversations. Rewards are aggregated, then averaged across rollouts. | +| 4 | Update model | The model weights are updated using the computed multiturn-aware rewards. | + +--- + +## Configuration + +The primary configuration is managed through the launch script `train_rl_collabllm.sh` and the YAML file `recipe/collabllm/config/collabllm_interaction_config.yaml`. Key configuration sections: + +| Section | Key Parameters / Notes | +|----------------------|-----------------------------------------------------------------------------------------| +| `data` | Paths to training/validation files, batch sizes, sequence lengths. | +| `actor_rollout_ref` (common) | Base model path (used for actor + initial reference), FSDP settings, optimization (LR, scheduler). | +| `actor_rollout_ref` (CollabLLM-specific) | Hyperparameters under `actor_rollout_ref.rollout.multi_turn`: `max_user_turns`, `max_assistant_turns`, `num_repeat_rollouts`. | +| `interaction` | Defined in `collabllm_interaction_config.yaml`. Specifies user simulator and hyperparameters. Requires exported API keys. | +| `reward_model` | Manager set to `collabllm` by default. Modify `reward_model.reward_kwargs.metric_weights` for conversational rewards and weights. LLM Judge hyperparameters (e.g., `model`, `temperature`) go under `reward_model.reward_kwargs.llm_judge_kwargs`. | +| `algorithm` | GRPO-specific hyperparameters such as `actor_rollout_ref.rollout.n`. | +| `trainer` | Distributed training (nodes, GPUs per node), logging (WandB), checkpointing frequency. | + +--- + +## Key Files + +| File Path | Purpose | +|-----------|---------| +| `recipe/collabllm/collabllm_agent_loop.py` | Main logic to sample future conversations, using `CollabLLMInteraction` from `verl/interactions/collabllm_interaction.py`. | +| `verl/workers/reward_manager/collabllm.py` | Computes rewards for future conversations, leveraging `recipe/collabllm/reward_function.py` to apply each metric. | + +--- + +## Acknowledgement + +We sincerely thank the `verl` community and advisors for their contributions and guidance! diff --git a/code/RL_model/verl/verl_train/docs/algo/dapo.md b/code/RL_model/verl/verl_train/docs/algo/dapo.md new file mode 100644 index 0000000000000000000000000000000000000000..beb1ca5fb98d7dbc59e6044fd8fc34d67fab5da5 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/dapo.md @@ -0,0 +1,187 @@ +# Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO) + +Last updated: 06/19/2025. + +> Open-Source Algorithm Implementation & Expriement Running: [Yuxuan Tong](https://tongyx361.github.io/), [Guangming Sheng](https://hk.linkedin.com/in/guangming-sheng-b50640211) + +🏠 [Homepage](https://dapo-sia.github.io/) | 📝 [Paper@arXiv](https://arxiv.org/abs/2503.14476) | 🤗 [Datasets&Models@HF](https://huggingface.co/collections/BytedTsinghua-SIA/dapo-67d7f1517ee33c8aed059da0) | 🐱 [Code@GitHub](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) | 🐱 [Repo@GitHub](https://github.com/BytedTsinghua-SIA/DAPO) + +> We propose the **D**ecoupled Clip and Dynamic s**A**mpling **P**olicy **O**ptimization (DAPO) algorithm. By making our work publicly available, we provide the broader research community and society with practical access to scalable reinforcement learning, enabling all to benefit from these advancements. Our system is based on the awesome [verl](https://github.com/volcengine/verl) framework. Thanks for their great work! Applying DAPO training to Qwen2.5-32B base model proves to outperform the previous state-of-the-art DeepSeek-R1-Zero-Qwen-32B on AIME 2024, achieving **50%** accuracy with **50%** less training steps. +> +> ![dapo-main-result](https://dapo-sia.github.io/static/images/score.png) + +## Quickstart + +1. Prepare the datasets **on the Ray cluster**: + +```bash +bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default +``` + +2. Submit the job to the Ray cluster **from any machine**: + +```bash +cd verl # Repo root +export RAY_ADDRESS="http://${RAY_IP:-localhost}:8265" # The Ray cluster address to connect to +export WORKING_DIR="${PWD}" # The local directory to package to the Ray cluster +# Set the runtime environment like env vars and pip packages for the Ray cluster in yaml +export RUNTIME_ENV="./recipe/dapo/runtime_env.yaml" # This sets environment variables for the Ray cluster +bash recipe/dapo/run_dapo_qwen2.5_32b.sh # or other scripts +``` + +## Reproduction Runs + +| Setup | AIME 2024 Acc. | Hardware | Image | Commit | Environment Variables | Training Script | Training Record | +| -------------------------------------------- | -------------- | --------- | -------------------------------------------------------------------- | -------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- | +| DAPO | 52% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) | +| DAPO w/o Dynamic Sampling | 50% | 16x8xH800 | `hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_wo_ds_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_wo_ds_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) | +| DAPO w/o Token-level Loss & Dynamic Sampling | 44% | 16x8xH20 | `hiyouga/verl:ngc-th2.5.1-cu120-vllm0.7.4-hotfix` | [`4f80e4`](https://github.com/volcengine/verl/tree/4f80e465c2ec79ab9c3c30ec74b9745de61d0490) | [runtime_env.yaml](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/runtime_env.yaml) | [run_dapo_early_qwen2.5_32b.sh](https://github.com/volcengine/verl/blob/4f80e465c2ec79ab9c3c30ec74b9745de61d0490/recipe/dapo/run_dapo_early_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) | + +> [!IMPORTANT] +> +> **📢 Call for Contribution!** +> +> Welcome to submit your reproduction runs and setups! + +## Configuration + +### Separated Clip Epsilons (-> Clip-Higher) + +An example configuration: + +```yaml +actor_rollout_ref: + actor: + clip_ratio_low: 0.2 + clip_ratio_high: 0.28 +``` + +`clip_ratio_low` and `clip_ratio_high` specify the $\varepsilon_{\text {low }}$ and $\varepsilon_{\text {high }}$ in the DAPO objective. + +Core relevant code: + +```python +pg_losses1 = -advantages * ratio +pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high) +pg_losses = torch.maximum(pg_losses1, pg_losses2) +``` + +### Dynamic Sampling (with Group Filtering) + +An example configuration: + +```yaml +data: + gen_batch_size: 1536 + train_batch_size: 512 +algorithm: + filter_groups: + enable: True + metric: acc # score / seq_reward / seq_final_reward / ... + max_num_gen_batches: 10 # Non-positive values mean no upper limit +``` + +Setting `filter_groups.enable` to `True` will filter out groups whose outputs' `metric` are all the same, e.g., for `acc`, groups whose outputs' accuracies are all 1 or 0. + +The trainer will repeat sampling with `gen_batch_size` until there are enough qualified groups for `train_batch_size` or reaching the upper limit specified by `max_num_gen_batches`. + +Core relevant code: + +```python +prompt_bsz = self.config.data.train_batch_size +if num_prompt_in_batch < prompt_bsz: + print(f'{num_prompt_in_batch=} < {prompt_bsz=}') + num_gen_batches += 1 + max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches + if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches: + print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...') + continue + else: + raise ValueError( + f'{num_gen_batches=} >= {max_num_gen_batches=}. Generated too many. Please check your data.' + ) +else: + # Align the batch + traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n + batch = batch[:traj_bsz] +``` + +### Flexible Loss Aggregation Mode (-> Token-level Loss) + +An example configuration: + +```yaml +actor_rollout_ref: + actor: + loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean" + # NOTE: "token-mean" is the default behavior +``` + +Setting `loss_agg_mode` to `token-mean` will mean the (policy gradient) loss across all the tokens in all the sequences in a mini-batch. + +Core relevant code: + +```python +if loss_agg_mode == "token-mean": + loss = verl_F.masked_mean(loss_mat, loss_mask) +elif loss_agg_mode == "seq-mean-token-sum": + seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum + loss = torch.mean(seq_losses) # seq-mean +elif loss_agg_mode == "seq-mean-token-mean": + seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1) # token-mean + loss = torch.mean(seq_losses) # seq-mean +else: + raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}") +``` + +### Overlong Reward Shaping + +An example configuration: + +```yaml +data: + max_response_length: 20480 # 16384 + 4096 +reward_model: + overlong_buffer: + enable: True + len: 4096 + penalty_factor: 1.0 +``` + +Setting `overlong_buffer.enable` to `True` will penalize the outputs whose lengths are overlong but still within the hard context limit. + +Specifically, the penalty increases linearly from `0` to `overlong_buffer.penalty_factor` when the length of the output exceeds the `max_response_length - overlong_buffer.len` by `0` to `overlong_buffer.len` tokens. + +Core relevant code: + +```python +if self.overlong_buffer_cfg.enable: + overlong_buffer_len = self.overlong_buffer_cfg.len + expected_len = self.max_resp_len - overlong_buffer_len + exceed_len = valid_response_length - expected_len + overlong_penalty_factor = self.overlong_buffer_cfg.penalty_factor + overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0) + reward += overlong_reward +``` + +## FAQ + +### Where is the "Overlong Filtering" in the paper? + +Most experiments in the paper, including the best-performant one, are run without Overlong Filtering because it's somehow overlapping with Overlong Reward Shaping in terms of properly learning from the longest outputs. So we don't implement it here. + +### What's the difference between [the `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl-recipe/tree/main/dapo) and the [`recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo)? + +[The `recipe/dapo` branch](https://github.com/verl-project/verl-recipe/tree/main/dapo/recipe/dapo) is for **as-is reproduction** and thus won't be updated with new features. + +[The `recipe/dapo` directory in the `main` branch](https://github.com/volcengine/verl-recipe/tree/main/dapo) works as an example of how to extend the latest `verl` to implement an algorithm recipe, which will be maintained with new features. + +### Why can't I produce similar results after modifications? + +RL infrastructures nowadays still have inherent unrobustness, on which we are still working hard to improve. + +We strongly recommend to only modify one thing at a time. + +We also list some known problems here: + +1. Enabling CUDA graph (`enforce_eager=False`) might cause model performance degradation, whose cause is still under investigation. diff --git a/code/RL_model/verl/verl_train/docs/algo/entropy.md b/code/RL_model/verl/verl_train/docs/algo/entropy.md new file mode 100644 index 0000000000000000000000000000000000000000..46153b7e8558583c9d4a0201a1317f09c6c1ecb1 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/entropy.md @@ -0,0 +1,115 @@ +# Recipe: Entropy Mechanism + +Last updated: 06/27/2025. + + +
+ + The Entropy Mechanism of Reinforcement Learning for Large Language Model Reasoning. + +[![Paper](https://img.shields.io/badge/paper-A42C25?style=for-the-badge&logo=arxiv&logoColor=white)](https://arxiv.org/pdf/2505.22617) [![Github](https://img.shields.io/badge/PRIME-000000?style=for-the-badge&logo=github&logoColor=000&logoColor=white)](https://github.com/PRIME-RL/Entropy-Mechanism-of-RL) [![alphaXiv](https://img.shields.io/badge/discussion-A42C25?style=for-the-badge&logo=arxiv&logoColor=white&color=blue +)](https://www.alphaxiv.org/abs/2505.22617) [![Twitter](https://img.shields.io/badge/Twitter-%23000000.svg?style=for-the-badge&logo=twitter&logoColor=white)](https://x.com/stingning/status/1928088554166505667) [![Twitter](https://img.shields.io/badge/Twitter-%23000000.svg?style=for-the-badge&logo=twitter&logoColor=white)](https://x.com/charlesfornlp/status/1928089451080585283) [![Twitter-ak](https://img.shields.io/badge/Twitter-%23000000.svg?style=for-the-badge&logo=twitter&logoColor=white)](https://x.com/_akhaliq/status/1928077929105268861) + + + + +
+ + +## 🎉News + +- **[2025/05/29]** 🎉 Ranked **#1** of the day on [Huggingface Daily Papers](https://huggingface.co/papers?date=2025-05-29). +- **[2025/05/29]** Released our Paper on arXiv. See [here](https://arxiv.org/pdf/2505.22617). We provide insights into the entropy mechanism of RL for LLMs and propose two simple yet effective strategies to alleviate the entropy collapse. + + + +## ✨Getting started + +After preparing the training data, for training Qwen2.5-7B on a single node, taking the KL-Cov approach as an example, you can simply run: + +``` +cd verl +conda activate your_env +bash recipe/dapo/7b_kl_cov.sh +``` + +While for training Qwen2.5-32B on multi nodes, you can run the following commands: + +``` +cd verl +conda activate your_env +bash recipe/dapo/32b_kl_cov.sh +``` + +## 📖Introduction + +
+ issue +
+ +This paper addresses the entropy collapse issue in scaling reinforcement learning (RL) for large language models (LLMs), where policy entropy drops sharply during training, leading to overconfidence and performance saturation. We empirically establish a relationship between entropy ($H$) and performance ($R$): $R=−aexp(H)+b$, showing performance is bottlenecked by entropy exhaustion. + +
+ issue +
+ +Theoretically, we find entropy changes are driven by the covariance between action probability and logit updates, which correlates with advantage in Policy Gradient methods. High-probability, high-advantage actions reduce entropy, while rare, high-advantage actions increase it. Empirically, the covariance term remains positive, explaining entropy’s monotonic decline. To mitigate this, we propose ​​Clip-Cov​​ and ​​KL-Cov​​, which restrict updates for high-covariance tokens. These methods effectively prevent entropy collapse, and improve performance. + +## 📃Evaluation + +
+ issue +
+ + +Our method is able to maintain a considerably higher level of entropy throughout training. For example, when the baseline's entropy reaches a plateau and can no longer be consumed, the KL-Cov method still sustains an entropy level over 10 times higher. Meanwhile, the response length of the policy model steadily increases, and its performance on the test set consistently surpasses that of the baseline. This indicates that our model is able to explore more freely during training, learning better policy through RL. +| **Method** | **AIME24** | **AIME25** | **AMC** | **MATH-500** | **OMNI-MATH** | **OlympiadBench** | **Minerva** | **Avg.** | +| ----------------- | ---------: | ---------: | -------: | -----------: | ------------: | ----------------: | ----------: | -------: | +| *Qwen2.5-7B* | | | | | | | | | +| GRPO | 21.2 | 9.6 | 58.7 | 78.8 | 27.9 | 40.7 | 36.7 | 38.6 | +| w. Clip-higher | 18.1 | 11.5 | 56.6 | 79.2 | 29.8 | 43.3 | 40.4 | 38.8 | +| w. **`CLIP-Cov`** | 22.1 | **15.8** | 58.2 | 80.4 | **30.5** | **44.1** | **41.1** | 40.4 | +| w. **`KL-Cov`** | **22.6** | 12.9 | **61.4** | **80.8** | 29.1 | 42.6 | 38.2 | **40.6** | +| *Qwen2.5-32B* | | | | | | | | | +| GRPO | 21.8 | 16.2 | 69.7 | 84.2 | 35.2 | 43.6 | 45.5 | 45.8 | +| w. Clip-higher | 35.6 | 22.3 | 69.5 | 77.2 | 35.1 | 42.5 | 43.0 | 47.2 | +| w. **`CLIP-Cov`** | 32.3 | 22.7 | 67.2 | **87.0** | **42.0** | **57.2** | 46.0 | 50.3 | +| w. **`KL-Cov`** | **36.8** | **30.8** | **74.5** | 84.6 | 39.1 | 49.0 | **46.3** | **52.2** | + +Our two approaches both achieve non-trivial improvements across all benchmarks. Compared to GRPO, our method outperforms it by 2.0% on average for the 7B model and by 6.4% for the 32B model. Moreover, we observe that our method yields more substantial gains on the larger Qwen2.5-32B. Specifically, our method achieves improvements of 15.0% and 14.6% compared to GRPO on the most challenging benchmarks, AIME24 and AIME25, respectively. + + +## 🎈Citation +If you find this paper or repo helpful, please cite us. + +```bibtex +@article{cui2025entropy, + title={The Entropy Mechanism of Reinforcement Learning for Reasoning Language Models}, + author={Cui, Ganqu and Zhang, Yuchen and Chen, Jiacheng and Yuan, Lifan and Wang, Zhi and Zuo, Yuxin and Li, Haozhan and Fan, Yuchen and Chen, Huayu and Chen, Weize and others}, + journal={arXiv preprint arXiv:2505.22617}, + year={2025} +} +``` +## 🌻Acknowledgement +We implement our reinforcement learning algorithm extending from [verl](https://github.com/volcengine/verl). We utilize [vLLM](https://github.com/vllm-project/vllm) for inference. Our models are trained primarily on [Qwen2.5 family](https://github.com/QwenLM/Qwen2.5). Our training data is built from [DAPO-MATH](https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k). Thanks for their great contributions! + +## 📬 Contact + +For questions, discussion, or collaboration opportunities, feel free to contact: +- Ganqu Cui: cuiganqu@pjlab.org.cn +- Yuchen Zhang: yuchen.zhang2003@gmail.com +- Jiacheng Chen: jackchan9345@gmail.com +- Ning Ding: ningding.cs@gmail.com + diff --git a/code/RL_model/verl/verl_train/docs/algo/gpg.md b/code/RL_model/verl/verl_train/docs/algo/gpg.md new file mode 100644 index 0000000000000000000000000000000000000000..36bede8c319040ae713ef335372f2caa40ce44a3 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/gpg.md @@ -0,0 +1,36 @@ +# GPG: Group Policy Gradient + +Last updated: 07/03/2025. + +Group Policy Gradient (GPG) is a minimalist reinforcement learning (RL) method that enhances the reasoning ability of large language models without relying on supervised fine-tuning or complex tricks. GPG revisits traditional policy gradients and directly optimizes the RL objective—no surrogate losses, no KL penalties, no critic, and no reference model. Compared to GRPO, GPG is simpler, more efficient, and achieves better results on many tasks. For more details, please refer to the original paper [GPG: A Simple and Strong Reinforcement Learning Baseline for Model Reasoning +](https://arxiv.org/abs/2504.02546). + +## Key Components +- Use a corrected advantage function to improve policy gradient accuracy and training efficiency. +- By eliminating the critic and reference models, avoiding KL divergence constraints, significantly simplifies the training process compared to Group Relative Policy Optimization (GRPO) + +## Configuration +To configure GPG within the framework, use the following YAML settings. + +```yaml +algorithm: + adv_estimator: gpg +actor_rollout_ref: + actor: + policy_loss: + loss_mode: "gpg" +``` + +## Advanced Extensions +GPG is a simple and strong baseline for model reasoning. Although it avoids using KL loss in its original form, you can still use KL loss to further improve the performance. + +```yaml +algorithm: + adv_estimator: gpg +actor_rollout_ref: + actor: + use_kl_loss: True # enable kl regularization + kl_loss_coef: 0.01 + policy_loss: + loss_mode: "gpg" +``` \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/algo/grpo.md b/code/RL_model/verl/verl_train/docs/algo/grpo.md new file mode 100644 index 0000000000000000000000000000000000000000..c25f401f9045026d20c8446694702d1f9cbfbc3b --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/grpo.md @@ -0,0 +1,72 @@ +# Group Relative Policy Optimization (GRPO) + +Last updated: 05/31/2025. + +In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive. + +GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows: +- Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs. +- Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality. +- Baseline Calculation: The average reward of the group serves as a baseline. +- Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones. + +This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300) + +## Key Components + +- No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic) +- Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group. +- Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group. + +## Configuration + +Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior. + +Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic). + +![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d) + +- `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling. + +- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n` + +- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers. + +- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor + +- `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2 + +- `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead + +- `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead. + +Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss: + +- `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO. + +- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001. + +- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html + +## Advanced Extensions + +### DrGRPO + +[Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, which leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias. + +Configure the following to enable DrGRPO, with all other parameters the same as GRPO's: + +- `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging +- `actor_rollout_ref.actor.loss_scale_factor`: (Optional) Set to a constant integer (e.g., max response length) to ensure consistent normalization throughout training. If not set, uses the current batch's response length. +- `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO +- `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm + +## Reference Example + +Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log) + +```bash +bash examples/grpo_trainer/run_qwen3-8b.sh +``` + +For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html diff --git a/code/RL_model/verl/verl_train/docs/algo/opo.md b/code/RL_model/verl/verl_train/docs/algo/opo.md new file mode 100644 index 0000000000000000000000000000000000000000..338f3a762d9585c608af28cdf4e75837dbfe11e4 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/opo.md @@ -0,0 +1,33 @@ +# On-Policy RL with Optimal Reward Baseline (OPO) + +Last updated: 06/02/2025. + +Loose on-policy constraints and suboptimal baselines in reinforcement learning often lead to training instability such as large policy shifts and entropy collapse. OPO addresses these challenges by using exact on-policy training with the theretically optimal reward baseline for advantage estimation. It achieves lower policy shifts and higher output entropy, encouraging more diverse and less repetitive responses. + +OPO uses group sampling to generate multiple outputs for each input like GRPO. Unlike group-based algorithms which typically use the mean reward of a group as its baseline, OPO employs a theoretically optimal baseline: the length-weighted reward of the group. It also omits the standard deviation normalization. By adopting these two key components, OPO enables the training of a single policy model with the objective of maximizing only the expected reward. For more detailes, refer to the original paper [On-Policy RL with Optimal Reward Baseline](https://arxiv.org/pdf/2505.23585). + +## Key Components + +- Exact On-Policy Training: always generates responses from the current policy, without using any pre-generated data or off-policy data. +- Optimal Reward Baseline: uses a length-weighted reward of the group as the baseline for normalizing the rewards. + +## Configuration + +To configure OPO within the framework, use the following YAML settings. These parameters are crucial for enabling exact on-policy training and activating the optimal reward baseline. + +```yaml +algorithm: + adv_estimator: opo # Use OPO for optimal reward baseline +data: + train_batch_size: 1024 +actor_rollout_ref: + actor: + ppo_mini_batch_size: 1024 # ppo_mini_batch_size should equal to train_batch_size to enable exact on-policy training + entropy_coeff: 0 # disable entropy regularization + use_kl_loss: False # disable kl regularization + kl_loss_coef: 0 +``` + +## Advanced Extensions + +OPO can also be extended to other algorithms like RLOO and Reinforce++. It just needs to adjust their configurations to enable exact on-policy training and incorporate the optimal length-weighted reward baseline with minimal modifications to their advantage estimation functions. diff --git a/code/RL_model/verl/verl_train/docs/algo/otb.md b/code/RL_model/verl/verl_train/docs/algo/otb.md new file mode 100644 index 0000000000000000000000000000000000000000..288eb71bd69cbe38a56b81e1d59b118be4a07a6d --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/otb.md @@ -0,0 +1,104 @@ +# Optimal Token Baseline (OTB) + +Last updated: 12/25/2025. + +Optimal Token Baseline (OTB) is dynamic token-level baseline for variance reduction. It weights updates based on "Realized Energy"—essentially, how much uncertainty has accumulated up to that specific token. It downweights the noisy parts and trusts the clear signals. Read [Optimal Token Baseline blog](https://richardli.xyz/optimal-token-baseline) for more details. + +## The method: OTB + +- OTB builds a _dynamic_ baseline that adapts to each token by tracking the “Realized Energy”—the uncertainty that has accumulated up to that token. It downweights the noisy parts and trusts the clear signals. +- Unlike standard group means (which average over the padding `EOS` token ineffectively), OTB handles this naturally by computing baselines only over valid tokens. + +## Logit-Gradient Proxy + +- Computing true uncertainty requires expensive backward passes (calculating gradient norms per token). Instead, OTB introduces the **Logit-Gradient Proxy**: the realized energy can be estimated entirely from forward probabilities. +- This means zero extra backward calls and effectively no additional runtime overhead. + +## Mechanics at a glance + +For each prompt group of size `N`, OTB computes rewards-to-go `G_t` and cumulative variance weights `W_t`. The optimal baseline per token is + +``` +B*_t = (Σ_i G_t^{(i)} · W_t^{(i)}) / (Σ_i W_t^{(i)} + ε), +W_t = Σ_{j=1}^t (1 - 2π_j + Σπ_j²), +Σπ_j² = exp(logsumexp(2·logits_j) - 2·logsumexp(logits_j)). +``` + +The final advantage is `(G_t - B*_t) · mask_t`, so padding tokens stay at zero. + +## Integration in VERL + +- `AdvantageEstimator.OPTIMAL_TOKEN_BASELINE` registers `compute_optimal_token_baseline_advantage`, invoked whenever `algorithm.adv_estimator` is set to `optimal_token_baseline`. +- `ActorRolloutRefWorker.compute_log_prob` emits an additional tensor `sum_pi_squared` (Σπ² per token) when `actor.calculate_sum_pi_squared=True`. This requires disabling fused log-prob kernels, because they do not surface logits. +- Trainers assert `sum_pi_squared` exists, regroup trajectories by `non_tensor_batch["uid"]`, and run the OTB calculation. If rollout IS is active, they rescale the weights by `rollout_is_weights**2` before aggregating. +- In Ulysses sequence-parallel setups, the actor gathers, unpads, and returns Σπ² in the same way it handles log-probabilities, so OTB supports sharded sequence-parallel models out of the box. +- `sum_pi_squared_checkpointing` is available to trade compute for memory when Σπ² tensors become large (e.g., lengthy chain-of-thought reasoning). + +## Configuration checklist + +- `actor_rollout_ref.actor.calculate_sum_pi_squared: true` (mandatory). +- `actor_rollout_ref.model.use_fused_kernels: false` (required until fused kernels emit logits). +- `algorithm.adv_estimator: optimal_token_baseline`. +- Group sampling (`actor_rollout_ref.rollout.n > 1`) to unlock OTB’s variance reduction; with `n=1` the baseline collapses to returns. + +Example OmegaConf overlay: + +```yaml +algorithm: + adv_estimator: optimal_token_baseline + +actor_rollout_ref: + actor: + calculate_sum_pi_squared: true + sum_pi_squared_checkpointing: false # optional memory saver + rollout: + n: 8 +``` + +## Example script + +- `examples/otb_trainer/run_qwen2_5-7b.sh`. + +## Gradient Variance Proxy Metrics + +All gradient-variance analysis in the Optimal Token Baseline work starts from the variance identity + +``` +Var(ĝ) = E[||ĝ||²] - ||E[ĝ]||², +``` + +which states that the variance of any stochastic gradient equals the mean squared magnitude minus the squared norm of its expectation. + +For a trajectory `τ`, the policy-gradient estimator is + +``` +ĝ(τ) = ∇ log π_θ(τ) · A(τ), A(τ) = R(τ) - B. +``` + +The logit-gradient proxy approximates the squared gradient norm without an extra backward pass: + +``` +||ĝ(τ)||² ≈ Ŵ(τ) · A(τ)², +``` + +where `Ŵ(τ)` is the realized energy built. Given a mini-batch `{τ_i}` of size `N`, we decompose its statistics into three diagnostics: + +- **Signal strength (squared norm of the mean gradient)** + ``` + S = || (1/N) · Σ ĝ(τ_i) ||² + ``` +- **Total power (signal + noise)** + ``` + P_total = (1/N) · Σ Ŵ(τ_i) · A(τ_i)² + ``` +- **Pure noise (estimated variance of the batch mean)** + ``` + Var_proxy = (1/(N-1)) · (P_total - S) + ``` + +`verl/trainer/ppo/metric_utils.py#L306` implements these diagnostics via `compute_variance_proxy_metrics`, emitting +`variance_proxy/proxy1_signal_strength`, +`variance_proxy/proxy2_total_power`, and +`variance_proxy/proxy3_pure_noise`. + +Tracking these metrics provides a forward-only, low-overhead view of gradient health for any advantage estimator that supplies `sum_pi_squared`. diff --git a/code/RL_model/verl/verl_train/docs/algo/ppo.md b/code/RL_model/verl/verl_train/docs/algo/ppo.md new file mode 100644 index 0000000000000000000000000000000000000000..4740667218579bacf8ab7d1fa5723962c720304c --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/ppo.md @@ -0,0 +1,105 @@ +# Proximal Policy Optimization (PPO) + +Last updated: 06/19/2025. + +Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning. + +Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from: + +- High variance and sample inefficiency. +- Instability due to large policy updates. + +PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives. + +For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347). + +## Key Components + +- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model. + +- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias. + +- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates. + +## Configuration + +Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior. + +Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below. + +![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d) + +- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n` + +- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers + +- `critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers + +- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2 + +- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor + +- `critic.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic. Defaults to `actor_rollout_ref.actor.ppo_epochs` + +- `algorithm.gemma`: discount factor + +- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator + +- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo + +## Advanced Extensions + +### KL Divergence Control + +Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155) + +Options to use KL loss for KL divergence control: + +- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False + +- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001. + +- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. Appending "+" in the end (e.g., 'k1+' and 'k3+') would apply straight through to employ k2 for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html + +Options to use KL penalty in the reward: + +- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False. + +- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html + +- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001. +- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController. +- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details. +- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details. + +### Dual-clip PPO + +The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound. + +![image](https://github.com/user-attachments/assets/fc232181-d8b0-4307-8dd2-4dc0a4c1c139) + +- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0 + +## Reference Example + +Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) + +```bash +bash run_gemma.sh + trainer.n_gpus_per_node=1 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + trainer.logger=console \ + critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + data.train_batch_size=256 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size=2 \ + critic.ppo_micro_batch_size=2 +``` + +Reference performance with verl v0.2: + +| Model | Method | Score | Link | +|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------| +| Qwen/Qwen2.5-0.5B-Instruct | pretrained model | 36.4 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) | +| Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) | diff --git a/code/RL_model/verl/verl_train/docs/algo/rollout_corr.md b/code/RL_model/verl/verl_train/docs/algo/rollout_corr.md new file mode 100644 index 0000000000000000000000000000000000000000..8569b243a9e2bedd33d02e8f53f39e09d046011a --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/rollout_corr.md @@ -0,0 +1,1313 @@ +# Rollout Correction + +**Author:** [Yingru Li](https://richardli.xyz/) + +Last updated: 10/30/2025. + +--- + +> **📖 Documentation Structure** +> +> - **This document** - Practical usage guide: configurations, presets, troubleshooting +> - **[Mathematical Formulations](rollout_corr_math.md)** - Theoretical foundations, derivations, and algorithmic details +> +> Start here for implementation, refer to the math doc for theory and design rationale. + +--- + +This document provides a comprehensive overview of the Rollout Correction implementation in verl. + +**Note on Naming**: This feature is called "Rollout Correction" to reflect the complete functionality: importance sampling (IS) weights and rejection sampling (RS). The internal variable `rollout_is_weights` retains its name as it specifically refers to the IS weights component. + +### BibTeX Citation + +```bibtex +@online{liu-li-2025-rl-collapse, + title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch}, + author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu}, + year = {2025}, + month = sep, + url = {https://richardli.xyz/rl-collapse} +} +``` + +### Blog Series + +- Main blog post: https://richardli.xyz/rl-collapse +- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance) +- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff) +- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS) + +## Overview + +Rollout Correction provides a unified framework to handle **general off-policy problems** in RL training. Any scenario where the data collection distribution differs from the training distribution can benefit from these methods. + +**Common off-policy scenarios:** + +1. **Policy Mismatch** (Implementation Differences) + + - Different precision: FP8 vs FP16 vs BF16 vs FP32 + - Different backends: vLLM vs SGLang vs FSDP vs Megatron + - Different implementations even with identical weights + +2. **Temporal Lag** (Model Staleness) + + - Rollout uses older checkpoint while training has progressed + - Asynchronous rollout workers with stale parameters + - Common in distributed/async RL systems + +3. **Replay Buffers** + + - Training on historical trajectories from earlier iterations + - Experience replay from different policy versions + - Data augmentation or resampling strategies + +4. **Off-Policy Algorithms** + + - Behavioral cloning from expert demonstrations + - DAPO (data from auxiliary policies) + - Any algorithm using trajectories from a different policy + +5. **Data Quality Filtering** + - Reweighting or filtering collected data + - Preference learning with modified distributions + - Curriculum learning with distribution shifts + +These off-policy gaps can cause training instability and policy collapse. Rollout Correction uses importance sampling (IS) weights and rejection sampling (RS) to correct for any distribution shift between data collection and training. + +**Important Note on Common Implementation Mistakes:** + +Many LLM-RL implementations incorrectly apply PPO by **ignoring the actual rollout policy** π_rollout and assuming the training reference policy π_old is the behavior policy. This is mathematically incorrect when π_rollout ≠ π_old (which is typical in LLM-RL due to precision/backend differences between rollout and training). + +**This is not PPO's fault** - PPO itself is mathematically correct. The issue is the incorrect assumption that π_old = π_rollout in naive implementations. + +This critical implementation mistake that leads to RL training collapse was identified in the blog post ["When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch"](https://richardli.xyz/rl-collapse) and motivated the development of this rollout correction framework. + +**Mathematically correct approaches:** + +- **Decoupled mode**: Three policies (π*rollout, π_old, π*θ) with IS correction from π_rollout to π_old +- **Bypass mode**: Two policies (π*rollout = π_old, π*θ) using actual rollout policy as PPO anchor +- **Bypass + Policy Gradient mode**: Two policies (π*rollout, π*θ) with IS/RS correction and no PPO clipping + +See [Mathematical Formulations](rollout_corr_math.md#38-common-implementation-mistake) for detailed explanation. + +### Key Design Principle: Separation of IS Weights and Rejection Sampling + +The implementation cleanly separates two orthogonal mechanisms: + +1. **IS Weights** (`rollout_is_weights`): Continuous reweighting for gradient correction + + - Policy ratio: π*old/π_rollout (decoupled) or π*θ/π_rollout (bypass) + - **Safety-bounded**: Clamped to [exp(-20), exp(20)] ≈ [2e-9, 5e8] to prevent overflow + - Token level: Bounds per-token ratios + - Sequence level: Bounds product of ratios (broadcast to all tokens) + - **Truncated**: Upper clamped via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling) + - **Zeroed at padding**: Multiplied by response_mask to zero out padding positions + - Used to weight policy gradients (variance reduction) + +2. **Rejection Sampling** (`modified_response_mask`): Binary filtering for outlier exclusion + - Creates binary mask: 1 = keep, 0 = reject + - Rejects tokens/sequences with IS ratios outside [lower_threshold, upper_threshold] + - Modifies response_mask to exclude rejected samples from training + - Used for loss aggregation (rejected samples don't contribute to gradients) + +This separation ensures: + +- ✅ IS weights provide continuous reweighting (reduce variance) +- ✅ Rejection sampling provides hard filtering (remove extreme outliers) +- ✅ Both mechanisms can be enabled independently or together +- ✅ Safety bounds prevent numerical overflow in all cases + +## Quick Start: Using Verified Presets + +**NEW**: We now provide typed configuration with verified presets for common scenarios. These presets have been validated with tens of thousands of GPU hours across various models and training scenarios. + +### Python API + +```python +from verl.trainer.config.algorithm import RolloutCorrectionConfig + +# === Decoupled PPO mode (3 policies: π_rollout, π_old, π_θ) === +# IS weights correct for gap between π_old and π_rollout +config = RolloutCorrectionConfig.decoupled_token_is() # Token-TIS +config = RolloutCorrectionConfig.decoupled_seq_is() # Seq-TIS +config = RolloutCorrectionConfig.decoupled_seq_is_rs() # Seq-MIS +config = RolloutCorrectionConfig.decoupled_geo_rs() # Geo-RS (ratio mode) +config = RolloutCorrectionConfig.decoupled_geo_rs_token_tis() # Geo-RS + Token-TIS + +# === K3 KL Estimator presets (more stable for small KL) === +config = RolloutCorrectionConfig.decoupled_k3_rs() # K3-RS only +config = RolloutCorrectionConfig.decoupled_k3_rs_token_tis() # K3-RS + Token-TIS + +# === Bypass PPO mode (2 policies: π_rollout = π_old, π_θ) - fast === +# PPO ratio handles IS, so no explicit IS weights needed +config = RolloutCorrectionConfig.bypass_ppo_clip() # PPO-clip only +config = RolloutCorrectionConfig.bypass_ppo_clip_geo_rs() # PPO-clip + Geo-RS (ratio) +config = RolloutCorrectionConfig.bypass_ppo_clip_k3_rs() # PPO-clip + K3-RS + +# === Bypass PG mode (2 policies, no PPO clipping) - fast === +# IS weights computed on-the-fly as π_θ / π_rollout +config = RolloutCorrectionConfig.bypass_pg_is() # Seq-TIS + PG +config = RolloutCorrectionConfig.bypass_pg_geo_rs() # Geo-RS + PG (ratio) +config = RolloutCorrectionConfig.bypass_pg_geo_rs_token_tis() # Geo-RS + Token-TIS + PG + +# === Other === +config = RolloutCorrectionConfig.disabled() # Metrics only (no correction) +``` + +### YAML Configuration (Advanced) + +For advanced customization or YAML-based configs: + +```yaml +algorithm: + rollout_correction: + rollout_is: token # IS weights: "token", "sequence", or null + rollout_is_threshold: 2.0 # Upper threshold for IS weights + rollout_is_batch_normalize: false # Batch normalize IS weights to mean=1.0 + rollout_rs: null # Rejection sampling: comma-separated canonical options (e.g. "token_k1,seq_max_k2") + rollout_rs_threshold: null # Threshold spec: float(s) or "lower_upper" string(s) + bypass_mode: false # Skip old_log_prob computation (sets π_old = π_rollout) + loss_type: ppo_clip # Loss type in bypass mode: "ppo_clip" (default) or "reinforce" + +# REQUIRED: Enable log prob calculation +actor_rollout_ref: + rollout: + calculate_log_probs: true +``` + +## Files + +### **Core Implementation** + +- `verl/trainer/ppo/rollout_corr_helper.py` - Contains `compute_rollout_correction_and_rejection_mask()` and `compute_offpolicy_metrics()` +- `verl/trainer/ppo/core_algos.py` - Rollout Correction integration with PPO and REINFORCE modes (`compute_policy_loss_bypass_mode()`, `compute_policy_loss_reinforce()`) +- `verl/trainer/ppo/ray_trainer.py` - Bypass mode implementation (skips `old_log_prob` computation) +- `verl/workers/actor/dp_actor.py` - Mode selection logic and metrics collection + +### **Configuration Files** + +- `verl/trainer/config/algorithm.py` - Rollout Correction parameters in `AlgoConfig` +- `verl/workers/config/actor.py` - Rollout Correction parameters in `ActorConfig` +- `verl/trainer/config/actor/actor.yaml` - Rollout Correction configuration section +- `verl/trainer/config/ppo_trainer.yaml` - Algorithm config with Rollout Correction + +### **Documentation** + +- `docs/examples/config.rst` - Configuration parameter descriptions + +### **Example Scripts** + +- `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh` - DAPO example with Rollout Correction +- `examples/rollout_correction/run_with_rollout_corr.sh` - Basic example +- `examples/rollout_correction/run_with_rollout_corr_multi_rs.sh` - Multi-RS example + +### **Tests** + +- `tests/trainer/ppo/test_rollout_corr.py` - Unit tests for IS/RS mechanisms +- `tests/trainer/ppo/test_rollout_corr_integration.py` - Integration tests + +## Configuration Parameters + +All parameters are under `algorithm.rollout_correction`: + +### `rollout_is` (str or null) + +Importance sampling weights aggregation level: + +- `null` = No IS weights computed (metrics-only mode) +- `"token"`: Per-token IS weights + - **Decoupled mode**: ρ_t = π_old(t)/π_rollout(t) + - **Bypass/Pure IS mode**: ρ*t = π*θ(t)/π_rollout(t) + - Independent truncation per token + - Typical threshold: 1.5 - 5.0 +- `"sequence"`: Per-sequence weight ρ_seq = ∏_t ρ_t + - Multiplicative aggregation across sequence + - Typical threshold: 2.0 - 10.0 + +All IS weights are safety-bounded to [exp(-20), exp(20)] ≈ [2e-9, 5e8] + +### `rollout_is_threshold` (float) + +Upper threshold for IS weight truncation. Default: `2.0` + +- Truncates IS weights via `.clamp(max=rollout_is_threshold)` (TIS: Truncated Importance Sampling) +- Applied to IS weights for variance reduction +- Separate from rejection sampling (controlled by `rollout_rs` parameters) + +### `rollout_is_batch_normalize` (bool) + +Apply batch normalization to IS weights. Default: `False` + +- `True`: Normalize IS weights to have mean=1.0 within each batch + - **Token-level IS**: Normalizes over all token weights + - **Sequence-level IS**: Normalizes over sequence means (one weight per sequence) +- `False`: Use raw (truncated) IS weights +- Reduces variance by ensuring average weight is 1.0 per batch +- Applied AFTER truncation to preserve truncation semantics +- Only affects IS weight values, not rejection sampling + +### `rollout_rs` (str or null) + +Rejection sampling aggregation modes. Supply a comma-separated string (spaces optional) using the canonical options implemented in `rollout_corr_helper`: + +- `token_k1`: Token-level rejection with `-log r` bounds (ratio thresholds supplied as `lower_upper`). Example: `"0.6_1.4"` +- `token_k2`: Token-level rejection with `0.5 * (log r)^2` (upper bound only) +- `token_k3`: Token-level rejection with `exp(log r) - 1 - log r` (upper bound only) +- `seq_sum_k1`: Sequence-level rejection with sum of `-log r` (ratio bounds) +- `seq_sum_k2`: Sequence-level rejection with sum of `0.5 * (log r)^2` (upper bound only) +- `seq_sum_k3`: Sequence-level rejection with sum of `exp(log r) - 1 - log r` (upper bound only) +- `seq_mean_k1`: Sequence-level rejection with mean of `-log r` (ratio bounds) +- `seq_mean_k2`: Sequence-level rejection with mean of `0.5 * (log r)^2` (upper bound only) +- `seq_mean_k3`: Sequence-level rejection with mean of `exp(log r) - 1 - log r` (upper bound only) +- `seq_max_k2`: Sequence-level rejection with max of `0.5 * (log r)^2` (upper bound only) +- `seq_max_k3`: Sequence-level rejection with max of `exp(log r) - 1 - log r` (upper bound only) + +### `rollout_rs_threshold` (str, float, or null) + +Threshold specification for rejection sampling. + +- Provide **one entry per option**, separated by commas. A single entry is broadcast to every option. +- **Ratio modes (`*k1`)**: Use `"lower_upper"` strings (e.g. `"0.7_1.3"`). Supplying a float implies only the upper bound; the lower bound defaults to its reciprocal. +- **Divergence modes (`*k2`/`*k3`)**: Supply positive upper bounds (float or numeric string). +- Set to `null` to disable thresholds entirely (only valid when `rollout_rs` is null). + +## Understanding the Framework: Components and Combinations + +The rollout correction framework is built from **orthogonal components** that can be combined flexibly. Understanding these components helps you choose the right configuration for your scenario. + +### Key Components + +1. **Operating Mode** (Section: [Operation Modes](#operation-modes)) + + - **Decoupled**: Three policies (π*rollout, π_old, π*θ) with separate π_old computation + - **Bypass**: Two policies (π*rollout = π_old, π*θ), skips π_old computation + +2. **Loss Function** (in bypass mode, controlled by `loss_type`) + + - **PPO-clip** (`loss_type="ppo_clip"`, default): PPO clipped objective (IS handled by ratio) + - **REINFORCE** (`loss_type="reinforce"`): Policy gradient with explicit IS weights (no clipping) + +3. **IS/RS Aggregation Level** + - **Token**: Per-token IS weights/rejection + - **Sequence**: Sequence-level IS weights/rejection + +See [Mathematical Formulations](rollout_corr_math.md#3-algorithmic-components-and-combinations) for detailed theory. + +--- + +## Preset Configuration Guide + +This section provides detailed guidance on choosing and using the verified presets. Each preset is a specific combination of components optimized for common scenarios. + +### Understanding the Presets + +#### Available Preset Methods + +| Preset Method | Estimator | Mode | IS Level | RS Level | Properties | +| ------------------------------------------------------------------------------ | ---------------- | ------------------ | -------- | -------- | --------------------------------------- | +| **Decoupled PPO Mode** (3 policies: π*rollout, π_old, π*θ) | +| `decoupled_token_is()` | Token-TIS | Decoupled | token | - | Per-token IS weights | +| `decoupled_seq_is()` | Seq-TIS | Decoupled | sequence | - | Sequence-level IS weights | +| `decoupled_seq_is_rs()` | Seq-MIS | Decoupled | sequence | sequence | Sequence IS + sequence RS | +| `decoupled_geo_rs()` | Geo-RS | Decoupled | - | sequence | Geometric RS (ratio mode) | +| `decoupled_geo_rs_token_tis()` | Geo-RS-Token-TIS | Decoupled | token | sequence | Geometric filter + token clipped weight | +| **K3 KL Estimator** (more stable for small KL values) | +| `decoupled_k3_rs()` | K3-RS | Decoupled | - | k3 | K3 rejection, no IS weights | +| `decoupled_k3_rs_token_tis()` | K3-RS-Token-TIS | Decoupled | token | k3 | K3 filter + token clipped weight | +| **Bypass Mode (PPO-clip)** (2 policies; ratio handles IS, RS masks outliers) | +| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | - | - | PPO-clip only | +| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | - | sequence | PPO-clip + Geo-RS (ratio) | +| `bypass_ppo_clip_k3_rs()` | K3-RS | Bypass (PPO-clip) | - | k3 | PPO-clip + K3-RS | +| **Bypass Mode (REINFORCE)** (2 policies; explicit IS weights, no PPO clipping) | +| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | sequence | - | REINFORCE with explicit IS | +| `bypass_pg_geo_rs()` | Geo-RS | Bypass (REINFORCE) | - | sequence | REINFORCE with Geo-RS (ratio) | +| `bypass_pg_geo_rs_token_tis()` | Geo-RS-Token-TIS | Bypass (REINFORCE) | token | sequence | REINFORCE + Geo filter + token IS | +| **Other** | +| `disabled()` | - | - | - | - | Metrics only, no correction | + +**Note:** + +- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function: + - `"ppo_clip"` (default): PPO clipped objective where ratio = π_θ/π_rollout already handles IS + - `"reinforce"`: REINFORCE with explicit IS weights as π_θ / π_rollout +- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples. +- Estimators (Token-TIS, Seq-TIS, Seq-MIS, Geo-RS) are compatible with Decoupled and Bypass modes. + +#### Other Supported Combinations (Manual Configuration Required) + +**Other supported combinations without preset methods:** + +- Token IS + Token RS: Token-level IS weights + token-level RS mask +- Pure token RS: Token-level RS only, no IS weights +- Pure sequence RS: Sequence-level RS only, no IS weights + +See [detailed configuration examples below](#additional-useful-configurations-not-exposed-as-presets) for manual configurations. + +**Key properties:** + +- Any aggregation level (token/sequence/geometric) works in either decoupled or bypass mode +- All combinations are fully supported by the implementation +- Rejection sampling is independent of IS weighting +- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` (no IS weights) + +--- + +### 1. Decoupled Mode with Token-level Importance Sampling (`decoupled_token_is`) + +**Configuration:** + +```python +config = RolloutCorrectionConfig.decoupled_token_is(threshold=2.0) +``` + +**Components:** + +- **Operating Mode**: Decoupled (3 policies) +- **Loss**: PPO with clipping (only for the second drift correction) +- **IS Aggregation**: Token-level +- **RS**: None (can be added separately) + +**Equivalent YAML:** + +```yaml +algorithm: + rollout_correction: + rollout_is: token + rollout_is_threshold: 2.0 + rollout_rs: null + bypass_mode: false # Decoupled mode +``` + +**Properties:** + +- Independent truncation per token +- Lower variance than sequence-level (product of ratios bounded individually) +- Typical threshold: 1.5 - 5.0 + +**Theory:** See [rollout_corr_math.md §3.3.1](rollout_corr_math.md#331-token-level-aggregation) + +--- + +### 2. Decoupled Mode with Sequence-level Importance Sampling (`decoupled_seq_is`) + +**Also known as: Seq-TIS (Sequence-Level Truncated IS)** + +**Configuration:** + +```python +config = RolloutCorrectionConfig.decoupled_seq_is(threshold=2.0) +``` + +**Components:** + +- **Operating Mode**: Decoupled (3 policies) +- **Loss**: PPO with clipping (only for the second drift correction) +- **IS Aggregation**: Sequence-level (Seq-TIS) +- **RS**: None (can be added separately) + +**Equivalent YAML:** + +```yaml +algorithm: + rollout_correction: + rollout_is: sequence + rollout_is_threshold: 2.0 + rollout_rs: null + bypass_mode: false # Decoupled mode +``` + +**Properties:** + +- Multiplicative aggregation across sequence +- More sensitive to outliers than token-level +- Typical threshold: 2.0 - 10.0 (higher than token-level) + +**Theory:** See [rollout_corr_math.md §3.3.2](rollout_corr_math.md#332-sequence-level-aggregation) + +--- + +### 3. Decoupled Mode with Sequence-level IS + Rejection Sampling (`decoupled_seq_is_rs`) + +**Also known as: Seq-MIS (Sequence-Level Masked IS)** + +**Configuration:** + +```python +config = RolloutCorrectionConfig.decoupled_seq_is_rs(is_threshold=2.0, rs_threshold="0.5_2.0") +``` + +**Components:** + +- **Operating Mode**: Decoupled (3 policies) +- **Loss**: PPO with clipping (only for the second drift correction) +- **IS Aggregation**: Sequence-level (Seq-TIS) +- **RS**: Sequence-level rejection (Seq-MIS) + +**Equivalent YAML:** + +```yaml +algorithm: + rollout_correction: + rollout_is: sequence + rollout_is_threshold: 2.0 + rollout_rs: seq_sum_k1 + rollout_rs_threshold: 0.5_2.0 + bypass_mode: false # Decoupled mode +``` + +**Properties:** + +- Double mechanism: IS reweighting (Seq-TIS) + rejection filtering (Seq-MIS) +- Lower effective sample size (rejects outliers) +- For severe off-policy gaps or when the distribution tail is "toxic" (garbage/adversarial samples) + +**When to use Seq-MIS over Seq-TIS:** + +- **Seq-TIS (clipping only)**: Maximizes information efficiency; extracts signal from all samples. Use when data is clean and mismatch is moderate. +- **Seq-MIS (rejection)**: Maximizes safety; acts as a hard trust region filter. Use when mismatch is severe or when high-weight samples are likely garbage rather than signal. + +**Theory:** See [rollout_corr_math.md §3.4](rollout_corr_math.md#34-rejection-sampling-rs) + +--- + +### 6. Bypass Mode with PPO-clip (`bypass_ppo_clip`) + +**Configuration:** + +```python +config = RolloutCorrectionConfig.bypass_ppo_clip() +``` + +**Components:** + +- **Operating Mode**: Bypass (2 policies: π*rollout = π_old, π*θ) +- **Loss**: PPO-clip (IS handled by ratio, no explicit IS weights) +- **IS Aggregation**: None (PPO ratio handles it) +- **RS**: None + +**Equivalent YAML:** + +```yaml +algorithm: + rollout_correction: + rollout_is: null + rollout_rs: null + bypass_mode: true + loss_type: ppo_clip +``` + +**Properties:** + +- PPO clipped objective in bypass mode +- The PPO ratio = π_θ/π_rollout already handles IS (no explicit IS weights needed) +- Skips `actor.compute_log_prob()` forward pass (2 policies instead of 3) +- No rejection sampling - use `bypass_ppo_clip_geo_rs()` for RS + +**Configuration requirement:** + +- Set `actor_rollout_ref.rollout.calculate_log_probs: true` + +**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies) + +--- + +### 7. REINFORCE with IS (`bypass_pg_is`) + +**Configuration:** + +```python +config = RolloutCorrectionConfig.bypass_pg_is(threshold=2.0) +``` + +**Components:** + +- **Operating Mode**: Bypass (2 policies: π*rollout, π*θ) +- **Loss**: REINFORCE (policy gradient with explicit IS weights, no PPO clipping) +- **IS Aggregation**: Sequence-level +- **RS**: None + +**Equivalent YAML:** + +```yaml +algorithm: + rollout_correction: + rollout_is: sequence + rollout_is_threshold: 2.0 + rollout_rs: null + bypass_mode: true + loss_type: reinforce # REINFORCE with explicit IS weights +``` + +**Properties:** + +- REINFORCE loss with explicit IS weights (no PPO clipping) +- Single forward pass (skips old_log_prob computation) +- IS weights computed on-the-fly in loss function + +**Theory:** See [rollout_corr_math.md §3.2.2](rollout_corr_math.md#322-policy-gradient-loss-with-isrs-correction) + +--- + +## Additional Useful Configurations (Not Exposed as Presets) + +These configurations are **fully supported** but don't have convenience preset methods yet. + +### 1. Token IS + Token RS (`token_is_rs`) + +Token-level IS weights with token-level RS mask. + +**Python:** + +```python +config = RolloutCorrectionConfig( + rollout_is="token", + rollout_is_threshold=2.0, + rollout_rs="token_k1", + rollout_rs_threshold=2.0, +) +``` + +**Properties:** Per-token IS weights + per-token RS mask. + +### 2. Pure Token RS (`token_rs`) + +Token-level RS only, no IS weights. + +**Python:** + +```python +config = RolloutCorrectionConfig( + rollout_is=None, + rollout_rs="token_k1", + rollout_rs_threshold=2.0, +) +``` + +**Properties:** Token-level RS mask, no IS reweighting. + +### 3. Pure Sequence RS (`seq_rs`) + +Sequence-level RS only, no IS weights. + +**Python:** + +```python +config = RolloutCorrectionConfig( + rollout_is=None, + rollout_rs="seq_sum_k1", + rollout_rs_threshold="0.5_2.0", +) +``` + +**Properties:** Sequence-level RS mask, no IS reweighting. + +--- + +### Summary: How IS Weights are Processed + +IS weights (`rollout_is_weights`) go through a fixed processing pipeline: + +**Stage 1: Safety Bound (Prevent Overflow)** + +- Token level: `exp(clamp(log_ratio, -20, 20))` per token → bounds each token to [2e-9, 5e8] +- Sequence level: `exp(clamp(sum(log_ratio), -20, 20))` → bounds product to [2e-9, 5e8], broadcast to all tokens + +**Stage 2: Truncation (Reduce Variance)** + +- `.clamp(max=rollout_is_threshold)` → caps weights at upper threshold (TIS: Truncated Importance Sampling) +- No lower truncation (preserves unbiasedness for small weights) + +**Stage 3: Padding Zeroing (Correct Aggregation)** + +- `weights * response_mask` → zeros out padding positions + +**Stage 4: Optional Batch Normalization** + +- If `rollout_is_batch_normalize=True`: Normalize weights to mean=1.0 within batch +- Applied after truncation to preserve truncation semantics + +**Rejection Sampling (Separate Mechanism)** + +Rejection sampling modifies `response_mask` (NOT weights) through `compute_rollout_rejection_mask()`: + +- Computes safety-bounded ratios independently +- Creates binary mask: tokens/sequences outside [lower_threshold, upper_threshold] → 0 (rejected) +- Modified mask used for loss aggregation (rejected samples excluded from training) + +## Operation Modes + +The framework provides **two operating modes** for computing π_old, which can be combined with different loss functions. + +### Operating Modes and Configuration + +| Configuration | `bypass_mode` | `loss_type` | Operating Mode | Loss Function | Description | +| ---------------------- | ------------- | ---------------------- | -------------- | ------------- | ----------------------------------------------------------------- | +| **Decoupled** | `false` | N/A | Decoupled | PPO | Computes `old_log_prob` separately via `actor.compute_log_prob()` | +| **Bypass + PPO-clip** | `true` | `"ppo_clip"` (default) | Bypass | PPO-clip | PPO clipped objective (IS handled by ratio) | +| **Bypass + REINFORCE** | `true` | `"reinforce"` | Bypass | REINFORCE | Policy gradient with explicit IS weights (no PPO clipping) | + +### Operating Mode Details + +#### Decoupled Mode (Three Policies) + +**Policy setup:** + +- π_rollout: Behavior policy (data collection) +- π_old: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch) +- π_θ: Current policy (being updated) + +**Configuration:** `bypass_mode = false` + +**Properties:** + +- ✅ Achieves batch size invariance +- ✅ Separately corrects Drift 1 (rollout→old) and Drift 2 (old→current) +- ✅ Efficient stale data utilization +- ❌ Extra forward pass needed (`actor.compute_log_prob()`) + +**Theory:** See [rollout_corr_math.md §3.1.1](rollout_corr_math.md#311-decoupled-mode-three-policies) + +#### Bypass Mode (Two Policies) + +**Policy setup:** + +- π_rollout: Behavior policy (data collection) +- π_old = π_rollout: Proximal policy equals behavior policy +- π_θ: Current policy (being updated) + +**Configuration:** `bypass_mode = true` + +**Properties:** + +- ✅ Skips `actor.compute_log_prob()` call (faster) +- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS) +- ✅ Uses two policies instead of three (π_rollout = π_old) +- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode) + +**Theory:** See [rollout_corr_math.md §3.1.2](rollout_corr_math.md#312-bypass-mode-two-policies) + +--- + +### IS/RS Aggregation Levels (Orthogonal to Operating Mode) + +The aggregation level can be chosen **independently** of the operating mode. Any aggregation level works in either decoupled or bypass mode. + +| `rollout_is` | `rollout_rs` | Behavior | +| ------------------------- | ------------------------------------------------------------------ | --------------------------------------------------------------------------------- | +| `null` | `null` | **Disabled**: No computation, no metrics, no rejection | +| `null` | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Rejection only**: Compute metrics, NO weight correction, YES rejection sampling | +| `"token"` or `"sequence"` | `null` | **IS weights only**: Weight correction enabled, NO rejection sampling | +| `"token"` or `"sequence"` | `"token_k1"`, `"seq_sum_k1"`, `"seq_mean_k1"`, `"seq_max_k2"`, etc | **Full correction**: Both weight correction and rejection sampling enabled | + +### Key Insights + +- ✅ Any IS/RS aggregation level (token/sequence/geometric) can be used in **either** decoupled or bypass mode +- ✅ You can use **rejection sampling alone** without IS weight correction (`rollout_is=null, rollout_rs="token_k1"`) +- ✅ You can use **IS weights alone** without outlier rejection (`rollout_is="token", rollout_rs=null`) +- ✅ You can use **both together** (`rollout_is="token", rollout_rs="token_k1"`) +- ✅ You can **monitor metrics only** without any correction by setting both to `null` but still providing rollout_log_probs + +**Theory:** See [rollout_corr_math.md §3.3](rollout_corr_math.md#33-isrs-aggregation-levels) for details on aggregation levels. + +### Example Workflow + +**Recommended: Bypass Mode** + +This workflow uses bypass mode for efficiency. + +1. **Start with metrics only** to understand the off-policy gap: + + ```yaml + algorithm: + rollout_correction: + rollout_is: null + rollout_rs: null + bypass_mode: true # Bypass mode (recommended) + loss_type: ppo_clip # Default: PPO clipped objective + ``` + + Monitor `rollout_corr/kl`, `rollout_corr/log_ppl_abs_diff`, `rollout_corr/chi2_token` to assess off-policy gap. + +2. **Enable rejection sampling** if you see high outlier fractions: + + ```yaml + algorithm: + rollout_correction: + rollout_is: null + rollout_rs: sequence # or "geometric" for higher sensitivity + rollout_rs_threshold: 2.0 + bypass_mode: true # Bypass mode + loss_type: ppo_clip # or "reinforce" for explicit IS weights + ``` + + This excludes outliers from training without modifying gradients. + +3. **Enable full IS correction** (with REINFORCE loss) once comfortable with metrics: + ```yaml + algorithm: + rollout_correction: + rollout_is: sequence # Recommended: unbiased, suitable for most cases + rollout_is_threshold: 2.0 + rollout_rs: sequence # or "geometric" for more aggressive filtering + rollout_rs_threshold: 2.0 + bypass_mode: true # Bypass mode + loss_type: reinforce # REINFORCE with explicit IS weights + ``` + +**Benefits of bypass mode:** + +- ✅ Skips expensive `actor.compute_log_prob()` forward pass (faster) +- ✅ `loss_type` controls the loss function: "ppo_clip" (default) or "reinforce" +- ✅ PPO-clip: IS handled by ratio (no explicit weights), RS mask applied +- ✅ REINFORCE: Explicit IS weights computed on-the-fly (π_θ / π_rollout) +- ✅ Both loss types work with all IS/RS combinations + +## Usage + +### Basic Setup + +```yaml +algorithm: + rollout_correction: + rollout_is: token # Enable IS weights at token level + rollout_is_threshold: 2.0 # Threshold for IS weights + rollout_rs: null # No rejection sampling + +actor_rollout_ref: + rollout: + calculate_log_probs: true # Required! +``` + +### Metrics + +All metrics are prefixed with `rollout_corr/` in logs. For example, `rollout_is_mean` appears as `rollout_corr/rollout_is_mean`. + +These metrics cover both: + +- **Diagnostic metrics**: KL divergence, perplexity differences (measuring off-policy gap) +- **Correction statistics**: IS weights, rejection rates (measuring correction applied) + +#### **Core IS Weight Metrics** + +- **`rollout_is_mean`**: Mean importance sampling weight across all valid tokens + + - Value close to 1.0 indicates minimal off-policy gap + +- **`rollout_is_std`**: Standard deviation of IS weights + + - Higher values indicate greater variance in IS weights + +- **`rollout_is_min`**: Minimum IS weight observed + + - Shows the most underweighted token/sequence + - For sequence/geometric: computed from unclamped log-space ratios (true minimum) + - For token: computed from safety-bounded weights + +- **`rollout_is_max`**: Maximum IS weight observed + - Shows the most overweighted token/sequence + - For sequence/geometric: computed from unclamped log-space ratios (true maximum before safety bound) + - For token: computed from safety-bounded weights (before threshold clamping) + - Compare with `rollout_is_threshold` to see truncation impact + +#### **Effective Sample Size** + +- **`rollout_is_eff_sample_size`**: Effective sample size after IS weighting + - **Formula**: `1 / mean(weights²)` where weights are normalized + - **Range**: 0.0 to 1.0 (as fraction of original batch) + - Lower values indicate weight concentration on fewer samples + +#### **Threshold Exceedance Metrics** + +- **`rollout_is_ratio_fraction_high`**: Fraction of weights exceeding upper threshold + + - Shows how often truncation/masking occurs on high end + - For sequence/geometric: computed from unclamped log-space ratios (true exceedance) + - For token: computed from safety-bounded weights (before threshold clamping) + +- **`rollout_is_ratio_fraction_low`**: Fraction of weights below lower threshold (1/upper_threshold) + - Diagnostic metric showing how many weights are below the reciprocal threshold + - For sequence/geometric: computed from unclamped log-space ratios (true exceedance) + - For token: computed from safety-bounded weights (before truncation) + +#### **Sequence-Level Metrics** (for sequence aggregation) + +- **`rollout_is_seq_mean`**: Mean IS weight at sequence level + + - Should match `rollout_is_mean` for sequence-level aggregation + +- **`rollout_is_seq_std`**: Standard deviation of sequence-level IS weights + +- **`rollout_is_seq_min`**: Minimum sequence-level IS weight + +- **`rollout_is_seq_max`**: Maximum sequence-level IS weight + +- **`rollout_is_seq_max_deviation`**: Maximum absolute deviation from 1.0 at sequence level + + - Shows worst-case sequence off-policy gap + +- **`rollout_is_seq_fraction_high`**: Fraction of sequences exceeding upper threshold + +- **`rollout_is_seq_fraction_low`**: Fraction of sequences below lower threshold + +#### **Rejection Sampling Metrics** (when `rollout_rs` is enabled) + +- **`rollout_rs_masked_fraction`**: Fraction of tokens rejected via rejection sampling + + - **Important**: Rejection sampling modifies `response_mask` (sets rejected tokens to 0) + - **Separate from IS weights**: IS weights are still truncated; rejection is an independent filtering step + - Only present when `rollout_rs` is enabled (token/sequence/geometric) + +- **`rollout_rs_seq_masked_fraction`**: Fraction of sequences with at least one rejected token + - Shows sequence-level impact of rejection sampling + - Token-level RS: sequence rejected if ANY token is outside [lower, upper] + - Sequence-level RS: entire sequence rejected or accepted based on sequence-level ratio + - Geometric RS: entire sequence rejected or accepted based on geometric mean + +#### **Off-Policy Diagnostic Metrics** (Training vs Rollout Policy) + +**Note on terminology:** These metrics use "training" to refer to the training reference policy and "rollout" to refer to π_rollout (the behavior policy used for data collection). + +- **Decoupled mode**: "training" = π_old (computed at start of training epoch) +- **Bypass/Pure IS mode**: "training" = π_θ (current policy being trained) + +In bypass/pure IS mode, metrics measure the drift between π_θ and π_rollout directly. + +- **`training_ppl`**: Perplexity of training reference policy (π*old in decoupled mode, π*θ in bypass/pure IS mode) + + - **Formula**: `exp(-mean(log_probs))` + - Lower values indicate higher model confidence + +- **`rollout_ppl`**: Perplexity of rollout policy π_rollout (e.g., vLLM BF16) + +- **`ppl_ratio`**: Ratio of training PPL to rollout PPL + + - **Formula**: `exp(mean(log(training_ppl / rollout_ppl)))` + - **Meaning**: > 1.0 means training is less confident than rollout + +- **`training_log_ppl`**: Log perplexity of training policy + + - Useful for identifying trends (linear scale) + +- **`rollout_log_ppl`**: Log perplexity of rollout policy + +- **`log_ppl_diff`**: Mean difference in log perplexities + + - **Formula**: `mean(log_ppl_rollout - log_ppl_training)` + - Sign indicates which policy is more confident + +- **`log_ppl_abs_diff`**: Mean absolute log perplexity difference + + - Magnitude of off-policy gap regardless of direction + +- **`log_ppl_diff_max`**: Maximum log perplexity difference across sequences + + - Identifies worst-case sequence + +- **`log_ppl_diff_min`**: Minimum log perplexity difference across sequences + +- **`kl`**: KL divergence KL(π_rollout || π_training) + + - **Formula**: `mean(log_prob_rollout - log_prob_training)` + - **Note**: Can be negative (rollout is less confident) + +- **`k3_kl`**: K3 divergence (equals KL(π_rollout || π_training) in expectation) + + - **Formula**: `mean(exp(log_ratio) - log_ratio - 1)` + - More stable than direct KL (non-negative per token) + - Always >= 0 + +- **`chi2_token`**: Chi-squared divergence at token level + + - **Formula**: `mean(ratio²) - 1` where ratio = π_training/π_rollout + - Measures second moment of IS weight distribution + - Always non-negative + +- **`chi2_seq`**: Chi-squared divergence at sequence level + - **Formula**: `mean((∏_t ratio_t)²) - 1` + - Sequence-level second moment of IS weights + - More sensitive than token-level chi-squared + +#### **Example: Accessing Metrics in Code** + +```python +# Metrics are returned from compute_rollout_correction_and_rejection_mask +from verl.trainer.ppo.rollout_corr_helper import compute_rollout_correction_and_rejection_mask + +# Returns 3 values (weights, modified_response_mask, metrics) +weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask( + old_log_prob=training_log_probs, # from training policy + rollout_log_prob=rollout_log_probs, # from rollout policy + response_mask=response_mask, + rollout_is="token", # Enable IS weights at token level + rollout_is_threshold=2.0, + rollout_rs="token_k1", + rollout_rs_threshold="0.5_2.0", +) + +# Extract IS weights (processed, zeroed at padding) +is_weights = weights_proto.batch["rollout_is_weights"] + +# IS weights processing (with IS enabled at token level): +# 1. Safety-bounded: exp(clamp(log_ratio, -20, 20)) per token +# 2. Truncated: .clamp(max=2.0) to cap extreme weights +# 3. Zeroed at padding positions +# Note: Truncation is ALWAYS applied to IS weights (TIS: Truncated Importance Sampling) + +# modified_response_mask has rejection applied (since rollout_rs="token_k1"): +# 1. RS rejection: tokens outside [0.5, 2.0] masked to 0 via response_mask +# Note: RS and IS are separate mechanisms - both can be enabled independently + +# All metrics have 'rollout_corr/' prefix +print(f"Mean IS weight: {metrics['rollout_corr/rollout_is_mean']:.3f}") +print(f"Effective sample size: {metrics['rollout_corr/rollout_is_eff_sample_size']:.3f}") +print(f"RS masked fraction: {metrics['rollout_corr/rollout_rs_masked_fraction']:.3f}") +print(f"KL divergence: {metrics['rollout_corr/kl']:.3f}") + +# Check IS weights for valid tokens (non-padding) +valid_weights = is_weights[response_mask.bool()] +print(f"\n✓ IS weights min (valid tokens): {valid_weights.min():.4f}") +print(f"✓ IS weights max (valid tokens): {valid_weights.max():.4f}") +print(f"✓ All valid IS weights > 0: {(valid_weights > 0).all()}") +print(f"✓ IS weights are capped at threshold: {(valid_weights <= 2.0).all()}") + +# Check rejection via response_mask +rejected_tokens = (response_mask == 1) & (modified_response_mask == 0) +print(f"\n✓ Rejected {rejected_tokens.sum()} tokens via response_mask") +print(f"✓ Rejection sampling modifies response_mask (separate from IS weight truncation)") +print(f"✓ IS weights are always truncated to [0, threshold] after safety bounding") + +# Check for warning conditions +if metrics['rollout_corr/rollout_is_mean'] < 0.5 or metrics['rollout_corr/rollout_is_mean'] > 2.0: + print("⚠️ Warning: Mean IS weight far from 1.0, significant off-policy gap detected") + +if metrics['rollout_corr/rollout_is_eff_sample_size'] < 0.3: + print("⚠️ Warning: Low effective sample size, high weight concentration") +``` + +#### **Example: Monitoring Metrics During Training** + +```python +# In your training loop +for epoch in range(num_epochs): + for batch_idx, batch in enumerate(dataloader): + # ... rollout phase ... + + # Compute IS weights and get metrics + rollout_corr_config = config.algorithm.get("rollout_correction", None) + if rollout_corr_config is not None: + weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask( + old_log_prob=batch.old_log_prob, + rollout_log_prob=batch.rollout_log_prob, + response_mask=batch.response_mask, + rollout_is=rollout_corr_config.get("rollout_is", None), + rollout_is_threshold=rollout_corr_config.get("rollout_is_threshold", 2.0), + rollout_rs=rollout_corr_config.get("rollout_rs", None), + rollout_rs_threshold=rollout_corr_config.get("rollout_rs_threshold", None), + ) + + # Log to tensorboard/wandb + for metric_name, metric_value in metrics.items(): + logger.log_scalar(metric_name, metric_value, step=global_step) + + # IMPORTANT: Update batch response_mask with rejection applied + batch.response_mask = modified_response_mask + + # Use IS weights in training (always safety-bounded, zeroed at padding) + is_weights = weights_proto.batch["rollout_is_weights"] + # ... apply weights to policy gradient ... +``` + +#### **Example: Conditional Alerting Based on Metrics** + +```python +def check_rollout_correction_health(metrics, config): + """Check if Rollout Correction metrics indicate healthy training.""" + warnings = [] + + # Check mean IS weight + mean_weight = metrics['rollout_corr/rollout_is_mean'] + if mean_weight < 0.5 or mean_weight > 2.0: + warnings.append(f"Mean IS weight {mean_weight:.3f} is far from 1.0") + + # Check effective sample size + ess = metrics['rollout_corr/rollout_is_eff_sample_size'] + if ess < 0.3: + warnings.append(f"Effective sample size {ess:.3f} is too low") + + # Check standard deviation + std = metrics['rollout_corr/rollout_is_std'] + if std > 1.0: + warnings.append(f"IS weight std {std:.3f} is too high") + + # Check KL divergence + kl = metrics['rollout_corr/kl'] + if abs(kl) > 0.1: + warnings.append(f"KL divergence {kl:.3f} indicates significant off-policy gap") + + # Check chi-squared divergence + if 'rollout_corr/chi2_token' in metrics: + chi2_token = metrics['rollout_corr/chi2_token'] + if chi2_token > 1.0: + warnings.append(f"Chi-squared divergence (token) {chi2_token:.3f} indicates severe distribution shift") + + if warnings: + print("⚠️ Rollout Correction Health Warnings:") + for warning in warnings: + print(f" - {warning}") + return False + else: + print("✅ Rollout Correction metrics look healthy") + return True + +# Use in training +_, _, metrics = compute_rollout_correction_and_rejection_mask(...) +is_healthy = check_rollout_correction_health(metrics, config) + +if not is_healthy: + # Consider adjusting config or investigating issues + print("Consider:") + print(" - Tightening rollout_is_threshold") + print(" - Switching to geometric aggregation level") + print(" - Checking if rollout and training policies are too different") +``` + +### Running Examples + +Start with the basic token-level truncate configuration: + +```bash +bash examples/rollout_correction/run_with_rollout_corr.sh +``` + +Monitor metrics for 1-2 epochs before adjusting parameters. + +## Configuration Examples + +### Example 1: IS Weights Only (Token Level) + +```yaml +algorithm: + rollout_correction: + rollout_is: token + rollout_is_threshold: 2.0 + rollout_rs: null # No rejection sampling +``` + +### Example 2: Rejection Sampling Only (No IS Weights) + +```yaml +algorithm: + rollout_correction: + rollout_is: null # No IS weights + rollout_rs: token_k1 + rollout_rs_threshold: "0.5_2.0" +``` + +### Example 3: Both IS and RS (Token RS) + +```yaml +algorithm: + rollout_correction: + rollout_is: token + rollout_is_threshold: 2.0 + rollout_rs: token_k1 + rollout_rs_threshold: "0.5_2.0" +``` + +### Example 5: Bypass Mode with PPO-clip (Default) + +```yaml +algorithm: + rollout_correction: + rollout_is: token + rollout_is_threshold: 2.0 + rollout_rs: token_k1 + rollout_rs_threshold: "0.5_2.0" + bypass_mode: true # Skip old_log_prob computation + loss_type: ppo_clip # PPO clipped objective (default) +``` + +**Skips expensive `actor.compute_log_prob()` forward pass. PPO ratio = π_θ/π_rollout handles IS.** + +### Example 6: Bypass Mode with REINFORCE + +```yaml +algorithm: + rollout_correction: + rollout_is: sequence # Explicit IS correction in loss + rollout_is_threshold: 2.0 + rollout_rs: null # Optional: can add rejection sampling + bypass_mode: true + loss_type: reinforce # REINFORCE with explicit IS weights +``` + +**No PPO clipping, pure policy gradient with IS correction** + +### Example 7: Bypass Mode with PPO-clip + Rejection Sampling + +```yaml +algorithm: + rollout_correction: + rollout_is: sequence # Computed for metrics + rollout_is_threshold: 2.0 + rollout_rs: seq_max_k2 # Sequence max χ²/2 guard + rollout_rs_threshold: 2.5 + bypass_mode: true + loss_type: ppo_clip # PPO clipped objective (IS handled by ratio) +``` + +**PPO clipping with rejection sampling. IS handled by PPO ratio (no explicit IS weights).** + +## Troubleshooting + +### Issue: High spread in IS weights + +**Symptoms:** `rollout_is_std` > 1.0, `rollout_is_eff_sample_size` < 0.3 + +**Solutions:** + +1. Switch from `sequence` to `geometric` level +2. Tighten thresholds +3. Verify rollout and training aren't too different + +### Issue: Mean IS weight far from 1.0 + +**Symptoms:** `rollout_is_mean` < 0.5 or > 2.0 + +**Solutions:** + +1. Verify `calculate_log_probs=True` is set +2. Check rollout_log_probs are correctly passed +3. Check for systematic distribution shift + +### Debugging: Visualizing Metrics + +**Example: Plot IS weight distribution** + +```python +import matplotlib.pyplot as plt +import numpy as np + +def plot_is_metrics(metrics_history): + """Plot rollout IS metrics over training steps.""" + fig, axes = plt.subplots(2, 3, figsize=(15, 10)) + + # Plot 1: Mean IS weight over time + axes[0, 0].plot(metrics_history['rollout_corr/rollout_is_mean']) + axes[0, 0].axhline(y=1.0, color='r', linestyle='--', label='Ideal') + axes[0, 0].set_title('Mean IS Weight') + axes[0, 0].set_xlabel('Step') + axes[0, 0].legend() + + # Plot 2: Effective sample size + axes[0, 1].plot(metrics_history['rollout_corr/rollout_is_eff_sample_size']) + axes[0, 1].axhline(y=0.5, color='g', linestyle='--', label='Good') + axes[0, 1].axhline(y=0.3, color='r', linestyle='--', label='Warning') + axes[0, 1].set_title('Effective Sample Size') + axes[0, 1].set_xlabel('Step') + axes[0, 1].legend() + + # Plot 3: KL divergence over time + axes[1, 0].plot(metrics_history['rollout_corr/kl'], label='KL') + axes[1, 0].plot(metrics_history['rollout_corr/k3_kl'], label='K3 KL') + axes[1, 0].axhline(y=0, color='g', linestyle='--', alpha=0.3) + axes[1, 0].set_title('KL Divergence') + axes[1, 0].set_xlabel('Step') + axes[1, 0].legend() + + # Plot 4: PPL ratio over time + axes[1, 1].plot(metrics_history['rollout_corr/ppl_ratio']) + axes[1, 1].axhline(y=1.0, color='r', linestyle='--', label='Ideal') + axes[1, 1].set_title('PPL Ratio (Training/Rollout)') + axes[1, 1].set_xlabel('Step') + axes[1, 1].legend() + + # Plot 5: Chi-squared divergence + if 'rollout_corr/chi2_token' in metrics_history: + axes[1, 2].plot(metrics_history['rollout_corr/chi2_token'], label='Token-level') + if 'rollout_corr/chi2_seq' in metrics_history: + axes[1, 2].plot(metrics_history['rollout_corr/chi2_seq'], label='Seq-level') + axes[1, 2].axhline(y=1.0, color='r', linestyle='--', label='Warning') + axes[1, 2].set_title('Chi-squared Divergence') + axes[1, 2].set_xlabel('Step') + axes[1, 2].legend() + else: + axes[1, 2].axis('off') + + plt.tight_layout() + plt.savefig('rollout_is_metrics.png', dpi=150) + print("Saved plot to rollout_is_metrics.png") +``` + +**Example: Metric collection during training** + +```python +# Collect metrics over time +metrics_history = { + 'rollout_corr/rollout_is_mean': [], + 'rollout_corr/rollout_is_eff_sample_size': [], + 'rollout_corr/kl': [], + 'rollout_corr/k3_kl': [], + 'rollout_corr/ppl_ratio': [], + 'rollout_corr/chi2_token': [], + 'rollout_corr/chi2_seq': [], +} + +# In training loop +for step in range(num_steps): + # ... compute IS weights and rejection mask ... + _, _, metrics = compute_rollout_correction_and_rejection_mask(...) + + # Store metrics + for key in metrics_history.keys(): + if key in metrics: + metrics_history[key].append(metrics[key]) + + # Plot every 100 steps + if step % 100 == 0: + plot_is_metrics(metrics_history) +``` + +## Performance Impact + +- **Memory overhead**: ~1% of model memory +- **Computational overhead**: 1-3% depending on level +- **Training stability**: Significantly improved when off-policy gap exists + +## Testing + +Run the test suite to verify everything works: + +```bash +# Basic unit tests +python test_rollout_corr.py + +# Integration tests (if pytest is available) +pytest tests/trainer/ppo/test_rollout_corr_integration.py -v +``` + +Expected output: All tests pass ✓ + +## Additional Resources + +- **Implementation**: `verl/trainer/ppo/rollout_corr_helper.py` +- **Examples**: `examples/rollout_correction/` +- **DAPO Example**: `recipe/dapo/run_dapo_qwen2.5_32b_rollout_corr.sh` + +## Summary + +Rollout Correction provides a unified framework for handling general off-policy problems in RL: + +- ✅ Corrects ANY distribution shift between data collection and training +- ✅ Supports diverse scenarios: policy mismatch, staleness, replay buffers, off-policy algorithms +- ✅ Numerical stability with safety bounds and rejection mechanisms +- ✅ Comprehensive diagnostics: KL, perplexity, χ² divergence +- ✅ Flexible methods from token-level to sequence-level aggregation +- ✅ Memory-efficient implementation + +## References + +- **[Mathematical Formulations](rollout_corr_math.md)** - Detailed mathematical theory and derivations for all rollout correction methods +- [When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch](https://richardli.xyz/rl-collapse) (see Blog Series above for parts 1-3) +- [Your Efficient RL Framework Secretly Brings You Off-Policy RL Training](https://fengyao.notion.site/off-policy-rl) diff --git a/code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md b/code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md new file mode 100644 index 0000000000000000000000000000000000000000..b0b0c13a29c072c179f89e23d2539cc06a8a52b1 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/rollout_corr_math.md @@ -0,0 +1,954 @@ +# Mathematical Formulations of Rollout Correction Methods in `verl` + +**Author:** [Yingru Li](https://richardli.xyz) +**Last updated:** 2025-11-04 + +--- + +> **📖 Documentation Structure** +> - **This document** - Mathematical theory: formulations, derivations, and algorithmic foundations +> - **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical implementation: configurations, presets, troubleshooting +> +> Start here for theory and design rationale, refer to the usage guide for implementation. + +--- + +### BibTeX Citation + +```bibtex +@online{liu-li-2025-rl-collapse, + title = {When Speed Kills Stability: Demystifying {RL} Collapse from the Training-Inference Mismatch}, + author = {Liu, Jiacai and Li, Yingru and Fu, Yuqian and Wang, Jiawei and Liu, Qian and Shen, Yu}, + year = {2025}, + month = sep, + url = {https://richardli.xyz/rl-collapse} +} +``` + +### Blog Series + +- Main blog post: https://richardli.xyz/rl-collapse +- [Part 1: Why Mismatch Breaks LLM-RL](https://richardli.xyz/rl-collapse-1) (analytical framework using TV distance for bias and χ²-divergence for variance) +- [Part 2: The Gradient Estimator Trials](https://richardli.xyz/rl-collapse-2) (token-level vs sequence-level correction bias-variance tradeoff) +- [Part 3: When Math Meets Reality—Toxic Tails and Length Traps](https://richardli.xyz/rl-collapse-3) (why rejection over clipping, and geometric-level RS) + +## Abstract + +This document provides the definitive mathematical formulations for rollout correction methods in `verl`, following the natural progression from **REINFORCE** to **PPO** to **Decoupled PPO**. + +Rollout correction provides a unified framework to handle **general off-policy problems** in RL training - any scenario where the data collection distribution differs from the training distribution. + +**Applicable scenarios include:** +- **Policy mismatch**: Different precision (FP8 vs FP16 vs BF16 vs FP32), different backends (vLLM vs SGLang vs FSDP vs Megatron) +- **Temporal lag**: Model staleness, asynchronous rollout workers +- **Replay buffers**: Training on historical trajectories from earlier policy versions +- **Off-policy algorithms**: Behavioral cloning, DAPO, expert demonstrations +- **Data filtering**: Reweighting, preference learning, curriculum learning + +--- + +## Table of Contents + +1. [Theoretical Foundation: From REINFORCE to Decoupled PPO](#1-theoretical-foundation-from-reinforce-to-decoupled-ppo) +2. [Implementation in verl: The Three-Policy Framework](#2-implementation-in-verl-the-three-policy-framework) +3. [Algorithmic Components and Combinations](#3-algorithmic-components-and-combinations) +4. [Off-Policy Diagnostic Metrics](#4-off-policy-diagnostic-metrics) +5. [Summary and Decision Guide](#5-summary-and-decision-guide) +6. [Implementation References](#6-implementation-references) + +--- + +## 1. Theoretical Foundation: From REINFORCE to Decoupled PPO + +This section establishes the theoretical progression that `verl` implements. + +### 1.1 REINFORCE: Policy Gradient Baseline + +The REINFORCE algorithm ([Williams, 1992](https://doi.org/10.1007/BF00992696)) is the foundation of policy gradient methods. + +**Vanilla REINFORCE (On-Policy)** + +For trajectories $\tau = (s_0, a_0, s_1, a_1, \ldots, s_T, a_T)$ sampled from the current policy $\pi_\theta$, the policy gradient is: + +$$ +\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta} \left[ \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right] +$$ + +where $A_t$ is the advantage function at timestep $t$. + +**Off-Policy REINFORCE** + +When trajectories are sampled from a different behavior policy $\mu$, we apply importance sampling over the **joint trajectory distribution**: + +$$ +\nabla_\theta J(\theta) = \mathbb{E}_{\tau \sim \mu} \left[ \frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} \sum_{t=0}^T \nabla_\theta \log \pi_\theta(a_t|s_t) \cdot A_t \right] +$$ + +where the trajectory-level importance weight is: + +$$ +\frac{P_{\pi_\theta}(\tau)}{P_\mu(\tau)} = \frac{p(s_0) \prod_{t=0}^T \pi_\theta(a_t|s_t) p(s_{t+1}|s_t, a_t)}{p(s_0) \prod_{t=0}^T \mu(a_t|s_t) p(s_{t+1}|s_t, a_t)} = \prod_{t=0}^T \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)} +$$ + +The transition dynamics $p(s_{t+1}|s_t, a_t)$ and initial state $p(s_0)$ cancel out, leaving only the product of per-step action probability ratios. + +**Key properties:** +- **Off-policy capable**: Can learn from any behavior policy via importance sampling +- **No trust region**: Policy updates not constrained + +**Implementation in verl:** The `bypass_pg_is` preset implements off-policy REINFORCE with truncated importance sampling. + +### 1.2 PPO: Adding Trust Region Control + +Proximal Policy Optimization ([Schulman et al., 2017](https://arxiv.org/abs/1707.06347)) adds a clipped surrogate objective: + +$$ +L_{\text{PPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right] +$$ + +where $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\mu(a_t|s_t)}$ and $\epsilon$ is the clip range (typically 0.2). + +**Key properties:** +- **Two policies**: $\mu$ (reference for clipping) and $\pi_\theta$ (being updated) +- **Trust region via clipping**: Limits policy update magnitude via ratio $r_t(\theta) = \frac{\pi_\theta}{\mu}$ + +### 1.3 Decoupled PPO: Achieving Batch Size Invariance + +Decoupled PPO ([Hilton et al., 2021](https://arxiv.org/abs/2110.00641)) solves PPO's batch size sensitivity by **decoupling two roles**: +1. **Proximal policy** $\pi_{\text{prox}}$: The anchor policy for PPO clipping (controls policy update size) +2. **Behavior policy** $\mu$: The policy that collected the data (for off-policy correction via importance sampling) + +**The problem**: Standard PPO controls policy update size via the ratio $\frac{\pi_\theta}{\pi_{\text{old}}}$, where $\pi_{\text{old}}$ is assumed to be both the proximal policy *and* the behavior policy. This coupling makes the algorithm sensitive to batch size because aggregating data from multiple workers or using replay buffers changes the effective behavior policy. + +**The solution**: Decouple these two roles, leading to a **three-policy formulation**: + +$$ +L_{\text{DecoupledPPO}}(\theta) = -\mathbb{E}_{(s,a) \sim \mu} \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right] +$$ + +where: +- $w_t = \frac{\pi_{\text{prox}}(a_t|s_t)}{\mu(a_t|s_t)}$: Importance sampling weight (corrects for behavior policy $\mu$). Here $\pi_{\text{prox}}$ is frozen during training, so $w_t$ is constant (no stopgrad operator needed). +- $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{prox}}(a_t|s_t)}$: PPO ratio (controls policy update size against proximal policy $\pi_{\text{prox}}$) + +**Key properties**: By decoupling: +- **Batch size invariance**: Policy update control (via $\pi_{\text{prox}}$) is independent of data aggregation +- **Flexible behavior policy**: Any $\mu$ can be used (different workers, replay buffers, or stale checkpoints) +- **Stale data utilization**: Older trajectories can be corrected via importance sampling +- **Clipping preserved**: Clipping against $\pi_{\text{prox}}$ limits update magnitude + +**This is the algorithm that `verl` implements via its three-policy framework.** + +--- + +## 2. Implementation in verl: The Three-Policy Framework + +The `verl` library implements decoupled PPO using three distinct policies, each serving a specific role. + +### 2.1 Policy Roles and Notation + +**$\pi_{\text{rollout}}$ (Behavior Policy $\mu$)** +The policy used for data collection. This is the behavior distribution $\mu$ from theory. + +- **When created**: During rollout/data collection phase +- **Purpose**: Generate trajectories for training +- **Common sources**: + - Policy mismatch: Same weights, different implementation (precision, backend) + - Temporal lag: Stale checkpoint from async workers + - Replay buffer: Historical data from earlier iterations + - Off-policy algorithms: Expert demonstrations, auxiliary policies (DAPO) + - Data filtering: Reweighted or filtered data +- **Fixed**: Frozen during training on a batch + +**$\pi_{\text{old}}$ (Proximal Policy $\pi_{\text{prox}}$)** +The reference policy for PPO clipping. This is the "proximal policy" from decoupled PPO theory. + +- **When created**: + - **Decoupled mode**: Computed at start of training epoch via `actor.compute_log_prob()` + - **Bypass mode**: Set equal to $\pi_{\text{rollout}}$ (skips separate computation) +- **Purpose**: + - Anchor point for PPO clipping (controls policy update size) + - When separate from $\pi_{\text{rollout}}$: Enables batch size invariance and efficient use of stale data +- **Fixed**: Frozen during all PPO update epochs on the same batch + +**$\pi_{\theta}$ (Current Policy)** +The policy being actively optimized during training. + +- **Updated**: Every gradient step +- **Purpose**: The policy we're improving + +### 2.2 Operating Modes + +The three-policy framework can operate in two modes: + +**Decoupled Mode (Three Policies)** +- Computes $\pi_{\text{old}}$ separately at the start of each training epoch +- **Algorithm**: Full decoupled PPO with three policies (mathematically correct) +- **Properties**: Achieves batch size invariance; separately corrects Drift 1 (rollout→old) and Drift 2 (old→current) + +**Bypass Mode (Two Policies)** +- Sets $\pi_{\text{old}} = \pi_{\text{rollout}}$ (skips separate computation) +- **Algorithm**: Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct) +- **Key difference**: Proximal policy equals behavior policy, so no IS correction needed between them +- **Properties**: Faster (skips `actor.compute_log_prob()` call); does not achieve batch size invariance + +### 2.3 Two Distribution Shifts + +The three-policy framework handles two types of distribution drift: + +**Drift 1: $\pi_{\text{rollout}} \to \pi_{\text{old}}$ (Off-Policy Gap)** + +This is the distribution shift between the data collection policy and the training reference policy. + +- **Nature**: Ranges from negligible (same checkpoint, minor differences) to severe (replay buffers, expert data) +- **Correction**: Importance sampling weight $w_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ +- **Optional**: Can be ignored (bypass mode) when negligible + +**Drift 2: $\pi_{\text{old}} \to \pi_{\theta}$ (Policy Update Drift)** + +This is the drift from policy parameter updates during training. + +- **Nature**: Occurs as $\pi_\theta$ is updated via gradient descent +- **Correction**: PPO clipping on ratio $r_t(\theta) = \frac{\pi_\theta(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ +- **Universal**: Applies to both on-policy and off-policy training + +### 2.4 Notation Summary + +- $\pi_{\text{rollout}}$: Behavior policy (data collection) +- $\pi_{\text{old}}$: Proximal policy (PPO anchor) +- $\pi_{\theta}$: Current policy (being updated) +- $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$: Per-token IS ratio (corrects Drift 1) +- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio (corrects Drift 2) +- $A_t$: Advantage at token $t$ +- $T$: Set of valid tokens in a sequence +- $C_{\text{IS}}$: Upper threshold for IS weights (e.g., 2.0) +- $C_{\text{RS-upper}}$: Upper threshold for RS mask (e.g., 2.0) +- $C_{\text{RS-lower}}$: Lower threshold for RS mask (typically $1/C_{\text{RS-upper}}$) +- $\epsilon$: PPO clip range (typically 0.2) + +--- + +## 3. Algorithmic Components and Combinations + +The rollout correction framework in `verl` is built from **orthogonal components** that can be combined flexibly: + +1. **Operating Mode**: How $\pi_{\text{old}}$ is computed (Decoupled vs Bypass) +2. **Loss Function**: PPO (with clipping) vs Pure IS (policy gradient only) +3. **IS/RS Aggregation Level**: Token, Sequence, or Geometric + +This section explains each component and their valid combinations. + +### 3.1 Operating Modes: Decoupled vs Bypass + +The operating mode determines how the proximal policy $\pi_{\text{old}}$ is computed. + +#### 3.1.1 Decoupled Mode (Three Policies) + +**Configuration:** `bypass_mode = false` + +**Policy setup:** +- $\pi_{\text{rollout}}$: Behavior policy (data collection) +- $\pi_{\text{old}}$: Proximal policy (computed via `actor.compute_log_prob()` at start of training epoch) +- $\pi_{\theta}$: Current policy (being updated) + +**IS ratio:** $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (corrects Drift 1: rollout→old) + +**PPO ratio:** $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (corrects Drift 2: old→current) + +**Properties:** +- ✅ Achieves batch size invariance +- ✅ Separately corrects two distribution drifts +- ✅ Efficient stale data utilization +- ❌ Extra forward pass needed (`actor.compute_log_prob()`) + +#### 3.1.2 Bypass Mode (Two Policies) + +**Configuration:** `bypass_mode = true` + +**Policy setup:** +- $\pi_{\text{rollout}}$: Behavior policy (data collection) +- $\pi_{\text{old}} = \pi_{\text{rollout}}$: Proximal policy equals behavior policy +- $\pi_{\theta}$: Current policy (being updated) + +**Ratios:** +- **With PPO-clip loss** (`loss_type = "ppo_clip"`, default): PPO ratio $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ clips against rollout policy (IS handled by ratio) +- **With REINFORCE loss** (`loss_type = "reinforce"`): IS ratio $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ computed on-the-fly in loss function + +**Properties:** +- ✅ Skips `actor.compute_log_prob()` call (faster) +- ✅ Handles off-policy correction via IS/RS (when using policy gradient with IS/RS) +- ✅ Uses two policies instead of three (π_rollout = π_old) +- ⚠️ Does not separate proximal policy from behavior policy (unlike decoupled mode) + +--- + +### 3.2 Loss Functions: PPO vs Policy Gradient + +#### 3.2.1 PPO Loss (with Clipping) + +**Configuration:** `loss_type = "ppo_clip"` (default in bypass mode) + +**Loss function:** + +$$ +L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ w_t \cdot \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right] +$$ + +where: +- $w_t$: IS weight (depends on aggregation level, see Section 3.3). In decoupled mode, $w_t = \frac{\pi_{\text{old}}}{\pi_{\text{rollout}}}$ where $\pi_{\text{old}}$ is frozen, so $w_t$ is constant (no stopgrad needed). In bypass mode with PPO loss, no separate IS weights are typically computed. +- $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$: PPO ratio +- $\epsilon$: Clip range (typically 0.2) + +**Properties:** +- Trust region control via clipping +- Limits policy update magnitude +- Standard in RL training + +#### 3.2.2 Policy Gradient Loss (with IS/RS Correction) + +**Configuration:** `loss_type = "reinforce"` (requires `bypass_mode = true`) + +**Loss function** (example with sequence-level IS): + +$$ +L_{\text{PG}}(\theta) = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \log \pi_{\theta}(a_t|s_t) \cdot A_t \right] +$$ + +where: +- $w_{\text{seq}}(\theta)$: Sample weight (IS or RS, see §3.3-3.4 for details) +- For IS: $w_{\text{seq}}(\theta) = \min\left( \prod_{t \in T} \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}, C_{\text{IS}} \right)$ +- For RS: $w_{\text{seq}}(\theta) \in \{0, 1\}$ (binary rejection mask) +- **stopgrad operator**: The weight $w_{\text{seq}}(\theta)$ is computed using $\pi_\theta$ but treated as a **constant coefficient** when computing $\nabla_\theta L$. This is essential for importance sampling correctness (see theoretical justification below). + +**Effective gradient:** + +$$ +\nabla_\theta L_{\text{PG}} = -\mathbb{E}_{(s,a) \sim \pi_{\text{rollout}}} \left[ \text{stopgrad}(w_{\text{seq}}(\theta)) \cdot \sum_{t \in T} \nabla_\theta \log \pi_{\theta}(a_t|s_t) \cdot A_t \right] +$$ + +**Theoretical Justification for stopgrad:** + +The stopgrad operator is **mathematically required** by importance sampling theory, not an implementation detail. Here's why: + +**The fundamental principle**: Importance sampling is a technique to **change the measure** (reweight samples from one distribution to estimate expectations under another), not to optimize the reweighting function itself. + +**Formal derivation**: + +1. **Original objective**: We want to optimize $J(\theta) = \mathbb{E}_{\tau \sim \pi_\theta}[\sum_t A_t]$. + +2. **Off-policy setting**: We only have samples from $\pi_{\text{rollout}}$, so we use importance sampling: + $$ + J(\theta) = \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ \underbrace{\frac{P_{\pi_\theta}(\tau)}{P_{\pi_{\text{rollout}}}(\tau)}}_{w(\tau;\theta)} \sum_t A_t \right] + $$ + +3. **Computing the policy gradient**: The correct gradient uses the **policy gradient theorem BEFORE importance sampling**: + $$ + \begin{aligned} + \nabla_\theta J(\theta) &= \nabla_\theta \mathbb{E}_{\tau \sim \pi_\theta}\left[\sum_t A_t\right] \\ + &= \mathbb{E}_{\tau \sim \pi_\theta} \left[\sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(policy gradient theorem)} \\ + &= \mathbb{E}_{\tau \sim \pi_{\text{rollout}}} \left[ w(\tau;\theta) \sum_t A_t \nabla_\theta \log \pi_\theta(a_t|s_t) \right] \quad \text{(change of measure)} + \end{aligned} + $$ + + In the final line, $w(\tau;\theta)$ appears as a **multiplicative coefficient** from the change of measure, not as something we differentiate. + +4. **What goes wrong without stopgrad**: If we naively compute $\nabla_\theta \left[w(\theta) \log \pi_\theta \right]$ in the loss, we get: + $$ + \nabla_\theta \left[w(\theta) \log \pi_\theta \right] = \underbrace{\log \pi_\theta \cdot \nabla_\theta w(\theta)}_{\text{WRONG: bias term}} + \underbrace{w(\theta) \cdot \nabla_\theta \log \pi_\theta}_{\text{CORRECT: IS-weighted gradient}} + $$ + + The first term $\log \pi_\theta \cdot \nabla_\theta w(\theta)$ is an artifact of the computational trick (using loss times log-prob), not part of the true policy gradient. It biases the gradient estimator and optimizes a different objective than $J(\theta)$. + +5. **Implementation requirement**: In PyTorch, to compute only the second term, we must use: + ```python + loss = -advantages * log_prob * rollout_is_weights.detach() # stopgrad on weights + ``` + Without `.detach()`, autograd computes both terms, giving an incorrect gradient. + +**Intuition**: The IS weight $w(\theta)$ tells us "how much to trust this sample" for estimating the gradient under $\pi_\theta$. We update $\theta$ to maximize the reweighted objective, but we don't update $\theta$ to maximize the weight itself—that would be circular reasoning (optimizing the correction factor instead of the actual objective). + +**Properties:** +- **Algorithm**: Off-policy policy gradient with IS/RS correction +- **Loss types** (`loss_type` config option in bypass mode): + - `"ppo_clip"` (default): PPO clipped objective + - $L = -\mathbb{E}[\min(r \cdot A, \text{clip}(r) \cdot A)]$ where $r = \pi_\theta / \pi_{\text{rollout}}$ + - Note: IS weights NOT applied (PPO ratio already handles it; would be double-counting) + - `"reinforce"`: Pure policy gradient with explicit IS weights, no PPO clipping + - $L = -\mathbb{E}[w \cdot \log \pi_\theta(a|s) \cdot A]$ where $w = \pi_\theta / \pi_{\text{rollout}}$ +- **Always uses bypass mode**: Direct $\pi_\theta$ to $\pi_{\text{rollout}}$ comparison +- **Fast**: Single forward pass + +**Implementation:** `compute_policy_loss_bypass_mode()` and `compute_policy_loss_reinforce()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py) + +--- + +### 3.3 IS/RS Aggregation Levels + +The aggregation level determines how per-token probability ratios are combined into IS weights and/or rejection masks. This choice is **orthogonal to the operating mode** - you can use any aggregation level in either decoupled or bypass mode. + +#### 3.3.1 Token-Level Aggregation + +**IS weights:** $w_t = \min(\rho_t, C_{\text{IS}})$ where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (decoupled) or $\rho_t = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$ (bypass/pure IS) + +**Configuration:** +```python +rollout_is = "token" # IS weights +rollout_rs = "token_k1" # Optional: rejection sampling (ratio bounds) +``` + +**Properties:** +- Independent truncation per token +- Lower variance than sequence-level (product of ratios bounded individually) +- **Bias-variance tradeoff**: Token-level correction has $O(T^2 \Delta_{\max})$ bias where $T$ is sequence length and $\Delta_{\max}$ is maximum per-token policy divergence. This bias becomes significant when the rollout policy deviates substantially from the training policy. Sequence-level correction is unbiased but has higher variance. +- Typical threshold: 1.5 - 5.0 +- Optional batch normalization (§3.6): Normalizes over all token weights to ensure $\mathbb{E}[\tilde{w}_t] = 1$ (reduces variance) +- **When to use**: Token-level works well when rollout policy stays within the trust region of training policy. When mismatch is significant, the bias becomes intolerable and sequence-level correction is preferred. + +**Loss function (REINFORCE + Token IS):** + +$$ +L_{\text{REINFORCE+TIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_t) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right] +$$ + +where $w_t = \min(\rho_t, C_{\text{IS}})$ are the truncated token-level IS weights. The stopgrad operator ensures that when computing $\nabla_\theta L$, the weights are treated as constants (see §3.2.2 for theoretical justification). This formulation can also be combined with PPO clipping by replacing the REINFORCE gradient with the clipped surrogate objective. + +**Implementation:** +- IS weights: `compute_rollout_correction_weights()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L325-L402) +- Loss: `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884) + +#### 3.3.2 Sequence-Level Aggregation + +**IS weights:** $w_{\text{seq}} = \min\left( \prod_{t \in T} \rho_t, C_{\text{IS}} \right) = \min\left( \exp\left(\sum_{t \in T} \log \rho_t\right), C_{\text{IS}} \right)$ (broadcast to all tokens) + +**Configuration:** +```python +rollout_is = "sequence" # IS weights +rollout_rs = "seq_sum_k1" # Optional: rejection sampling +``` + +**Properties:** +- Multiplicative aggregation across sequence +- More sensitive to outliers than token-level +- Typical threshold: 2.0 - 10.0 +- Optional batch normalization (§3.6): Normalizes over sequence means (one weight per sequence) + +**Terminology Note:** +- **Seq-TIS (Sequence-Level Truncated IS)**: Clips the sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$. Maximizes information efficiency by extracting signal from all samples. Best for clean data with moderate mismatch. +- **Seq-MIS (Sequence-Level Masked IS)**: Rejects (masks) sequences with $\rho(\tau) > C$ instead of clipping. Acts as a hard trust region filter. Best for severe mismatch or when the distribution tail is "toxic" (contains garbage/adversarial samples rather than signal). + +**Loss function (REINFORCE + Sequence IS):** + +$$ +L_{\text{REINFORCE+SeqIS}}(\theta) = -\mathbb{E}_t \left[ \text{stopgrad}(w_{\text{seq}}) \cdot \log \pi_\theta(a_t|s_t) \cdot A_t \right] +$$ + +where $w_{\text{seq}}$ is broadcast to all tokens in the sequence. The stopgrad operator ensures correct IS gradient computation (see §3.2.2). This formulation can also be combined with PPO clipping. + +#### 3.3.3 Geometric Mean Aggregation (Geo-RS) + +**Geometric mean ratio:** $\rho_{\text{geo}} = \exp\left( \frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{1/|T|}$ (broadcast to all tokens) + +**Configuration:** +```python +rollout_is = null # No IS weights, pure rejection +rollout_rs = "seq_mean_k1" # Geometric mean rejection sampling (ratio bounds) +``` + +**Properties:** +- Length-invariant (normalizes by sequence length) +- Ideal ratio = 1.0 (policies match) +- Typical bounds: `"0.999_1.001"` (~±0.1%) +- **Used for rejection sampling only, not IS weighting** + +**The Length Trap Problem:** + +Standard IS estimators have a systematic **length bias** that penalizes long sequences. The importance ratio $\rho(y)$ is multiplicative: + +$$ +\rho(y) = \prod_{t=1}^T \frac{\pi(y_t|y_{= 0 per token (equals 0 when ρ = 1) +- More stable than geometric ratio checks because each token term is non-negative +- Only upper threshold applies (no lower threshold since K3 >= 0) +- Typical threshold: 0.001 - 0.01 + +**Why K3 over geometric ratio?** +- Geometric ratio uses average log-ratio; small numerical bias can flip sign +- K3 = E[ρ - log ρ - 1] is non-negative per token, offering a smoother detector +- Both estimate the same quantity: KL(π_rollout || π_old) +- For small divergences, K3 ≈ 0.5 × Var(log_ratio) + +**Combined Estimator (K3-RS-Token-TIS):** + +For best results, combine K3 filter with token-level IS weights: + +$$ +\hat{g}_{\text{k3-rs-token-tis}}(y) = \underbrace{\mathbb{I}\left( K3_{\text{seq}} \le C_{\text{k3}} \right)}_{\text{K3 Filter}} \cdot \prod_t \min(\rho_t, C) \cdot f(y) +$$ + +This is implemented by combining `rollout_rs="k3"` with `rollout_is="token"`. + + +--- + +### 3.4 Batch Normalization + +An optional variance reduction technique that normalizes IS weights to have mean 1.0 within each batch. + +**Configuration:** +```python +rollout_is_batch_normalize = True # Default: False +``` + +**Normalization formula (aggregation-aware):** + +For **token-level IS** (§3.3.1): + +$$ +\tilde{w}_t = \frac{w_t}{\frac{1}{\sum_{i,t} m_{i,t}} \sum_{i,t} w_{i,t} \cdot m_{i,t}} +$$ + +where $w_{i,t}$ are truncated token IS weights, $m_{i,t}$ is the response mask, and normalization is over **all tokens**. + +For **sequence-level IS** (§3.3.2): + +$$ +\tilde{w}_i = \frac{w_i}{\frac{1}{B}\sum_{j=1}^B \bar{w}_j} +$$ + +where $\bar{w}_j = \frac{1}{T_j}\sum_{t=1}^{T_j} w_{j,t} \cdot m_{j,t}$ is the per-sequence mean (all tokens in a sequence have the same weight), and normalization is over **sequences**. + +**Properties:** +- Applied **after** truncation to preserve truncation semantics +- Ensures $\mathbb{E}[\tilde{w}] = 1$ within each batch +- **Aggregation-aware**: Token-level normalizes over tokens; sequence-level normalizes over sequences +- Uses `masked_mean` to respect padding tokens +- Reduces gradient magnitude variance by removing random batch-level scale fluctuations + +**Metrics:** +- `rollout_is_batch_norm_factor`: The normalization factor applied (batch mean before normalization) + +**Implementation:** [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L401-L421) + +--- + +### 3.5 Rejection Sampling (RS) + +Rejection sampling can be added to **any combination** of operating mode and aggregation level. It modifies the `response_mask` to exclude outlier tokens/sequences. + +**Configuration examples:** +```python +rollout_rs = "token_k1" # Token-level ratio bounds +rollout_rs_threshold = "0.6_1.6" + +rollout_rs = "seq_sum_k1" # Sequence sum of log ratios +rollout_rs_threshold = "0.5_2.0" + +rollout_rs = "seq_mean_k3" # Sequence mean of K3 divergence +rollout_rs_threshold = 0.01 +``` + +**Acceptance set:** +- **Token-level**: $\mathcal{A}_{\text{token}} = \{ t : C_{\text{RS-lower}} \leq \rho_t \leq C_{\text{RS-upper}} \}$ +- **Sequence-level**: $\mathcal{A}_{\text{seq}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \prod_{t \in T} \rho_t \leq C_{\text{RS-upper}} \}$ +- **Geometric**: $\mathcal{A}_{\text{geo}} = \{ \text{seq} : C_{\text{RS-lower}} \leq \rho_{\text{geo}} \leq C_{\text{RS-upper}} \}$ + +**Properties:** +- Separate from IS weighting (can use RS without IS) +- Reduces effective sample size +- Filters extreme outliers + +**Implementation:** `compute_rollout_rejection_mask()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L80-L188) + +--- + +### 3.6 Combination Matrix + +**Key insight:** Estimators (how IS/RS is computed) and operating modes (decoupled PPO vs bypass PG) are **orthogonal**. Any estimator can be combined with any operating mode. + +#### Estimator × Operating Mode + +| Estimator | Configuration | Compatible Modes | +|-----------|---------------|------------------| +| **Token-TIS** | `rollout_is="token"` | Decoupled PPO, Bypass PG | +| **Seq-TIS** | `rollout_is="sequence"` | Decoupled PPO, Bypass PG | +| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Decoupled PPO, Bypass PG | +| **Geo-RS** | `rollout_rs="seq_mean_k1"` (geometric mean) | Decoupled PPO, Bypass PG | +| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Decoupled PPO, Bypass PG | +| **K3-RS** | `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG | +| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | Decoupled PPO, Bypass PG | + +**Note:** In bypass mode, `loss_type` controls the loss function. Use "ppo_clip" (default) or "reinforce". + +#### Available Preset Methods + +| Preset Method | Estimator | Mode | Properties | +|---------------|-----------|------|------------| +| **Decoupled PPO Mode** (3 policies: π_rollout, π_old, π_θ) | +| `decoupled_token_is()` | Token-TIS | Decoupled PPO | Per-token IS weights | +| `decoupled_seq_is()` | Seq-TIS | Decoupled PPO | Sequence-level IS weights | +| `decoupled_seq_is_rs()` | Seq-MIS | Decoupled PPO | Sequence IS + sequence RS | +| `decoupled_geo_rs()` | Geo-RS | Decoupled PPO | Geometric RS + seq\_max\_k2 guard | +| `decoupled_geo_rs_token_tis()` | Geo-RS-Token-TIS | Decoupled PPO | Geometric filter + token IS | +| **K3 KL Estimator** (more stable for small KL values) | +| `decoupled_k3_rs()` | K3-RS | Decoupled PPO | K3 rejection, no IS weights | +| `decoupled_k3_rs_token_tis()` | K3-RS-Token-TIS | Decoupled PPO | K3 filter + token clipped weight | +| **Bypass Mode (PPO-clip)** (ratio handles IS, RS masks outliers) | +| `bypass_ppo_clip()` | - | Bypass (PPO-clip) | PPO-clip only | +| `bypass_ppo_clip_geo_rs()` | Geo-RS | Bypass (PPO-clip) | PPO-clip + Geo-RS (ratio) | +| `bypass_ppo_clip_k3_rs()` | K3-RS | Bypass (PPO-clip) | PPO-clip + K3-RS | +| **Bypass Mode (REINFORCE)** (explicit IS weights, no PPO clipping) | +| `bypass_pg_is()` | Seq-TIS | Bypass (REINFORCE) | REINFORCE + Seq IS | +| `bypass_pg_geo_rs()` | Geo-RS | Bypass (REINFORCE) | REINFORCE + Geo-RS (ratio) | +| `bypass_pg_geo_rs_token_tis()` | Geo-RS-Token-TIS | Bypass (REINFORCE) | REINFORCE + Geo filter + token IS | +| **Other** | +| `disabled()` | - | - | Metrics only | + +**Note:** Bypass mode sets π_old = π_rollout and uses `loss_type` to select the loss function. + +#### Additional Supported Combinations (Manual Configuration) + +These combinations are **fully supported** but require manual configuration: + +**1. Token IS + Token RS** +```python +config = RolloutCorrectionConfig( + rollout_is="token", + rollout_is_threshold=2.0, + rollout_rs="token_k1", + rollout_rs_threshold="0.5_2.0", +) +``` +**Properties:** Token-level IS weights + token-level RS mask. + +**2. Pure Token RS** +```python +config = RolloutCorrectionConfig( + rollout_is=None, + rollout_rs="token_k1", + rollout_rs_threshold="0.5_2.0", +) +``` +**Properties:** Token-level RS mask only, no IS weights. + +**3. Pure Sequence RS** +```python +config = RolloutCorrectionConfig( + rollout_is=None, + rollout_rs="seq_sum_k1", + rollout_rs_threshold="0.5_2.0", +) +``` +**Properties:** Sequence-level RS mask only, no IS weights. + +**Key properties:** +- Any IS aggregation level (token/sequence) can be used in either decoupled or bypass mode +- Rejection sampling can be added to any combination +- Geometric aggregation is typically used for RS only (not IS weighting) +- Pure RS (`bypass_pg_rs`) uses bypass + geometric RS with `loss_type="reinforce"` for REINFORCE (no IS weights) +- All combinations in the table above are valid and supported by the implementation + +--- + +### 3.7 Common Implementation Mistake + +#### Incorrect LLM-RL Implementation (PPO Without Rollout Correction) + +**Theory:** Naive LLM-RL implementation that incorrectly applies PPO by **ignoring the actual rollout policy** and assuming $\pi_{\text{old}} = \pi_{\text{rollout}}$. + +**Note:** This incorrect implementation pattern was identified in [Liu, Li, et al. (2025)](https://richardli.xyz/rl-collapse) as a key cause of training instability in LLM-RL systems, motivating the development of this rollout correction framework. + +**Loss Function:** + +$$ +L_{\text{PPO}}(\theta) = -\mathbb{E}_t \left[ \min\left( r_t(\theta) A_t, \text{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) A_t \right) \right] +$$ + +where $r_t(\theta) = \frac{\pi_{\theta}(a_t|s_t)}{\pi_{\text{old}}(a_t|s_t)}$ (ignores $\pi_{\text{rollout}}$). + +**Why it's wrong:** +- **Ignores $\pi_{\text{rollout}}$**: Uses $\pi_{\text{old}}$ as behavior policy instead of actual $\pi_{\text{rollout}}$ +- **Policy mismatch**: In LLM-RL, rollout typically uses different precision/backend/checkpoint than training, causing $\pi_{\text{rollout}} \neq \pi_{\text{old}}$ even with same model weights +- **Not PPO's fault**: PPO itself is correct; the issue is the incorrect assumption + +**Correct alternatives:** +1. **Decoupled mode**: Three policies with IS correction from $\pi_{\text{rollout}}$ to $\pi_{\text{old}}$ +2. **Bypass mode**: Two policies using $\pi_{\text{rollout}}$ as both behavior policy and proximal policy +3. **Bypass + Policy Gradient mode**: Two policies with IS/RS correction and no PPO clipping + +**Implementation:** `compute_policy_loss()` in [core_algos.py](../../verl/trainer/ppo/core_algos.py#L812-L884) + +--- + +## 4. Off-Policy Diagnostic Metrics + +These metrics quantify the severity of off-policy drift. + +**Note on notation:** Metrics use $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$. In bypass mode, $\pi_{\text{old}} = \pi_{\text{rollout}}$, so metrics measure rollout→current drift using $\rho_t = \frac{\pi_{\theta}}{\pi_{\text{rollout}}}$ instead. + +### 4.1 KL Divergence + +**Direct KL estimator:** + +$$ +\text{KL}(\pi_{\text{rollout}} \| \pi_{\text{old}}) = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \log \pi_{\text{rollout}}(a_t|s_t) - \log \pi_{\text{old}}(a_t|s_t) \right] +$$ + +**K3 KL estimator** (alternative formulation): + +$$ +\text{KL}_{\text{K3}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t - \log \rho_t - 1 \right] +$$ + +where $\rho_t = \frac{\pi_{\text{old}}(a_t|s_t)}{\pi_{\text{rollout}}(a_t|s_t)}$. + +### 4.2 Perplexity + +**Old policy perplexity:** + +$$ +\text{PPL}_{\text{old}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{old}}(a_t|s_t) \right) +$$ + +**Rollout policy perplexity:** + +$$ +\text{PPL}_{\text{rollout}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \pi_{\text{rollout}}(a_t|s_t) \right) +$$ + +**PPL ratio** (inverse of geometric mean IS weight): + +$$ +\text{PPL}_{\text{ratio}} = \frac{\text{PPL}_{\text{old}}}{\text{PPL}_{\text{rollout}}} = \exp\left( -\frac{1}{|T|} \sum_{t \in T} \log \rho_t \right) = \left(\prod_{t \in T} \rho_t\right)^{-1/|T|} +$$ + +**Interpretation:** Values > 1 mean $\pi_{\text{old}}$ assigns lower probability than $\pi_{\text{rollout}}$ to the observed actions (distribution shift). + +### 4.3 Chi-squared Divergence + +Measures the second moment of the IS weight distribution. + +**Token-level:** + +$$ +\chi^2_{\text{token}} = \mathbb{E}_{t \sim \pi_{\text{rollout}}} \left[ \rho_t^2 \right] - 1 +$$ + +**Sequence-level:** + +$$ +\chi^2_{\text{seq}} = \mathbb{E}_{\text{seq} \sim \pi_{\text{rollout}}} \left[ \left(\prod_{t \in T} \rho_t\right)^2 \right] - 1 +$$ + +**Interpretation:** +- $\chi^2 = 0$: Policies are identical +- $\chi^2 > 0$: Higher values indicate more severe off-policy distribution shift + +**Implementation:** `compute_offpolicy_metrics()` in [rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py#L670-L776) + +--- + +## 5. Summary and Decision Guide + +### 5.1 Method Summary Table + +| Method | Theory | Policies | PPO Clip | IS Correction | Correctness | Speed | +|--------|--------|----------|----------|---------------|-------------|-------| +| **Bypass Mode** (π_old = π_rollout, `loss_type` selects algorithm) | +| `loss_type="ppo_clip"` (default) | PPO (ratio = π_θ/π_rollout) | 2 (rollout, θ) | ✅ | RS mask only (ratio handles IS) | ✅ Correct | **Fast** | +| `loss_type="reinforce"` | Off-policy REINFORCE | 2 (rollout, θ) | ❌ | ✅ (explicit IS weights) | ✅ Correct | **Fast** | +| **Bypass Mode Presets (PPO-clip)** | +| `bypass_ppo_clip` | PPO only | 2 (rollout, θ) | ✅ | - | ✅ Correct | **Fast** | +| `bypass_ppo_clip_geo_rs` | PPO + Geo-RS | 2 (rollout, θ) | ✅ | Geo-RS mask (ratio) | ✅ Correct | **Fast** | +| **Bypass Mode Presets (REINFORCE)** | +| `bypass_pg_is` | REINFORCE + Seq-TIS | 2 (rollout, θ) | ❌ | ✅ Seq-TIS | ✅ Correct | **Fast** | +| `bypass_pg_geo_rs` | REINFORCE + Geo-RS | 2 (rollout, θ) | ❌ | Geo-RS only (ratio) | ✅ Correct | **Fast** | +| `bypass_pg_geo_rs_token_tis` | REINFORCE + Geo RS + Token IS | 2 (rollout, θ) | ❌ | ✅ Geo-RS-Token-TIS | ✅ Correct | **Fast** | +| **Decoupled PPO Mode** (IS weights = π_old / π_rollout) | +| `decoupled_token_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Token-TIS | ✅ Correct | Standard | +| `decoupled_seq_is` | Decoupled PPO | 3 (rollout, old, θ) | ✅ | ✅ Seq-TIS | ✅ Correct | Standard | +| `decoupled_seq_is_rs` | Decoupled PPO + RS | 3 (rollout, old, θ) | ✅ | ✅ Seq-MIS | ✅ Correct | Standard | +| `decoupled_geo_rs` | Decoupled PPO + Geo-RS | 3 (rollout, old, θ) | ✅ | Geo-RS only (ratio) | ✅ Correct | Standard | +| `decoupled_geo_rs_token_tis` | Decoupled PPO + Geo RS + Token IS | 3 (rollout, old, θ) | ✅ | ✅ Geo-RS-Token-TIS | ✅ Correct | Standard | +| **Incorrect (for reference)** | +| Naive LLM-RL | Incorrect PPO usage | 2 (old, θ) | ✅ | ❌ | ⚠️ Incorrect | Standard | + +**Notes:** +- **Bypass mode** sets π_old = π_rollout and uses `loss_type` to select the loss function: + - `"ppo_clip"` (default): PPO clipped ratio (IS handled by ratio = π_θ/π_rollout, no explicit IS weights to avoid double-counting) + - `"reinforce"`: Explicit IS weights applied as $w \cdot \log \pi \cdot A$ +- Both loss types benefit from rejection sampling (RS) which masks out-of-distribution samples + +### 5.2 Estimator Hierarchy + +These estimators define **how IS weights and rejection masks are computed**. They are orthogonal to the operating mode (decoupled PPO vs bypass policy gradient) and can be combined with either. + +| Estimator | Configuration | Mechanism | Best For | +|-----------|---------------|-----------|----------| +| **Token-TIS** | `rollout_is="token"` | Clips per-token ratios | Lower variance IS with acceptable bias | +| **Seq-TIS** | `rollout_is="sequence"` | Clips sequence ratio $\rho(\tau) \to \min(\rho(\tau), C)$ | Clean data with moderate mismatch; unbiased | +| **Seq-MIS** | `rollout_is="sequence"` + `rollout_rs="seq_sum_k1"` | Rejects sequences with $\rho(\tau) > C$ | Severe mismatch; filters "toxic tail" (garbage data) | +| **Geo-RS** | `rollout_rs="seq_mean_k1"` | Rejects on geometric mean ratio exp(E[log(r)]) | Length-invariant trust region | +| **Geo-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k1"` | Geometric filter + token IS weights | Ratio-based length normalization + lower variance IS | +| **K3-RS** | `rollout_rs="seq_mean_k3"` | Rejects on K3 KL divergence | Small KL values; smooth detector | +| **K3-RS-Token-TIS** | `rollout_is="token"` + `rollout_rs="seq_mean_k3"` | K3 filter + token IS weights | Small KL + lower variance IS | + +**Note:** Each estimator can be used with either: +- **Decoupled PPO** (`bypass_mode=false`): Three policies with PPO clipping +- **Bypass Mode** (`bypass_mode=true`): Two policies with configurable loss type + - `loss_type="ppo_clip"` (default): PPO clipped objective (IS via ratio, RS mask applied) + - `loss_type="reinforce"`: REINFORCE with explicit IS weights + +### 5.3 Method Characteristics by Scenario + +**Choosing estimator by off-policy severity:** +- **Negligible** (same checkpoint, minor differences): No IS correction needed; use bypass mode for efficiency +- **Moderate** (async workers, slight staleness): Token-TIS provides per-token IS correction with lower variance +- **Severe** (replay buffers, old data): Seq-TIS or Seq-MIS provides sequence-level IS correction; use Seq-MIS when high-weight samples are likely garbage + +**Choosing estimator by sequence length:** +- **Short sequences** (standard chat): Seq-TIS is optimal +- **Long sequences** (CoT, agents): K1-RS or K1-RS-Token-TIS to avoid Length Trap + +**Choosing operating mode:** +- **Batch size invariance needed**: Use decoupled mode (`bypass_mode=false`) +- **Computational efficiency needed**: Use bypass mode (`bypass_mode=true`) to skip `old_log_prob` computation +- **No PPO clipping**: Use bypass mode with `loss_type="reinforce"` + +### 5.4 Decoupled Mode vs Bypass Mode + +**Decoupled mode** (computes `old_log_prob` separately): +- Implements full decoupled PPO with three policies (mathematically correct) +- Separately measures and corrects Drift 1 (rollout→old) and Drift 2 (old→current) +- Achieves batch size invariance and efficient stale data utilization +- Enables accurate off-policy metrics monitoring + +**Bypass mode** (sets $\pi_{\text{old}} = \pi_{\text{rollout}}$): +- Uses $\pi_{\text{rollout}}$ as both behavior policy and proximal policy (mathematically correct) +- Computational efficiency: Skips separate `old_log_prob` computation +- Does not achieve batch size invariance (proximal policy depends on data collection) + +--- + +## 6. Implementation References + +- **[Rollout Correction Usage Guide](rollout_corr.md)** - Practical configuration and troubleshooting +- **Config:** [verl/trainer/config/algorithm.py](../../verl/trainer/config/algorithm.py) +- **IS/RS Helper:** [verl/trainer/ppo/rollout_corr_helper.py](../../verl/trainer/ppo/rollout_corr_helper.py) +- **PPO Loss:** [verl/trainer/ppo/core_algos.py](../../verl/trainer/ppo/core_algos.py) +- **Tests:** [tests/trainer/ppo/test_rollout_corr.py](../../tests/trainer/ppo/test_rollout_corr.py) + +--- + +## References + +- **Williams, R. J. (1992).** "Simple statistical gradient-following algorithms for connectionist reinforcement learning." *Machine Learning*, 8(3-4), 229-256. https://doi.org/10.1007/BF00992696 +- **Schulman, J., Wolski, F., Dhariwal, P., Radford, A., & Klimov, O. (2017).** "Proximal policy optimization algorithms." *arXiv preprint arXiv:1707.06347.* https://arxiv.org/abs/1707.06347 +- **Hilton, J., Cobbe, K., & Schulman, J. (2021).** "Batch size-invariance for policy optimization." *arXiv preprint arXiv:2110.00641.* https://arxiv.org/abs/2110.00641 + - Introduced decoupled PPO: separating proximal policy (for controlling policy update size) from behavior policy (for off-policy correction) to achieve batch size invariance +- **Liu, J., Li, Y., et al. (2025).** "When Speed Kills Stability: Demystifying RL Collapse from the Training-Inference Mismatch" + - Blog post: https://richardli.xyz/rl-collapse (see Blog Series above for parts 1-3) diff --git a/code/RL_model/verl/verl_train/docs/algo/spin.md b/code/RL_model/verl/verl_train/docs/algo/spin.md new file mode 100644 index 0000000000000000000000000000000000000000..9349cef976f551a1f60376585f88da2313bdc3f7 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/spin.md @@ -0,0 +1,179 @@ +# Recipe: Self-Play Fine-Tuning (SPIN) + +Last updated: 05/31/2025. + +`verl` provides a recipe inspired by the paper **"Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models"** (SPIN). SPIN is a language model finetuning algorithm that enables iterative self-improvement through a self-play mechanism inspired by game theory. + +**Core Idea:** Models learn by playing against themselves, reducing reliance on external preference datasets or stronger teacher models: + +1. **Synthetic Data Generation:** The current model generates responses, creating its own training data from previous iterations. +2. **Two-Player Game Setup:** A game involving two players acted by a single LLM. +3. **Iterative Training:** The model progressively improves by refining its policy, with each iteration's model becoming the opponent for the next iteration. + +Paper Authors: [Zixiang Chen](https://github.com/uclaml/SPIN)\*, [Yihe Deng](https://github.com/uclaml/SPIN)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/) + +[[Webpage](https://uclaml.github.io/SPIN/)] [[Huggingface](https://huggingface.co/papers/2401.01335)] [[Paper](https://arxiv.org/abs/2401.01335)] [[Original Implementation](https://github.com/uclaml/SPIN)] + +verl Implementation Authors: [Chendong Wang](https://cdwang96.github.io/), [Chenyang Zhao](https://github.com/zhaochenyang20) + +--- + +## Key Function (compute_online_dpo_loss) and Related works +SPIN (Chen et al., 2024) proposes an iterative self-play mechanism to fine-tune language models. In each iteration, SPIN's training objective, when using a logistic loss function, is equivalent to Direct Preference Optimization (DPO) loss (Rafailov et al., 2023). + +This `verl` recipe realizes SPIN's core concept by using DPO loss iteratively (Xu et al., 2023; Xiong et al., 2023; Snorkel AI, 2024). This means that in each iteration, we fine-tune the LLM using DPO loss for preference optimization. Notably, Xu et al. (2023) explored iterative preference optimization with pairwise cringe loss, while Xiong et al. (2023) discussed how to bridge theory and practice for RLHF under KL constraints using iterative training. The concept of iterative preference learning was also explored in online DPO (Guo et al., 2024), which focuses on direct alignment from online AI feedback. In online DPO, preference data is dynamically updated during training, allowing the model to learn from its own generated data. + +Specifically, we developed the **`compute_online_dpo_loss`** function and built this SPIN recipe on top of it. By incorporating online preference generation, this approach enables continuously refining language models without relying on fixed external preference datasets. + +**Reference Papers:** +* [Self-Play Fine-Tuning Converts Weak Language Models to Strong Language Models](https://arxiv.org/abs/2401.01335) (Chen et al., 2024) +* [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2305.18290) (Rafailov et al., 2023) +* [Somethings are more cringe than others: Preference optimization with the pairwise cringe loss](https://arxiv.org/abs/2312.16682) (Xu et al., 2023) +* [Iterative preference learning from human feedback: Bridging theory and practice for rlhf under kl-constraint](https://arxiv.org/abs/2312.11456) (Xiong et al., 2023) +* [Snorkel-Mistral-PairRM-DPO](https://huggingface.co/snorkelai/Snorkel-Mistral-PairRM-DPO) (Snorkel AI, 2024) +* [Direct language model alignment from online ai feedback](https://arxiv.org/abs/2402.04792) (Guo et al., 2024) + + +## Our Online DPO Implementation + +Our `compute_online_dpo_loss` function adapts `verl`'s existing PPO infrastructure (based on `verl` v0.3.0.post1) for this iterative online DPO. Key aspects of our implementation include: + +* **No Critic:** Unlike PPO, we omit the value function critic. +* **Dynamic Reference Model:** An explicit reference policy (`ref_policy_wg`) is used for DPO loss. This reference model's weights can be periodically updated from the actor (`ref_update_freq`), providing a dynamic baseline. +* **Online Preference Generation:** The `compute_onlineDPO_pref` function (in `core_algos.py`) dynamically creates chosen/rejected pairs based on a reward source (e.g., rule-based ranking for math problems). +* **DPO Loss Integration:** We replace PPO's policy loss with our `compute_online_dpo_loss` (in `core_algos.py`) within the actor update (`dp_actor.py`), directly optimizing the policy using the generated preferences. +* **Iterative Training Orchestration:** The `SpinTrainer` (in `spin_trainer.py`) manages the entire self-play loop: generation, preference labeling, optional reference model updates, and policy updates, enabling continuous self-improvement aligned with SPIN's principles. + +--- +## Algorithm + +This recipe implements an Online algorithm adapted to the `verl` Reinforcement Learning framework, which provides an alternative to PPO for fine-tuning language models. + +**Online Loop:** Instead of maximizing a scalar reward signal in PPO, this approach directly optimizes the policy model to align with preference data generated *online* during training: + +1. **Generation:** The current model generates multiple responses for each prompt in a batch. +2. **Preference Labeling:** A function evaluates these generated responses to determine which one is preferred (chosen) and which is dispreferred (rejected). This can be done using a reward function or implicit ranking based on specific rules. (In this recipe, we use rule-based ranking on the math problem). +3. **Update:** This preference tuple (`prompt`, `chosen_response`, `rejected_response`) is used to update the actor model using `compute_online_dpo_loss`, comparing against a reference model. + +**Connection with SPIN:** +Instead of only using a fixed target data distribution, the online generation loop in step 2 will dynamically change the target data distribution by using a certain Preference Labeling method (rule-based ranking on the math problem by selecting the better one in this recipe). This explores the direction mentioned in SPIN's paper Section 7 about "dynamically changing target data distribution" to potentially elevate LLM performance beyond the fixed human-annotated data ceiling. + +--- + +## Reproduce the Experiment (Example Setup) + +The following steps outline how to set up the environment and run the SPIN recipe, based on the provided test log using GSM8K and Qwen2.5-3B-Instruct. + +1. **Setup Environment (Example using Docker):** + ```bash + # Start a container with GPU access and shared memory + docker run -it --name spin_test --gpus all \ + --shm-size=32g \ + --ipc=host \ + -v /path/to/host/.cache:/root/.cache \ + -e HF_TOKEN= \ + lmsysorg/sglang:latest \ + /bin/bash + + # Inside the container or on your host machine: + # Ensure /tmp is writable + mkdir -p /tmp + chmod 1777 /tmp + + # Install Python 3.10 (if not present) and venv + sudo apt update + sudo apt install -y python3.10 python3.10-venv tmux + python3 -m ensurepip --upgrade + + # Create and activate a virtual environment + python3 -m venv ~/.python/spin_env + source ~/.python/spin_env/bin/activate + + # Install uv (fast package installer) + python3 -m pip install uv + ``` + +2. **Install verl and Dependencies:** + ```bash + # Clone the verl repository and checkout the spin branch + cd ~ + git clone git@github.com:volcengine/verl.git && cd verl + + # Install flash-attn (handle potential build issues) + python3 -m uv pip install wheel packaging + python3 -m uv pip install flash-attn --no-build-isolation --no-deps + + # Install verl with sglang extras + python3 -m uv pip install -e ".[sglang]" + ``` + *Note: If `flash-attn` installation fails, try the manual steps again or consult its documentation.* + +3. **Login & Download Data/Model:** + ```bash + # Login to Weights & Biases (optional, for logging) + export WANDB_API_KEY= + # wandb login + + # Download the GSM8K dataset + python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k # Adjusted path + + # Download the base model (Example: Qwen2.5-3B-Instruct) + hf download Qwen/Qwen2.5-3B-Instruct --local-dir $HOME/models/Qwen2.5-3B-Instruct + ``` + +4. **Configure:** + * Modify the configuration file (e.g., `config/spin_trainer.yaml` or the one specified in the run script) with correct paths to your downloaded model, data, desired hyperparameters (`dpo_beta`, learning rate, etc.), and distributed training settings (nodes, GPUs per node). + * Pay attention to `actor_rollout_ref.model`, `data` paths, `reward_model` config (if using one), and `trainer.ref_update_freq`. + +5. **Run Training:** + ```bash + # Set CUDA visible devices (adjust based on your hardware and config) + export CUDA_VISIBLE_DEVICES=0,1,2,3 + + # Launch the training script (e.g., test.sh or a custom script) + # Ensure test.sh points to the correct config and main script + bash recipe/spin/run_spin.sh + ``` + +--- + +## Configuration + +* The primary configuration is typically managed through a YAML file specified in the launch script (e.g., `config/spin_trainer.yaml`). +* Key configuration sections: + * `data`: Paths to training/validation prompt files, batch sizes, sequence lengths. + * `actor_rollout_ref`: Paths to the base model (used for actor and initial reference), FSDP settings, optimization parameters (learning rate, scheduler). + * `reward_model`: Configuration for the reward model used for online preference labeling (path, batch size, etc.). Can be omitted if using a simpler reward function. + * `algorithm`: DPO-specific hyperparameters like `dpo_beta`, `dpo_loss_type`. + * `trainer`: Distributed training settings (nodes, GPUs per node), logging (WandB), checkpointing frequency, and `ref_update_freq` (set > 0 to enable periodic reference model updates from the actor). + +--- + +## Key Files + +* `main_spin.py`: Main entry point using Hydra to load the config and launch the `SpinTrainer`. +* `spin_trainer.py`: Defines the `SpinTrainer` class, orchestrating the Online DPO training loop. +* `fsdp_workers.py`: Implements Ray workers (Actor, Reference) potentially using FSDP. +* `dp_actor.py`: Contains the actor class, including the DPO policy update logic. +* `core_algos.py`: Includes helper functions for `compute_online_dpo_loss` and `compute_onlineDPO_pref`. +* `config/spin_trainer.yaml` (or similar): Main Hydra configuration file for the recipe. +* `run_spin.sh` (or similar): Example bash script for launching a training run. +* `README.md`: This file. + +--- + +## Acknowledgement + +We sincerely thank the contribution and guidance from the `verl` community and advisors, including (adapted from SPPO): + +* [Zixiang Chen](https://sites.google.com/view/zxchen) +* [Yuhao Yang](https://github.com/yhyang201) +* [Yifan Zhang](https://github.com/yifanzhang-pro) +* [Yongan Xiang](https://github.com/BearBiscuit05) +* [Junrong Lin](https://github.com/ocss884) +* [Yuxuan Tong](https://github.com/tongyx361) +* [Guangming Shen](https://github.com/PeterSH6) +* [Biao He](https://www.linkedin.com/in/biao-he/) +* [Qingquan Song](https://qingquansong.github.io/) +* [Chenyang Zhao](https://zhaochenyang20.github.io/Chayenne/) +* [Quanquan Gu](https://web.cs.ucla.edu/~qgu/) diff --git a/code/RL_model/verl/verl_train/docs/algo/sppo.md b/code/RL_model/verl/verl_train/docs/algo/sppo.md new file mode 100644 index 0000000000000000000000000000000000000000..ec9679987a1f1dde7163cc69c0a93c83d3811db7 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/algo/sppo.md @@ -0,0 +1,52 @@ +# Recipe: Self-Play Preference Optimization (SPPO) + +Last updated: 05/28/2025. + +verl provides a community recipe implementation for the paper [Self-Play Preference Optimization for Language Model Alignment](https://arxiv.org/abs/2405.00675). SPPO can significantly enhance the performance of an LLM without strong external signals such as responses or preferences from GPT-4. It can outperform the model trained with iterative direct preference optimization (DPO), among other methods. SPPO is theoretically grounded, ensuring that the LLM can converge to the von Neumann winner (i.e., Nash equilibrium) under general, potentially intransitive preference, and empirically validated through extensive evaluations on multiple datasets. + +Paper Authors: [Yue Wu](https://yuewu.us/)\*, [Zhiqing Sun](https://www.cs.cmu.edu/~zhiqings/)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Yiming Yang](https://www.cs.cmu.edu/~yiming/), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/) + +verl Implementation Authors: [Yuhao Yang](https://github.com/yhyang201), [Chenyang Zhao](https://github.com/zhaochenyang20) + +[[Webpage](https://uclaml.github.io/SPPO/)] [[Huggingface](https://huggingface.co/papers/2405.00675)] [[Paper](https://arxiv.org/abs/2405.00675)][[Original Implementation](https://github.com/uclaml/SPPO)] + +## Reproduce the Experiment + +We evaluate the performance of SPPO on the MATH dataset. Starting from an initial score of 46.6 with Qwen2.5-7B-Instruct, we achieve a score of 65.6 after 20 epochs of training, placing our model approximately in the top 20 on the [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math). It's important to note that verl's internal evaluation metrics may not perfectly align with the official evaluation methodology for Qwen2.5-7B-Instruct. Therefore, for consistency and fair comparison, we report only the results based on verl's evaluation framework. + +``` +git clone git@github.com:volcengine/verl.git +cd verl +python3 -m uv pip install -e ".[sglang]" + +export WANDB_API_KEY= + +python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math +hf download Qwen/Qwen2.5-7B-Instruct --local-dir $HOME/models/Qwen2.5-7B-Instruct + +export CUDA_VISIBLE_DEVICES=0,1,2,3 +bash recipe/sppo/run_qwen2.5-7b_rm.sh +``` + +Note that the installation would occasionally fail to install flash-attn. If this happens, you can install it manually by running: + +```bash +python3 -m uv pip install wheel +python3 -m uv pip install packaging +python3 -m uv pip install flash-attn --no-build-isolation --no-deps +``` + +## Acknowledgement + +We sincerely thank the contribution and guidance from: + +- [Yue Wu](https://yuewu.us/) +- [Chendong Wang](https://cdwang96.github.io/) +- [Yifan Zhang](https://github.com/yifanzhang-pro) +- [Yongan Xiang](https://github.com/BearBiscuit05) +- [Junrong Lin](https://github.com/ocss884) +- [Yuxuan Tong](https://github.com/tongyx361) +- [Guangming Shen](https://github.com/PeterSH6) +- [Biao He](https://www.linkedin.com/in/biao-he/) +- [Qingquan Song](https://qingquansong.github.io/) +- [Quanquan Gu](https://web.cs.ucla.edu/~qgu/) diff --git a/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst new file mode 100644 index 0000000000000000000000000000000000000000..fc462c17fbd8aab8aa57456b73bcf35e5aec5394 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_build_dockerfile_page.rst @@ -0,0 +1,796 @@ +Getting started with AMD (ROCM Kernel) +===================================================== + +Last updated: 07/06/2025. + +Author: `Yusheng Su `_ + +Setup +----- + +If you run on AMD GPUs (MI300) with ROCM platform, you cannot use the previous quickstart to run verl. You should follow the following steps to build a docker and set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` or ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training. + + +docker/Dockerfile.rocm +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + FROM "rlsys/rocm-6.3.4-patch:rocm6.3.4-numa-patch_ubuntu-22.04" + + SHELL ["/bin/bash", "-ceuxo", "pipefail"] + + ENV MAX_JOBS=512 + + ENV PATH="/usr/local/python3.12/bin:$PATH" + RUN ln -sf /usr/bin/python3.12 /usr/bin/python && \ + ln -sf /usr/bin/pip3.12 /usr/bin/pip + + ############################################ + RUN apt-get update + RUN apt-get install -y pkg-config liblzma-dev + ############################################ + + ########################################### + ##########Install TransformerEngine######## + ########################################### + WORKDIR /workspace/ + # transformer-engine install + # https://github.com/ROCm/TransformerEngine + RUN rm -rf TransformerEngine + RUN git clone --recursive https://github.com/ROCm/TransformerEngine.git + WORKDIR /workspace/TransformerEngine + git checkout 236178e5 + # git checkout bb061ade + # git checkout 864405c + ENV NVTE_FRAMEWORK=pytorch + ENV NVTE_ROCM_ARCH=gfx942 + ENV NVTE_USE_HIPBLASLT=1 + ENV NVTE_USE_ROCM=1 + # export CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr:${CMAKE_PREFIX_PATH:-}" + ENV CMAKE_PREFIX_PATH="/opt/rocm:/opt/rocm/hip:/usr/local:/usr" + RUN MAX_JOBS=$(MAX_JOBS) pip install . -vvv + WORKDIR /workspace/ + ########################################### + ########################################### + ########################################### + + + + + + #################################################################################### + ################Install vllm - sglang require vllm 0.6.7 dependency################# + #################################################################################### + #### Require vllm 0.6.7 - checkout 113274a0 + WORKDIR /workspace/ + RUN rm -rf vllm + RUN pip uninstall -y vllm + # Refer to here (down-grade vllm to 0.6.3): https://docs.vllm.ai/en/v0.6.3/getting_started/amd-installation.html + RUN git clone https://github.com/ROCm/vllm.git + # git clone https://github.com/vllm-project/vllm.git + WORKDIR /workspace/vllm + RUN git checkout 113274a0 + ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942" + #ENV MAX_JOBS=512 + ENV MAX_JOBS=${MAX_JOBS} + RUN pip install "boto3>=1.26.0" + RUN pip install setuptools_scm + # will add src into py. You can delete the repo + RUN python3 setup.py install + WORKDIR /workspace/ + #################################################################################### + #################################################################################### + #################################################################################### + + + + ########################################### + ############For hack docker################ + ########################################### + RUN pip install setuptools==75.8.0 + ########################################### + ########################################### + ########################################### + + + + ########################################### + ############build sgalng################### + ########################################### + # Set environment variables + ENV BASE_DIR=/sgl-workspace + ENV BUILD_TYPE=all + ENV SGL_REPO=https://github.com/sgl-project/sglang + ENV SGL_BRANCH=v0.4.6.post5 + ENV TRITON_REPO=https://github.com/ROCm/triton.git + ENV TRITON_COMMIT=improve_fa_decode_3.0.0 + ENV AITER_REPO=https://github.com/ROCm/aiter.git + ENV AITER_COMMIT=v0.1.2 + # v0.1.2 version - commit id: 9d11f47 + # ENV AITER_COMMIT=9d11f47 + ENV HIP_FORCE_DEV_KERNARG=1 + ENV HSA_NO_SCRATCH_RECLAIM=1 + ENV SGLANG_SET_CPU_AFFINITY=1 + ENV SGLANG_ALLOW_OVERWRITE_LONGER_CONTEXT_LEN=1 + ENV NCCL_MIN_NCHANNELS=112 + ENV MOE_PADDING=1 + ENV VLLM_FP8_PADDING=1 + ENV VLLM_FP8_ACT_PADDING=1 + ENV VLLM_FP8_WEIGHT_PADDING=1 + ENV VLLM_FP8_REDUCE_CONV=1 + ENV TORCHINDUCTOR_MAX_AUTOTUNE=1 + ENV TORCHINDUCTOR_MAX_AUTOTUNE_POINTWISE=1 + ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942" + ENV AMDGPU_TARGETS=gfx942 + ENV ROCM_ARCH=gfx942 + ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942" + # Switch to working directory + WORKDIR /sgl-workspace + # Clean and create directory + RUN rm -rf /sgl-workspace && mkdir -p /sgl-workspace + + # Clone and build sglang + RUN git clone ${SGL_REPO} \ + && cd sglang \ + && git checkout ${SGL_BRANCH} || echo "Using default branch" \ + && cd sgl-kernel \ + && rm -f pyproject.toml \ + && mv pyproject_rocm.toml pyproject.toml \ + && python setup_rocm.py install \ + && cd .. \ + && if [ "$BUILD_TYPE" = "srt" ]; then \ + python -m pip --no-cache-dir install -e "python[srt_hip]"; \ + else \ + python -m pip --no-cache-dir install -e "python[all_hip]"; \ + fi \ + && cd /sgl-workspace \ + && cp -r /sgl-workspace/sglang /sglang \ + && python -m pip cache purge + + # Install common Python packages + RUN pip install IPython orjson python-multipart torchao pybind11 + # Rebuild Triton + RUN pip uninstall -y triton || true \ + && git clone ${TRITON_REPO} \ + && cd triton \ + && git checkout ${TRITON_COMMIT} \ + && cd python \ + && python3 setup.py install \ + && cd /sgl-workspace + # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942 --amdgpu-lower-module-lds-strategy=1" + # ENV HIPCC_COMPILE_FLAGS_APPEND="--offload-arch=gfx942" + + # Build aiter + #version: Commit 9d11f47 + # && git checkout ${AITER_COMMIT} \ + RUN pip uninstall -y aiter || true + RUN git clone ${AITER_REPO} \ + && cd aiter \ + && git checkout ${AITER_COMMIT} \ + && git submodule sync \ + && git submodule update --init --recursive \ + && PREBUILD_KERNELS=1 GPU_ARCHS=gfx942 python3 setup.py install \ + && cd /sgl-workspace + + # Copy MI300X config + RUN find /sgl-workspace/sglang/python/sglang/srt/layers/quantization/configs/ \ + /sgl-workspace/sglang/python/sglang/srt/layers/moe/fused_moe_triton/configs/ \ + -type f -name '*MI300X*' | \ + xargs -I {} sh -c 'vf_config=$(echo "$1" | sed "s/MI300X/MI300X_VF/"); cp "$1" "$vf_config"' -- {} + + # Environment setup complete. + RUN echo "Environment setup complete." + + WORKDIR /workspace/ + ########################################### + ########################################### + ########################################### + + + + + + + ########################################### + ###############vllm v0.8.5################# + ########################################### + WORKDIR /workspace/ + + ENV VLLM_TARGET_DEVICE=rocm + ENV ROCM_PATH=/opt/rocm + ENV SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev + # Find the repo path in: DockerFile/Dockerfile.rocm_yang + # RUN git clone https://github.com/RLFoundation/vllm-patch.git + RUN pip uninstall -y vllm || true + RUN rm -rf vllm-patch + RUN git clone https://github.com/RLFoundation/vllm-patch.git \ + && cd vllm-patch \ + && git checkout v0.8.5-sleep-numa \ + && rm -rf build/ dist/ *.egg-info \ + && ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so \ + && SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py install + # RUN SETUPTOOLS_SCM_PRETEND_VERSION=0.8.5.dev PYTORCH_ROCM_ARCH="gfx90a;gfx942" MAX_JOBS=${MAX_JOBS} python3 setup.py develop + WORKDIR /workspace/ + ########################################### + ########################################### + ########################################### + + + + + ######################################### + #### Install megatron-core############### + ######################################### + RUN pip uninstall -y megatron-core && \ + git clone https://github.com/yushengsu-thu/Megatron-LM-amd_version.git && \ + cd Megatron-LM-amd_version && \ + pip install -vvv -e . && \ + cd /workspace/ + ######################################### + ######################################### + ######################################### + + + + + ####################################### + ################apex################### + ####################################### + WORKDIR /workspace/ + RUN pip uninstall -y apex && \ + git clone git@github.com:ROCm/apex.git && \ + cd apex && \ + python setup.py install && \ + cd /workspace/ + ####################################### + ####################################### + ####################################### + + + ################################################################################ + ###########################Add torch_memory_saver############################### + ################################################################################ + # Set environment variables + ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__" + ENV CFLAGS="-D__HIP_PLATFORM_AMD__" + ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__" + RUN pip install "git+https://github.com/YangWang92/torch_memory_saver_numa.git@numa" + ################################################################################ + ################################################################################ + ################################################################################ + + + + ######################################## + ######Install ray####################### + ######################################## + # need to add this patch: https://github.com/ray-project/ray/pull/53531/files + RUN pip uninstall ray -y + RUN pip install "ray[data,train,tune,serve]>=2.47.0" + ######################################## + ######################################## + ######################################## + + + ########################################## + #######Install other dependencies######### + ########################################## + RUN pip install "tensordict==0.6.2" --no-deps && \ + pip install accelerate \ + codetiming \ + datasets \ + dill \ + hydra-core \ + liger-kernel \ + numpy \ + pandas \ + peft \ + "pyarrow>=15.0.0" \ + pylatexenc \ + torchdata \ + wandb \ + orjson \ + pybind11 + + WORKDIR /workspace/ + RUN git clone https://github.com/volcengine/verl.git && \ + cd verl && \ + pip install -e . + ########################################## + ########################################## + ########################################## + + WORKDIR /workspace/ + CMD ["/usr/bin/bash"] + + +Build the image: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + docker docker/build -t verl-rocm . + +Run the container +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Note: You can pull the docker from this DockerHub: [RLSys Foundation](https://hub.docker.com/u/yushengsuthu) +Pull the image: +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 + + docker tag rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 verl-rocm:latest + +Run the container +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +Optional: Running without root and with user permissions +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + docker run --rm -it \ + --device /dev/dri \ + --device /dev/kfd \ + -p 8265:8265 \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME/.ssh:/root/.ssh \ + -v $HOME:$HOME \ + --shm-size 128G \ + -w $PWD \ + verl-rocm \ + /bin/bash + +(Optional): If you do not want to root mode and require assign yourself as the user +Please add ``-e HOST_UID=$(id -u)`` and ``-e HOST_GID=$(id -g)`` into the above docker launch script. + +Example +------- + +Due to to special setting in AMD (ROCM) torch, +1. If your ``ray>=2.45.0`` (default), you need to set ``RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES`` when starting ray in verl's RLHF training and add this [patch](https://github.com/ray-project/ray/pull/53531/files). +2. If your ``ray<2.45.0``, you need to set ``RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES`` when starting ray in verl's RLHF training. +Inference ``$ENGINE`` can be ``vllm`` or ``sglang``. We choose ``vllm`` as default in the following examples. + + + +PPO +~~~ + +.. code-block:: bash + + YOUR_PROJECT_NAME=r1-verl-ppo-upstream + YOUR_RUN_NAME=r1-training_ppo-upstream + # export HYDRA_FULL_ERROR=1 + + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + + # [ray] < 2.45.0 + #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 + + # [ray] >= 2.45.0 + export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794 + + GPUS_PER_NODE=8 + MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct + python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k + python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')" + ENGINE=vllm #sglang + + PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ + data.train_files=data/gsm8k/train.parquet \ + data.val_files=data/gsm8k/test.parquet \ + data.train_batch_size=256 \ + data.val_batch_size=1312 \ + data.max_prompt_length=512 \ + data.max_response_length=256 \ + actor_rollout_ref.model.path=$MODEL_PATH \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=$ENGINE \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + critic.optim.lr=1e-5 \ + critic.model.path=$MODEL_PATH \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=console \ + trainer.project_name=$YOUR_PROJECT_NAME \ + trainer.experiment_name=$YOUR_RUN_NAME \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=$GPUS_PER_NODE \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 #2>&1 | tee verl_demo.log + +GRPO +~~~~ + +.. code-block:: bash + + YOUR_PROJECT_NAME=r1-verl-grpo-upstream + YOUR_RUN_NAME=r1-training_grpo-upstream + # export HYDRA_FULL_ERROR=1 + # export FSDP_VERBOSE=1 + + #export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + + # [ray] < 2.45.0 + #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 + + # [ray] >= 2.45.0 + export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794 + + GPUS_PER_NODE=8 + MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct + # MODEL_PATH=Qwen/Qwen2-7B-Instruct + python3 examples/data_preprocess/gsm8k.py --local_save_dir data/gsm8k + python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')" + ENGINE=vllm #sglang + + python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=data/gsm8k/train.parquet \ + data.val_files=data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.val_batch_size=1312 \ + data.max_prompt_length=512 \ + data.max_response_length=1024 \ + actor_rollout_ref.model.path=$MODEL_PATH \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.use_dynamic_bsz=True \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.model.enable_gradient_checkpointing=Flase \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=$ENGINE \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.fsdp_config.param_offload=False \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger=console \ + trainer.project_name=$YOUR_PROJECT_NAME \ + trainer.experiment_name=$YOUR_RUN_NAME \ + trainer.n_gpus_per_node=$GPUS_PER_NODE \ + trainer.val_before_train=False \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 + + + +Multi-node training: slurm with Docker/Podman container +--------------------------------------------------------------------------------------- + +If you want to run multi-node training with slurm, you can use the following script. + +.. note:: + 1. You need to use ``podman`` or ``docker`` in the following script. We will release the apptainer script later. + 2. If you want to use ``podman``, you just replace ``docker`` with ``podman`` in the following script. + +The script includes the following steps: + +1. SLURM Configuration +2. Environment Setup +3. Docker/Podman Container Setup +4. Ray Cluster Initialization +5. Data Preprocessing +6. Model Setup +7. Training Launch + + +slurm_script.sh +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + #!/bin/bash + + #SBATCH --job-name=verl-ray-on-slurm + #SBATCH --nodes=2 + #SBATCH --ntasks-per-node=2 + #SBATCH --mem=200G + #SBATCH --time=30-00:00:00 + #SBATCH --gpus-per-node=8 + #SBATCH --cpus-per-task=28 + #SBATCH --output=../verl_log/slurm-%j.out + #SBATCH --error=../verl_log/slurm-%j.err + #SBATCH --nodelist=gpu-[0,1] + + + # load necessary modules + ### Run this setup + # [Cluster]: Use docker + # docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + + + ########################################################################## + ###The following setting should be set in different project and cluster### + ########################################################################## + + ### Project + CONTAINER_NAME="multinode_verl_training" + IMG="verl.rocm" + DOCKERFILE="docker/Dockerfile.rocm" + # echo $PWD + verl_workdir="${HOME}/projects/verl_upstream" + export TRANSFORMERS_CACHE="${HOME}/.cache/huggingface" + export HF_HOME=$TRANSFORMERS_CACHE + + ### Cluster Network Setting + export NCCL_DEBUG=TRACE + export GPU_MAX_HW_QUEUES=2 + export TORCH_NCCL_HIGH_PRIORITY=1 + export NCCL_CHECKS_DISABLE=1 + # export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9 + export NCCL_IB_GID_INDEX=3 + export NCCL_CROSS_NIC=0 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_PROTO=Simple + export RCCL_MSCCL_ENABLE=0 + export TOKENIZERS_PARALLELISM=false + export HSA_NO_SCRATCH_RECLAIM=1 + ########################################################################## + + ## Assign using GPUs + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + + ### For rocm and training script + # [ray] < 2.45.0 + #export RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 + + # [ray] >= 2.45.0 + export RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 # Patch with https://github.com/ray-project/ray/pull/52794 + + + # Build and launch the Docker container + srun bash -c " + # Exit on any error + set -e + + # Clean up dangling images (images with tag) + docker image prune -f + + # Need to pull the docker first + docker pull rlsys/verl:verl-0.4.1_ubuntu-22.04_rocm6.3.4-numa-patch_vllm0.8.5_sglang0.4.6.post4 + + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "${IMG}"; then + echo \"Building ${IMG} image...\" + docker build -f \"${DOCKERFILE}\" -t \"${IMG}\" . + else + echo \"${IMG} image already exists, skipping build\" + fi + + # Removing old container if exists + docker rm \"${CONTAINER_NAME}\" 2>/dev/null || true + + # Checking network devices + ibdev2netdev + + # Launch the docker + docker run --rm -d \ + -e HYDRA_FULL_ERROR=1 \ + -e RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 \ + -e RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 \ + -e NCCL_DEBUG=${NCCL_DEBUG} \ + -e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES} \ + -e TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY} \ + -e NCCL_CHECKS_DISABLE=${NCCL_CHECKS_DISABLE} \ + -e NCCL_IB_HCA=${NCCL_IB_HCA} \ + -e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX} \ + -e NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \ + -e CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS} \ + -e NCCL_PROTO=${NCCL_PROTO} \ + -e RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE} \ + -e TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM} \ + -e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM} \ + -e TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE} \ + -e HF_HOME=${HF_HOME} \ + --network host \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v \${HOME}:\${HOME} \ + -v \${HOME}/.ssh:/root/.ssh \ + -w "${verl_workdir}" \ + --shm-size 128G \ + --name \"${CONTAINER_NAME}\" \ + \"${IMG}\" \ + tail -f /dev/null + + echo \"Container setup completed\" + " + # (Optional): If you do not want to root mode and require assign yuorself as the user + # Please add `-e HOST_UID=$(id -u)` and `-e HOST_GID=$(id -g)` into the above docker launch script. + + + + + + ### Ray launch the nodes before training + + # Getting the node names + nodes_array=($(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ')) + + head_node=${nodes_array[0]} + head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + + # if we detect a space character in the head node IP, we'll + # convert it to an ipv4 address. This step is optional. + if [[ "$head_node_ip" == *" "* ]]; then + IFS=' ' read -ra ADDR <<<"$head_node_ip" + if [[ ${#ADDR[0]} -gt 16 ]]; then + head_node_ip=${ADDR[1]} + else + head_node_ip=${ADDR[0]} + fi + echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" + fi + + port=6379 + ip_head=$head_node_ip:$port + export ip_head + echo "IP Head: $ip_head" + + # make sure we set environment variables before Ray initialization + + # Print out all env variables + printenv + + echo "Starting HEAD at $head_node" + srun --nodes=1 --ntasks=1 -w "$head_node" \ + docker exec "${CONTAINER_NAME}" \ + ray start --head --node-ip-address="$head_node_ip" --port=$port \ + --dashboard-port=8266 \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & + # optional, though may be useful in certain versions of Ray < 1.0. + sleep 10 + + # number of nodes other than the head node + worker_num=$((SLURM_JOB_NUM_NODES - 1)) + + for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Debug: Starting worker on node_i = ${node_i}" + if [ -z "$node_i" ]; then + echo "Error: Empty node name for worker $i" + continue + fi + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \ + docker exec "${CONTAINER_NAME}" \ + ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & + sleep 5 + done + + + + + # Ray initlization test (See whether any error in the above execution) + echo "Testing Ray initialization in the slurm nodes..." + docker exec "${CONTAINER_NAME}" python3 -c ' + import ray + try: + ray.init(address="auto") + print("\n=== Ray Cluster Status ===") + print(f"Number of nodes: {len(ray.nodes())}") + for node in ray.nodes(): + print("Node: {}, Status: {}".format(node["NodeManagerHostname"], node["Alive"])) + # print(f"Node: {node}") + ray.shutdown() + print("Ray initialization successful!") + except Exception as e: + print(f"Ray initialization failed: {str(e)}") + ' + echo "=== Ray test completed ===" + ###### + + + + # Run data preprocessing + + echo "Starting data preprocessing..." + docker exec "${CONTAINER_NAME}" \ + python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k" + + echo "Starting data preprocessing..." + docker exec "${CONTAINER_NAME}" \ + python3 "examples/data_preprocess/math_dataset.py" "--local_dir" "../data/math" + + train_files="../data/gsm8k/train.parquet" + val_files="../data/gsm8k/test.parquet" + + # Download and test model + echo "Loading model..." + docker exec "${CONTAINER_NAME}" \ + python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')" + MODEL_PATH="Qwen/Qwen2-7B-Instruct" + + # Set model path after pipeline test + MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct" + + echo "== Data and model loading Done ==" + + echo "Start to train..." + + docker exec "${CONTAINER_NAME}" \ + python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')" + MODEL_PATH="Qwen/Qwen2-7B-Instruct" + + + PYTHONUNBUFFERED=1 srun --overlap --nodes=${SLURM_NNODES} --ntasks=1 -w "$head_node" \ + docker exec "${CONTAINER_NAME}" \ + python3 -m verl.trainer.main_ppo \ + data.train_files=$train_files \ + data.val_files=$val_files \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + actor_rollout_ref.model.path=$MODEL_PATH \ + actor_rollout_ref.model.enable_gradient_checkpointing=False \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=$MODEL_PATH \ + critic.model.enable_gradient_checkpointing=False \ + critic.ppo_micro_batch_size_per_gpu=8 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.kl_ctrl.kl_coef=0.0001 \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example' \ + trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \ + trainer.n_gpus_per_node=${SLURM_GPUS_PER_NODE} \ + trainer.val_before_train=False \ + trainer.nnodes=${SLURM_NNODES} \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 + + +Run slurm_script.sh +~~~~~~~~~~~~~~~~~~~~ +Just sbatch your slurm_script.sh + +.. code-block:: bash + + sbatch slurm_script.sh + diff --git a/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst new file mode 100644 index 0000000000000000000000000000000000000000..7c230acab8792406e0ecb82d1a4fb417ba027a2e --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/amd_tutorial/amd_vllm_page.rst @@ -0,0 +1,41 @@ +verl performance tuning for AMD (ROCm Kernel) +===================================================== + +Last updated: 11/13/2025. + +Author: `Yang Wang `_, `Songlin Jiang `_ + +Use vLLM Sleep Mode for AMD MI3xx series GPUs +-------------------------------------------------------------- + +By default, verl requires vLLM to enable sleep mode, which allows vLLM to offload GPU memory to CPU memory after rollout. This feature has been merged into the main branch of vLLM for version later than 0.11.0. + +For now, you can use the vLLM main branch and build it from the source code, or you can directly install vLLM from the pre-built ROCm wheels for vLLM version later than 0.11.0 when it's available. + +1. Clone the vLLM repository and build it with the following commands: + +.. code-block:: bash + + git clone https://github.com/vllm-project/vllm.git + cd vllm + git reset --hard 4ca5cd5740c0cd7788cdfa8b7ec6a27335607a48 # You can also use a later commit as you wish + python -m pip install -r requirements/rocm.txt + VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ python3 setup.py develop + +2. Additionally, we recommend you to use the ROCm version later than or equal to ROCm 7.0. + +After the upgrade, you can verify whether sleep mode is working by trying out `these scripts `_. + +If sleep mode is working, you should see the memory usage reduce after sleep. + +After applying the vLLM patch and completing the installation, you can enable sleep mode in verl to reduce memory overhead. This allows verl to offload unused GPU memory during rollout, significantly lowering the memory footprint during long-context training or multi-node reinforcement learning. + + +Enable CUDA Graph and Bypass ROCm-related issues +-------------------------------------------------------------- + +Due to potential issues with CUDA graph capture in ROCm, we've found that vLLM's CUDA graph feature cannot be enabled on multiple nodes in verl on AMD platforms with vLLM V1 mode. This leads to significantly slower rollout performance. + +Our investigation shows that ROCm may trigger an unexpected crash when attempting to capture large batches with CUDA graph. One workaround is to set ``actor_rollout_ref.rollout.cudagraph_capture_sizes`` to values such as ``[1, 2, 4, 8, 16, 32, 64]`` (change depending on your GPU memory size). + +Then, you can choose to enable CUDA graph by setting ``actor_rollout_ref.rollout.enforce_eager`` to ``False`` in your verl configuration file. diff --git a/code/RL_model/verl/verl_train/docs/api/data.rst b/code/RL_model/verl/verl_train/docs/api/data.rst new file mode 100644 index 0000000000000000000000000000000000000000..5baa5b51bfdb79f6ead72f1f46141720248bd813 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/api/data.rst @@ -0,0 +1,61 @@ +Data interface +========================= + +Last updated: 05/19/2025 (API docstrings are auto-generated). + +DataProto is the interface for data exchange. + +The :class:`verl.DataProto` class contains two key members: + +- batch: a :class:`tensordict.TensorDict` object for the actual data +- meta_info: a :class:`Dict` with additional meta information + +TensorDict +~~~~~~~~~~~~ + +:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem. +A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size. + +.. code-block:: python + + >>> import torch + >>> from tensordict import TensorDict + >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,]) + >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6) + >>> zeros = tensordict["zeros"] + >>> tensordict + TensorDict( + fields={ + ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False), + twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False), + zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([2]), + device=None, + is_shared=False) + +One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well. + +.. code-block:: python + + >>> tensordict[..., :1] + TensorDict( + fields={ + ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False), + twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False), + zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)}, + batch_size=torch.Size([1]), + device=None, + is_shared=False) + >>> tensordict = tensordict.to("cuda:0") + >>> tensordict = tensordict.reshape(6) + +For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation. + +.. _tensordict: https://pytorch.org/tensordict/stable/overview.html + + +Core APIs +~~~~~~~~~~~~~~~~~ + +.. autoclass:: verl.DataProto + :members: to, select, union, make_iterator, concat diff --git a/code/RL_model/verl/verl_train/docs/api/single_controller.rst b/code/RL_model/verl/verl_train/docs/api/single_controller.rst new file mode 100644 index 0000000000000000000000000000000000000000..44ea366ffe4b12ce5293821877ce70a0073f2152 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/api/single_controller.rst @@ -0,0 +1,30 @@ +Single Controller interface +============================ + +Last updated: 05/27/2025 (API docstrings are auto-generated). + +The Single Controller provides a unified interface for managing distributed workers +using Ray or other backends and executing functions across them. +It simplifies the process of dispatching tasks and collecting results, particularly +when dealing with data parallelism or model parallelism. + + +Core APIs +~~~~~~~~~~~~~~~~~ + +.. autoclass:: verl.single_controller.Worker + :members: __init__, __new__, get_master_addr_port, get_cuda_visible_devices, world_size, rank + +.. autoclass:: verl.single_controller.WorkerGroup + :members: __init__, world_size + +.. autoclass:: verl.single_controller.ClassWithInitArgs + :members: __init__, __call__ + +.. autoclass:: verl.single_controller.ResourcePool + :members: __init__, world_size, local_world_size_list, local_rank_list + +.. autoclass:: verl.single_controller.ray.RayWorkerGroup + :members: __init__ + +.. autofunction:: verl.single_controller.ray.create_colocated_worker_cls \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/api/trainer.rst b/code/RL_model/verl/verl_train/docs/api/trainer.rst new file mode 100644 index 0000000000000000000000000000000000000000..abfa51f01a31606f436a95fde13770577b9ab540 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/api/trainer.rst @@ -0,0 +1,31 @@ +Trainer Interface +================================ + +Last updated: 06/08/2025 (API docstrings are auto-generated). + +Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged. + +.. autosummary:: + :nosignatures: + + verl.trainer.ppo.ray_trainer.RayPPOTrainer + + +Core APIs +~~~~~~~~~~~~~~~~~ + +.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer + :members: __init__, init_workers, fit + +.. automodule:: verl.utils.tokenizer + :members: hf_tokenizer + +.. automodule:: verl.trainer.ppo.core_algos + :members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty + +.. automodule:: verl.trainer.ppo.reward + :members: load_reward_manager, compute_reward, compute_reward_async + +.. autoclass:: verl.workers.reward_manager.NaiveRewardManager + +.. autoclass:: verl.workers.reward_manager.DAPORewardManager diff --git a/code/RL_model/verl/verl_train/docs/api/utils.rst b/code/RL_model/verl/verl_train/docs/api/utils.rst new file mode 100644 index 0000000000000000000000000000000000000000..e15e3a5a32bdbb129a25d93b12e751385caa30b5 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/api/utils.rst @@ -0,0 +1,76 @@ +Utilities +============ + +Last updated: 05/19/2025 (API docstrings are auto-generated). + +This section documents the utility functions and classes in the VERL library. + +Python Functional Utilities +------------------------------ + +.. automodule:: verl.utils.py_functional + :members: append_to_dict + +File System Utilities +------------------------ + +.. automodule:: verl.utils.fs + :members: copy_to_local + +Tracking Utilities +--------------------- + +.. automodule:: verl.utils.tracking + :members: Tracking + +Metrics Utilities +--------------------- + +.. automodule:: verl.utils.metric + :members: reduce_metrics + +Checkpoint Management +------------------------ + +.. automodule:: verl.utils.checkpoint.checkpoint_manager + :members: find_latest_ckpt_path + +.. automodule:: verl.utils.checkpoint.fsdp_checkpoint_manager + :members: FSDPCheckpointManager + +Dataset Utilities +--------------------- + +.. automodule:: verl.utils.dataset.rl_dataset + :members: RLHFDataset, collate_fn + +Torch Functional Utilities +----------------------------- + +.. automodule:: verl.utils.torch_functional + :members: get_constant_schedule_with_warmup, masked_whiten, masked_mean, logprobs_from_logits + +Sequence Length Balancing +---------------------------- + +.. automodule:: verl.utils.seqlen_balancing + :members: get_reverse_idx, rearrange_micro_batches + +Ulysses Utilities +-------------------- + +.. automodule:: verl.utils.ulysses + :members: gather_outputs_and_unpad, ulysses_pad_and_slice_inputs + +FSDP Utilities +------------------ + +.. automodule:: verl.utils.fsdp_utils + :members: get_fsdp_wrap_policy, get_init_weight_context_manager, init_fn, load_fsdp_model_to_gpu, load_fsdp_optimizer, offload_fsdp_model_to_cpu, offload_fsdp_optimizer, + +Debug Utilities +------------------- + +.. automodule:: verl.utils.profiler + :members: log_gpu_memory_usage, GPUMemoryLogger + diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst new file mode 100644 index 0000000000000000000000000000000000000000..20aab3c7057fb70e6b2326f72dce4aeee4002703 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_consistency.rst @@ -0,0 +1,50 @@ +Align the Inference results of the verl and vLLM frameworks on Ascend devices(zh) +==================================== + +在昇腾设备上对齐verl和vLLM两个框架下的推理结果。 + +Last updated: 11/17/2025. + +这是一份在昇腾设备上对齐verl和vLLM两个框架下推理结果的教程。 + +环境变量配置 +~~~~~~~~~~~~ + +在多卡通信情况下: + +- HCCL通信下(默认场景): + + - export CLOSE_MATMUL_K_SHIFT=1 + - export ATB_MATMUL_SHUFFLE_K_ENABLE=0 + - export HCCL_DETERMINISTIC="true" + - export VLLM_ENABLE_V1_MULTIPROCESSING=0 + +- LCCL通信下(通过export HCCL_OP_EXPANSION_MODE="AIV"使能): + + - export CLOSE_MATMUL_K_SHIFT=1 + - export ATB_MATMUL_SHUFFLE_K_ENABLE=0 + - export LCCL_DETERMINISTIC=1 + - export ATB_LLM_LCOC_ENABLE=0 + - export VLLM_ENABLE_V1_MULTIPROCESSING=0 + +在单卡无通信情况下: + +- HCCL和LCCL通信下: + + - export CLOSE_MATMUL_K_SHIFT=1 + - export ATB_MATMUL_SHUFFLE_K_ENABLE=0 + - export VLLM_ENABLE_V1_MULTIPROCESSING=0 + +vLLM初始化参数 +~~~~~~~~~~~~ + +需要对 SamplingParams 参数里单独设置seed, 保持vLLM和verl推理结果一致, 举例修改如下: + +.. code:: yaml + + sampling_params = SamplingParams(n=1, + logprobs=0, # can be set to 0 and let actor to recompute + max_tokens=config.response_length, + repetition_penalty=config.get("repetition_penalty", 1.0), + seed=1234) + diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst new file mode 100644 index 0000000000000000000000000000000000000000..aa9c9adc8fc001dc34c1e510abe993edaa7fe7fb --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_en.rst @@ -0,0 +1,403 @@ +Performance data collection based on FSDP or MindSpeed(Megatron) on Ascend devices(en) +========================================================================================== + +Last updated: 12/20/2025. + +This is a tutorial for data collection using the GRPO or DAPO algorithm +based on FSDP or MindSpeed(Megatron) on Ascend devices. + +Configuration +------------- + +Leverage two levels of configuration to control data collection: + +- **Global profiler control**: Use parameters in ``verl/trainer/config/ppo_trainer.yaml`` (FSDP) or ``verl/trainer/config/ppo_megatron_trainer.yaml`` (MindSpeed) to control the collection mode and steps. +- **Role profile control**: Use parameters in each role's ``profile`` field to control various parameters. + +Global collection control +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Use parameters in ppo_trainer.yaml to control the collection mode +and steps. + +- global_profiler: Control the ranks and mode of profiling + + - tool: The profiling tool to use, options are nsys, npu, torch, + torch_memory. + - steps: This parameter can be set as a list that has + collection steps, such as [2, 4], which means it will collect steps 2 + and 4. If set to null, no collection occurs. + - save_path: The path to save the collected data. Default is + "outputs/profile". + + +Role collection control +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In each role's ``profiler`` field, you can control the collection mode for that role. + +- enable: Whether to enable profiling for this role. +- all_ranks: Whether to collect data from all ranks. +- ranks: A list of ranks to collect data from. If empty, no data is collected. +- tool_config: Configuration for the profiling tool used by this role. + +Use parameters in each role's ``profiler.tool_config.npu`` to control npu profiler behavior: + +- level: Collection level—options are level_none, level0, level1, and + level2 + + - level_none: Disables all level-based data collection (turns off profiler_level). + - level0: Collect high-level application data, underlying NPU data, and operator execution details on NPU. After balancing data volume and analytical capability, Level 0 is recommended as the default configuration. + - level1: Extends level0 by adding CANN-layer AscendCL data and AI Core performance metrics on NPU. + - level2: Extends level1 by adding CANN-layer Runtime data and AI CPU metrics. + +- contents: A list of options to control the collection content, such as + npu, cpu, memory, shapes, module, stack. + + - npu: Whether to collect device-side performance data. + - cpu: Whether to collect host-side performance data. + - memory: Whether to enable memory analysis. + - shapes: Whether to record tensor shapes. + - module: Whether to record framework-layer Python call stack information. It is recommended to use 'module' instead of 'stack' for recording call stack information, as it costs less performance overhead. + - stack: Whether to record operator call stack information. + +- analysis: Enables automatic data parsing. +- discrete: Whether to enable discrete mode. + + +Examples +-------- + +Disabling collection +~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + global_profiler: + steps: null # disable profile + +End-to-End collection +~~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + global_profiler: + steps: [1, 2, 5] + save_path: ./outputs/profile + actor_rollout_ref: + actor: # Set actor role profiler collection configuration parameters + profiler: + enable: True + all_ranks: True + tool_config: + npu: + discrete: False + contents: [npu, cpu] # Control collection list, default cpu, npu, can configure memory, shapes, module, etc. + # rollout & ref follow actor settings + + +Discrete Mode Collection +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + global_profiler: + steps: [1, 2, 5] + save_path: ./outputs/profile + actor_rollout_ref: + actor: + profiler: + enable: True # Set to True to profile training + all_ranks: False + ranks: [0] # Global Rank 0 + tool_config: + npu: + discrete: True + contents: [npu, cpu] + rollout: + profiler: + enable: True # Set to True to profile inference + all_ranks: False + ranks: [0] # In Agent Loop mode, this is the Replica Rank (e.g., 0-th instance) + tool_config: + npu: + discrete: True # Must be enabled in Agent Loop mode + # ref follow actor settings + +**Agent Loop Scenario Description**: + +When Rollout runs in `Agent Loop <../advance/agent_loop.rst>`_ mode, performance data for the Rollout phase **must be collected using discrete mode**. At this time, the Profiler is triggered by the inference engine backend. + +1. **Rank Meaning**: ``ranks`` in the Rollout config refers to the **Replica Rank** (instance index), not the global rank. +2. **Inference Engine Setup**: + + - **vLLM Engine** + - **Must be configured via environment variables**: + - ``VLLM_TORCH_PROFILER_DIR``: Directory to save traces (**Required**). + - ``VLLM_TORCH_PROFILER_WITH_STACK``: Control stack tracing (1: on, 0: off, default: on). + - ``VLLM_TORCH_PROFILER_RECORD_SHAPES``: Set to 1 to record shapes of operator inputs. + - ``VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY``: Set to 1 to track tensor memory allocation/free. + - ``VLLM_TORCH_PROFILER_WITH_FLOPS``: Set to 1 to estimate FLOPS. + - *Note: vLLM ignores the save_path and contents in yaml.* + + - **SGLang Engine** + - **Zero Configuration**. Automatically reads configuration from ``ppo_trainer.yaml``. + + +Visualization +------------- + +Collected data is stored in the user-defined save_path and can be +visualized by using the `MindStudio Insight `_ tool. + +Additionally, in a Linux environment, the MindStudio Insight tool is provided in the form of a `JupyterLab Plugin `_ ,offering a more intuitive and highly interactive user interface. The advantages of the JupyterLab plugin are as follows: + +- Seamless integration: Supports running the MindStudio Insight tool directly within the Jupyter environment, eliminating the need to switch platforms or copy data from the server, enabling data to be collected and used immediately. +- Fast startup: Allows MindStudio Insight to be launched quickly via the JupyterLab command line or graphical interface. +- Smooth operation: In a Linux environment, launching MindStudio Insight through JupyterLab effectively alleviates performance lag compared to the full-package communication mode, significantly improving the user experience. +- Remote access: Supports remotely launching MindStudio Insight. Users can connect to the service via a local browser for direct visual analysis, reducing the difficulty of uploading and downloading data during large-model training or inference. + +If the analysis parameter is set to False, offline parsing is required after data collection: + +.. code:: python + + import torch_npu + # Set profiler_path to the parent directory of the "localhost.localdomain___ascend_pt" folder + torch_npu.profiler.profiler.analyse(profiler_path=profiler_path) + + +Advanced Guide: Fine-grained Collection +--------------------------------------- + +Background and Challenges +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Although the configuration-based collection method mentioned above is convenient, it faces challenges in training scenarios with **long sequences (Long Context)** or **large global batch sizes (Large Global Batch Size)**. Within a complete training step (Step), model computation exhibits high-frequency and repetitive characteristics: + +1. **Rollout phase**: Sequence generation (Generate Sequence) is an autoregressive process involving thousands of forward computations of the Decoder model. +2. **Training phase**: To control peak memory usage, verl typically adopts a Micro-Batch strategy, dividing large data streams into multiple micro-batches for computation. + + - **compute_log_prob (Actor/Ref)**: Involves multiple rounds of pure forward propagation. + - **update_policy (Actor/Critic)**: Involves multiple rounds of forward and backward propagation. + +This characteristic leads to massive and repetitive operator records from full profiling. As shown in the image below: + +.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png + +Even with ``discrete`` mode enabled, performance data files for a single stage can still reach several TB, leading to **parsing failures** or **visualization tool lag**. + +Solution: Critical Path Sampling +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To solve the above problems, we can adopt a **critical path sampling** strategy: Based on the API interface provided by `torch_npu.profiler `_, directly modify Python source code to collect only representative data segments (such as specific Decode Steps or the first Micro-Batch). + + **Important Notes** + + 1. This chapter involves direct source code modification. It is recommended to back up files before modification and restore them after debugging. + 2. When using code instrumentation for collection, be sure to **disable global collection** (``global_profiler: steps: null``) in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yaml`` to avoid Profiler conflicts. + +1. Fine-grained Collection in Rollout Phase +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +For vLLM or SGLang inference engines, we can control the ``schedule`` parameter to collect model forward propagation performance data for specific tokens. + +**vLLM Engine** + +- **Reference Version**: vLLM v0.11.0, vLLM-Ascend v0.11.0rc1 +- **Modified File**: ``vllm-ascend/vllm_ascend/worker/worker_v1.py`` + +.. code-block:: diff + + class NPUWorker(WorkerBase): + + def __init__(self, *args, **kwargs): + # ... existing code ... + + + # Initialize profiler + + import torch_npu + + experimental_config = torch_npu.profiler._ExperimentalConfig( + + profiler_level=torch_npu.profiler.ProfilerLevel.Level1, + + export_type=torch_npu.profiler.ExportType.Db, # You can choose torch_npu.profiler.ExportType.Text format + + ) + + self.profiler_npu = torch_npu.profiler.profile( + + activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU], + + with_modules=False, # Collect call stack + + profile_memory=False, # Collect memory + + experimental_config=experimental_config, + + # Skip first step, warmup one step, collect 3 steps, repeat 1 time. If you want to collect decode steps 30~70, set schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1) + + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True) # Data save path and whether to parse online + + ) + + self.profiler_npu.start() + + # ... existing code ... + + def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs): + # ... existing code ... + output = self.model_runner.execute_model(scheduler_output, + intermediate_tensors) + + + self.profiler_npu.step() # Drive schedule to collect partial decode steps + + # ... existing code ... + +**SGLang Engine** + +- **Reference Version**: SGLang master branch +- **Modified File**: ``sglang/python/sglang/srt/model_executor/model_runner.py`` + +.. code-block:: diff + + # ... existing imports ... + + import torch_npu + + class ModelRunner: + + def __init__(self, *args, **kwargs): + # ... existing init code ... + + + # Initialize profiler (same configuration as above, omitted) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.profiler_npu = torch_npu.profiler.profile( + + # ... + + # Skip first step, warmup one step, collect 3 steps, repeat 1 time. + + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True) + + ) + + self.profiler_npu.start() + + def forward(self, forward_batch, **kwargs): + # ... existing code ... + + + self.profiler_npu.step() # Drive schedule to collect partial decode steps + return output + +2. Fine-grained Collection in compute_log_prob (Actor & Ref) Phase +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This phase computes probability distributions for new and old policies. + +**FSDP Backend** + +The FSDP backend allows fine-grained control at the Micro-Batch level. + +- **Modified File**: ``verl/workers/actor/dp_actor.py`` + +.. code-block:: diff + + # ... import dependencies ... + + import torch_npu + + class DataParallelPPOActor(BasePPOActor): + + def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor: + + + role = "Ref" if self.actor_optimizer is None else "Actor" + + # Prepare profiler (same configuration as above, omitted) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.prof_npu = torch_npu.profiler.profile( + + # ... + + # wait=0, warmup=0, active=1: directly collect first micro-batch + + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True) + + ) + + + + # This function is shared by ref and actor, set role flag to distinguish. If you want to collect actor_compute_log_prob, set if role=="Actor": + + if role=="Ref": + + self.prof_npu.start() + + for micro_batch in micro_batches: + + # ... original computation logic ... + with torch.no_grad(): + entropy, log_probs = self._forward_micro_batch(...) + + + # Drive schedule to collect micro batch + + if role=="Ref": + + self.prof_npu.step() + + # ... + + +**Megatron Backend** + +The Micro-Batch scheduling in the Megatron backend is managed internally by the framework and does not currently support fine-grained collection at the Micro-Batch level through simple code instrumentation. It is recommended to use global configuration for collection. + +3. Fine-grained Collection in update_policy (Actor & Critic) Phase +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The Update phase includes forward and backward propagation. + +**FSDP Backend** + +The FSDP backend supports collection at both Mini-Batch and Micro-Batch granularities. + +- **Modified File**: ``verl/workers/actor/dp_actor.py`` + +.. code-block:: diff + + # ... import dependencies ... + + import torch_npu + + class DataParallelPPOActor(BasePPOActor): + + def update_policy(self, data: DataProto): + + + # Prepare profiler (same configuration as above, omitted) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.prof_npu = torch_npu.profiler.profile( + + # ... + + # Only collect first Mini Batch (including all Micro-Batch computations and one optimizer update) + + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True) + + ) + + self.prof_npu.start() + + # ... PPO Epochs loop ... + for _ in range(self.config.ppo_epochs): + # ... Mini Batch loop ... + for batch_idx, mini_batch in enumerate(mini_batches): + # ... mini_batches split ... + + for i, micro_batch in enumerate(micro_batches): + # ... Original Forward & Backward logic ... + # ... loss.backward() ... + pass + + grad_norm = self._optimizer_step() + + + # Drive schedule to collect mini batch, if you want micro batch collection, move self.prof_npu.step() inside the micro_batch loop + + self.prof_npu.step() + + +**Megatron Backend** + +The Megatron backend supports collection at the Mini-Batch granularity. + +- **Modified File**: ``verl/workers/actor/megatron_actor.py`` + +.. code-block:: diff + + class MegatronPPOActor(BasePPOActor): + + def update_policy(self, dataloader: Iterable[DataProto]) -> dict: + # ... + + # Prepare profiler (same configuration as above, omitted) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.prof_npu = torch_npu.profiler.profile( + + # ... + + # Only collect computation of first Mini Batch (including all Micro-Batches) and one optimizer update + + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True) + + ) + + self.prof_npu.start() + + for data in dataloader: + # ... internally calls self.forward_backward_batch for computation ... + # ... metric_micro_batch = self.forward_backward_batch(...) + + # ... self.actor_optimizer.step() ... + + + # Drive schedule to collect mini batch + + self.prof_npu.step() \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst new file mode 100644 index 0000000000000000000000000000000000000000..6f27f81bea2bb7543b8e21c2f7292e8842fe5b98 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_profiling_zh.rst @@ -0,0 +1,398 @@ +Performance data collection based on FSDP or MindSpeed(Megatron) on Ascend devices(zh) +================================================================================== + +在昇腾设备上基于 FSDP 或 MindSpeed (Megatron) 后端进行性能数据采集 +---------------------------------------------------------------- + +Last updated: 12/20/2025. + +这是一份在昇腾设备上基于FSDP或MindSpeed(Megatron)后端,使用GRPO或DAPO算法进行数据采集的教程。 + +配置 +---- + +使用两级profile设置来控制数据采集 + +- 全局采集控制:使用verl/trainer/config/ppo_trainer.yaml(FSDP),或verl/trainer/config/ppo_megatron_trainer.yaml(MindSpeed)中的配置项控制采集的模式和步数。 +- 角色profile控制:通过每个角色中的配置项控制等参数。 + +全局采集控制 +~~~~~~~~~~~~ + +通过 ppo_trainer.yaml 中的参数控制采集步数和模式: + +- global_profiler: 控制采集的rank和模式 + + - tool: 使用的采集工具,选项有 nsys、npu、torch、torch_memory。 + - steps: 此参数可以设置为包含采集步数的列表,例如 [2, 4],表示将采集第2步和第4步。如果设置为 null,则不进行采集。 + - save_path: 保存采集数据的路径。默认值为 "outputs/profile"。 + +角色profiler控制 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +在每个角色的 ``profiler`` 字段中,您可以控制该角色的采集模式。 + +- enable: 是否为此角色启用性能分析。 +- all_ranks: 是否从所有rank收集数据。 +- ranks: 要收集数据的rank列表。如果为空,则不收集数据。 +- tool_config: 此角色使用的性能分析工具的配置。 + +通过每个角色的 ``profiler.tool_config.npu`` 中的参数控制具体采集行为: + +- level: 采集级别—选项有 level_none、level0、level1 和 level2 + + - level_none: 禁用所有基于级别的数据采集(关闭 profiler_level)。 + - level0: 采集高级应用数据、底层NPU数据和NPU上的算子执行详情。在权衡数据量和分析能力后,level0是推荐的默认配置。 + - level1: 在level0基础上增加CANN层AscendCL数据和NPU上的AI Core性能指标。 + - level2: 在level1基础上增加CANN层Runtime数据和AI CPU指标。 + +- contents: 控制采集内容的选项列表,例如 + npu、cpu、memory、shapes、module、stack。 + + - npu: 是否采集设备端性能数据。 + - cpu: 是否采集主机端性能数据。 + - memory: 是否启用内存分析。 + - shapes: 是否记录张量形状。 + - module: 是否记录框架层Python调用栈信息。相较于stack,更推荐使用module记录调用栈信息,因其产生的性能膨胀更低。 + - stack: 是否记录算子调用栈信息。 + +- analysis: 启用自动数据解析。 +- discrete: 使用离散模式。 + +示例 +---- + +禁用采集 +~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + global_profiler: + steps: null # disable profile + +端到端采集 +~~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + global_profiler: + steps: [1, 2, 5] + save_path: ./outputs/profile + actor_rollout_ref: + actor: # 设置 actor role 的 profiler 采集配置参数 + profiler: + enable: True + all_ranks: True + tool_config: + npu: + discrete: False + contents: [npu, cpu] # 控制采集列表,默认cpu、npu,可配置memory、shapes、module等 + + # rollout & ref follow actor settings + + +离散模式采集 +~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + global_profiler: + steps: [1, 2, 5] + save_path: ./outputs/profile + actor_rollout_ref: + actor: + profiler: + enable: True # 设置为 True 以采集训练阶段 + all_ranks: False + ranks: [0] # 全局 Rank 0 + tool_config: + npu: + discrete: True + contents: [npu, cpu] + rollout: + profiler: + enable: True # 设置为 True 以采集推理阶段 + all_ranks: False + ranks: [0] # 在 Agent Loop 模式下,此处指推理实例的 Replica Rank (例如第 0 个实例) + tool_config: + npu: + discrete: True # Agent Loop 模式下必须开启离散模式 + # ref follow actor settings + +**Agent Loop 场景说明**: + +当 Rollout 运行在 `Agent Loop <../advance/agent_loop.rst>`_ 模式时,Rollout 阶段的性能数据 **必须使用离散模式** 采集。此时 Profiler 由推理引擎后端触发,配置要求如下: + +1. **Rank 含义**:Rollout 配置中的 ``ranks`` 指代 **Replica Rank**(实例索引),而非全局 Rank。 +2. **推理引擎配置**: + + - **vLLM 引擎** + - **必须通过环境变量配置**: + - ``VLLM_TORCH_PROFILER_DIR``: 设置数据保存路径(**必选**)。 + - ``VLLM_TORCH_PROFILER_WITH_STACK``: 是否记录调用栈 (1开启, 0关闭,默认开启)。 + - ``VLLM_TORCH_PROFILER_RECORD_SHAPES``: 设置为 1 以记录形状。 + - ``VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY``: 设置为 1 以记录内存。 + - ``VLLM_TORCH_PROFILER_WITH_FLOPS``: 设置为 1 以估算 FLOPS。 + - *注意:vLLM 会忽略 yaml 中的 save_path 和 contents。* + + - **SGLang 引擎** + - **零配置**。自动读取 ``ppo_trainer.yaml`` 中的配置。 + + +可视化 +------ + +采集后的数据存放在用户设置的save_path下,可通过 `MindStudio Insight `_ 工具进行可视化。 + +另外在Linux环境下,MindStudio Insight工具提供了 `JupyterLab插件 `_ 形态,提供更直观和交互式强的操作界面。JupyterLab插件优势如下: + +- 无缝集成:支持在Jupyter环境中直接运行MindStudio Insight工具,无需切换平台,无需拷贝服务器上的数据,实现数据即采即用。 +- 快速启动:通过JupyterLab的命令行或图形界面,可快速启动MindStudio Insight工具。 +- 运行流畅:在Linux环境下,通过JupyterLab环境启动MindStudio Insight,相较于整包通信,有效解决了运行卡顿问题,操作体验显著提升。 +- 远程访问:支持远程启动MindStudio Insight,可通过本地浏览器远程连接服务直接进行可视化分析,缓解了大模型训练或推理数据上传和下载的困难。 + +如果analysis参数设置为False,采集之后需要进行离线解析: + +.. code:: python + + import torch_npu + # profiler_path请设置为"localhost.localdomain___ascend_pt"目录的上一级目录 + torch_npu.profiler.profiler.analyse(profiler_path=profiler_path) + + +进阶指南:精细化采集 +-------------------- + +背景与挑战 +~~~~~~~~~~ + +上述基于配置文件的采集方式虽然便捷,但在 **长序列 (Long Context)** 或 **大全局批量 (Large Global Batch Size)** 的训练场景中面临挑战。 +在一个完整的训练步 (Step) 内,模型计算呈现出高频次、重复性的特征: + +1. Rollout 阶段:序列生成 (Generate Sequence) 是一个自回归过程,涉及成千上万次 Decoder 模型的前向计算。 +2. Training 阶段:为了控制显存峰值,verl 通常采用 Micro-Batch 策略,将庞大的数据流切分为多个微批次进行计算。 + + - compute_log_prob (Actor/Ref):涉及多轮纯前向传播。 + - update_policy (Actor/Critic):涉及多轮前向与反向传播。 + +这种特性会导致全量 Profiling 产生海量且重复的算子记录。如下图所示: + +.. image:: https://raw.githubusercontent.com/mengchengTang/verl-data/master/verl_ascend_profiler.png + +即使使用了 ``discrete`` 模式,单个阶段的性能数据文件仍可能达到数 TB,导致 **解析失败** 或 **可视化工具卡顿** 。 + +解决方案:关键路径采样 +~~~~~~~~~~~~~~~~~~~~~~ + +为了解决上述问题,我们可以采用 **关键路径采样** 策略:基于 `torch_npu.profiler `_ 提供的API接口,直接修改 Python 源码,仅采集具有代表性的数据片段(如特定 Decode Step 或首个 Micro-Batch)。 + + **重要提示** + + 1. 本章节涉及直接修改源码。建议修改前备份文件,调试完成后恢复。 + 2. 使用代码插桩采集时,请务必在 ``ppo_trainer.yaml`` 或 ``ppo_megatron_trainer.yaml`` 中**禁用全局采集** (``global_profiler: steps: null``),以避免 Profiler 冲突。 + +1. Rollout 阶段精细化采集 +~~~~~~~~~~~~~~~~~~~~~~~~~ + +对于 vLLM 或 SGLang 推理引擎,我们可以通过控制 ``schedule`` 参数来控制采集模型在特定token的前向传播性能数据。 + +**vLLM 引擎** + +- **参考版本**:vLLM v0.11.0, vLLM-Ascend v0.11.0rc1 +- **修改文件**:``vllm-ascend/vllm_ascend/worker/worker_v1.py`` + +.. code-block:: diff + + class NPUWorker(WorkerBase): + + def __init__(self, *args, **kwargs): + # ... existing code ... + + + # Initialize profiler + + import torch_npu + + experimental_config = torch_npu.profiler._ExperimentalConfig( + + profiler_level=torch_npu.profiler.ProfilerLevel.Level1, + + export_type=torch_npu.profiler.ExportType.Db, # 可选择torch_npu.profiler.ExportType.Text格式 + + ) + + self.profiler_npu = torch_npu.profiler.profile( + + activities=[torch_npu.profiler.ProfilerActivity.CPU, torch_npu.profiler.ProfilerActivity.NPU], + + with_modules=False, # 采集调用栈 + + profile_memory=False, # 采集内存 + + experimental_config=experimental_config, + + # 跳过第一步,warmup一步,采集3步,重复1次。如果想采集第30~70个decode step,可以设置为schedule=torch_npu.profiler.schedule(wait=29, warmup=1, active=30, repeat=1) + + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/vllm_profile", analyse_flag=True) # 采集数据保存路径,是否在线解析 + + ) + + self.profiler_npu.start() + + # ... existing code ... + + def execute_model(self, scheduler_output=None, intermediate_tensors=None, **kwargs): + # ... existing code ... + output = self.model_runner.execute_model(scheduler_output, + intermediate_tensors) + + + self.profiler_npu.step() # 驱动 schedule,对部分decode step进行采集 + + # ... existing code ... + +**SGLang 引擎** + +- **参考版本**:SGLang master 分支 +- **修改文件**:``sglang/python/sglang/srt/model_executor/model_runner.py`` + +.. code-block:: diff + + # ... existing imports ... + + import torch_npu + + class ModelRunner: + + def __init__(self, *args, **kwargs): + # ... existing init code ... + + + # Initialize profiler (配置同上,略) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.profiler_npu = torch_npu.profiler.profile( + + # ... + + # 跳过第一步,warmup一步,采集3步,重复1次。 + + schedule=torch_npu.profiler.schedule(wait=1, warmup=1, active=3, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/sglang_profile", analyse_flag=True) + + ) + + self.profiler_npu.start() + + def forward(self, forward_batch, **kwargs): + # ... existing code ... + + + self.profiler_npu.step() # 驱动 schedule,对部分decode step进行采集 + return output + +2. compute_log_prob (Actor & Ref) 阶段精细化采集 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +该阶段计算新旧策略的概率分布。 + +**FSDP 后端** + +FSDP 后端允许在 Micro-Batch 级别进行精细控制。 + +- **修改文件**:``verl/workers/actor/dp_actor.py`` + +.. code-block:: diff + + # ... 引入依赖 ... + + import torch_npu + + class DataParallelPPOActor(BasePPOActor): + + def compute_log_prob(self, data: DataProto, calculate_entropy=False) -> torch.Tensor: + + + role = "Ref" if self.actor_optimizer is None else "Actor" + + # 准备 profiler (配置同上,略) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.prof_npu = torch_npu.profiler.profile( + + # ... + + # wait=0, warmup=0, active=1: 直接采集第一个 micro-batch + + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler(f"./outputs/{role}_compute_log_prob", analyse_flag=True) + + ) + + + + # 此函数ref和actor共用,设置role标志位来区分。如果想采集actor_compute_log_prob,可设置if role=="Actor": + + if role=="Ref": + + self.prof_npu.start() + + for micro_batch in micro_batches: + + # ... 原始计算逻辑 ... + with torch.no_grad(): + entropy, log_probs = self._forward_micro_batch(...) + + + # 驱动 schedule,对micro batch进行采集 + + if role=="Ref": + + self.prof_npu.step() + + # ... + + +**Megatron 后端** + +Megatron 后端的 Micro-Batch 调度由框架内部管理,暂不支持通过简单的代码插桩进行 Micro-Batch 级别的精细化采集。建议使用全局配置进行采集。 + +3. update_policy (Actor & Critic) 阶段精细化采集 +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Update 阶段包含前向和反向传播。 + +**FSDP 后端** + +FSDP 后端支持设置对 Mini-Batch 和 Micro-Batch 的粒度进行采集。 + +- **修改文件**:``verl/workers/actor/dp_actor.py`` + +.. code-block:: diff + + # ... 引入依赖 ... + + import torch_npu + + class DataParallelPPOActor(BasePPOActor): + + def update_policy(self, data: DataProto): + + + # 准备 profiler (配置同上,略) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.prof_npu = torch_npu.profiler.profile( + + # ... + + # 仅采集第一个 Mini Batch(包含所有 Micro-Batch 的计算和一次优化器更新) + + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/fsdp_actor_update_profile", analyse_flag=True) + + ) + + self.prof_npu.start() + + # ... PPO Epochs 循环 ... + for _ in range(self.config.ppo_epochs): + # ... Mini Batch 循环 ... + for batch_idx, mini_batch in enumerate(mini_batches): + # ... mini_batches 切分 ... + + for i, micro_batch in enumerate(micro_batches): + # ... 原始 Forward & Backward 逻辑 ... + # ... loss.backward() ... + pass + + grad_norm = self._optimizer_step() + + + # 驱动 schedule,对mini batch进行采集,如果想对micro batch进行,则将self.prof_npu.step()移动到micro_batch的循环内 + + self.prof_npu.step() + + +**Megatron 后端** + +Megatron 后端支持以 Mini-Batch 的粒度进行采集。 + +- **修改文件**:``verl/workers/actor/megatron_actor.py`` + +.. code-block:: diff + + class MegatronPPOActor(BasePPOActor): + + def update_policy(self, dataloader: Iterable[DataProto]) -> dict: + # ... + + # 准备 profiler (配置同上,略) + + experimental_config = torch_npu.profiler._ExperimentalConfig(...) + + self.prof_npu = torch_npu.profiler.profile( + + # ... + + # 仅采集第一个 Mini Batch 的计算(含所有 Micro-Batch)和一次优化器更新 + + schedule=torch_npu.profiler.schedule(wait=0, warmup=0, active=1, repeat=1), + + on_trace_ready=torch_npu.profiler.tensorboard_trace_handler("./outputs/megatron_actor_update_profile", analyse_flag=True) + + ) + + self.prof_npu.start() + + for data in dataloader: + # ... 内部会调用 self.forward_backward_batch 进行计算 ... + # ... metric_micro_batch = self.forward_backward_batch(...) + + # ... self.actor_optimizer.step() ... + + + # 驱动 schedule,对mini batch进行采集 + + self.prof_npu.step() diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst new file mode 100644 index 0000000000000000000000000000000000000000..1fa607befe48e402ca8c4f7dd03549ef5830ef4f --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_quick_start.rst @@ -0,0 +1,289 @@ +Ascend Quickstart +=================================== + +Last updated: 12/11/2025. + +我们在 verl 上增加对华为昇腾设备的支持。 + + +关键更新 +---------------------------------- + +2025/12/11:verl 存量场景目前支持自动识别 NPU 设备类型, GPU 脚本在昇腾上运行,原则上不再需要显式设置 trainer.device=npu 参数,新增特性通过设置 trainer.device 仍可优先使用,逐步适配自动识别能力。 + + [说明] 自动识别 NPU 设备类型的前提,是运行程序所在环境包含 torch_npu 软件包。如不包含该软件包,仍需显式指定 trainer.device=npu 参数。 + +硬件支持 +----------------------------------- + +Atlas 200T A2 Box16 + +Atlas 900 A2 PODc + +Atlas 800T A3 + + +安装流程 +----------------------------------- + + +DockerFile镜像构建 & 使用 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +如需要通过 DockerFile 构建镜像,或希望使用基于 verl 构建的镜像,请参考 `文档 `_ 。 + + +安装基础环境 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. 基础环境涉及以下软件包,请参考 `文档 `_ 安装。 + + +---------------+----------------------+ + | software | version | + +---------------+----------------------+ + | Python | >= 3.10, <3.12 | + +---------------+----------------------+ + | CANN | == 8.3.RC1 | + +---------------+----------------------+ + | torch | == 2.7.1 | + +---------------+----------------------+ + | torch_npu | == 2.7.1 | + +---------------+----------------------+ + +2. (可选)在 x86 平台安装时,pip 需要配置额外的源,指令如下: + + .. code-block:: bash + + pip config set global.extra-index-url "https://download.pytorch.org/whl/cpu/" + + +安装其他软件包 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +基础环境准备完毕后,需要通过指令安装以下软件包: + + +---------------+----------------------+ + | torchvision | == 0.22.1 | + +---------------+----------------------+ + | triton-ascend | == 3.2.0rc4 | + +---------------+----------------------+ + | transformers | latest release | + +---------------+----------------------+ + + 安装指令: + + .. code-block:: bash + + # 安装torchvision,版本需要和torch匹配 + pip install torchvision==0.22.1 + + # 清理环境上可能存在的历史triton/triton-ascend软件包残留 + pip uninstall -y triton triton-ascend + + # 安装triton-ascend,不需要单独安装triton + pip install triton-ascend==3.2.0rc4 + + +安装 vllm & vllm-ascend +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. 需确保CANN ascend-toolkit 和 nnal 环境变量被激活,对于CANN默认安装路径 /usr/local/Ascend 而言,激活指令如下: + + .. code-block:: + + source /usr/local/Ascend/ascend-toolkit/set_env.sh + source /usr/local/Ascend/nnal/atb/set_env.sh + +2. vllm 源码安装指令: + + .. code-block:: bash + + git clone --depth 1 --branch v0.11.0 https://github.com/vllm-project/vllm.git + cd vllm && VLLM_TARGET_DEVICE=empty pip install -v -e . && cd .. + +3. vllm-ascend 源码安装指令: + + .. code-block:: bash + + git clone --depth 1 --branch v0.11.0rc1 https://github.com/vllm-project/vllm-ascend.git + cd vllm-ascend && pip install -v -e . && cd .. + + +安装 MindSpeed +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +MindSpeed 源码安装指令: + + .. code-block:: bash + + # 下载 MindSpeed,切换到指定commit-id,并下载 Megatron-LM + git clone https://gitcode.com/Ascend/MindSpeed.git + cd MindSpeed && git checkout f2b0977e && cd .. + git clone --depth 1 --branch core_v0.12.1 https://github.com/NVIDIA/Megatron-LM.git + + # 安装 MindSpeed & Megatron + pip install -e MindSpeed + + # 将 Megatron-LM 源码路径配置到 PYTHONPATH 环境变量中 + export PYTHONPATH=$PYTHONPATH:"$(pwd)/Megatron-LM" + + # (可选)如希望 shell 关闭,或系统重启后,PYTHONPATH 环境变量仍然生效,建议将它添加到 .bashrc 配置文件中 + echo "export PYTHONPATH=$PYTHONPATH:\"$(pwd)/Megatron-LM\"" >> ~/.bashrc + + # 安装 mbridge + pip install mbridge + +MindSpeed 对应 Megatron-LM 后端使用场景,使用方式如下: + + 1. 使能 verl worker 模型 ``strategy`` 配置为 ``megatron`` ,例如 ``actor_rollout_ref.actor.strategy=megatron``。 + + 2. MindSpeed 自定义入参可通过 ``override_transformer_config`` 参数传入,例如对 actor 模型开启 FA 特性可使用 ``+actor_rollout_ref.actor.megatron.override_transformer_config.use_flash_attn=True``。 + + 3. 更多特性信息可参考 `MindSpeed & verl 文档 `_ 。 + + +安装verl +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: bash + + git clone --depth 1 https://github.com/volcengine/verl.git + cd verl && pip install -r requirements-npu.txt && pip install -v -e . && cd .. + + +昇腾暂不支持生态库说明 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +verl 中昇腾暂不支持生态库如下: + + +---------------+----------------+ + | software | description | + +---------------+----------------+ + | flash_attn | not supported | + +---------------+----------------+ + | liger-kernel | not supported | + +---------------+----------------+ + + 1. 不支持通过 flash_attn 使能 flash attention 加速,支持通过 transformers 使用。 + 2. 不支持 liger-kernel 使能。 + + +快速开始 +----------------------------------- +正式使用前,建议您通过对Qwen2.5-0.5B GRPO的训练尝试以检验环境准备和安装的正确性。 + +1.下载数据集并将数据集预处理为parquet格式,以便包含计算RL奖励所需的必要字段 + + .. code-block:: bash + + python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k + +2.执行训练 + + .. code-block:: bash + + set -x + + export VLLM_ATTENTION_BACKEND=XFORMERS + + python3 -m verl.trainer.main_ppo \ + algorithm.adv_estimator=grpo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=128 \ + data.max_prompt_length=512 \ + data.max_response_length=128 \ + data.filter_overlong_prompts=True \ + data.truncation='error' \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.actor.optim.lr=5e-7 \ + actor_rollout_ref.model.use_remove_padding=False \ + actor_rollout_ref.actor.entropy_coeff=0.001 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \ + actor_rollout_ref.actor.use_kl_loss=True \ + actor_rollout_ref.actor.kl_loss_coef=0.001 \ + actor_rollout_ref.actor.kl_loss_type=low_var_kl \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.rollout.enable_chunked_prefill=False \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ + actor_rollout_ref.rollout.n=5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger=console \ + trainer.project_name='verl_grpo_example_gsm8k' \ + trainer.experiment_name='qwen2_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=5 \ + trainer.total_epochs=1 $@ + + + +算法支持现状 +----------------------------------- + +**表1** RL类算法 + + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | algorithm | model | download link | actor.strategy | rollout.name | shell location | hardware | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen2.5-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2_5_7b_grpo_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen2.5-32B-instruct |`32B `_ | FSDP | vllm-ascend |`qwen2_5_32b_grpo_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen2.5-VL-3B-instruct |`3B `_ | FSDP | vllm-ascend |`qwen2_5_vl_3b_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen2.5-VL-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2_5_vl_7b_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen2.5-VL-32B-instruct |`32B `_ | FSDP | vllm-ascend |`qwen2_5_vl_32b_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen3-4B |`4B `_ | FSDP | vllm-ascend |`qwen3-4B_npu `_ | Atlas 800T A3 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen3-8B |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_vllm_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen3-8B |`8B `_ | FSDP | sglang |`qwen3_8b_sglang_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | Qwen3-32B |`32B `_ | FSDP | vllm-ascend |`qwen3-32B_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | GRPO | DeepSeekv3-671B |`671B `_ | Megatron | vllm-ascend |`deepseek_v3_megatron_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | DAPO | Qwen2.5-7B-instruct |`7B `_ | FSDP | vllm-ascend |`qwen2.5_7b_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | DAPO | Qwen2.5-32B |`32B `_ | FSDP | vllm-ascend |`qwen2.5_32b_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | DAPO | Qwen3-8B-base |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | DAPO | Qwen3-14B-base |`14B `_ | FSDP | vllm-ascend |`qwen3_14b_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | DAPO | Qwen3-30B-A3B-base |`30B `_ | FSDP | vllm-ascend |`qwen3_30b_fsdp_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | DAPO | Qwen3-30B-A3B-base |`30B `_ | Megatron | vllm-ascend |`qwen3_30b_megatron_npu `_ | Atlas 200T A2 Box16 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | PPO | Qwen3-8B |`8B `_ | FSDP | vllm-ascend |`qwen3_8b_ppo_npu `_ | Atlas 900 A2 PODc | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + | One_Step_Off_Policy | Qwen3-8B |`8B `_ | FSDP2 | vllm-ascend |`qwen3_8b_fsdp2_npu `_ | Atlas 800T A3 | + +-----------------------+-------------------------+------------------------------------------------------------------+-------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+ + +**表2** SFT类算法 + + +-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ + | algorithm | model | download link | actor.strategy | shell location | hardware | + +-----------+-------------------------+------------------------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ + | SFT-PEFT | Qwen3-8B |`8B `_ | FSDP |`sft_peft_sp2_npu `_ | Atlas 900 A2 PODc | + +-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ + | ReTool-SFT| Qwen2-7B-instruct |`7B `_ | FSDP |`qwen2_7b_sft_npu `_ | Atlas 900 A2 PODc | + +-----------+-------------------------+-------------------------+----------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ + + +声明 +----------------------------------- +verl中提供的ascend支持代码、Dockerfile、镜像皆为参考样例,如在生产环境中使用请通过官方正式途径沟通,谢谢。 diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst new file mode 100644 index 0000000000000000000000000000000000000000..8b1661cbbe4e6fc0b2eba6aeacc485dc8be7d99a --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/ascend_sglang_quick_start.rst @@ -0,0 +1,153 @@ +Ascend Quickstart with SGLang Backend +=================================== + +Last updated: 01/27/2026. + +我们在 verl 上增加对华为昇腾设备的支持。 + +硬件支持 +----------------------------------- + +Atlas 200T A2 Box16 + +Atlas 900 A2 PODc + +Atlas 800T A3 + + +安装 +----------------------------------- +关键支持版本 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + ++-----------+-----------------+ +| software | version | ++===========+=================+ +| Python | == 3.11 | ++-----------+-----------------+ +| HDK | >= 25.3.RC1 | ++-----------+-----------------+ +| CANN | >= 8.3.RC1 | ++-----------+-----------------+ +| torch | >= 2.7.1 | ++-----------+-----------------+ +| torch_npu | >= 2.7.1.post2 | ++-----------+-----------------+ +| sglang | v0.5.8 | ++-----------+-----------------+ + +从 Docker 镜像进行安装 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +我们提供了DockerFile进行构建,详见 `dockerfile_build_guidance `_ ,请根据设备自行选择对应构建文件 + +从自定义环境安装 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +**1. 安装HDK&CANN依赖并激活** + +异构计算架构CANN(Compute Architecture for Neural Networks)是昇腾针对AI场景推出的异构计算架构, 为了使训练和推理引擎能够利用更好、更快的硬件支持, 我们需要安装以下 `先决条件 `_ + ++-----------+-------------+ +| HDK | >= 25.3.RC1 | ++-----------+-------------+ +| CANN | >= 8.3.RC1 | ++-----------+-------------+ +安装完成后请激活环境 + +.. code-block:: bash + + source /usr/local/Ascend/ascend-toolkit/set_env.sh + source /usr/local/Ascend/nnal/atb/set_env.sh + +**2. 创建conda环境** + +.. code-block:: bash + + # create conda env + conda create -n verl-sglang python==3.11 + conda activate verl-sglang + +**3. 然后,执行我们在 verl 中提供的脚本** `install_sglang_mcore_npu.sh `_ + +如果在此步骤中遇到错误,请检查脚本并手动按照脚本中的步骤操作。 + +.. code-block:: bash + + git clone https://github.com/volcengine/verl.git + # Make sure you have activated verl conda env + # NPU_DEVICE=A3 or A2 depends on your device + NPU_DEVICE=A3 bash verl/scripts/install_sglang_mcore_npu.sh + +**4. 安装verl** + +.. code-block:: bash + + cd verl + pip install --no-deps -e . + pip install -r requirements-npu.txt + + +快速开始 +----------------------------------- + +**1.当前NPU sglang脚本一览** + +.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh +.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh +.. _Qwen3-8B-1k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_1k_spmd_npu.sh +.. _Qwen3-8B-32k: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3_8b_grpo_sglang_32k_spmd_npu.sh + + +-----------------+----------------+----------+-------------------+ + | 模型 | 推荐NPU型号 | 节点数量 | 训推后端 | + +=================+================+==========+===================+ + | `Qwen3-30B`_ | Atlas 800T A3 | 1 | SGLang + Megatron | + +-----------------+----------------+----------+-------------------+ + | `Qwen2.5-32B`_ | Atlas 900 A2 | 2 | SGLang + FSDP | + +-----------------+----------------+----------+-------------------+ + | `Qwen3-8B-1k`_ | Atlas A3/A2 | 1 | SGLang + FSDP | + +-----------------+----------------+----------+-------------------+ + | `Qwen3-8B-32k`_ | Atlas A3/A2 | 1 | SGLang + FSDP | + +-----------------+----------------+----------+-------------------+ + +**2.最佳实践** + +我们提供基于verl+sglang `Qwen3-30B`_ 以及 `Qwen2.5-32B`_ 的 `最佳实践 `_ 作为参考 + +**3.环境变量与参数** + +当前NPU上支持sglang后端必须添加以下环境变量 + +.. code-block:: bash + + #支持NPU单卡多进程 https://www.hiascend.com/document/detail/zh/canncommercial/850/commlib/hcclug/hcclug_000091.html + export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050 + export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050 + #规避ray在device侧调用无法根据is_npu_available接口识别设备可用性 + export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 + #根据当前设备和需要卡数定义 + export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 + #使能推理EP时需要 + export SGLANG_DEEPEP_BF16_DISPATCH=1 + + + +当前verl已解析推理常见参数, 详见 `async_sglang_server.py `_ 中 ServerArgs初始化传参,其他 `sglang参数 `_ 均可通过engine_kwargs 进行参数传递 + +vllm后端推理脚本转换为sglang, 需要添加修改以下参数 + +.. code-block:: bash + + #必须 + actor_rollout_ref.rollout.name=sglang + +actor_rollout_ref.rollout.engine_kwargs.sglang.attention_backend="ascend" + #可选 + #使能推理EP,详细使用方法见 https://github.com/sgl-project/sgl-kernel-npu/blob/main/python/deep_ep/README_CN.md + ++actor_rollout_ref.rollout.engine_kwargs.sglang.deepep_mode="auto" + ++actor_rollout_ref.rollout.engine_kwargs.sglang.moe_a2a_backend="deepep" + #Moe模型多DP时必须设置为True + +actor_rollout_ref.rollout.engine_kwargs.sglang.enable_dp_attention=False + #chunked_prefill默认关闭 + +actor_rollout_ref.rollout.engine_kwargs.sglang.chunked_prefill_size=-1 + + + diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst new file mode 100644 index 0000000000000000000000000000000000000000..e9624d7a6d5ad09ce95b633f8d09437c85d4e946 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/dockerfile_build_guidance.rst @@ -0,0 +1,82 @@ +Ascend Dockerfile Build Guidance +=================================== + +Last updated: 12/4/2025. + +我们在verl上增加对华为昇腾镜像构建的支持。 + + +镜像硬件支持 +----------------------------------- + +Atlas 200T A2 Box16 + +Atlas 900 A2 PODc + +Atlas 800T A3 + + +镜像内各组件版本信息清单 +---------------- + +================= ============ +组件 版本 +================= ============ +基础镜像 Ubuntu 22.04 +Python 3.11 +CANN 8.3.RC1 +torch 2.7.1 +torch_npu 2.7.1 +torchvision 0.22.1 +vLLM 0.11.0 +vLLM-ascend 0.11.0rc1 +Megatron-LM v0.12.1 +MindSpeed (f2b0977e) +triton-ascend 3.2.0rc4 +mbridge latest version +SGLang v0.5.8 +sgl-kernel-npu (46b73de) +================= ============ + + +Dockerfile构建镜像脚本清单 +--------------------------- + +============== ============== ============== ============================================================== +设备类型 基础镜像版本 推理后端 参考文件 +============== ============== ============== ============================================================== +A2 8.2.RC1 vLLM `Dockerfile.ascend_8.2.rc1_a2 `_ +A2 8.3.RC1 vLLM `Dockerfile.ascend_8.3.rc1_a2 `_ +A2 8.3.RC1 SGLang `Dockerfile.ascend.sglang_8.3.rc1_a2 `_ +A3 8.2.RC1 vLLM `Dockerfile.ascend_8.2.rc1_a3 `_ +A3 8.3.RC1 vLLM `Dockerfile.ascend_8.3.rc1_a3 `_ +A3 8.3.RC1 SGLang `Dockerfile.ascend.sglang_8.3.rc1_a3 `_ +============== ============== ============== ============================================================== + + +镜像构建命令示例 +-------------------- + +.. code:: bash + + # Navigate to the directory containing the Dockerfile + cd {verl-root-path}/docker/ascend + + # Build the image + # vLLM + docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend:8.3.rc1-a2 . + # SGLang + docker build -f Dockerfile.ascend_8.3.rc1_a2 -t verl-ascend-sglang:8.3.rc1-a2 . + +公开镜像地址 +-------------------- + +昇腾在 `quay.io/ascend/verl `_ 中托管每日构建的 A2/A3 镜像,基于上述 Dockerfile 构建。 + +每日构建镜像名格式:verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-latest + +verl release版本镜像名格式:verl-{CANN版本}-{NPU设备类型}-{操作系统版本}-{python版本}-{verl release版本号} + +声明 +-------------------- +verl中提供的ascend相关Dockerfile、镜像皆为参考样例,可用于尝鲜体验,如在生产环境中使用请通过官方正式途径沟通,谢谢。 \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst new file mode 100644 index 0000000000000000000000000000000000000000..e7a11299fa356c33fa5a4e3f11b0f179663a41de --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/ascend_sglang_best_practices.rst @@ -0,0 +1,296 @@ +Ascend SGLang Best Practice +=================================== + +Last updated: 01/27/2026. + +.. _Qwen3-30B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh +.. _Qwen2.5-32B: https://github.com/verl-project/verl/blob/main/examples/grpo_trainer/run_qwen2-32b_sglang_fsdp_npu.sh +引言 +---------------------------------- + +SGLang 是当前主流的高性能开源推理引擎, 昇腾已经全面原生支持该推理引擎在verl中使用, +仅需简单的构建流程,开发者即可完成环境构建,本文将提供两个经典用例来帮助开发者了解以下内容: + +1. 环境构建 +2. 模型训练与评估 +3. 性能采集 + +两个用例模型脚本以及其需要的硬件条件各自如下: + ++----------------------+---------------------+----------+------------------------+ +| 模型 | NPU型号 | 节点数量 | 训推后端 | ++======================+=====================+==========+========================+ +| `Qwen3-30B`_ | Atlas 800T A3 | 1 | SGLang + Megatron | ++----------------------+---------------------+----------+------------------------+ +| `Qwen2.5-32B`_ | Atlas 900 A2 | 2 | SGLang + FSDP | ++----------------------+---------------------+----------+------------------------+ + +环境构建 +----------------------------------- +我们在quickstart中提供了两种构建环境的方法, 1.从镜像文件DockerFile进行构建 2.从自定义Conda环境进行构建 + +在本实践中, 我们额外指定verl 的commit id 以避免引入其他问题 + +.. code-block:: bash + + cd verl + git checkout 772c224 +模型训练与评估 +----------------------------------- +1.模型数据准备 +^^^^^^^^^^^ +`Qwen3-30B`_ +^^^^^^^^^^^ +**下载模型权重** + +--local-dir: 模型保存路径 + +.. code-block:: bash + + export HF_ENDPOINT=https://hf-mirror.com + hf download --resume-download Qwen/Qwen3-30B-A3B --local-dir /path/to/local_dir + +**下载数据集** + +.. code-block:: bash + + git clone https://www.modelscope.cn/datasets/AI-ModelScope/DAPO-Math-17k.git + +**HuggingFace To Megatron权重转换(可选)** + +.. code-block:: bash + + python scripts/converter_hf_to_mcore.py \ + --hf_model_path Qwen/Qwen3-30B-A3B \ + --output_path Qwen/Qwen3-30B-A3B-mcore \ + --use_cpu_initialization # Only work for MoE models +*注:verl当前已支持mbridge进行灵活的hf和mcore之间的权重转换,可以修改以下相关参数直接加载hf权重* + +.. code-block:: bash + + actor_rollout_ref.actor.megatron.use_dist_checkpointing=False + actor_rollout_ref.actor.megatron.use_mbridge=True + +`Qwen2.5-32B`_ +^^^^^^^^^^^ +**下载模型权重** + +--local-dir: 模型保存路径 + +.. code-block:: bash + + export HF_ENDPOINT=https://hf-mirror.com + hf download --resume-download Qwen/Qwen2.5-32B --local-dir /path/to/local_dir + +**下载及处理数据集** + +.. code-block:: bash + + wget https://huggingface.co/datasets/agentica-org/DeepScaleR-Preview-Dataset/resolve/main/deepscaler.json + python recipe/r1_ascend/json_to_parquet.py --output_dir ./data/deepscaler --json_path path/to/deepscaler.json --train_data_ratio 0.9 + +2.训练 +^^^^^^^^^^^ +根据开发者实际路径配置情况修改模型训练脚本中的以下参数 + +.. code-block:: bash + + # Model Weights Paths + MODEL_PATH=Qwen/Qwen3-30B-A3B + MCORE_MODEL_PATH=Qwen/Qwen3-30B-A3B-mcore + RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} + CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} + + # File System Paths + TRAIN_FILE=$RAY_DATA_HOME/dataset/dapo-math-17k.parquet + TEST_FILE=$RAY_DATA_HOME/dataset/aime-2024.parquet + + #保存频率,-1默认不保存,如需评测请修改此参数 + trainer.save_freq=-1 + +对于单机任务 `Qwen3-30B`_ , 可以直接bash执行verl仓上示例脚本 + +.. code-block:: bash + + bash examples/grpo_trainer/run_qwen3moe-30b_sglang_megatron_npu.sh +对于多节点任务 `Qwen2.5-32B`_ ,我们推荐使用以下脚本进行大规模多节点训练拉起 + +.. code-block:: bash + + pkill -9 python + ray stop --force + rm -rf /tmp/ray + export RAY_DEDUP_LOGS=0 + export HYDRA_FULL_ERROR=1 + # TASK_QUEUE_ENABLE,下发优化,图模式设置为1,非图模式设置为2 + export TASK_QUEUE_ENABLE=1 + export HCCL_ASYNC_ERROR_HANDLING=0 + export HCCL_EXEC_TIMEOUT=3600 + export HCCL_CONNECT_TIMEOUT=3600 + + export HCCL_HOST_SOCKET_PORT_RANGE=60000-60050 + export HCCL_NPU_SOCKET_PORT_RANGE=61000-61050 + export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1 + export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8 + # 修改为当前需要跑的用例路径 + DEFAULT_SH="./run_*.sh" + echo "Use $DEFAULT_SH" + + ulimit -n 32768 + mkdir logs + + NNODES=2 + NPUS_PER_NODE=8 + # 修改为对应主节点IP + MASTER_ADDR="IP FOR MASTER NODE" + # 修改为当前节点的通信网卡 + SOCKET_IFNAME="Your SOCKET IFNAME" + export HCCL_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE" + export GLOO_SOCKET_IFNAME="SOCKET IFNAME FOR CURRENT NODE" + # 获取当前IP + CURRENT_IP=$(ifconfig $SOCKET_IFNAME | grep -Eo 'inet (addr:)?([0-9]{1,3}\.){3}[0-9]{1,3}' | awk '{print $NF}') + if [ "$MASTER_ADDR" = "$CURRENT_IP" ]; then + # 主节点启动 + ray start --head --port 6766 --dashboard-host=$MASTER_ADDR --node-ip-address=$CURRENT_IP --dashboard-port=8260 --resources='{"NPU": '$NPUS_PER_NODE'}' + + while true; do + ray_status_output=$(ray status) + npu_count=$(echo "$ray_status_output" | grep -oP '(?<=/)\d+\.\d+(?=\s*NPU)' | head -n 1) + npu_count_int=$(echo "$npu_count" | awk '{print int($1)}') + device_count=$((npu_count_int / $NPUS_PER_NODE)) + + # 判断device_count 是否与 NNODES 相等 + if [ "$device_count" -eq "$NNODES" ]; then + echo "Ray cluster is ready with $device_count devices (from $npu_count NPU resources), starting Python script." + ray status + bash $DEFAULT_SH + break + else + echo "Waiting for Ray to allocate $NNODES devices. Current device count: $device_count" + sleep 5 + fi + done + else + # 子节点尝试往主节点注册 ray 直到成功 + while true; do + # 尝试连接 ray 集群 + ray start --address="$MASTER_ADDR:6766" --resources='{"NPU": '$NPUS_PER_NODE'}' --node-ip-address=$CURRENT_IP + + # 检查连接是否成功 + ray status + if [ $? -eq 0 ]; then + echo "Successfully connected to the Ray cluster!" + break + else + echo "Failed to connect to the Ray cluster. Retrying in 5 seconds..." + sleep 5 + fi + done + fi + + sleep 600 + +DEFAULT_SH:修改为训练所用配置 sh 文件路径。在此案例中修改为 `Qwen2.5-32B`_ 路径。 + +NNODES 和 NPUS_PER_NODE:修改为使用节点数量和每个节点 NPU 数量。在此案例中分别为2和8。 + +MASTER_ADDR:修改为对应主节点 IP。即所有节点的 MASTER_ADDR 应该相同。 + +SOCKET_IFNAME, HCCL_SOCKET_IFNAME, GLOO_SOCKET_IFNAME: 修改为对应通信网卡,通信网卡可以通过以下命令获取: + +.. code-block:: bash + + ifconfig |grep "$(hostname -I |awk '{print $1}'|awk -F '.' '{print $0}')" -B 1|awk -F ':' '{print$1}' | head -1 | tail -1 + +3.模型评估 +^^^^^^^^^^^ + +不同模型步骤一致,仅以Qwen3-30b为例列举 + +我们通过 AISBenchmark 评估模型,该工具支持vllm/sglang多种推理后端的评估 + +**安装方法** + +.. code-block:: bash + + git clone https://gitee.com/aisbench/benchmark.git + cd benchmark + pip install -e . + +**下载评估数据集** + +.. code-block:: bash + + cd path/to/benchmark/ais_bench/datasets + wget http://opencompass.oss-cn-shanghai.aliyuncs.com/datasets/data/math.zip + unzip math.zip + rm math.zip + +**修改AISBench配置代码使能sglang推理评测** + +打开 benchmark/ais_bench/benchmark/configs/models/vllm_api/vllm_api_stream_chat.py 文件,这是推理配置文件 + +.. code-block:: bash + + from ais_bench.benchmark.models import VLLMCustomAPIChatStream + from ais_bench.benchmark.utils.model_postprocessors import extract_non_reasoning_content + from ais_bench.benchmark.clients import OpenAIChatStreamClient, OpenAIChatStreamSglangClient + + models = [ + dict( + attr="service", + type=VLLMCustomAPIChatStream, + abbr='sgl-api-stream-chat', + path="/path/to/Qwen3-30B", # 修改为 Qwen3-30B 模型路径 + model="qwen3-30b", + request_rate = 0, + max_seq_len=2048, + retry = 2, + host_ip = "localhost", # 推理服务的IP + host_port = 8005, # 推理服务的端口 + max_out_len = 8192, # 最大输出tokens长度 + batch_size=48, # 推理的最大并发数 + trust_remote_code=False, + custom_client=dict(type=OpenAIChatStreamSglangClient), #使用sglang客户端 + generation_kwargs = dict( + temperature = 0, + seed = 1234, + ), + pred_postprocessor=dict(type=extract_non_reasoning_content) + ) + ] + + +**启动sglang_server服务** + +.. code-block:: bash + + python -m sglang.launch_server --model-path "/path/to/Qwen3-30B" --tp-size 4 --dp-size 1 --port 8005 + +**启动sglang_client评测** + +.. code-block:: bash + + ais_bench --models vllm_api_stream_chat --datasets math500_gen_0_shot_cot_chat_prompt + +**评测结果** + +经过训练,模型在Math-500上的评分显著上升 + ++------+----------------------+---------+----------+------+----------------------+ +| iter | dataset | version | metric | mode | sgl-api-stream-chat | ++======+======================+=========+==========+======+======================+ +| 0 | math_prm800k_500 | c4b6f0 | accuracy | gen | 84.4 | ++------+----------------------+---------+----------+------+----------------------+ +| 150 | math_prm800k_500 | c4b6f0 | accuracy | gen | 91.7 | ++------+----------------------+---------+----------+------+----------------------+ + +性能采集 +----------------------------------- +关于NPU profiling的详细文档请参考 `ascend_profiling_zh `_ + +在 `Qwen3-30B`_ 的脚本中提供了基本的采集性能选项PROF_CONFIG,默认设置 global_profiler.steps=null 关闭采集, 开发者可根据实际需要进行参数修改 + +采集完成后,开发者可以使用 `MindStudio Insight `_ 进行数据解析 + +注: verl框架侧进行采集全量 Profiling 产生海量且重复的算子记录,可以根据文档修改代码仅采集关键阶段 \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md new file mode 100644 index 0000000000000000000000000000000000000000..62b0cc15bc7b9bd2872f673cc9cfa8ec06d662cb --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/dapo_multi_model_optimization_practice.md @@ -0,0 +1,324 @@ +# DAPO 介绍 + +Last updated: 01/27/2026. + +DAPO的论文可以参考:[DAPO](https://arxiv.org/pdf/2503.14476),其中包含以下几个关键技术。 + +* ​**Clip-Higher**​: 通过对重要性采样比的上限剪裁促进了系统的多样性并避免了熵坍缩(Entropy Collapse)。 +* ​**Dynamic Sampling**​: 提高了训练效率和稳定性。DAPO出了一种执行动态采样的策略,并过滤掉准确率等于1和0的提示组,从而保持批次间具有有效梯度的提示数量一致。 +* ​**Token-level Policy Gradient Loss**​: 在长链思维强化学习 (long-CoT RL) 场景中至关重要。 +* ​**Overlong Reward Shaping**​: 减少奖励噪声并稳定了训练。 + +在verl中,可以进行如下设置,从而进行DAPO算法的运行。 + +- **奖励模型的管理策略为 DAPO** + 在dapo算法中,必须配置成dapo。 + +``` +reward_model.reward_manager=dapo +``` + +- **Clip-Higher 更高裁剪 ** + `clip_ratio_low` 和 `clip_ratio_high` 用于指定 DAPO 目标函数中的 $\varepsilon_{\text {low }}$ 和 $\varepsilon_{\text {high }}$。 + +``` +clip_ratio_low=0.2 # 裁剪比例下限,默认值为0.2 +clip_ratio_high=0.28 # 裁剪比例上限,默认值为0.28 +``` + +- **动态采样的相关配置 ** + 将 `filter_groups.enable` 设置为 `True` 会过滤掉输出 `metric` 完全相同的组,例如对于 `acc` 指标,过滤掉输出准确率全部为 1 或 0 的组。 + 训练器会使用 `gen_batch_size` 进行重复采样,直到生成足够数量的符合条件的组,或者达到 `max_num_gen_batches` 所指定的上限为止。 + +``` +data.gen_batch_size=${gen_prompt_bsz} +algorithm.filter_groups.enable=${enable_filter_groups} # 动态采样开关 +algorithm.filter_groups.metric=${filter_groups_metric} # 使用准确率作为过滤标准 +algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} # 最大生成批次数量,最多重复生成数据的次数 +``` + +- **Token-level Loss ** + 将 `loss_agg_mode` 设置为 `token-mean` 意味着计算一个批次中所有序列内所有 token 的(策略梯度)损失的平均值。 + +``` +actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} +#注意:“token-mean”是默认行为。 +``` + +- **奖励模型对超长回答的惩罚配置 ** + 将 `overlong_buffer.enable` 设置为 `True` 将对输出长度过长但仍未超过硬上下文限制的输出进行惩罚。具体来说,当输出的长度超过 `max_response_length - overlong_buffer.len` 且超出 `0` 到 `overlong_buffer.len` 个 token 时,惩罚值会从 `0` 线性增加到 `overlong_buffer.penalty_factor`。 + +``` +reward_model.overlong_buffer.enable=${enable_overlong_buffer} # 启用超长缓冲区惩罚,开启对超长输出的惩罚机制 +reward_model.overlong_buffer.len=${overlong_buffer_len} # 缓冲区长度,定义缓冲区的toke,最大惩罚强度 +reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} #惩罚因子,最大惩罚强度 +``` + +相关参数涉及的代码可以参考:[Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)](https://github.com/verl-project/verl-recipe/blob/main/dapo/README.md) + +# 硬件要求 + +当前支持Atlas 800T A3 与 Atlas 900 A3 SuperPoD。完成跑完本次最佳实践需要 2台Atlas 800T A3。关键软件版本可以参考:[Ascend Quickstart](https://github.com/volcengine/verl/blob/main/docs/ascend_tutorial/ascend_quick_start.rst) + +# 模型训练 + +## 数据集准备 + +Geometry3k 数据集是由加利福尼亚大学洛杉矶分校与浙江大学联合研发的几何领域专用数据集,核心面向视觉问答(VQA)任务展开研究与模型训练。该数据集总计包含 3002 个样本,采用图像和文本两种模态数据形式构建,其中文本模态涵盖各类几何问题描述,图像则以可视化图表呈现问题中的几何图形信息,包括三角形、圆形、四边形等基础几何形状,以及不同图形间的位置、嵌套、相交等关联关系。可以从Hugging Face库下载对应的原始数据集:[Geometry3k ](https://huggingface.co/datasets/hiyouga/geometry3k) + +```python +# 下载原始数据并预处理 +python ./examples/data_preprocess/geo3k.py --local_dir=./data/geo3k +``` + +## 权重下载 + +从Hugging Face库下载对应的模型权重:[Qwen3-VL-30B-A3B-Instruct](https://huggingface.co/Qwen/Qwen3-VL-30B-A3B-Instruct/tree/main +) + +## 全局变量导入 + +- 为了确保 Ray 进程能够正常回收内存,需要安装并使能 jemalloc 库进行内存管理,用于更好管理内存,避免长跑过程中内存 OOM。 + +``` +# 根据实际安装路径设置 jemalloc 环境变量 +export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 +``` + +- 某些模型是通过 vllm ascend 进行优化的。但在某些情况下,优化后的模型可能并不适用。此时,将此值设置为 0 即可禁用优化后的模型。 + +``` +export USE_OPTIMIZED_MODEL=0 +``` + +- 启用vLLM V1 + +``` +export VLLM_USE_V1=1 +``` + +昇腾多卡通信的兜底配置,延长连接超时时间,避免集群环境下训练启动因连接慢而失败 + +``` +export HCCL_CONNECT_TIMEOUT=5400 +``` + +- 控制 vLLM 在昇腾芯片上是否启用NZ优化 + +``` +export VLLM_ASCEND_ENABLE_NZ=0 +``` + +- 根据使用机器的情况,修改相关配置, 例如双机机 A2 可设置`trainer.nnodes`为 1 、`trainer.n_gpus_per_node`为8 + +## 训练脚本 + +基于以上修改,提供了示例配置文件,创建 run_dapo_qwen3_vl_30b.sh 文件。 + +```bash +set -xeuo pipefail + +export VLLM_USE_V1=1 +export HCCL_CONNECT_TIMEOUT=5400 +export VLLM_ASCEND_ENABLE_NZ=0 +export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2 +# Some models are optimized by vllm ascend. While in some case, e.g. rlhf training, +# the optimized model may not be suitable. In this case, set this value to 0 to disable the optimized model. +export USE_OPTIMIZED_MODEL=0 + +project_name='DAPO' +exp_name='DAPO-Qwen3-vl-30B' + +adv_estimator=grpo + +use_kl_in_reward=False +kl_coef=0.0 +use_kl_loss=False +kl_loss_coef=0.0 + +clip_ratio_low=0.2 +clip_ratio_high=0.28 + +max_prompt_length=1024 +max_response_length=2048 +enable_overlong_buffer=False +overlong_buffer_len=$((1024 * 2)) +overlong_penalty_factor=1.0 + +loss_agg_mode="token-mean" + +enable_filter_groups=True +filter_groups_metric=acc +max_num_gen_batches=4 +train_prompt_bsz=64 +gen_prompt_bsz=$((train_prompt_bsz * 3)) +n_resp_per_prompt=8 +train_prompt_mini_bsz=16 + +# Ray +PWD=./ +RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"} +WORKING_DIR=${WORKING_DIR:-"${PWD}"} +RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"} + +# Paths +RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"} +MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen3-VL-30B-A3B-Instruct"} +CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"} +TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/geo3k/train.parquet"} +TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/geo3k/test.parquet"} + +# Algorithm +temperature=1.0 +top_p=1.0 +top_k=-1 # 0 for HF rollout, -1 for vLLM rollout +val_top_p=0.7 + +# Performance Related Parameter +sp_size=8 +use_dynamic_bsz=True +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size)) +gen_tp=8 +fsdp_size=16 + +ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \ + --working-dir "${WORKING_DIR}" \ + --address "${RAY_ADDRESS}" \ + -- python3 -m recipe.dapo.main_dapo \ + data.train_files="${TRAIN_FILE}" \ + data.val_files="${TEST_FILE}" \ + data.prompt_key=prompt \ + data.truncation='left' \ + data.max_prompt_length=${max_prompt_length} \ + data.max_response_length=${max_response_length} \ + data.gen_batch_size=${gen_prompt_bsz} \ + data.train_batch_size=${train_prompt_bsz} \ + actor_rollout_ref.rollout.n=${n_resp_per_prompt} \ + algorithm.adv_estimator=${adv_estimator} \ + algorithm.use_kl_in_reward=${use_kl_in_reward} \ + algorithm.kl_ctrl.kl_coef=${kl_coef} \ + actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \ + actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \ + actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \ + actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \ + actor_rollout_ref.actor.clip_ratio_c=10.0 \ + algorithm.filter_groups.enable=${enable_filter_groups} \ + algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \ + algorithm.filter_groups.metric=${filter_groups_metric} \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 \ + actor_rollout_ref.model.path="${MODEL_PATH}" \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.optim.lr_warmup_steps=10 \ + actor_rollout_ref.actor.optim.weight_decay=0.1 \ + actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.actor.use_torch_compile=False \ + actor_rollout_ref.actor.entropy_coeff=0 \ + actor_rollout_ref.actor.grad_clip=1.0 \ + actor_rollout_ref.rollout.enforce_eager=True \ + actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.70 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ + actor_rollout_ref.rollout.enable_chunked_prefill=True \ + actor_rollout_ref.rollout.temperature=${temperature} \ + actor_rollout_ref.rollout.top_p=${top_p} \ + actor_rollout_ref.rollout.top_k="${top_k}" \ + actor_rollout_ref.rollout.val_kwargs.temperature=${temperature} \ + actor_rollout_ref.rollout.val_kwargs.top_p=${val_top_p} \ + actor_rollout_ref.rollout.val_kwargs.top_k=${top_k} \ + actor_rollout_ref.rollout.val_kwargs.do_sample=True \ + actor_rollout_ref.rollout.val_kwargs.n=1 \ + actor_rollout_ref.rollout.expert_parallel_size=8 \ + actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \ + actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ + actor_rollout_ref.rollout.name=vllm \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + actor_rollout_ref.actor.strategy=fsdp2 \ + actor_rollout_ref.ref.strategy=fsdp2 \ + critic.strategy=fsdp2 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \ + actor_rollout_ref.actor.fsdp_config.fsdp_size=${fsdp_size} \ + reward_model.reward_manager=dapo \ + reward_model.overlong_buffer.enable=${enable_overlong_buffer} \ + reward_model.overlong_buffer.len=${overlong_buffer_len} \ + reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ + trainer.logger=console \ + trainer.project_name="${project_name}" \ + trainer.experiment_name="${exp_name}" \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=2 \ + trainer.val_before_train=True \ + trainer.test_freq=1 \ + trainer.save_freq=20 \ + trainer.resume_mode=auto \ + trainer.device=npu \ + trainer.total_epochs=30 \ + trainer.total_training_steps=100 \ + trainer.default_local_dir="${CKPTS_DIR}" +``` + +# 优化参考 + +- **启动动态批次大小** + 根据单 GPU 的最大 Token 总数(ppo_max_token_len_per_gpu)动态调整批次大小 + +``` +actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} +actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} +actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} +``` + +- **单个 GPU 能处理的最大 Token 总数** + 当`use_dynamic_bsz=True`时,单 GPU 在一个微批次中能处理的最大 Token 数量 + +``` +actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} +actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} +actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} +``` + +- **单个 GPU 微批次大小** + 当`use_dynamic_bsz=True`时,框架会以该值为​初始批次大小​,再根据`ppo_max_token_len_per_gpu`向上 / 向下调整 + +``` +actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=2 +actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=2 +actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=2 +``` + +- **启用 FSDP2 框架** + “将模型参数、梯度、优化器状态分片存储在不同 GPU 上”,避免单卡加载全量模型导致显存溢出。 + +``` +# 启用 FSDP2 框架 +actor_rollout_ref.actor.strategy=fsdp2 +actor_rollout_ref.ref.strategy=fsdp2 +critic.strategy=fsdp2 + +# 仅用于 FSDP2:前向传播后重新分片以减少内存占用。 +actor_rollout_ref.actor.fsdp_config.reshard_after_forward=True +# 仅用于 FSDP2:是否在模型前向传播后重新分片以节省内存。 +actor_rollout_ref.ref.fsdp_config.reshard_after_forward=True +``` + +- **启用专家并行配置** + 指定有多少个 GPU用于并行计算不同的专家网络 + +``` +# MoE 架构 Actor 模型的专家并行配置 +actor_rollout_ref.rollout.expert_parallel_size=8 +``` + + diff --git a/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md new file mode 100644 index 0000000000000000000000000000000000000000..e943fcdbfff6b68b11a941990669b8cec8990391 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/ascend_tutorial/examples/gspo_optimization_practice.md @@ -0,0 +1,233 @@ +## NPU Qwen3-32B GSPO Optimization Practice + +Last updated: 01/27/2026. + +本文章对应脚本地址:[qwen3_32b_gspo_npu](https://github.com/volcengine/verl/blob/main/examples/gspo_trainer/run_qwen3_32b_gspo_npu.sh) + +### 算法适配 + +GSPO通过将优化颗粒度从**token级**提升到**sequence级**,规避了GRPO会遇到的**方差急剧增大**导致训练不稳定的情况,增加了训练的稳定性,同时该算法也在一定程度上提升了算法的收敛速度。 + +想要成功在verl仓库中成功调用到GSPO算法,需要进行如下的必要配置 + +~~~python +# 核心算法配置 +algorithm.adv_estimator=grpo \ # 使用GRPO优势估计器 +algorithm.use_kl_in_reward=False \ # 不在奖励中添加KL惩罚 +# GSPO策略损失模式 +actor_rollout_ref.actor.policy_loss.loss_mode=gspo \ # 启用GSPO策略损失 +# 极小裁剪范围(GSPO特色) +actor_rollout_ref.actor.clip_ratio_low=0.0003 \ # 裁剪下界,论文推荐值 +actor_rollout_ref.actor.clip_ratio_high=0.0004 \ # 裁剪上界,论文推荐值 +# KL配置(GSPO不使用KL loss) +actor_rollout_ref.actor.use_kl_loss=False \ # 禁用KL损失 +actor_rollout_ref.actor.kl_loss_coef=0.0 \ # KL损失系数设为0 +# 序列级损失聚合模式(GSPO核心) +actor_rollout_ref.actor.loss_agg_mode=seq-mean-token-mean \ # 序列级平均,GSPO论文推荐 +# 批次配置 +actor_rollout_ref.rollout.n=16 \ # 每个prompt生成16个响应(组采样) +~~~ + +一般选择入口函数为`verl.trainer.main_ppo` + +### 性能调优 + +优化从训练、推理、调度和其他四个方面入手。 + +#### 训练 + +##### 动态bsz + +~~~bash +actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size)) +infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) / sp_size)) +~~~ + +**这个优化点主要调整上面这两个参数,不过需要注意这两个参数调整的太大会导致OOM** + +**主要调整**`actor_ppo_max_token_len`,调大了会降低训练的耗时,调整`infer_ppo_max_token_len`没有明显的收益,可以不动 + +**这两个参数的作用介绍如下:** + +**这两个参数用于控制动态批处理(dynamic batch size)模式下每个GPU处理的最大token数量** + +- **`actor_ppo_max_token_len`**: Actor模型在PPO更新(前向+反向传播)时每个GPU能处理的最大token数 +- **`infer_ppo_max_token_len`**: 推理阶段(Reference policy和Rollout)计算log概率时每个GPU能处理的最大token数 + +#### 推理 + +##### ACLgraph+FULL_DECODE_ONLY + +推理算子下发方面的优化,平均能有`15%~20%`左右的性能收益。 + +先看单开**ACLgraph**,如下: + +~~~bash +# 开启ACLgraph+FULL_DECODE_ONLY(注意:当设置此参数为False时,TASK_QUEUE_ENABLE必须设置为1,不然会报错) +actor_rollout_ref.rollout.enforce_eager=False +actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_capture_sizes='[8,16,32,64,128]' \ +actor_rollout_ref.rollout.engine_kwargs.vllm.compilation_config.cudagraph_mode='FULL_DECODE_ONLY' \ +~~~ + +`FULL_DECODE_ONLY`开启成功后有如下输出: + +![FULL_DECODE_ONLY result](https://github.com/wucong25/verl-data/blob/main/ascend_acl_graph.png) + +**`cudagraph_capture_sizes`参数设置指南** + +cudagraph_capture_sizes设置的值对应的是批大小,这里的批大小不是配置里的DP域对应的那个批次大小,这里是相较于vllm来说的批大小,单位为**token** + +默认生成的算法如下,可做参考 + +![cudagraph_capture_sizes](https://github.com/wucong25/verl-data/blob/main/ascend_set_cudagraph_sizes.png) + +##### 推理后端切换 + +使用方式:`export VLLM_ATTENTION_BACKEND=XFORMERS` + +![VLLM_ATTENTION_BACKEND](https://github.com/wucong25/verl-data/blob/main/ascend_vllm_attn_backend.png) + +注:需要注意某些后端在一些比较老的vllm-ascend版本内并不支持 + +##### 使能vllm v1版本 + +使用方式:`export VLLM_USE_V1=1` + +可以常开,一般都是正收益。 + +#### 调度 + +##### AIV + +打开方式:设置`export HCCL_OP_EXPANSION_MODE="AIV"` + +HCCL_OP_EXPANSION_MODE环境变量用于配置通信算法的编排展开位置,支持如下取值: + +- AI_CPU:代表通信算法的编排展开位置在Device侧的AI CPU计算单元。 +- AIV:代表通信算法的编排展开位置在Device侧的Vector Core计算单元。 +- HOST:代表通信算法的编排展开位置为Host侧CPU,Device侧根据硬件型号自动选择相应的调度器。 +- HOST_TS:代表通信算法的编排展开位置为Host侧CPU,Host向Device的Task Scheduler下发任务,Device的Task Scheduler进行任务调度执行。 + +下面介绍两种展开机制 + +###### HOST展开 + +image-20260113194257095 + +- 软件栈工作在hostcpu,通信算法展开一个个task +- 每个task调用runtime接口,下发到device的rtsqueue +- STARS从rstqueue上顺序拿取task +- 根据task类型分别调用掉SDMA和RDMA引擎。 + **单算子瓶颈**:hostbound 每个task提交是2~5us,一个通信算子有几百个task,单算子场景不会在device上缓存,下发一个执行一个 + +###### AICpu机制展开 + +image-20260113194333218 + +- host侧不下发一个个task,把通信算子作为一个个kernel,放在通信算子kernel的队列上去。 +- STARS调度kernel队列流上的kernel,把kernel放到AiCPU上去执行。 +- AICPU调用函数(kernel),用一个线程执行kernel 函数,在函数内把通信task展开,把task放到rstqueue上,STARS调用。 +- 降低host和aicpu交互,由几百次降低为一次。 +- task的提交在AICPU上提交,做了提交的部分合并。 + +##### TASK_QUEUE_ENABLE + +**使用方式:**`export TASK_QUEUE_ENABLE=2` + +TASK_QUEUE_ENABLE,下发优化,图模式设置为1(即开启图模式的时候这个要设置为1),非图模式设置为2 + +示意图: + +![ascend task queue](https://github.com/wucong25/verl-data/blob/main/ascend_task_queue2.png) + +##### 绑核优化 + +**使用方式:**`export CPU_AFFINITY_CONF=1` + +详细设置原理可看:https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0059.html + +#### 其他 + +以下内容汇总了若干全局环境变量的调优配置。由于这些参数在训练阶段与推理阶段往往都能带来正向收益,且目前尚缺乏足够精细的消融实验来严格区分它们各自对训练或推理的贡献占比,故统一归拢在此,供后续持续监控与进一步拆解分析。 + +##### 使能jemalloc + +使用方式(注意需要先安装jemalloc库):`export LD_PRELOAD=/usr/local/lib/libjemalloc.so.2` + +**安装使用教程:**[MindSpeed-RL/docs/install_guide.md · Ascend/MindSpeed-RL - AtomGit | GitCode](https://gitcode.com/Ascend/MindSpeed-RL/blob/master/docs/install_guide.md#高性能内存库-jemalloc-安装) + +##### 多流复用 + +内存方面有优化 + +使能方式:`export MULTI_STREAM_MEMORY_REUSE=1` + +原理介绍:https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0040.html + +##### VLLM_ASCEND_ENABLE_FLASHCOMM + +使用方式:`export VLLM_ASCEND_ENABLE_FLASHCOMM=1` + +启用昇腾 NPU 特有的FLASHCOMM高速通信优化技术 + +地址:https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html + +##### VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE + +使用方式:`export VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1` + +启用昇腾 NPU针对大模型推理的稠密计算优化 + +地址:https://vllm-ascend.readthedocs.io/zh-cn/latest/user_guide/release_notes.html + +##### VLLM_ASCEND_ENABLE_PREFETCH_MLP + +使用方式:`export VLLM_ASCEND_ENABLE_PREFETCH_MLP=1` + +启用 MLP 层的权重预取机制 + +image-20251124173132677 + +##### verl框架参数设置 + +主要是内存方面的一些设置开关(注意,这个里面的优化都或多或少会导致吞吐量有一定程度的劣化) + +~~~bash +# 梯度检查点 (Gradient Checkpointing) +# 作用: 通过重新计算激活值来节省显存,以计算换内存。在前向传播时不保存中间激活值,反向传播时重新计算,可以显著降低显存占用,允许使用更大的batch size。 +actor_rollout_ref.model.enable_gradient_checkpointing=True + +# 参数卸载 (Parameter Offload) +# 作用: 将模型参数卸载到CPU内存,训练时再加载回GPU。 +actor_rollout_ref.actor.fsdp_config.param_offload=${offload} # True +actor_rollout_ref.ref.fsdp_config.param_offload=${offload} # True + +# 优化器状态卸载 (Optimizer Offload) +# 作用: 将优化器状态(如Adam的动量)卸载到CPU。优化器状态通常占用大量显存(对于Adam,每个参数需要额外8字节),卸载可以节省显存。 +actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} # True + +# 释放推理引擎缓存 (Free Cache Engine) +# 作用: 在训练阶段释放推理引擎的KV cache和权重。这是3D-HybridEngine的核心优化,允许在同一GPU上交替进行推理和训练,显著降低显存需求。 +actor_rollout_ref.rollout.free_cache_engine=True + +# 熵计算优化 +# entropy_checkpointing: 在训练时对熵计算启用重计算,降低显存峰值 +# entropy_from_logits_with_chunking: 分块处理logits张量(如2048 tokens一组),避免一次性加载整个[bsz*seq_len, vocab]张量 +actor_rollout_ref.actor.entropy_checkpointing=True +actor_rollout_ref.ref.entropy_checkpointing=True +actor_rollout_ref.actor.entropy_from_logits_with_chunking=True +actor_rollout_ref.ref.entropy_from_logits_with_chunking=True + +# 推理引擎显存配置 +# gpu_memory_utilization: 控制vLLM使用的GPU显存比例(0.90 = 90%) +# enforce_eager=False: 启用CUDA graphs加速推理,但会占用额外显存 +actor_rollout_ref.rollout.gpu_memory_utilization=0.90 +actor_rollout_ref.rollout.enforce_eager=False +~~~ + +### NPU调优参考文章 + +环境变量相关:[环境变量列表-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/apiref/Envvariables/Envir_001.html) + +社区性能调优教程:[性能调优流程-Ascend Extension for PyTorch6.0.0-昇腾社区](https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/performance_tuning_0001.html) + diff --git a/code/RL_model/verl/verl_train/docs/blog/v0.7.md b/code/RL_model/verl/verl_train/docs/blog/v0.7.md new file mode 100644 index 0000000000000000000000000000000000000000..0bf3c31c3e9cd771451546a825cf9a74504c1cb7 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/blog/v0.7.md @@ -0,0 +1,274 @@ +# verl 0.7 release blog + +**Author:** verl team + +Last updated: 01/03/2026. + +## Overview +verl adopts a Hybrid-Controller architecture (also known as HybridFlow). Sharing design principles with asynchronous sharded dataflow systems like Google Pathways, verl models Reinforcement Learning (RL) algorithms, such as PPO, GRPO, DAPO, and others, as a multi-stage, multi-model and parallelizable dataflow graph. + +To balance flexibility with performance, verl unifies two distinct programming models: + +**High-Level Single-Controller (MPMD)**: At the orchestration level, a single process `RLTrainer` manages the global computation graph. It handles macro-tasks such as scheduling rollout generation, triggering reward scoring, and dispatching distributed training jobs. + +**Internal Multi-Controller (SPMD)**: Internally, the Model Engine operates in standard distributed training mode. Workers execute identical programs, via trainer backends like FSDP, Megatron, or VeOmni, or rollout executors (not rollout server) like vLLM/SGLang/TensorRT-LLM, to perform heavy distributed computation, synchronizing via collective communication. + +
+ hybridflow.png +
+ +This hybrid approach offers significant advantages: + +**Flexible Orchestration**: The single-controller design allows verl to dynamically manage complex constraints within the computation graph, including flexible data dependencies, diverse resource allocation and model placement, and fine-grained asynchronous staleness control. + +**Abstraction of Complexity**: We encapsulate complex parallel strategies—such as 5D parallelism (DP, TP, CP, PP, and EP)—strictly within the Model Engine. This allows users to focus entirely on RL algorithm implementation without getting bogged down by the details of distributed training. + +Furthermore, leveraging Ray placement groups, verl provides `ResourcePool` and `WorkerGroup` abstractions. These enable flexible GPU sharing among the various roles in the RL process—such as actor, critic, reward, and rollout—allowing components to share resources efficiently while remaining isolated. + +As illustrated in the diagram below, the overall architecture of verl is divided into two layers: + +- **verl-core**: provides four components required for the RL pipeline: model engine, rollout engine, checkpoint engine, and transfer queue. Each component exposes abstract interfaces, making them both extensible and pluggable. +- **verl-trainer**: builds upon these components, construct various RL pipelines—such as on-policy, one-step-off-policy, and fully asynchronous—tailored to meet the demands of diverse scenarios. + +
+ verl-arch.png +
+ + +## verl-core +### Model Engine + +The Model Engine serves as verl's core training engine, defining a set of abstract interfaces that support pluggable backends. It operates in SPMD mode: +- SFT: Workers are launched via torchrun. +- RL: Workers are executed via the WorkerGroup API, invoked by the single-controller. + +The abstract interfaces include methods like `initialize`, `forward`, `optimizer_step`, and `load`/`offload`. Integrating a new training engine simply requires inheriting and implementing these interfaces. Crucially, because all backends adhere to this unified abstraction, adding a new Model Engine requires absolutely no code modification on the caller side. The RLTrainer remains completely agnostic to the backend's specific parallel strategy when calling these interfaces, while the WorkerGroup automatically handles data dispatch and collection based on the underlying parallelism. + +Currently, the Model Engine supports the following backends (more backend maybe supported in future, e.g torchtitan): +|Backend|Parallelism|Performance|Support Model|New Model Support Time +|-----|-----|----|----|----| +|FSDP| FSDP+SP|Dense medium/MoE low| all transformer models|Day 0 +|MCore| DP+TP+PP+EP+CP|High| see [Megatron-Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge) support model list|few weeks or month +|VeOmni| FSDP+SP+EP|Medium| see [VeOmni](https://github.com/ByteDance-Seed/VeOmni) support model list|~1 week + +```python +class BaseEngine: + def initialize(self): + """Instantiate or load the model, optimizer, and learning rate scheduler.""" + raise NotImplementedError + + def optimizer_zero_grad(self): + """Zero the gradients of the optimizer.""" + raise NotImplementedError + + def optimizer_step(self): + """Perform an optimization step using the optimizer.""" + raise NotImplementedError + + def lr_scheduler_step(self): + """Advance the learning rate scheduler by one step.""" + raise NotImplementedError + + def forward_backward_batch(self, data: TensorDict, loss_function: Callable, forward_only=False) -> Any: + """Perform a forward pass and optionally a backward pass on a batch of data.""" + raise NotImplementedError + + def get_per_tensor_param(self) -> tuple[Generator[tuple[str, torch.Tensor], None, None], Optional[dict]]: + """Get a generator that yields per-tensor parameters and optional peft config.""" + raise NotImplementedError + + def to(self, device: str, model: bool = True, optimizer: bool = True, grad: bool = True): + """Move model parameters, optimizer states, or both to the specified device.""" + raise NotImplementedError +``` + + +### Rollout Engine +As LLM reinforcement learning evolves from single-turn, static tasks to multi-turn, dynamic, and interactive agentic tasks, the legacy SPMD rollout mode previously used by verl has become insufficient. Consequently, in verl v0.7, we have removed the SPMD rollout mode and switched to rollout server mode by default. + +
+ rollout_engine.png +
+ +In the server mode, the LLM server operates as online serving rather than the traditional offline batch inference. Clients send per-sample requests to the server, enabling the engine to utilize dynamic batching. This significantly enhances throughput efficiency for multi-turn conversation. Furthermore, the server-based approach eliminates the need for intrusive modifications to the LLM inference engine, allowing for the seamless integration of modern inference backends such as vLLM, SGLang, and TensorRT-LLM. + +On the client side, verl introduces an extensible **AgentLoop** abstraction designed to define custom agentic task loops. This abstraction manages the cycle of requesting responses from the LLM server and interacting with external environments to obtain feedback. We provide two default implementations: +- **SingleTurnAgentLoop**: Designed for standard single-turn tasks. +- **ToolAgentLoop**: Designed for classic ReAct architectures involving multi-turn tool invocation. + +Users can implement custom AgentLoop logic tailored to their specific needs, such as [SWEAgentLoop](https://github.com/volcengine/verl/pull/4080) or GUIAgentLoop. + +```python +class AgentLoopBase(ABC): + @abstractmethod + async def run(self, sampling_params: dict[str, Any], **kwargs) -> AgentLoopOutput: + """Run agent loop to interact with LLM server and environment. + + Args: + sampling_params (Dict[str, Any]): LLM sampling params. + **kwargs: dataset fields from `verl.utils.dataset.RLHFDataset`. + + Returns: + AgentLoopOutput: Agent loop output. + """ + raise NotImplementedError +``` + +### TransferQueue +As mentioned, verl uses a global single-controller RLTrainer to orchestrate the computation graph. A major limitation in the current implementation is that the RLTrainer handles both control and data flow, creating a bottleneck when dispatching data between components. This issue is amplified by the massive data volumes in multimodal training (images, video, audio) and complex algorithms like router replay, which requires transmitting large tensors per sample. Our earlier attempt to solve this using the Ray object store yielded poor performance due to the lack of tensor optimization and fine-grained column access. + +
+ transfer_queue.png +
+ +In v0.7, we experimentally introduced **TransferQueue** to decouple control flow from data flow. The RLTrainer now only dispatch instructions and metadata, while TransferQueue handles data transmission via reference passing. TransferQueue is specifically optimized for PyTorch tensors (supporting zero-copy and RDMA) and allows for backend extensions like ZeroMQ, NIXL, and Ray RDT. We plan to make this the default transmission method in v0.8. + +```python +# In PPOTrainer +def fit(self): + batch = next(dataloader) + gen_batch: BatchMeta = self.rollout_manager.generate_sequences(batch) + output: BatchMeta = self.actor_rollout_wg.compute_log_prob(gen_batch) + gen_batch = gen_batch.union(output) + output = self.actor_rollout_wg.update_actor(gen_batch) + +# In Worker +def compute_log_prob(self, batch: BatchMeta) -> BatchMeta: + data = tq.get(batch) + output = self.actor.infer_batch(data=data) + return tq.put(output) +``` + +### Checkpoint Engine + +With the increase in LLM context lengths and the evolution of agentic tasks, the "long-tail" problem in rollout has become prominent, limiting the overall efficiency of RL training. + +To mitigate this, a viable strategy is moving from on-policy synchronous training to off-policy asynchronous training, e.g [Laminar](https://arxiv.org/abs/2510.12633), [Areal](https://arxiv.org/abs/2505.24298), [StreamRL](https://arxiv.org/abs/2504.15930), [LlamaRL](https://arxiv.org/pdf/2505.24034), [PipelineRL](https://arxiv.org/abs/2509.19128). This involves separating the rollout and model engines onto different nodes (a disaggregated architecture, as opposed to colocated), with data transmitted via queues. This separation alleviates the rollout long-tail issue and enables rollout elastic scaling, fault tolerance, and heterogeneous hardware. However, it introduces a new challenge: efficient cross-node parameter synchronization. + +
+ checkpoint_engine.png +
+ +To address this, we introduce the Checkpoint Engine: a unified abstraction layer designed to synchronize weights between various training and inference backends. +- It provides three unified APIs to implement the streaming transmission of parameters. +- Users can extend the Transport Layer implementation based on their specific infrastructure requirements (device, network, local cache, etc.). + +Currently, we provide two transport backends: NCCL (for broadcast collective communication) and NIXL (for P2P point-to-point communication). + +```python +class CheckpointEngine(ABC): + @abstractmethod + async def send_weights(self, weights: Generator[tuple[str, torch.Tensor], None, None]): + """Send the weights of the model. + + Args: + weights: A generator that yields the name of the weight tensor and the tensor itself. + """ + raise NotImplementedError + + @abstractmethod + async def receive_weights(self) -> Generator[tuple[str, torch.Tensor], None, None]: + """Receive the weights of the model. + + Yields: + A tuple of the name of the weight tensor and the tensor itself. + """ + raise NotImplementedError +``` + +## verl-trainer +Building upon the four core components provided by verl-core, verl-trainer constructs several RL training pipelines tailored to specific scenarios. These pipelines are designed to address training efficiency challenges across varying scales and requirements: + +**On-policy (Synchronous)** + - Main Features: Executes rollout and training serially, typically sharing GPU resources (Colocate). It strictly adheres to standard on-policy algorithm definitions, where training must wait for all samples to be generated. + - Scenarios: Best for baseline implementations, scenarios where strict algorithmic correctness is prioritized over training throughput. + +**One-step-off-policy (Async)** + - Main Features: Parallelizes generation and training by overlapping the current training step with the next batch's generation. It employs resource isolation and uses parameters from the previous step for rollout to minimize GPU idle time. + - Scenarios: Ideal for scenarios requiring moderate efficiency gains (20%–40%) while maintaining training stability very close to strict on-policy methods. + +**Fully async (Decoupled & Streaming)** + - Main Features: Completely decouples the Trainer and Rollouter onto separate nodes. It utilizes streaming data transfer, staleness control, and partial rollout mechanisms to maximize throughput and mitigate long-tail generation latency. + - Scenarios: Essential for large-scale training (e.g., 128+ GPUs) or complex reasoning tasks (e.g., long chain-of-thought) where generation latency significantly bottlenecks performance. + +
+ fully_async.png +
+ +## roadmap +### v0.7 release + +**Model Engine** +- Integrate Megatron-Bridge and support LoRA/PEFT, see blog post: [How We Build Trillion Parameter Reasoning RL with 10% GPUs](https://macaron.im/mindlab/research/building-trillion-parameter-reasoning-rl-with-10-gpus) +- Support experimental fp8 training for megatron backend +- Support new model for megatron backend: GPT-OSS, Qwen3-Next +- Comprehensive support for new mode engine, FSDP and Megatron engine are production ready. + - Dispatch tensordict with nested tensor instead of padded DataProto + - Add TrainingWorker that resembles Tinker-like API + - Add VLM support for model engine, SFT and RL trainer + - Add model engine based critic model + - Implement ActorRolloutRefWorker by TrainingWorker, support different backend in one worker +- New VeOmni engine added, still in alpha status. + +**Rollout Engine** +- Remove SPMD rollout mode +- Support blockwise fp8 rollout for vllm and sglang; support online quant for vllm with torchao +- Experimental router replay support for vllm +- Optimize multi-modal data fetch and preprocess, support video input +- Upgrade to vllm==0.12.0; sglang==0.5.6 + +**Reward** +- Support hybrid reward scenarios, including generative, discriminative, rule-based rewards, and their combinations. +- Refactor reward models into server mode, supporting both colocated and standalone deployments. +- Introduce new reward managers to handle more complex scenarios, limited mode for request rate control and remote mode for CPU-intensive tasks. + +**Algorithm** +- Add [CISPO](https://arxiv.org/pdf/2506.13585): Clipped IS-weight Policy Optimization +- Add [SAPO](https://arxiv.org/abs/2511.20347): Soft Adaptive Policy Optimization + +**Recipe** +- [NEW] VLA: add experimental support for VLA model +- [NEW] [rhymerl](https://arxiv.org/abs/2508.18588): History Rhymes: Accelerating LLM Reinforcement Learning with RhymeRL +- TransferQueue: support multiple data partition and optimize tensor zero-copy serialization +- One-step-off-policy/Fully async: optimize weight synchronization by checkpoint engine with bucket and pipeline support. + +### v0.8 + +**Model Engine** +- Deprecate DataProto by Tensordict for zero padding transmission +- Switch default to new model engine, mark legacy engine (fsdp_workers.py, megatron_workers.py) as deprecated +- Feature parity between new and legacy model engine: LoRA/PEFT, etc +- Polish VeOmni engine to production ready status +- Support MTP RL training +- Optimize GPU memory for long context: fine-grained activation recompuation/offload +- New model support: DeepSeek V3.2, etc + +**Rollout Engine** +- New rollout engine TensorRT-LLM +- Separate vllm worker from trainer process, update weights by cuda ipc + +**TransferQueue** +- Merge TransferQueue recipe into main +- Optimize e2e image/video vlm training pipeline by TransferQueue +- Optimize router replay transmission by TransferQueue + +**Checkpoint Engine** +- Add checkpoint engine abstract interface +- Add NCCL and NIXL transport backend +- Add more transport backend + +### v0.9 + +**Trainer** +- Merge Full async into main: refactor with verl-core component + +**Model Engine** +- Remove legacy model engine (fsdp_workers.py, megatron_workers.py) +- Support omni-model RL training: Qwen3-Omni, BAGEL, etc + +**Rollout Engine** +- New rollout engine vllm-omni + +**More agentic training recipe** +- SWEAgent +- GUIAgent diff --git a/code/RL_model/verl/verl_train/docs/data/transfer_queue.md b/code/RL_model/verl/verl_train/docs/data/transfer_queue.md new file mode 100644 index 0000000000000000000000000000000000000000..2775034029b8064995421d10b9f6a26c1a0cecf3 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/data/transfer_queue.md @@ -0,0 +1,290 @@ +# TransferQueue Data System + +Last updated: 01/07/2026. + +This doc introduce [TransferQueue](https://gitcode.com/Ascend/TransferQueue), an asynchronous streaming data management system for efficient post-training. + +🔥 **Now TransferQueue is formally open-sourced at [GitCode](https://gitcode.com/Ascend/TransferQueue). We will soon provide a [Github Mirror Repo](https://github.com/Ascend/TransferQueue) for community contributions. You are welcome to submit contributions or propose new ideas on either platform!** + + +> At the mean time, the early development history remains accessible at: https://github.com/TransferQueue/TransferQueue. + +

Overview

+ +TransferQueue is a high-performance data storage and transfer module with panoramic data visibility and streaming scheduling capabilities, optimized for efficient dataflow in post-training workflows. + +

+ +

+ +TransferQueue offers **fine-grained, sample-level** data management and **load-balancing** (on the way) capabilities, serving as a data gateway that decouples explicit data dependencies across computational tasks. This enables a divide-and-conquer approach, significantly simplifies the algorithm controller design. + +

+ +

+ +

Updates

+ + - **Dec 30, 2025**: **TransferQueue x verl** integration is tested with the DAPO algorithm at scale **(64 nodes, 1024 cards)**. It significantly optimizes host memory utilization and accelerates data transfers. Stay tuned for more details! + - **Dec 20, 2025**: 🔥 The official [tutorial](https://github.com/TransferQueue/TransferQueue/tree/main/tutorial) is released! Feel free to check it out. + - **Nov 10, 2025**: We disentangle the data retrieval logic from TransferQueueController [PR#101](https://github.com/TransferQueue/TransferQueue/pull/101). Now you can implement your own `Sampler` to control how to consume the data. + - **Nov 5, 2025**: We provide a `KVStorageManager` that simplifies the integration with KV-based storage backends [PR#96](https://github.com/TransferQueue/TransferQueue/pull/96). The first available KV-based backend is [Yuanrong](https://gitee.com/openeuler/yuanrong-datasystem). + - **Nov 4, 2025**: Data partition capability is available in [PR#98](https://github.com/TransferQueue/TransferQueue/pull/98). Now you can define logical data partitions to manage your train/val/test datasets. + - **Oct 25, 2025**: We make storage backends pluggable in [PR#66](https://github.com/TransferQueue/TransferQueue/pull/66). You can try to integrate your own storage backend with TransferQueue now! + - **Oct 21, 2025**: Official integration into verl is ready [verl/pulls/3649](https://github.com/volcengine/verl/pull/3649). Following PRs will optimize the single controller architecture by fully decoupling data & control flows. + - **July 22, 2025**: We present a series of Chinese blogs on Zhihu 1, 2. + - **July 21, 2025**: We started an RFC on verl community [verl/RFC#2662](https://github.com/volcengine/verl/discussions/2662). + - **July 2, 2025**: We publish the paper [AsyncFlow](https://arxiv.org/abs/2507.01663). + +

Components

+ +### Control Plane: Panoramic Data Management + +In the control plane, `TransferQueueController` tracks the **production status** and **consumption status** of each training sample as metadata. When all the required data fields are ready (i.e., written to the `TransferQueueStorageManager`), we know that this data sample can be consumed by downstream tasks. + +For consumption status, we record the consumption records for each computational task (e.g., `generate_sequences`, `compute_log_prob`, etc.). Therefore, even when different computation tasks require the same data field, they can consume the data independently without interfering with each other. + +

+ +

+ +To make the data retrieval process more customizable, we provide a `Sampler` class that allows users to define their own data retrieval and consumption logic. Refer to the [Customize](#customize) section for details. + +> In the future, we plan to support **load-balancing** and **dynamic batching** capabilities in the control plane. Additionally, we will support data management for disaggregated frameworks where each rank manages the data retrieval by itself, rather than coordinated by a single controller. + +### Data Plane: Distributed Data Storage + +In the data plane, we provide a pluggable design that enables TransferQueue to integrate with different storage backends according to user requirements. + +Specifically, we provide a `TransferQueueStorageManager` abstraction class that defines the core APIs as follows: + +- `async def put_data(self, data: TensorDict, metadata: BatchMeta) -> None` +- `async def get_data(self, metadata: BatchMeta) -> TensorDict` +- `async def clear_data(self, metadata: BatchMeta) -> None` + +This class encapsulates the core interaction logic within the TransferQueue system. You only need to write a simple subclass to integrate your own storage backend. Refer to the [Customize](#customize) section for details. + +Currently, we support the following storage backends: + +- SimpleStorageUnit: A basic CPU memory storage with minimal data format constraints and easy usability. +- [Yuanrong](https://gitcode.com/openeuler/yuanrong-datasystem) (beta, [#PR107](https://github.com/TransferQueue/TransferQueue/pull/107), [#PR96](https://github.com/TransferQueue/TransferQueue/pull/96)): An Ascend native data system that provides hierarchical storage interfaces including HBM/DRAM/SSD. +- [Mooncake Store](https://github.com/kvcache-ai/Mooncake) (alpha, [#PR162](https://github.com/TransferQueue/TransferQueue/pull/162)): A high-performance, KV-based hierarchical storage that supports RDMA transport between GPU and DRAM. +- [Ray Direct Transport](https://docs.ray.io/en/master/ray-core/direct-transport.html) (alpha, [#PR167](https://github.com/TransferQueue/TransferQueue/pull/167)): Ray's new feature that allows Ray to store and pass objects directly between Ray actors. + +Among them, `SimpleStorageUnit` serves as our default storage backend, coordinated by the `AsyncSimpleStorageManager` class. Each storage unit can be deployed on a separate node, allowing for distributed data management. + +`SimpleStorageUnit` employs a 2D data structure as follows: + +- Each row corresponds to a training sample, assigned a unique index within the corresponding global batch. +- Each column represents the input/output data fields for computational tasks. + +This data structure design is motivated by the computational characteristics of the post-training process, where each training sample is generated in a relayed manner across task pipelines. It provides an accurate addressing capability, which allows fine-grained, concurrent data read/write operations in a streaming manner. + +

+ +

+ +### User Interface: Asynchronous & Synchronous Client + +The interaction workflow of TransferQueue system is as follows: + +1. A process sends a read request to the `TransferQueueController`. +2. `TransferQueueController` scans the production and consumption metadata for each sample (row), and dynamically assembles a micro-batch metadata according to the load-balancing policy. This mechanism enables sample-level data scheduling. +3. The process retrieves the actual data from distributed storage units using the metadata provided by the controller. + +To simplify the usage of TransferQueue, we have encapsulated this process into `AsyncTransferQueueClient` and `TransferQueueClient`. These clients provide both asynchronous and synchronous interfaces for data transfer, allowing users to easily integrate TransferQueue into their framework. + +> In the future, we will provide a `StreamingDataLoader` interface for disaggregated frameworks as discussed in [issue#85](https://github.com/TransferQueue/TransferQueue/issues/85) and [verl/RFC#2662](https://github.com/volcengine/verl/discussions/2662). Leveraging this abstraction, each rank can automatically get its own data like `DataLoader` in PyTorch. The TransferQueue system will handle the underlying data scheduling and transfer logic caused by different parallelism strategies, significantly simplifying the design of disaggregated frameworks. + +

🔥 Showcases

+ +### General Usage + +The primary interaction points are `AsyncTransferQueueClient` and `TransferQueueClient`, serving as the communication interface with the TransferQueue system. + +Core interfaces: + +- `(async_)get_meta(data_fields: list[str], batch_size:int, partition_id: str, mode: str, task_name:str, sampling_config: Optional[dict[str, Any]]) -> BatchMeta` +- `(async_)get_data(metadata: BatchMeta) -> TensorDict` +- `(async_)put(data: TensorDict, metadata: Optional[BatchMeta], partition_id: Optional[str])` +- `(async_)clear_partition(partition_id: str)` and `(async_)clear_samples(metadata: BatchMeta)` + +**Refer to our [tutorial](https://github.com/TransferQueue/TransferQueue/tree/main/tutorial) for detailed examples.** + + +### verl Example + +The primary motivation for integrating TransferQueue to verl now is to **alleviate the data transfer bottleneck of the single controller `RayPPOTrainer`**. Currently, all `DataProto` objects must be routed through `RayPPOTrainer`, resulting in a single point bottleneck of the whole post-training system. + +![verl_dataflow_DataProto](https://github.com/TransferQueue/community_doc/blob/main/docs/verl_workflow.jpeg?raw=true) + + +Leveraging TransferQueue, we separate experience data transfer from metadata dispatch by + +- Replacing `DataProto` with `BatchMeta` (metadata) and `TensorDict` (actual data) structures +- Preserving verl's original Dispatch/Collect logic via BatchMeta (maintaining single-controller debuggability) +- Accelerating data transfer by TransferQueue's distributed storage units + +![verl_dataflow_TransferQueue](https://github.com/TransferQueue/community_doc/blob/main/docs/verl_workflow_with_tq.jpeg?raw=true) + + +You may refer to the [recipe](https://github.com/TransferQueue/TransferQueue/tree/dev/recipe/simple_use_case), where we mimic the verl usage in both async & sync scenarios. Official integration to verl is also available now at [verl/pulls/3649](https://github.com/volcengine/verl/pull/3649) (with subsequent PRs to further optimize the integration). + + +### Use Python package +```bash +pip install TransferQueue +``` + +### Build wheel package from source code + +Follow these steps to build and install: +1. Clone the source code from the GitHub repository + ```bash + git clone https://github.com/TransferQueue/TransferQueue/ + cd TransferQueue + ``` + +2. Install dependencies + ```bash + pip install -r requirements.txt + ``` + +3. Build and install + ```bash + python -m build --wheel + pip install dist/*.whl + ``` + +

📊 Performance

+ +

+ +

+ +> Note: The above benchmark for TransferQueue is based on our naive `SimpleStorageUnit` backend. By introducing high-performance storage backends and optimizing serialization/deserialization, we expect to achieve even better performance. Warmly welcome contributions from the community! + +For detailed performance benchmarks, please refer to [this blog](https://www.yuque.com/haomingzi-lfse7/hlx5g0/tml8ke0zkgn6roey?singleDoc#). + +We also provide a [stress test report](https://www.yuque.com/haomingzi-lfse7/hlx5g0/ydbwgo5k2umaag78?singleDoc#) that demonstrates **768 concurrent clients writing 1.4 TB of data** into TransferQueue across 4 nodes. The system remains stable without any crashes or data loss, achieving 80% bandwidth. + +

🛠️ Customize TransferQueue

+ +### Define your own data retrieval logic +We provide a `BaseSampler` abstraction class, which defines the following interface: + +```python3 +@abstractmethod +def sample( + self, + ready_indexes: list[int], + batch_size: int, + *args: Any, + **kwargs: Any, +) -> tuple[list[int], list[int]]: + """Sample a batch of indices from the ready indices. + + Args: + ready_indexes: List of global indices for which all required fields of the + corresponding samples have been produced, and the samples are not labeled as + consumed in the corresponding task. + batch_size: Number of samples to select + *args: Additional positional arguments for specific sampler implementations + **kwargs: Additional keyword arguments for specific sampler implementations + + Returns: + List of sampled global indices of length batch_size + List of global indices of length batch_size that should be labeled as consumed + (will never be retrieved in the future) + + Raises: + ValueError: If batch_size is invalid or ready_indexes is insufficient + """ + raise NotImplementedError("Subclasses must implement sample") +``` + +In this design, we separate data retrieval and data consumption through the two return values, which enables us to easily control sample replacement. We have implemented two reference designs: `SequentialSampler` and `GRPOGroupNSampler`. + +The `Sampler` class or instance should be passed to the `TransferQueueController` during initialization. During each `get_meta` call, you can provide dynamic sampling parameters to the `Sampler`. + +```python3 +from transfer_queue import TransferQueueController, TransferQueueClient, GRPOGroupNSampler, process_zmq_server_info + +# Option 1: Pass the sampler class to the TransferQueueController +controller = TransferQueueController.remote(GRPOGroupNSampler) + +# Option 2: Pass the sampler instance to the TransferQueueController (if you need custom configuration) +your_own_sampler = YourOwnSampler(config) +controller = TransferQueueController.remote(your_own_sampler) + +# Use the sampler +batch_meta = client.get_meta( + data_fields=["input_ids", "attention_mask"], + batch_size=8, + partition_id="train_0", + task_name="generate_sequences", + sampling_config={"n_samples_per_prompt": 4} # Put the required sampling parameters here +) +``` + +**Refer to [tutorial/04_custom_sampler.py](https://github.com/TransferQueue/TransferQueue/blob/main/tutorial/04_custom_sampler.py) for more details.** + + +### How to integrate a new storage backend + +The data plane is organized as follows: +```text + transfer_queue/ + ├── storage/ + │ ├── __init__.py + │ │── simple_backend.py # Default distributed storage backend (SimpleStorageUnit) by TQ + │ ├── managers/ # Managers are upper level interfaces that encapsulate the interaction logic with TQ system. + │ │ ├── __init__.py + │ │ ├──base.py # TransferQueueStorageManager, KVStorageManager + │ │ ├──simple_backend_manager.py # AsyncSimpleStorageManager + │ │ ├──yuanrong_manager.py # YuanrongStorageManager + │ │ ├──mooncake_manager.py # MooncakeStorageManager + │ │ └──factory.py # TransferQueueStorageManagerFactory + │ └── clients/ # Clients are lower level interfaces that directly manipulate the target storage backend. + │ │ ├── __init__.py + │ │ ├── base.py # TransferQueueStorageKVClient + │ │ ├── yuanrong_client.py # YuanrongStorageClient + │ │ ├── mooncake_client.py # MooncakeStorageClient + │ │ ├── ray_storage_client.py # RayStorageClient + │ │ └── factory.py # TransferQueueStorageClientFactory +``` + +To integrate TransferQueue with a custom storage backend, start by implementing a subclass that inherits from `TransferQueueStorageManager`. This subclass acts as an adapter between the TransferQueue system and the target storage backend. For KV-based storage backends, you can simply inherit from `KVStorageManager`, which can serve as the general manager for all KV-based backends. + +Distributed storage backends often come with their own native clients serving as the interface of the storage system. In such cases, a low-level adapter for this client can be written, following the examples provided in the `storage/clients` directory. + +Factory classes are provided for both `StorageManager` and `StorageClient` to facilitate easy integration. Adding necessary descriptions of required parameters in the factory class helps enhance the overall user experience. + +

✏️ Contribution Guide

+ +**Contributions are warmly welcome!** + +New ideas, feature suggestions, and user experience feedback are all encouraged—feel free to submit issues or PRs. We will respond as soon as possible. + +We recommend using pre-commit for better code format. + +```bash +# install pre-commit +pip install pre-commit + +# run the following command in your repo folder, then fix the check before committing your code +pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always +``` + + +

Citation

+Please kindly cite our paper if you find this repo is useful: + +```bibtex +@article{han2025asyncflow, + title={AsyncFlow: An Asynchronous Streaming RL Framework for Efficient LLM Post-Training}, + author={Han, Zhenyu and You, Ansheng and Wang, Haibo and Luo, Kui and Yang, Guang and Shi, Wenqi and Chen, Menglong and Zhang, Sicheng and Lan, Zeshun and Deng, Chunshi and others}, + journal={arXiv preprint arXiv:2507.01663}, + year={2025} +} +``` \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/examples/config.rst b/code/RL_model/verl/verl_train/docs/examples/config.rst new file mode 100644 index 0000000000000000000000000000000000000000..9909dd67581c3aa2d2ecb8b889e5955081cb24fc --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/examples/config.rst @@ -0,0 +1,735 @@ +.. _config-explain-page: + +Config Explanation +=================== + +Last updated: 06/18/2025. + +ppo_trainer.yaml for RL FSDP Backend +------------------------------------- + +Data +~~~~ + +.. code:: yaml + + data: + tokenizer: null + train_files: ~/data/rlhf/gsm8k/train.parquet + val_files: ~/data/rlhf/gsm8k/test.parquet + train_max_samples: -1 # set to -1 to use full dataset + val_max_samples: -1 # set to -1 to use full dataset + prompt_key: prompt + max_prompt_length: 512 + max_response_length: 512 + train_batch_size: 1024 + return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs + return_raw_chat: False + return_full_prompt: False + shuffle: True + seed: 42 + filter_overlong_prompts: False + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + trust_remote_code: True + custom_cls: + path: null + name: null + +- ``data.train_files``: Training set parquet. Can be a list or a single + file. The program will read all files into memory, so it can't be too + large (< 100GB). The path can be either local path or HDFS path. For + HDFS path, we provide utils to download it to DRAM and convert the + HDFS path to local path. +- ``data.val_files``: Validation parquet. Can be a list or a single + file. +- ``data.train_max_samples``: Maximum number of samples to use from the + training dataset. Set to -1 to use the full dataset. +- ``data.val_max_samples``: Maximum number of samples to use from the + validation dataset. Set to -1 to use the full dataset. +- ``data.prompt_key``: The field in the dataset where the prompt is + located. Default is 'prompt'. +- ``data.max_prompt_length``: Maximum prompt length. All prompts will be + left-padded to this length. An error will be reported if the length is + too long +- ``data.max_response_length``: Maximum response length. Rollout in RL + algorithms (e.g. PPO) generates up to this length +- ``data.train_batch_size``: Batch size sampled for one training + iteration of different RL algorithms. +- ``data.return_raw_input_ids``: Whether to return the original + input_ids without adding chat template. This is mainly used to + accommodate situations where the reward model's chat template differs + from the policy. It needs to be decoded first, then apply the RM's + chat template. If using a model-based RM, and the policy and RM + chat_templates are different, this flag needs to be set +- ``data.return_raw_chat``: Whether to return the original chat (prompt) + without applying chat template. +- ``data.return_full_prompt``: Whether to return the full prompt with chat template +- ``data.shuffle``: Whether to shuffle the data in the dataloader. +- ``data.seed``: An integer seed to use when shuffling the data. If not set or set to + `null`, the data shuffling will not be seeded, resulting in a different data order on each run. +- ``data.filter_overlong_prompts``: Default don't filter. +- ``data.filter_overlong_prompts_workers``: For large-scale dataset, filtering + overlong prompts could be timeconsuming. You cat set the ``filter_overlong_prompts_workers`` + to use multiprocessing for speed up. Default to 1. +- ``data.truncation``: Truncate the input_ids or prompt length if they + exceed max_prompt_length. Default is 'error', not allow exceed the + max_prompt_length. The users should increase the max_prompt_length if + throwing the error. You can also set ``left``, ``right`` and ``middle``. + When ``middle`` is selected, the logic splits the allowed max length roughly in half + and keeps the head and tail of the sequence, effectively discarding the middle section. +- ``data.image_key``: The field in the multi-modal dataset where the image is + located. Default is 'images'. +- ``data.trust_remote_code``: If the remote tokenizer has python file, we can use this field to allow + using remote tokenizer. For example: moonshotai/Moonlight-16B-A3B-Instruct + +Customized Dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Customized dataset extension is implemented for the SFT trainer and can be extended to other trainers with similar changes. + +.. code:: yaml + + custom_cls: + path: null + name: null + +- ``data.custom_cls.path``: The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used. +- ``data.custom_cls.name``: The name of the dataset class within the specified file. + +Actor/Rollout/Reference Policy +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + actor_rollout_ref: + hybrid_engine: True + model: + path: ~/models/deepseek-llm-7b-chat + external_lib: null + override_config: + attn_implementation: flash_attention_2 # or eager, sdpa - attention implementation override + model_config: {} + moe_config: # Megatron only, can adjust moe configuration + freeze_moe_router: False # Megatron only, can freeze moe router (no grad) + enable_gradient_checkpointing: False + enable_activation_offload: False + trust_remote_code: False + use_remove_padding: False + actor: + strategy: fsdp # This is for backward-compatibility + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: False + ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} + grad_clip: 1.0 + clip_ratio: 0.2 + entropy_coeff: 0.0 + use_kl_loss: False # True for GRPO + # Rollout Correction (corrects distribution mismatch between rollout and training) + rollout_correction: + rollout_is: token # IS weights: token/sequence/null + rollout_is_threshold: 2.0 # Upper threshold for IS weights + rollout_rs: null # Rejection sampling: token/sequence/geometric/null + rollout_rs_threshold: null # RS upper threshold + rollout_rs_threshold_lower: null # RS lower threshold + rollout_token_veto_threshold: null # Per-token veto (null to disable) + use_torch_compile: True # False to disable torch compile + kl_loss_coef: 0.001 # for grpo + kl_loss_type: low_var_kl # for grpo + ppo_epochs: 1 + data_loader_seed: null + shuffle: False + ulysses_sequence_parallel_size: 1 # sp size + optim: + lr: 1e-6 + lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio. + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + min_lr_ratio: 0.0 # only used with cosine lr scheduler, default to 0.0 + num_cycles: 0.5 # only used with cosine lr scheduler, default to 0.5 + lr_scheduler_type: constant # select from constant/cosine + total_training_steps: -1 # must be override by program + fsdp_config: + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + param_offload: False + optimizer_offload: False + fsdp_size: -1 + checkpoint: + # What to include in saved checkpoints + # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space + save_contents: ['model', 'optimizer', 'extra'] + # For more flexibility, you can specify the contents to load from the checkpoint. + load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents} + ref: + fsdp_config: + param_offload: False + wrap_policy: + # transformer_layer_cls_to_wrap: None + min_num_params: 0 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 16 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size + rollout: + name: vllm + temperature: 1.0 + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1 + prompt_length: ${data.max_prompt_length} # not use for opensource + response_length: ${data.max_response_length} + # for vllm rollout + dtype: bfloat16 # should align with FSDP + gpu_memory_utilization: 0.5 + ignore_eos: False + enforce_eager: True + free_cache_engine: True + load_format: dummy_dtensor + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu + log_prob_micro_batch_size_per_gpu: 16 + log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} + log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} + # for hf rollout + do_sample: True + engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail + vllm: {} + sglang: {} + + n: 1 # for each prompt, sample n responses (i.e. num sample times). set it to values > 1 for grpo, rloo + calculate_log_probs: False # set to True for computing log probs via rollouts + val_kwargs: + # sampling parameters for validation + top_k: -1 # 0 for hf rollout, -1 for vllm rollout + top_p: 1.0 + temperature: 0 + n: 1 + do_sample: False # default eager for validation + + agent: + custom_async_server: # Use custom async server implementation for rollout + path: null + name: null + +**Common config for actor, rollout and reference model** + +- ``actor_rollout_ref.hybrid_engine``: Whether it's a hybrid engine, + currently only supports hybrid engine +- ``actor_rollout_ref.model.path``: Huggingface model path. This can be + either local path or HDFS path. For HDFS path, we provide utils to + download it to DRAM and convert the HDFS path to local path. +- ``actor_rollout_ref.model.external_libs``: Additional Python packages + that need to be imported. Used to register models or tokenizers into + the Huggingface system. +- ``actor_rollout_ref.model.override_config``: Used to override some of + the model's original configurations. Common overrides include: + + - ``attn_implementation``: Override the attention implementation. Default is ``flash_attention_2``. + Supported values: ``flash_attention_2``, ``eager``, ``sdpa``. Use ``eager`` for debugging or + compatibility issues. See :ref:`attention-implementation-override` for detailed usage. + +- ``actor_rollout_ref.model.enable_gradient_checkpointing``: FSDP only, decide + Whether to enable gradient checkpointing for the actor, + Megatron uses recompute options in ``override_transformer_config`` to set this +- ``actor_rollout_ref.model.enable_activation_offload``: Whether to enable + activation offloading for the actor +- ``actor_rollout_ref.model.trust_remote_code``: Whether to enable loading + a remote code model +- ``actor_rollout_ref.model.use_fused_kernels``: Whether to use fused + kernels in the model. If set to True, the following parameters will be + used. + + - ``actor_rollout_ref.model.fused_kernel_options.impl_backend``: The + implementation backend for fused kernels. Options: "triton" or + "torch". Default is "torch". + While in megatron, we only support "triton" as the + implementation backend, so there is no need for this option. + +- ``actor_rollout_ref.model.use_remove_padding``: Whether to use remove + padding in the model. If set to True, the model will remove padding + tokens in the input_ids and response_ids. This helps a lot in improving model running efficiency. + +- ``actor_rollout_ref.model.tiled_mlp``: TiledMLP configuration for memory-efficient + MLP computation. Reduces peak memory by processing MLP forward/backward in tiles. + Only compatible with FSDP2 (requires ``actor_rollout_ref.actor.strategy=fsdp2``). + + - ``actor_rollout_ref.model.tiled_mlp.enabled``: Whether to enable TiledMLP. + Default is False. + - ``actor_rollout_ref.model.tiled_mlp.num_shards``: Number of shards to split + the input. Higher values reduce peak memory but may slightly impact performance. + Default is 4. + +**Actor model** + +- ``actor_rollout_ref.actor.strategy``: fsdp or megatron. In this + example, we use fsdp backend. + +- ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split + into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO + updates. The ppo_mini_batch_size is a global num across all workers/gpus + +- ``actor_rollout_ref.actor.ppo_micro_batch_size``: [Will be deprecated, use ppo_micro_batch_size_per_gpu] + Similar to gradient accumulation, the micro_batch_size_per_gpu for one forward pass, + trading speed for GPU memory. The value represent the global view. + +- ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: Similar to gradient + accumulation, the micro_batch_size_per_gpu for one forward pass, trading speed + for GPU memory. The value represent the local num per gpu. + +- ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor + updates +- ``actor_rollout_ref.actor.use_kl_loss``: to use kl loss in actor. When used, we are not applying KL in the reward function. + +- ``actor_rollout_ref.actor.clip_ratio``: PPO clip ratio + +- ``actor_rollout_ref.actor.use_torch_compile``: Whether to use torch compile in actor + +- ``actor_rollout_ref.actor.entropy_coeff``: The weight of entropy when + calculating PPO loss. The default value is changed to 0.0 since v0.3.x + +- ``actor_rollout_ref.actor.ppo_epochs``: Number of epochs for PPO + updates on one set of sampled data + +- ``actor_rollout_ref.actor.data_loader_seed``: From torch 2.6.0 Megatron backend can get wrong seed generated by pytorch + between cp ranks and cause misalignment between data on these ranks, so we shall manually set the seed to avoid hanging + issue. if ``actor_rollout_ref.actor.shuffle`` is not null, this must be set. + +- ``actor_rollout_ref.actor.shuffle``: Whether to shuffle data when + there are multiple epochs + +- ``actor_rollout_ref.actor.optim``: Actor's optimizer parameters + +- ``actor_rollout_ref.actor.fsdp_config``: FSDP config for actor + training + + - ``wrap_policy``: FSDP wrap policy. By default, it uses Huggingface's + wrap policy, i.e., wrapping by DecoderLayer + + - No need to set transformer_layer_cls_to_wrap, so we comment it. + + - ``*_offload``: Whether to enable parameter, gradient and optimizer + offload + + - Trading speed for GPU memory. + +- ``actor_rollout_ref.actor.use_kl_loss``: Whether to enable kl loss. Default is False. + +- ``actor_rollout_ref.actor.kl_loss_coef``: The coefficient of kl loss. Default is 0.001. + +- ``actor_rollout_ref.actor.kl_loss_type``: Support ``kl`` (``k1``), ``abs``, ``mse`` (``k2``), ``low_var_kl`` (``k3``) and ``full``. Appending ``+`` in the end (e.g., ``k1+`` and ``k3+``) would use straight-through to employ ``k2`` for unbiased gradient estimation, regardless of the kl value estimation (see https://github.com/volcengine/verl/pull/2953#issuecomment-3162113848 for more details). How to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty()` in `core_algos.py `_ . See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html + +- ``actor_rollout_ref.actor.checkpoint``: The configurations of checkpoint function in actor + + - ``save_contents``: The contents to save in the checkpoint. By default, we save model, optimizer and extra information in the checkpoint. + The extra information includes Rng states currently, FSDP supported lr_scheduler, and Megatron opt_param_scheduler will coming soon. + We do not store hf_model in checkpoint by default, but we provide a tool in ``scripts/model_merge.py`` to convert checkpoint format to hf format. + + - ``load_contents``: The contents to load in the checkpoint, you can specify different checkpoint loading contents. By default, it is the same with ``save_checkpoint``. + +**Reference Model** + +Reference model will be enabled when ``actor.use_kl_loss`` or/and ``algorithm.use_kl_in_reward`` is/are True. + +- ``actor_rollout_ref.ref``: FSDP config same as actor. **For models + larger than 7B, it's recommended to turn on offload for ref by + default** + +- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu] + The batch size for one forward pass in the computation of ``ref_log_prob``. The value represent the global num. + +- ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``: The batch size + for one forward pass in the computation of ``ref_log_prob``. The value represent the local num per gpu. + +**Rollout Model** + +- ``actor_rollout_ref.rollout.name``: hf/vllm/sglang. + +- Rollout (Auto-regressive) parameters. The key should be equal to the + property name in vLLM's ``SamplingParams``. + + - ``temperature``, ``top_k``, ``top_p`` and others: Sampling + parameters in ``SamplingParams``. + +- ``actor_rollout_ref.rollout.dtype``: Rollout model parameters type. This should be align with + the actor model parameter type in FSDP/Megatron backend. + +- ``actor_rollout_ref.rollout.gpu_memory_utilization``: + + - For vLLM v0.7.0 and later: The fraction of **total** GPU memory to be used for the vLLM instance. + - For SGLang: Corresponding to ``mem_fraction_static``, the fraction of the free GPU memory used for **static** memory like model weights and KV cache. + +- ``actor_rollout_ref.rollout.tensor_model_parallel_size``: TP size for rollout. Only effective + for vllm. + +- ``actor_rollout_ref.rollout.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu] + The batch size for one forward pass in the computation of ``log_prob``. The value represent the global num. + +- ``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``: Micro batch size per gpu (The batch size for + one forward pass) for recalculating ``log_prob``. The value represent the local num per gpu. + +- ``actor_rollout_ref.rollout.do_sample``: Whether to sample during training rollout. If set to False, the rollout model + will perform greedy sampling. + +- ``actor_rollout_ref.rollout.val_kwargs```: Sampling parameters used specifically during validation. + + - ``top_k``: Top-k sampling parameter. Default to -1 for vLLM rollout or 0 for HF rollout. + - ``top_p``: Top-p sampling parameter. Default is 1.0 (disabled). + - ``temperature``: Sampling temperature. Default is 0 (deterministic greedy). + - ``n``: Number of responses to generate during validation. Default is 1. + - ``do_sample``: Whether to use sampling during validation. Default is False for + deterministic outputs. When set to True, the rollout will use the ``actor_rollout_ref.rollout.val_kwargs`` parameters + (top_k, top_p, temperature) to control the sampling behavior. + +- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args, please refer vllm official doc for detail + +- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args, please refer sglang official doc for detail + +- ``actor_rollout_ref.rollout.ignore_eos``: Whether to ignore the EOS + token and continue generating tokens after the EOS token is generated. + +- ``actor_rollout_ref.rollout.free_cache_engine``: Offload the KVCache + after rollout generation stage. Default is True. When set to True, + for vllm v0.5.4 and v0.6.3, we need to disable the usage of CUDAGraph + (set ``enforce_eager`` to True.) + +- ``actor_rollout_ref.rollout.enforce_eager``: Whether to use CUDAGraph + in vLLM generation. Default set to True to disable CUDAGraph. + +- ``actor_rollout_ref.rollout.load_format``: Which weight loader to use + to load the actor model weights to the rollout model. + + - ``auto``: Use Megatron weight loader. + - ``megatron``: Use Megatron weight loader. Deployed with Megatron + backend. The input model ``state_dict()`` is already partitioned + along TP dimension and already gathered along PP dimension. This + weight loader requires that the Rollout model and Actor model's + parameters shape and name should be identical. + - ``dtensor``: Default solution when using Huggingface weight loader. + Deployed with FSDP backend and the state_dict_type is + ``StateDictType.SHARDED_STATE_DICT``. Recommend to use this weight + loader + - ``hf``: Use Huggingface weight loader. Deployed with FSDP backend + and the state_dict_type is ``StateDictType.FULL_STATE_DICT``. This + solution doesn't need to rewrite the weight loader for each model + implemented in vLLM but it results in larger peak memory usage. + - ``dummy_hf``, ``dummy_megatron``, ``dummy_dtensor``: Random + initialization. + +.. note:: **NOTED**: In this config field, users only need to select from ``dummy_megatron``, ``dummy_dtensor``, ``dummy_hf`` for rollout initialization and our hybrid engine will select the corresponding weight loader (i.e., ``megatron``, ``dtensor``, ``hf``) during actor/rollout weight synchronization. + + +Megatron Optimizer and Optimizer Parameter Scheduler +____________________________________________________ + +.. code:: yaml + + optim: + optimizer: adam + lr: 1e-6 + clip_grad: 1.0 + total_training_steps: -1 # must be override by program + lr_warmup_init: 0.0 # initial learning rate for warmup, default to 0.0 + lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio. + lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime + lr_decay_steps: null + lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root + min_lr: 0.0 # minimum learning rate, default to 0.0 + weight_decay: 0.01 + weight_decay_incr_style: constant # select from constant/linear/cosine + lr_wsd_decay_style: exponential # select from constant/exponential/cosine + lr_wsd_decay_steps: null + use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler + + +Notice that there are some differences in APIs between Megatron optimizer and FSDP optimizer. + +- Megatron optimizer scheduler names the period after lr_warmup as lr_decay_steps, so the ``lr_scheduler_type`` actually means the style of lr decay after warmup. +- Megatron optimizer also support weight decay decay mechanism +- ``use_checkpoint_opt_param_scheduler`` determines whether to use the checkpoint optimizer parameter scheduler. If set to True, the optimizer parameter scheduler will be saved in the checkpoint and loaded from the checkpoint during resuming training. + +For learning rate decay, original Megatron pretrain default option of ``lr_decay_style`` is ``linear``, +meaning that the learning rate will be linearly decayed from the initial learning rate to ``min_lr`` within the +``lr_decay_steps``. However, in verl, to align with FSDP's default behavior, we set the default +``lr_decay_style`` to ``constant``, meaning that the learning rate will be kept constant after the warmup stage. + + +Critic Model +~~~~~~~~~~~~ + +Most parameters for Critic are similar to Actor Model. + +Reward Model +~~~~~~~~~~~~ + +.. code:: yaml + + reward_model: + enable: False + model: + input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical + path: ~/models/Anomy-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: False + fsdp_config: + min_num_params: 0 + param_offload: False + micro_batch_size_per_gpu: 16 + max_length: null + reward_manager: naive + +- ``reward_model.enable``: Whether to enable reward model. If False, we + compute the reward only with the user-defined reward functions. In + GSM8K and Math examples, we disable reward model. For RLHF alignment + example using full_hh_rlhf, we utilize reward model to assess the + responses. If False, the following parameters are not effective. +- ``reward_model.model`` + + - ``input_tokenizer``: Input tokenizer. If the reward model's chat + template is inconsistent with the policy, we need to first decode to + plaintext, then apply the rm's chat_template. Then score with RM. If + chat_templates are consistent, it can be set to null. + - ``path``: RM's HDFS path or local path. Note that RM only supports + AutoModelForSequenceClassification. Other model types need to define + their own RewardModelWorker and pass it from the code. + - ``trust_remote_code``: Whether to enable loading a remote code model, + default to False. +- ``reward_model.reward_manager``: Reward Manager. This defines the mechanism + of computing rule-based reward and handling different reward sources. Default + is ``naive``. If all verification functions are multiprocessing-safe, the reward + manager can be set to ``prime`` for parallel verification. + +Customized Reward Function +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + custom_reward_function: + path: null + name: compute_score + +- ``custom_reward_function.path``: The path to the file containing your customized reward function. If not specified, pre-implemented reward functions will be used. +- ``custom_reward_function.name`` (Optional) : The name of the reward function within the specified file. Default is 'compute_score'. + +Algorithm +~~~~~~~~~ + +.. code:: yaml + + algorithm: + gamma: 1.0 + lam: 1.0 + adv_estimator: gae + use_kl_in_reward: False + kl_penalty: kl # how to estimate kl divergence + kl_ctrl: + type: fixed + kl_coef: 0.005 + horizon: 10000 + target_kl: 0.1 + # Rollout Correction + rollout_correction: + rollout_is: null # IS weights: token/sequence/null + rollout_is_threshold: 2.0 # Upper threshold for IS weights + rollout_rs: null # Rejection sampling: token/sequence/geometric/null + rollout_rs_threshold: null # RS upper threshold + rollout_rs_threshold_lower: null # RS lower threshold + rollout_token_veto_threshold: null # Per-token veto (null to disable) + +- ``gamma``: discount factor +- ``lam``: Trade-off between bias and variance in the GAE estimator +- ``adv_estimator``: Support ``gae``, ``grpo``, ``reinforce_plus_plus``, ``reinforce_plus_plus_baseline``, ``rloo``, ``rloo_vectorized``, ``grpo_vectorized`` +- ``use_kl_in_reward``: Whether to enable in-reward kl penalty. Default is False. +- ``kl_penalty``: Support ``kl``, ``abs``, ``mse``, ``low_var_kl`` and ``full``. How to + calculate the kl divergence between actor and reference policy. For + specific options, refer to `kl_penalty()` in `core_algos.py `_ . +- ``kl_ctrl``: Config for in-reward kl_penalty controller + + - ``kl_coef``: The (initial) coefficient of in-reward kl_penalty. Default is 0.001. + - ``type``: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController. + - ``horizon`` and ``target_kl``: See source code of AdaptiveKLController for details. + +- ``rollout_correction``: Rollout Correction configuration (nested dict). Set to ``null`` to disable. + When enabled, contains: + + - ``rollout_is``: IS weights aggregation level: ``token``, ``sequence``, or ``null`` to disable IS weights. + - ``rollout_is_threshold``: Upper threshold for IS weights (e.g., 2.0). + - ``rollout_rs``: Rejection sampling mode: ``token``, ``sequence``, ``geometric``, or ``null`` to disable RS. + - ``rollout_rs_threshold``: RS upper threshold. + - ``rollout_rs_threshold_lower``: RS lower threshold (null = auto-reciprocal). + - ``rollout_token_veto_threshold``: Per-token veto threshold for catastrophic outliers (null = disabled). + + Note: Rollout Correction requires setting ``actor_rollout_ref.rollout.calculate_log_probs=True``. + +Trainer +~~~~~~~ + +.. code:: yaml + + trainer: + total_epochs: 30 + project_name: verl_examples + experiment_name: gsm8k + logger: ['console', 'wandb'] + log_val_generations: 0 + nnodes: 1 + n_gpus_per_node: 8 + save_freq: -1 + val_before_train: True + test_freq: 2 + critic_warmup: 0 + default_hdfs_dir: null # hdfs checkpoint path + default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} # local checkpoint path + resume_mode: auto # or disable or resume_path if resume_from_path is set + resume_from_path: null + remove_previous_ckpt_in_save: False + del_local_ckpt_after_load: False + ray_wait_register_center_timeout: 300 + +- ``trainer.total_epochs``: Number of epochs in training. +- ``trainer.project_name``: For wandb, swanlab, mlflow +- ``trainer.experiment_name``: For wandb, swanlab, mlflow +- ``trainer.logger``: Support console and wandb, swanlab, mlflow, tensorboard, trackio +- ``trainer.log_val_generations``: The number of logged generation during validation (default ``0``) +- ``trainer.nnodes``: Number of nodes used in the training. +- ``trainer.n_gpus_per_node``: Number of GPUs per node. +- ``trainer.save_freq``: The frequency (by iteration) to save checkpoint + of the actor and critic model. +- ``trainer.val_before_train``: Whether to run validation before training. +- ``trainer.test_freq``: The validation frequency (by iteration). +- ``trainer.critic_warmup``: The number of iteration to train the critic + model before actual policy learning. +- ``trainer.resume_mode``: The mode of resuming training. Support + ``disable``, ``auto`` and ``resume_path``. If set to ``auto`` as default, the + program will automatically resume from the latest checkpoint in the + ``default_local_dir``. If set to ``resume_path``, the program will resume + from the path specified in ``resume_from_path``. +- ``trainer.resume_from_path``: The path to resume training from. Only + effective when ``resume_mode`` is set to ``resume_path``. +- ``trainer.remove_previous_ckpt_in_save``: Whether to remove previous + checkpoints in the save directory. Default is False. +- ``trainer.del_local_ckpt_after_load``: Whether to delete local + checkpoints after loading them. Default is False. +- ``trainer.ray_wait_register_center_timeout``: The timeout for waiting + for the ray register center to be ready. Default is 300 seconds. + + +This figure illustrates how the configurations affect the training. + +https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA + +.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d + + +evaluation.yaml +--------------- + +Data +~~~~ + +.. code:: yaml + + data: + path: /tmp/math_Qwen2-7B-Instruct.parquet + prompt_key: prompt + response_key: responses + data_source_key: data_source + reward_model_key: reward_model + +- ``data.path``: Path to the dataset file (Parquet format). +- ``data.prompt_key``: The field in the dataset where the prompt is located. Default is 'prompt'. +- ``data.response_key``: The key holds the generated responses. This should be a list of strings representing the responses. Default is 'responses'. +- ``data.data_source_key``: This is used to separate metric calculations for different data sources, ensuring that metrics are calculated independently for each source. +- ``data.reward_model_key``: The key holds the reference answers. These reference answers typically serve as the ground truth or test cases for the task. + +Customized Reward Function +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: yaml + + custom_reward_function: + path: null + name: compute_score + +- ``custom_reward_function.path``: The path to the file containing your customized reward function. If not specified, pre-implemented reward functions will be used. +- ``custom_reward_function.name`` (Optional) : The name of the reward function within the specified file. Default is 'compute_score'. + +sft_trainer.yaml for SFT FSDP Backend +-------------------------------------- + + +Optim +~~~~~~~ + +.. code:: yaml + + optim: + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1e-5 + weight_decay: 0.01 + lr_warmup_steps_ratio: 0.1 + clip_grad: 1.0 + lr_scheduler: cosine + override_optimizer_config: null + +- ``optimizer``: Optimizer class name (e.g., ``"AdamW"``, ``"AdamW8bit"``, ``"_AdamW"``). The class name as it appears in the module. +- ``optimizer_impl``: Module path to import optimizer from (e.g., ``"torch.optim"``, ``"torchao.optim"``, ``"bitsandbytes.optim"``). +- ``optim.lr``: Learning rate for the optimizer. +- ``optim.weight_decay``: Weight decay for the optimizer. +- ``optim.lr_warmup_steps_ratio``: Ratio of warmup steps to total training steps. +- ``optim.clip_grad``: Gradient clipping value. +- ``optim.lr_scheduler``: Learning rate scheduler type. Options: + + - ``cosine``: Cosine learning rate scheduler with warmup (default). + - ``wsd``: Warmup-Stable-Decay scheduler that provides a stable learning rate phase between warmup and decay phases. + +- ``override_optimizer_config``: Dictionary of additional optimizer-specific keyword arguments. For example, to use ``torchao.optim``'s ``_AdamW`` with BF16 stochastic rounding: ``{"bf16_stochastic_round": true}`` + +Model +~~~~~~~~~~~~ + +Most parameters for Model are similar to Reward Model. + +.. code:: yaml + + model: + partial_pretrain: ~/models/gemma-1.1-7b-it + fsdp_config: + model_dtype: fp32 + wrap_policy: + min_num_params: 0 + cpu_offload: False + offload_params: False + external_lib: null + enable_gradient_checkpointing: False + trust_remote_code: False + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + use_liger: False + +- ``partial_pretrain``: HDFS path or local path for the pretrained model. +- ``fsdp_config`` + + - ``model_dtype``: Model parameters type, default to ``fp32``. + Support: ``bf16``, ``fp16``, ``fp32``. + - ``cpu_offload``: Whether to enable CPU offloading for FSDP. If True, + the offload_params will be used as argument. + - ``offload_params``: Whether to offload parameters to CPU + when not involved in computation. If True, then this offloads gradients + to CPU as well, meaning that the optimizer step runs on CPU. + +- ``lora_rank``: The rank of the LoRA model, default to 0. If ``lora_rank``>0, + we will train LoRA modules instead of tuning the full model. +- ``lora_alpha``: The alpha parameter for LoRA scaling, default to 16. +- ``target_modules``: The names of the modules to apply the adapter to, + default to ``all-linear``. See `peft docs `_ for detail. + +- ``use_liger``: Whether to enable Liger kernel, default to False. If True, + we apply Liger kernel to the model (depends on `liger-kernel`). diff --git a/code/RL_model/verl/verl_train/docs/examples/gsm8k_example.rst b/code/RL_model/verl/verl_train/docs/examples/gsm8k_example.rst new file mode 100644 index 0000000000000000000000000000000000000000..bc56497be64e578c6623fc917e34d376457b3676 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/examples/gsm8k_example.rst @@ -0,0 +1,190 @@ +GSM8K Example +============= + +Last updated: 03/25/2025. + +Introduction +------------ + +In this example, we train an LLM to tackle the GSM8k task. + +Paper: https://arxiv.org/pdf/2110.14168 + +Dataset: https://huggingface.co/datasets/openai/gsm8k + +Note that the original paper mainly focuses on training a verifier (a +reward model) to solve math problems via Best-of-N sampling. In this +example, we train an RLHF agent using a rule-based reward model. + +Dataset Introduction +-------------------- + +GSM8k is a math problem dataset. The prompt is an elementary school +problem. The LLM model is required to answer the math problem. + +The training set contains 7473 samples and the test set contains 1319 +samples. + +**An example** + +Prompt + + Katy makes coffee using teaspoons of sugar and cups of water in the + ratio of 7:13. If she used a total of 120 teaspoons of sugar and cups + of water, calculate the number of teaspoonfuls of sugar she used. + +Solution + + The total ratio representing the ingredients she used to make the + coffee is 7+13 = <<7+13=20>>20 Since the fraction representing the + number of teaspoons she used is 7/20, she used 7/20\ *120 = + <<7/20*\ 120=42>>42 #### 42 + +Step 1: Prepare dataset +----------------------- + +.. code:: bash + + cd examples/data_preprocess + python3 gsm8k.py --local_save_dir ~/data/gsm8k + +Step 2: Download Model +---------------------- + +There're three ways to prepare the model checkpoints for post-training: + +- Download the required models from huggingface or modelscope + +.. code:: bash + + hf download deepseek-ai/deepseek-math-7b-instruct --local-dir ~/models/deepseek-math-7b-instruct --local-dir-use-symlinks False + # or + modelscope download --model deepseek-ai/deepseek-math-7b-instruct --local_dir ~/models/deepseek-math-7b-instruct + +- Already store your store model in the local directory or HDFS path. +- Also, you can directly use the model name in huggingface (e.g., + deepseek-ai/deepseek-math-7b-instruct) in + ``actor_rollout_ref.model.path`` and ``critic.model.path`` field in + the run script. You can also download models from modelscope by setting environmental variable ``VERL_USE_MODELSCOPE=True``. + See examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh for example. + +Noted that users should prepare checkpoints for actor, critic and reward +model. + +[Optional] Step 3: SFT your Model +--------------------------------- + +We provide a SFT Trainer using PyTorch FSDP in +`fsdp_sft_trainer.py `_. +Users can customize their own SFT +script using our FSDP SFT Trainer. + +We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft directory `_. + +.. code:: shell + + set -x + + torchrun -m verl.trainer.fsdp_sft_trainer \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.prompt_key=question \ + data.response_key=answer \ + data.micro_batch_size_per_gpu=8 \ + model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \ + trainer.project_name=gsm8k-sft \ + trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \ + trainer.total_epochs=4 \ + trainer.logger='["console","wandb"]' + + +If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script: + + .. code-block:: bash + + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + + +Step 4: Perform PPO training with your model on GSM8K Dataset +------------------------------------------------------------- + +- Prepare your own run.sh script. Here's an example for GSM8k dataset + and deepseek-llm-7b-chat model. +- Users could replace the ``data.train_files`` ,\ ``data.val_files``, + ``actor_rollout_ref.model.path`` and ``critic.model.path`` based on + their environment. +- See :doc:`config` for detailed explanation of each config field. + +**Reward Model/Function** + +We use a rule-based reward model. We force the model to produce a final +answer following 4 “#” as shown in the solution. We extract the final +answer from both the solution and model's output using regular +expression matching. We compare them and assign a reward of 1 to correct +answer, 0.1 to incorrect answer and 0 to no answer. + +**Training Script** + +The training script example for FSDP and Megatron-LM backend are stored in examples/ppo_trainer directory. + +.. code:: bash + + cd ../ppo_trainer + bash run_deepseek7b_llm.sh + +The script of run_deepseek7b_llm.sh + +.. code:: bash + + set -x + + python3 -m verl.trainer.main_ppo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=1024 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size_per_gpu=32 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example_gsm8k' \ + trainer.experiment_name='deepseek_llm_7b_function_rm' \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=1 \ + trainer.total_epochs=15 $@ + + +If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script: + + .. code-block:: bash + + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + +If you encounter any issues in using AMD GPUs running VeRL, feel free to contact me - `Yusheng Su `_. \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/examples/multi_modal_example.rst b/code/RL_model/verl/verl_train/docs/examples/multi_modal_example.rst new file mode 100644 index 0000000000000000000000000000000000000000..844005b66eac5a8b0543d3e67a722c0c11293c95 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/examples/multi_modal_example.rst @@ -0,0 +1,45 @@ +Multi-Modal Example Architecture +================================= + +Last updated: 04/28/2025. + +Introduction +------------ + +Now, verl has supported multi-modal training. You can use fsdp and +vllm/sglang to start a multi-modal RL task. Megatron supports is also +on the way. + +Follow the steps below to quickly start a multi-modal RL task. + +Step 1: Prepare dataset +----------------------- + +.. code:: python + + # it will be saved in the $HOME/data/geo3k folder + python examples/data_preprocess/geo3k.py + +Step 2: Download Model +---------------------- + +.. code:: bash + + # download the model from huggingface + python3 -c "import transformers; transformers.pipeline(model='Qwen/Qwen2.5-VL-7B-Instruct')" + +Step 3: Perform GRPO training with multi-modal model on Geo3K Dataset +--------------------------------------------------------------------- + +.. code:: bash + + # run the task + bash examples/grpo_trainer/run_qwen2_5_vl-7b.sh + + + + + + + + diff --git a/code/RL_model/verl/verl_train/docs/examples/ppo_code_architecture.rst b/code/RL_model/verl/verl_train/docs/examples/ppo_code_architecture.rst new file mode 100644 index 0000000000000000000000000000000000000000..94d62413a2a684385eae801281995d6a02f05b3a --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/examples/ppo_code_architecture.rst @@ -0,0 +1,209 @@ +PPO Example Architecture +======================== + +Last updated: 02/17/2025. + +Let's start with the Proximal Policy Optimization algorithm, which is +most widely used algorithm in LLM post-training. + +The main entry point of the PPO algorithm example is: +`main_ppo.py `_. +In this tutorial, we will go through the code architecture in `main_ppo.py `_. + +Define the data +--------------- + +Users need to preprocess and store the dataset in parquet files. +And we implement `RLHFDataset` to load and tokenize the parquet files. + +For ``RLHFDataset`` (Default), at least 1 fields are required: + +- ``prompt``: Contains the string prompt + +We already provide some examples of processing the datasets to parquet +files in `data_preprocess directory `_. Currently, we support +preprocess of GSM8k, MATH, Hellasage, Full_hh_rlhf datasets. See :doc:`../preparation/prepare_data` for +more information. + +Define the reward functions for different datasets +-------------------------------------------------- + +In this main entry point, the users only need to define their own reward +function based on the datasets (or applications) utilized in PPO +training. + +For example, we already provide reward functions for `GSM8k `_ +and `MATH `_ +datasets in the ``_select_rm_score_fn``. In the ``RewardManager``, we +will compute the reward score based on the data_source to select +corresponding reward functions. For some RLHF datasets (e.g., +full_hh_rlhf), the reward model is utilized to assess the responses +without any reward functions. In this case, the ``RewardManager`` will +return the ``rm_score`` computed by the reward model directly. + +See `reward functions `_ for detailed implementation. + +Define worker classes +--------------------- + +.. code:: python + + if config.actor_rollout_ref.actor.strategy in {"fsdp", "fsdp2"}: # for FSDP backend + assert config.critic.strategy in {"fsdp", "fsdp2"} + from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker + from verl.single_controller.ray import RayWorkerGroup + ray_worker_group_cls = RayWorkerGroup + + elif config.actor_rollout_ref.actor.strategy == 'megatron': # for Megatron backend + assert config.actor_rollout_ref.actor.strategy == config.critic.strategy + from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker + from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup + ray_worker_group_cls = NVMegatronRayWorkerGroup # Ray worker class for Megatron-LM + + else: + raise NotImplementedError + + from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role + + role_worker_mapping = { + Role.ActorRollout: ActorRolloutRefWorker, + Role.Critic: CriticWorker, + Role.RefPolicy: ActorRolloutRefWorker + } + + global_pool_id = 'global_pool' + resource_pool_spec = { + global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes, + } + mapping = { + Role.ActorRollout: global_pool_id, + Role.Critic: global_pool_id, + Role.RefPolicy: global_pool_id, + } + +Step 1: Construct the mapping between roles and workers +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +A role represents a group of workers in the same process. We have +pre-defined several roles in `ray_trainer.py `_. + +.. code:: python + + class Role(Enum): + """ + To create more roles dynamically, you can subclass Role and add new members + """ + Actor = 0 # This worker only has Actor + Rollout = 1 # This worker only has Rollout + ActorRollout = 2 # This worker has both actor and rollout, it's a HybridEngine + Critic = 3 # This worker only has critic + RefPolicy = 4 # This worker only has reference policy + RewardModel = 5 # This worker only has reward model + ActorRolloutRef = 6 # This worker contains actor, rollout and reference policy simultaneously + +Step 2: Define the worker class corresponding to this role +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- We have pre-implemented the ``ActorRolloutRefWorker``. Through + different configs, it can be a standalone actor, a standalone rollout, + an ActorRollout HybridEngine, or an ActorRolloutRef HybridEngine +- We also pre-implemented workers for ``Actor``, ``Rollout``, + ``Critic``, ``Reward Model`` and ``Reference model`` on two different + backend: PyTorch FSDP + and Megatron-LM. + See `FSDP Workers `_ + and `Megatron-LM Workers `_ + for more information. + +Step 3: Define resource pool id and resource pool spec +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +- Resource pool is a division of global GPU resources, + ``resource_pool_spec`` is a dict, mapping from id to # of GPUs + + - In the above example, we defined a global resource pool: + global_pool_id, and then put all roles on this one resource pool + with all the GPUs in this post-training task. This refers to + *co-locate* placement where all the models share the same set of + GPUs. + +- See resource pool and placement for advance usage. + +Defining reward model/function +------------------------------ + +.. code:: python + + # we should adopt a multi-source reward function here + # - for rule-based rm, we directly call a reward score + # - for model-based rm, we call a model + # - for code related prompt, we send to a sandbox if there are test cases + # - finally, we combine all the rewards together + # - The reward type depends on the tag of the data + if config.reward_model.enable: + from verl.workers.fsdp_workers import RewardModelWorker + role_worker_mapping[Role.RewardModel] = RewardModelWorker + mapping[Role.RewardModel] = global_pool_id + + reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0) + + # Note that we always use function-based RM for validation + val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1) + + resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping) + +Since not all tasks use model-based RM, users need to define here +whether it's a model-based RM or a function-based RM + +- If it's a model-based RM, directly add the ``RewardModel`` role in the + resource mapping and add it to the resource pool mapping. + + - Note that the pre-defined ``RewardModelWorker`` only supports models + with the structure of huggingface + ``AutoModelForSequenceClassification``. If it's not this model, you + need to define your own RewardModelWorker in `FSDP Workers `_ + and `Megatron-LM Workers `_. + +- If it's a function-based RM, the users are required to classified the + reward function for each datasets. + +.. code:: python + + def _select_rm_score_fn(data_source): + if data_source == 'openai/gsm8k': + return gsm8k.compute_score + elif data_source == 'lighteval/MATH': + return math.compute_score + else: + raise NotImplementedError + +See reward functions implemented in `directory `_ +for more information. + +Define, init and run the PPO Trainer +------------------------------------ + +.. code:: python + + trainer = RayPPOTrainer(config=config, + tokenizer=tokenizer, + role_worker_mapping=role_worker_mapping, + resource_pool_manager=resource_pool_manager, + ray_worker_group_cls=ray_worker_group_cls, + reward_fn=reward_fn, + val_reward_fn=val_reward_fn) + trainer.init_workers() + trainer.fit() + +- We first initialize the ``RayPPOTrainer`` with user config, tokenizer + and all the above worker mapping, resource pool, worker group and + reward functions +- We first call the ``trainer.init_workers()`` to initialize the models + on the allocated GPUs (in the resource pool) +- The actual PPO training will be executed in ``trainer.fit()`` + +verl can be easily extended to other RL algorithms by reusing the Ray +model workers, resource pool and reward functions. See :doc:`extension<../advance/dpo_extension>` for +more information. + +Details of the ``RayPPOTrainer`` is discussed in :doc:`Ray Trainer<../workers/ray_trainer>`. diff --git a/code/RL_model/verl/verl_train/docs/examples/sandbox_fusion_example.rst b/code/RL_model/verl/verl_train/docs/examples/sandbox_fusion_example.rst new file mode 100644 index 0000000000000000000000000000000000000000..f3359efda2e14fa6d869b9af21060d6053ac112e --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/examples/sandbox_fusion_example.rst @@ -0,0 +1,54 @@ +Sandbox Fusion Example +============================ + +Last updated: 06/27/2025. + +Introduction +------------ + +Sandbox Fusion is a remote code sandbox service that provides a secure environment for running and evaluating code generated by Large Language Models (LLMs). This example demonstrates how to train an LLM and use Sandbox Fusion to verify generated code, enhancing both security and performance. + +By leveraging a remote code sandbox service with greater CPU resources for concurrent code verification, you can reduce the reward stage time by 10-30%, depending on the quality of the generated code. + +Step 1: Prepare the Dataset +--------------------------- + +We use the Eurus-2-RL-Data dataset for training. This dataset combines math and code questions, making it suitable for LLM training tasks. You can download it from HuggingFace: `Eurus-2-RL-Data Dataset `_. + +Step 2: Set Up the Sandbox Fusion Service +----------------------------------------- + +Sandbox Fusion is a remote code sandbox service designed to securely run and evaluate LLM-generated code. To use it: + +1. **Access Full Documentation**: For detailed setup instructions, refer to the `Sandbox Fusion Documentation `_. +2. **Deploy the Service**: Choose one of the following deployment methods: + + - **Local Deployment**: Follow the guide `here `_. + - **FaaS Instance (Volcengine)**: Create an instance using the `Volcengine Documentation `_. + +After deployment, you will receive an API endpoint in the format: ``https:///run_code``. + +Step 3: Configure the Training Script +------------------------------------- + +To integrate Sandbox Fusion into your training script, configure the following parameters: + +**Key Settings for Sandbox Fusion** + +- ``reward_model.sandbox_fusion.url=''``: Enable Sandbox Fusion by specifying the API endpoint (must end with ``/run_code``). +- ``reward_model.sandbox_fusion.max_concurrent=256``: Set the maximum number of concurrent API requests to the Sandbox Fusion service. +- ``reward_model.sandbox_fusion.memory_limit_mb=1024``: Set the memory limit (in MB) for each sandbox instance. Defaults to 1024MB if not specified. + +**Additional Optimization** + +To further reduce code verification time, enable parallel processing with: + +- ``reward_model.reward_manager=prime``: The Prime reward manager verifies code across multiple subprocesses concurrently. + +**Example Script** + +For a practical implementation, refer to the example script: + +``examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh`` + +Once you’ve set your API endpoint in the script, you can start the training job. \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/examples/skypilot_examples.rst b/code/RL_model/verl/verl_train/docs/examples/skypilot_examples.rst new file mode 100644 index 0000000000000000000000000000000000000000..de91781be63290be6da5bf4b62624addb6446a2d --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/examples/skypilot_examples.rst @@ -0,0 +1,146 @@ +SkyPilot Examples +================= + +Last updated: 09/04/2025. + +This guide provides examples of running VERL reinforcement learning training on Kubernetes clusters or cloud platforms with GPU nodes using `SkyPilot `_. + +Installation and Configuration +------------------------------- + +Step 1: Install SkyPilot +~~~~~~~~~~~~~~~~~~~~~~~~~ + +Choose the installation based on your target platform: + +.. code-block:: bash + + # For Kubernetes only + pip install "skypilot[kubernetes]" + + # For AWS + pip install "skypilot[aws]" + + # For Google Cloud Platform + pip install "skypilot[gcp]" + + # For Azure + pip install "skypilot[azure]" + + # For multiple platforms + pip install "skypilot[kubernetes,aws,gcp,azure]" + +Step 2: Configure Your Platform +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See https://docs.skypilot.co/en/latest/getting-started/installation.html + +Step 3: Set Up Environment Variables +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Export necessary API keys for experiment tracking: + +.. code-block:: bash + + # For Weights & Biases tracking + export WANDB_API_KEY="your-wandb-api-key" + + # For HuggingFace gated models (if needed) + export HF_TOKEN="your-huggingface-token" + +Examples +-------- + +All example configurations are available in the `examples/skypilot/ `_ directory on GitHub. See the `README `_ for additional details. + +PPO Training +~~~~~~~~~~~~ + +.. code-block:: bash + + sky launch -c verl-ppo verl-ppo.yaml --secret WANDB_API_KEY -y + +Runs PPO training on GSM8K dataset using Qwen2.5-0.5B-Instruct model across 2 nodes with H100 GPUs. Based on examples in ``examples/ppo_trainer/``. + +`View verl-ppo.yaml on GitHub `_ + +GRPO Training +~~~~~~~~~~~~~ + +.. code-block:: bash + + sky launch -c verl-grpo verl-grpo.yaml --secret WANDB_API_KEY -y + +Runs GRPO (Group Relative Policy Optimization) training on MATH dataset using Qwen2.5-7B-Instruct model. Memory-optimized configuration for 2 nodes. Based on examples in ``examples/grpo_trainer/``. + +`View verl-grpo.yaml on GitHub `_ + +Multi-turn Tool Usage Training +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + sky launch -c verl-multiturn verl-multiturn-tools.yaml \ + --secret WANDB_API_KEY --secret HF_TOKEN -y + +Single-node training with 8xH100 GPUs for multi-turn tool usage with Qwen2.5-3B-Instruct. Includes tool and interaction configurations for GSM8K. Based on examples in ``examples/sglang_multiturn/`` but uses vLLM instead of sglang. + +`View verl-multiturn-tools.yaml on GitHub `_ + +Configuration +------------- + +The example YAML files are pre-configured with: + +- **Infrastructure**: Kubernetes clusters (``infra: k8s``) - can be changed to ``infra: aws`` or ``infra: gcp``, etc. +- **Docker Image**: VERL's official Docker image with CUDA 12.6 support +- **Setup**: Automatically clones and installs VERL from source +- **Datasets**: Downloads required datasets during setup phase +- **Ray Cluster**: Configures distributed training across nodes +- **Logging**: Supports Weights & Biases via ``--secret WANDB_API_KEY`` +- **Models**: Supports gated HuggingFace models via ``--secret HF_TOKEN`` + +Launch Command Options +---------------------- + +- ``-c ``: Cluster name for managing the job +- ``--secret KEY``: Pass secrets for API keys (can be used multiple times) +- ``-y``: Skip confirmation prompt + +Monitoring Your Jobs +-------------------- + +Check Cluster Status +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + sky status + +View Logs +~~~~~~~~~ + +.. code-block:: bash + + sky logs verl-ppo # View logs for the PPO job + +SSH into Head Node +~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + ssh verl-ppo + +Access Ray Dashboard +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + sky status --endpoint 8265 verl-ppo # Get dashboard URL + +Stop a Cluster +~~~~~~~~~~~~~~ + +.. code-block:: bash + + sky down verl-ppo diff --git a/code/RL_model/verl/verl_train/docs/faq/faq.rst b/code/RL_model/verl/verl_train/docs/faq/faq.rst new file mode 100644 index 0000000000000000000000000000000000000000..aa150d65b1da895da0ae4b6780513be501cc0b52 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/faq/faq.rst @@ -0,0 +1,209 @@ +Frequently Asked Questions +==================================== + +Last updated: 09/24/2025. + +Ray related +------------ + +How to add breakpoint for debugging with distributed Ray? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Please checkout the official debugging guide from Ray: https://docs.ray.io/en/latest/ray-observability/ray-distributed-debugger.html + + +"Unable to register worker with raylet" +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The cause of this issue is due to some system setting, e.g., SLURM added some constraints on how the CPUs are shared on a node. +While `ray.init()` tries to launch as many worker processes as the number of CPU cores of the machine, +some constraints of SLURM restricts the `core-workers` seeing the `raylet` process, leading to the problem. + +To fix this issue, you can set the config term ``ray_init.num_cpus`` to a number allowed by your system. + +Distributed training +------------------------ + +How to run multi-node post-training with Ray? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can start a ray cluster and submit a ray job, following the official guide from Ray: https://docs.ray.io/en/latest/ray-core/starting-ray.html + +Then in the configuration, set the ``trainer.nnode`` config to the number of machines for your job. + +How to use verl on a Slurm-managed cluster? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Ray provides users with `this `_ official +tutorial to start a Ray cluster on top of Slurm. We have verified the :doc:`GSM8K example<../examples/gsm8k_example>` +on a Slurm cluster under a multi-node setting with the following steps. + +1. [Optional] If your cluster support `Apptainer or Singularity `_ and you wish +to use it, convert verl's Docker image to an Apptainer image. Alternatively, set up the environment with the package +manager available on your cluster or use other container runtimes (e.g. through `Slurm's OCI support `_) available to you. + +.. code:: bash + + apptainer pull /your/dest/dir/vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3.sif docker://verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3 + +2. Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints. + +3. Modify `examples/slurm/ray_on_slurm.slurm `_ with your cluster's own information. + +4. Submit the job script to the Slurm cluster with `sbatch`. + +Please note that Slurm cluster setup may vary. If you encounter any issues, please refer to Ray's +`Slurm user guide `_ for common caveats. + +If you changed Slurm resource specifications, please make sure to update the environment variables in the job script if necessary. + + +Install related +------------------------ + +NotImplementedError: TensorDict does not support membership checks with the `in` keyword. +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Detail error information: + +.. code:: bash + + NotImplementedError: TensorDict does not support membership checks with the `in` keyword. If you want to check if a particular key is in your TensorDict, please use `key in tensordict.keys()` instead. + +Cause of the problem: There is no suitable version of tensordict package for the linux-arm64 platform. The confirmation method is as follows: + +.. code:: bash + + pip install tensordict==0.6.2 + +Output example: + +.. code:: bash + + ERROR: Could not find a version that satisfies the requirement tensordict==0.6.2 (from versions: 0.0.1a0, 0.0.1b0, 0.0.1rc0, 0.0.2a0, 0.0.2b0, 0.0.3, 0.1.0, 0.1.1, 0.1.2, 0.8.0, 0.8.1, 0.8.2, 0.8.3) + ERROR: No matching distribution found for tensordict==0.6.2 + +Solution 1st: + Install tensordict from source code: + +.. code:: bash + + pip uninstall tensordict + git clone https://github.com/pytorch/tensordict.git + cd tensordict/ + git checkout v0.6.2 + python setup.py develop + pip install -v -e . + +Solution 2nd: + Temperally modify the error takeplace codes: tensordict_var -> tensordict_var.keys() + + +Illegal memory access +--------------------------------- + +If you encounter the error message like ``CUDA error: an illegal memory access was encountered`` during rollout, please check the vLLM documentation for troubleshooting steps specific to your vLLM version. + +Checkpoints +------------------------ + +If you want to convert the model checkpoint into huggingface safetensor format, please refer to ``verl/model_merger``. + + +Triton ``compile_module_from_src`` error +------------------------------------------------ + +If you encounter triton compilation error similar to the stacktrace below, please set the ``use_torch_compile`` flag according to +https://verl.readthedocs.io/en/latest/examples/config.html to disable just-in-time compilation for fused kernels. + +.. code:: bash + + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in + return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs) + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 338, in run + return self.fn.run(*args, **kwargs) + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 607, in run + device = driver.active.get_current_device() + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 23, in __getattr__ + self._initialize_obj() + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 20, in _initialize_obj + self._obj = self._init_fn() + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 9, in _create_driver + return actives[0]() + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 371, in __init__ + self.utils = CudaUtils() # TODO: make static + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 80, in __init__ + mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils") + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 57, in compile_module_from_src + so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries) + File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/build.py", line 48, in _build + ret = subprocess.check_call(cc_cmd) + File "/data/lbh/conda_envs/verl/lib/python3.10/subprocess.py", line 369, in check_call + raise CalledProcessError(retcode, cmd) + +What is the meaning of train batch size, mini batch size, and micro batch size? +------------------------------------------------------------------------------------------ + +This figure illustrates the relationship between different batch size configurations. + +https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA + +.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d + +How to generate ray timeline to analyse performance of a training job? +------------------------------------------------------------------------------------------ + +To generate the ray timeline file, you can set the config term ``ray_init.timeline_json_file`` to a json file path. +For example: + +.. code:: bash + + ray_init.timeline_json_file=/tmp/ray_timeline.json + +The file will be generated in the specified path at the end of a training job. +You can use tools like chrome://tracing or the Perfetto UI and view the ray timeline file. + +This figure shows the ray timeline file generated by from a training job on 1 node with 4 GPUs + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray_timeline.png?raw=true + +How to set proxy only for wandb? +------------------------------------------------------------------------------------------ + +If you need a proxy to access wandb, you can add below config in your training job script. +Comparing to using global https_proxy env variable, this approach won't mess up other http requests, such as ChatCompletionScheduler. + +.. code:: bash + + +trainer.wandb_proxy=http:// + +Missmatch between inference and training sequence (high actor/grad_norm) +------------------------------------------------------------------------------------------ + +If you encounter the issue of actor/grad_norm metric continuously increasing during training, it might be caused by a significant precision mismatching between the inference engine and training. You can use the following parameter to confirm this: + +.. code:: bash + + actor_rollout_ref.rollout.calculate_log_probs=True + +This parameter will add metrics like training/rollout_probs_diff_mean , which can be used to verify if there is a precision difference between inference and training. + +Under normal circumstances, the value of training/rollout_probs_diff_mean should be below 0.005. If you observe this value to be higher than 0.01, it indicates a precision issue from the inference engine. +The precision issue is known to occur under the following conditions: + +1. Using non-Hopper architecture GPUs, such as A100, L20, B200, etc. + +2. Using vLLM `with issue 22103 `_ as the inference engine. + +3. The input and output texts are long, for example, in multi-turn scenarios using reasioning models like Qwen3 for RL training. + +If all three conditions above are met and you observe that rollout_probs_diff_mean is too high, it is recommended to add the following parameter to resolve the precision issue: + +.. code:: bash + + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_cascade_attn=True + +The root cause of this issue is a bug in the flash attention used by vLLM. Although it has been fixed, the fix has not yet been released in the latest version of vLLM (v0.10.2). +For a more detailed explanation of this issue, please refer to `Fix LSE output error in FA2 kv-split `_. + +Until vLLM releases a new version with this fix, it is recommended to use the configuration above to disable cascade attention as a workaround. diff --git a/code/RL_model/verl/verl_train/docs/perf/best_practices.rst b/code/RL_model/verl/verl_train/docs/perf/best_practices.rst new file mode 100644 index 0000000000000000000000000000000000000000..69d8286710ad01d04cf60366a52b398f3dfb7b6d --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/perf/best_practices.rst @@ -0,0 +1,242 @@ +Verl LLM Best Practices (DAPO + Qwen3-235B) +=========================================== + +Last updated: 11/03/2025. + +Purpose +------- + +This guide uses DAPO training on Qwen3-235B as a concrete example. We unpack every parameter that appears in the optimization objective, map it to Verl configuration entries, and share field-tested recommendations so you can derive sensible settings for your own workloads. + +.. note:: + + 1. The guide only covers the subset of parameters required to reproduce the DAPO experiments discussed here. For the full list, refer to the ``config`` components in the Verl source tree: https://github.com/volcengine/verl/tree/main/verl/trainer/config + 2. PPO and GRPO introduce KL-constrained policies. We therefore include that setup in the explanations below. You can treat all configurations mentioned here as a DAPO pipeline augmented with a KL penalty. + +Optimization Objectives +----------------------- + +DAPO objective +~~~~~~~~~~~~~~ + +.. math:: + + \begin{aligned} + \mathcal{J}_{\mathrm{DAPO}}(\theta)= & \mathbb{E}_{(q, a) \sim \mathcal{D},\left\{o_i\right\}_{i=1}^G \sim \pi_{\theta_{\text {old }}}(\cdot \mid q)} \ + {\left[\frac{1}{\sum_{i=1}^G\left|o_i\right|} \sum_{i=1}^G \sum_{t=1}^{\left|o_i\right|} \min \left(r_{i, t}(\theta) \hat{A}_{i, t}, \operatorname{clip}\left(r_{i, t}(\theta), 1-\varepsilon_{\text {low }}, 1+\varepsilon_{\text {high }}\right) \hat{A}_{i, t}\right)\right] } \\ + \end{aligned} + +.. math:: + \text { s.t. } \quad 0<\mid\left\{o_i \mid \text { is_equivalent }\left(a, o_i\right)\right\} \mid 2 * model_parameters`` (bf16/fp16). Increase TP gradually to expand KV cache capacity while watching communication cost—especially once TP > 8. + - ``actor_rollout_ref.rollout.temperature`` / ``top_p`` / ``top_k``: + Sampling knobs for rollout. Keep enough randomness; ``temperature=1.0``, ``top_p=1.0``, ``top_k=-1`` are good defaults. + - ``actor_rollout_ref.rollout.val_kwargs.temperature`` / ``top_p`` / ``top_k`` / ``do_sample`` / ``n``: + Sampling options for validation. Set ``temperature > 0`` to prevent repetitive thinking chains. For small test sets (e.g., AIME24) raise ``n`` (64 is a common choice) to reduce variance. A practical starting point is ``temperature=1.0``, ``top_p=0.7``, ``top_k=-1``, ``do_sample=True``, ``n=1`` and then increase ``n`` as needed. + - ``+actor_rollout_ref.rollout.engine_kwargs.vllm.*`` / ``+actor_rollout_ref.rollout.engine_kwargs.sglang.*``: + Extra backend options injected via the ``+`` syntax. Consult backend docs for exact semantics. Some switches (for example ``pipeline_parallel_size``) may not be supported yet; when TP=32, ``enable_expert_parallel=True`` can even slow down DeepSeek-V3 rollout, so benchmark carefully. + +:math:`\pi_\theta` + - ``data.train_batch_size``: + Total batch size per training iteration. Each rollout produces ``train_batch_size * n`` samples. Larger values reduce the number of rollouts but increase off-policy drift. + - ``actor_rollout_ref.actor.ppo_mini_batch_size``: + Mini-batch size per optimization step. Tune it the same way you would for standard deep learning workloads. + - ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: + Samples processed per forward pass on one GPU group (a Megatron group contains TP * PP * CP GPUs). Keep it ≤ ``ppo_mini_batch_size`` and as large as memory allows. + - ``actor_rollout_ref.actor.use_dynamic_bsz``: + Enable dynamic batch sizing to adapt to sequence length and improve throughput. + - ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``: + Maximum tokens per GPU when computing log probabilities under dynamic batching. Set it to at least a multiple of ``max_prompt_length + max_response_length`` to prevent truncation. + - Megatron parallelism parameters (``pipeline_model_parallel_size`` / ``tensor_model_parallel_size`` / ``expert_model_parallel_size`` / ``expert_tensor_parallel_size`` / ``context_parallel_size``): + Balance PP/TP/EP/ETP/CP to match memory and network constraints. In bf16/fp16, each parameter consumes roughly ``2 / TP`` bytes; if you keep FP32 master weights or skip optimizer offload, reserve another 4–8 bytes for Adam. Activations scale with ``micro_batch_size × sequence_length × hidden_size`` and can be mitigated with gradient checkpointing, dynamic batches, or offload. Prefer increasing TP first, add PP when necessary, extend sequence capacity with CP, align EP/ETP with TP for MoE models, and keep DP minimal on constrained clusters while combining with offload. Always align the setup with hardware topology and communication cost. + - ``actor_rollout_ref.model.use_fused_kernels``: + Enable Verl’s fused kernels for supported models to squeeze out additional performance. + +:math:`\hat{A}_{i,t}` + - ``algorithm.adv_estimator``: + Advantage estimator. Set to ``grpo`` for DAPO/GRPO. + +:math:`R_i` + - ``reward_model.reward_manager``: + Reward aggregation strategy. Use ``dapo`` for DAPO and ``naive`` for GRPO. + +:math:`D_{KL}` + - ``algorithm.use_kl_in_reward``: + Whether to add a KL term to the reward. ``True`` for PPO, ``False`` for GRPO and DAPO. + - ``actor_rollout_ref.actor.use_kl_loss``: + Whether to include a KL loss term. ``False`` for PPO, ``True`` for GRPO, ``False`` for DAPO. + +:math:`\beta` + - ``actor_rollout_ref.actor.kl_loss_coef``: + Weight of the KL loss. Start around 0.001. Larger values curb reward hacking but reduce exploration. + - ``algorithm.kl_ctrl.kl_coef``: + KL coefficient applied within the reward. Adjust to match your tolerance for divergence. + +:math:`\pi_{old}` + - ``actor_rollout_ref.rollout.log_prob_use_dynamic_bsz``: + Enable dynamic batching when the old policy computes log-probabilities. Recommended. + +:math:`\pi_{ref}` + - ``actor_rollout_ref.ref.log_prob_use_dynamic_bsz``: + Enable dynamic batching for the reference policy. Recommended. + - Reference Megatron parallelism: + Keep ``pipeline_model_parallel_size``, ``tensor_model_parallel_size``, ``expert_model_parallel_size``, ``expert_tensor_parallel_size``, and ``context_parallel_size`` in sync with the actor. + - ``actor_rollout_ref.ref.megatron.param_offload``: + Offload reference parameters to CPU when the actor does so. Even without gradients or optimizer states, parity helps with capacity planning. + +:math:`o_i` / :math:`|o_i|` + - ``actor_rollout_ref.actor.loss_agg_mode``: + Loss aggregation mode. Token-level ``token-mean`` matches the recommendations from Dr.GRPO and DAPO; use ``seq-mean-token-mean`` to reproduce the original GRPO behavior. + +:math:`\pi_\theta(o_{i,t} \mid q_i,o_{i,`_ + - `SimonHuang `_ + +1.5B +~~~ + +.. list-table:: + :widths: auto + :header-rows: 1 + + * - Tag + - Model + - Task + - Resource + - MaxBatch + - Train + - Infer + - Link + - Contributor + * - MIN + - Qwen2.5-1.5B + - GRPO-LoRA + - 1*H100 + - 128 + - fsdp + - vllm0.8.3 + - `qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh `_ + - `SimonHuang `_ + +3B +~~~ + +.. list-table:: + :widths: auto + :header-rows: 1 + + * - Tag + - Model + - Task + - Resource + - MaxBatch + - Train + - Infer + - Link + - Contributor + * - MIN + - Qwen2.5-3B + - GRPO-LoRA + - 1*H100 + - 62 + - fsdp + - vllm0.8.3 + - `qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh `_ + - `SimonHuang `_ + +7B +~~~ + +.. list-table:: + :widths: auto + :header-rows: 1 + + * - Tag + - Model + - Task + - Resource + - MaxBatch + - Train + - Infer + - Link + - Contributor + * - MIN + - Qwen2-7B + - GRPO + - 2*H800 + - \ + - fsdp + - vllm0.8.2 + - `qwen2-7b_grpo_2_h800_fsdp_vllm `_ + - `Xiangyongan `_ + * - MIN + - Qwen2.5-7B + - GRPO-LoRA + - 1*H100 + - 16 + - fsdp + - vllm0.8.3 + - `qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh `_ + - `SimonHuang `_ + +14B +~~~ + +.. list-table:: + :widths: auto + :header-rows: 1 + + * - Tag + - Model + - Task + - Resource + - MaxBatch + - Train + - Infer + - Link + - Contributor + * - MIN + - Qwen2-14B + - GRPO + - 4*H800 + - \ + - fsdp + - vllm0.8.2 + - `qwen2-14b_grpo_4_h800_fsdp_vllm `_ + - `Xiangyongan `_ + * - MIN + - Qwen2.5-14B + - GRPO-LoRA + - 2*H100 + - 116 + - fsdp + - vllm0.8.3 + - `qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh `_ + - `SimonHuang `_ + +32B +~~~ + +.. list-table:: + :widths: auto + :header-rows: 1 + + * - Tag + - Model + - Task + - Resource + - MaxBatch + - Train + - Infer + - Link + - Contributor + * - MIN + - Qwen2-32B + - GRPO + - 8*H20 + - \ + - megatron + - vllm0.8.2 + - `qwen2-32b_grpo_8_h20_megatron_vllm `_ + - `Xiangyongan `_ + * - MIN + - Qwen2.5-32B + - GRPO-LoRA + - 4*H100 + - 180 + - fsdp + - vllm0.8.3 + - `qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh `_ + - `SimonHuang `_ + +70B +~~~ + +.. list-table:: + :widths: auto + :header-rows: 1 + + * - Tag + - Model + - Task + - Resource + - MaxBatch + - Train + - Infer + - Link + - Contributor + * - MIN + - Qwen2-70B + - GRPO + - 32*H20 + - \ + - fsdp + - vllm0.8.2 + - `qwen2-70b_grpo_32_h20_fsdp_vllm `_ + - `Xiangyongan `_ + * - MIN + - Qwen2-70B + - GRPO + - 32*H800 + - \ + - fsdp + - vllm0.8.3 + - `qwen2-70b_grpo_32_h800_fsdp_vllm `_ + - `Xiangyongan `_ + * - MIN + - Qwen2.5-72B + - GRPO-LoRA + - 8*H100 + - 176 + - fsdp + - vllm0.8.3 + - `qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh `_ + - `SimonHuang `_ + +405B +~~~~ + +.. table:: + :widths: auto + + ====== ====== ====== ======== ======== ====== ====== ====== + tag model task resource MaxBatch train infer link + ====== ====== ====== ======== ======== ====== ====== ====== + \ \ \ \ \ \ \ + ====== ====== ====== ======== ======== ====== ====== ====== + +671B +~~~~ + +.. table:: + :widths: auto + + ====== ====== ====== ======== ======== ====== ====== ====== + tag model task resource MaxBatch train infer link + ====== ====== ====== ======== ======== ====== ====== ====== + \ \ \ \ \ \ \ + ====== ====== ====== ======== ======== ====== ====== ====== diff --git a/code/RL_model/verl/verl_train/docs/perf/dpsk.md b/code/RL_model/verl/verl_train/docs/perf/dpsk.md new file mode 100644 index 0000000000000000000000000000000000000000..7ea5bd196c3a63cc8d5e06189eb8dc92400136ce --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/perf/dpsk.md @@ -0,0 +1,88 @@ +# Training DeepSeek 671b + +Last updated: 08/20/2025. + +verl integrates Megatron to support large MoE models such as `Qwen3-235B-A22B` and `deepseek-ai/DeepSeek-V3`. This is an ongoing community effort. + +In the journey the community added the following features and optimizations that enable verl with larger models: +- per tensor weight resharding between rollout and training +- context parallelism and expert parallelism enabled via megatron +- dynamic batch size (sequence balance) for megatron +- reduced ray-related serialization overhead +- optimizer offloading, recomputation, and efficient kernels +- various debugging metrics and utils +- hybrid optimizer + +and the megatron backend now has a wider list of models supported: +- DeepSeek-V3 +- Moonlight +- Qwen3 +- Qwen2.5-VL (to be merged soon) +- Qwen2 +- Mixtral + +## Getting Started + +### preparation +The recommended image with pre-built Megatron dependency is `verlai/verl:app-verl0.4-vllm0.8.5-mcore0.13.0-preview`, which is built using the Dockerfile at [docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview](https://github.com/volcengine/verl/blob/main/docker/verl0.4-cu124-torch2.6-fa2.7.4/Dockerfile.app.vllm.mcore0.13.preview). + +The image is build in Hopper GPUs with DeepEP. It does not support None-Hopper GPUs, such as A100. You may need to reinstall DeepEP to work with A100. + +With `OFFLOAD_FRACTION=1`, the system's minimum requirements are lowered. It can run on as few as 96 H20 (96GB) GPUs for DeepSeek-V3, and on as few as 32 H20 (96GB) GPUs for Qwen3-235B-A22B. However, this configuration will use 1.6TB CPU memory per node. If you run out of CPU memory or require faster training speed, you can add more nodes. + +### DeepSeek 671b + +For DeepSeek-V3 671b, please refer to [examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh). + +MTP and quantilization is disabled during RL training. + +To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware. +| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | LAST_LAYER | +| -- | -- | -- | -- | -- | -- | -- | -- | +| 96 | 12 | 8 | 12 | 8 | 1. | False | 6 | +| 128 | 16 | 8 | 16 | 8 | 0.5 | True | 1 | +| 256 | 32 | 8 | 16 | 8 | 0. | True | 1 | +| 512 | 64 | 1 | 16 | 32 | 0 | True | 1 | + +### Qwen3 235b + +For Qwen3-235b, please refer to [examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh). + +To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware. +| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | LAST_LAYER | +| -- | -- | -- | -- | -- | -- | -- | -- | +| 32 | 4 | 4 | 8 | 4 | 1. | False | 6 | +| 64 | 8 | 4 | 8 | 4 | 0.5 | True | 6 | +| 128 | 16 | 4 | 8 | 4 | 0 | True | 6 | +| 256 | 32 | 4 | 8 | 4 | 0 | True | 6 | + +### Benchmark +Here are some benchmark results for DeepSeek / Qwen3-235B. All configurations match the recommended settings based on the number of GPUs. + +| model | num gpus | mean response length | rollout time(s) | GPU memory(GB) | CPU memory(GB) | MFU | step time(s) | +| -- | -- | -- | -- | -- | -- | -- | -- | +| DeepSeek 671b | 96 | 1960 | 1050 | 66 | 1500 | 0.19 | 1700 | + +### Qwen3-30B-A3B MOE + +For Qwen3-30b, please refer to [examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3moe-30b_megatron_96gb.sh). + +To train your project, configure the following environment variables based on the number of available GPUs. These are recommended settings and can be adjusted based on your specific hardware. +| num gpus | NNODES | TP | PP | EP | OFFLOAD_FRACTION | OFFLOAD_OPTIM | MFU | +| -- | -- | -- | -- | -- | -- | -- | -- | +| 8 | 1 | 1 | 1 | 8 | 1. | True | 0.4 | +| 16 | 2 | 1 | 1 | 8 | 1. | True | 0.37 | +| 32 | 4 | 1 | 1 | 8 | 1. | True | 0.31 | + + +## Upcoming Optimizations + +The community continue to optimize large MoE models further, ongoing efforts include: +- further optimizing memory consumption, and provide recommended/tuned configurations with various machine types +- optimizing long context RL training performance +- performance improvement with SGLang x Megatron + +We invite the community to try and improve verl together. Get connected with us on [slack](https://join.slack.com/t/verlgroup/shared_invite/zt-2w5p9o4c3-yy0x2Q56s_VlGLsJ93A6vA)/[wechat](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/WeChat.JPG)/[Github issues](https://github.com/volcengine/verl/issues/708)! + +## Acknowledgement +@vermouth1992 @ISEEKYAN @ETOgaosion @yzlnew @ShareLer @BearBiscuit05 @ccclyu @ann-qin-lu @SwordFaith @zzong2006 @zhaochenyang20 @ocss884 @eric-haibin-lin @chenhaiq @techkang diff --git a/code/RL_model/verl/verl_train/docs/perf/nsight_profiling.md b/code/RL_model/verl/verl_train/docs/perf/nsight_profiling.md new file mode 100644 index 0000000000000000000000000000000000000000..490de5e7e4f7b6ba6c0e372eb7c0c3bfce2a77b9 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/perf/nsight_profiling.md @@ -0,0 +1,94 @@ +# NVIDIA Nsight Systems profiling in verl + +Last updated: 06/20/2025. + +This guide explains how to use NVIDIA Nsight Systems for profiling verl training runs. + +## Configuration + +Profiling in verl can be configured through several parameters in the trainer configuration file (ppo_trainer.yaml or other files like dapo_trainer.yaml): + +### Prerequisites + +Nsight Systems version is important, please reference `docker/Dockerfile.vllm.sglang.megatron` for the version we used. + +### Global profiling control + +verl has one single controller process and multiple worker processes. Both controller and worker processes can be profiled. Since the controller process can be executed in any nodes in the cluster, there is a message printed in the logging to indicate the controller process node hostname and process id. + +In `global_profiler`, three new config entries control the profiler behaviors: + +* **`global_profiler.steps`**. List of step numbers at which profiling should be performed. For example: [1, 2, 5] will profile steps 1, 2, and 5. And ``null`` means no profiling. + +* **`global_profiler.profile_continuous_steps`**. If true, and the following `global_profiler.discrete==False`, then the continuous steps in `global_profiler.steps` will be combined into one database. For example the above step 1 and 2 are in one database, and 5 in another. If false, every step occupies at least one database. The reason for this config is to observe the program behaviors between steps. + +Nsys options in controller nodes and worker nodes are configured in `global_profiler.global_tool_config.nsys`: + +* **`global_profiler.global_tool_config.nsys.controller_nsight_options`**. This config group is for the single controller. All fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. `ppo_trainer.yaml` provides a workable example. Users can reference [Nsight Systems manual](https://docs.nvidia.com/nsight-systems/UserGuide/index.html) and [Ray user guide](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html) for more details. +* **`global_profiler.global_tool_config.nsys.worker_nsight_options`**. This config group is for the worker processes. Similarly all fields in this config group will be just sent to Nsight Systems when Ray starts the controller process. Capture range is used to control the profiler when to start and stop. So `capture-range: "cudaProfilerApi"` is fixed and does not change it. Users can change `capture-range-end` with some accurate calculation or just leave it `null`. + +### Worker process profiling + +Verl manages mulitiple RL roles, _Actor_, _Ref_, _Rollout_, _Critic_, _Reward_, which are implemented in different Worker classes. And these workers can be combined into one Ray Actor, running in a process group. Each RL role has its own profiling config group, `profiler`, which consists of three fields: + +* **`all_ranks` and `ranks`**. When `all_ranks` is set `True` then all ranks will be profiled; when set `False`, `ranks` will be profiled. By default, verl profiles the whole training process in a series ` worker_process_..nsys-rep` files for each process rank. PID is the process ID; RID is the capture range ID. +* **`discrete`**. When set `False`, all the roles actions in one training step will be dumped in one database. When set `True`, the actions annotated by `DistProfiler.annotate` will be dumped into a discrete database. In this case, each role's action occupies one ``. +* **Verl collocate mode**. Verl can combine two Worker sub classes to one Worker Actor. In this case, the user should take care that the combined Workers have consistent `discrete`. The Nsight Systems profiler uses a `torch.cuda.profiler.start()` and `stop()` pair to dump a `` database anyway. + +### where to find the profiling data + +By default the `*.nsys-rep` files are saved in the directory `/tmp/ray/session_latest/logs/nsight/` at each node. According to the Ray manual, this default directory is not changeable. ["however, Ray preserves the `--output` option of the default config"](https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html). + +Some users may think it is not convenient, but it is understandable that Ray may start hundreds of processes and it would be a big network file system pressure if we save the files in one central place. + +## Usage Example + +To enable profiling for specific components and steps, modify your ppo_trainer.yaml like this: + +### Disable profiler + +```yaml + profiler: + steps: null # disable profile +``` + +### Enable profiler and one database for one training step + +```yaml + global_profiler: + steps: [1, 2, 5] + discrete: False + actor_rollout_ref: + actor: + profiler: + enable: True + all_ranks: True + # rollout & ref follow actor settings + critic: + profiler: + enable: True + all_ranks: True + reward_model: + profiler: + enable: True + all_ranks: True +``` + +### Enable profiler and multiple databases for one training step + +```yaml + profiler: + steps: [1, 2, 5] + discrete: True +``` + +## Profiling Output + +When profiling is enabled, verl will generate Nsight Systems profiles for the specified components and steps. The profiles will include: + +- CUDA kernel execution +- Memory operations +- CPU-GPU synchronization +- NVTX markers for key operations + +Nsight Systems supports multi-report view, to open multiple databases together. In this mode, different processes and steps can be aligned in one time line for better analysis. diff --git a/code/RL_model/verl/verl_train/docs/perf/perf_tuning.rst b/code/RL_model/verl/verl_train/docs/perf/perf_tuning.rst new file mode 100644 index 0000000000000000000000000000000000000000..b5edd50c4dfc88afdf18f2525c44fb882dc96eaf --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/perf/perf_tuning.rst @@ -0,0 +1,224 @@ +Performance Tuning Guide +============================== + +Last updated: 07/17/2025. + +Author: `Guangming Sheng `_, `Jiali Zheng `_ + +In this section, we will discuss how to tune the performance of all the stages in verl, including: + +1. Rollout generation throughput. + +2. Enable ``use_remove_padding=True`` for sequence packing (i.e., data packing and remove padding). + +3. Batch size tuning for forward and backward computation + +4. Enable ``use_dynamic_bsz=True`` for higher throughput. + +5. Utilize Ulysses Sequence Parallel for Long Context Training + +6. LigerKernel for SFT performance optimization + +7. Forward prefetch in FSDP training backend + +8. Memory optimization for entropy calculation from logits + +Rollout Generation Tuning +-------------------------- + +verl currently supports two rollout backends: vLLM and TGI (with SGLang support coming soon). + +Below are key factors for tuning vLLM-based rollout. Before tuning, we recommend setting ``actor_rollout_ref.rollout.disable_log_stats=False`` so that rollout statistics are logged. + +- Increase ``gpu_memory_utilization``. + + - For vLLM v0.7.0 and later, the vLLM instance will only use gpu_memory_utilization of the **total** memory. + - For SGLang, it's the fraction of the free GPU memory used for **static** memory like model weights and KV cache. However, the remaining (1-gpu_memory_utilization) will also be used during inference. + + However, if model parameters and optimizer states are not offloaded, using too high a fraction can lead to OOM. + A value between 0.5 and 0.7 often strikes a good balance between high throughput and avoiding OOM. + + Note: since the definition of ``gpu_memory_utilization`` varies across inference engines, a value that works well for one engine may cause OOM for another. + +- Adjust ``max_num_seqs`` or ``max_num_batched_tokens``. + If the GPU cache utilization is relatively low in the log, increase ``max_num_seqs`` or ``max_num_batched_tokens`` + can enlarge the effective batch size in the decoding stage, allowing more concurrent requests per batch. + We recommend setting ``max_num_batched_tokens > 2048`` for higher throughput. + +- Use a smaller ``tensor_parallel_size``. + When GPU resources allow, a smaller tensor parallel size spawns more vLLM replicas. + Data parallelism (DP) can yield higher throughput than tensor parallelism (TP), but also increases KVCache consumption. + Carefully balance the trade-off between more replicas and higher memory usage. + Our experiment in Sec. 8.4 of `HybridFlow paper `_ evaluate this trade-off. + +- Balance performance and memory using ``cudagraph_capture_sizes``. + If ``cudagraph_capture_sizes`` is set, vLLM will try to capture the model execution graph for different batch sizes. + Since cudagraph memory can not be offloaded to cpu, The memory stay in gpu when update actor is running. + Using smaller batch sizes can avoid OOM but slightly reduce throughput. + Must to set ``enforce_eager=False`` to use ``cudagraph_capture_sizes``. + +More tuning details such as dealing with Preemption and Chunked-prefill +can be found in `vLLM official tuning guide `_ + +For optimal performance, we recommend using vLLM v0.8.3 or later. See https://github.com/volcengine/verl/blob/main/docs/README_vllm0.8.md for details. + +Enable remove padding (sequence packing) +----------------------------------------- + +Currently, for llama, mistral, gemma1 and qwen based models, users can enable `use_remove_padding=True` to utilize the +sequence packing implementation provided by transformers library. + +For other models, transformers library may also support it but we haven't tested it yet. +Users can add the desired model config to the `test_transformer.py `_ file. +And test its functionality by running the following command: + +.. code-block:: bash + + pytest -s tests/models/test_transformer.py + +If the test passes, you can add your desired model into the model `registry.py `_ file. +Then, you can enjoy the performance boost of sequence packing +and welcome to PR your tested model to verl! + + +Batch Size Tuning +----------------- + +To achieve higher throughput in experience preparation (i.e., model fwd) and model update (i.e., actor/critic fwd/bwd), +users may need to tune the ``*micro_batch_size_per_gpu`` for different computation. + +In verl, the core principle for setting batch sizes is: + +- **Algorithmic metrics** (train batch size, PPO mini-batch size) are *global* (from a single-controller perspective), + normalized in each worker. See the `normalization code `_. + +- **Performance-related parameters** (micro batch size, max token length for dynamic batch size) are *local* parameters that define the per-GPU data allocations. + See the `normalization code `_. + +.. note:: In your training script, please use ``*micro_batch_size_per_gpu`` instead of ``*micro_batch_size``. + So that you don't need to consider the normalization of the ``micro_batch_size`` and ``micro_batch_size`` will be deprecated. + +Batch Size Tuning tips +"""""""""""""""""""""" + +Therefore, users may need to tune the ``*micro_batch_size_per_gpu`` to accelerate training. Here're some tips: + +1. **Enable gradient checkpointing**: + Set ``actor_rollout_ref.model.enable_gradient_checkpointing=True`` and ``critic.model.enable_gradient_checkpointing=True``. + This often allows for larger micro-batch sizes and will be beneficial for large mini-batch training. + +2. Increase the ``*micro_batch_size_per_gpu`` as much as possible till equals to normalized ``mini_batch_size``. + +3. **Use larger forward-only parameters**: + Forward only parameter, such as ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``, + ``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``, ``critic.forward_micro_batch_size_per_gpu`` could be larger (e.g., 2x) than training related micro batch sizes, + such as ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``, ``critic.ppo_micro_batch_size_per_gpu``. + +4. **Allow larger micro-batch sizes for Critic and Reward models**: + micro batch size of Critic and Reward model could be larger than Actor model. This is because the actor model has much larger vocab size in the final layer. + +5. **Enable activation offloading**: + Set ``actor_rollout_ref.model.enable_activation_offload=True`` and ``critic.model.enable_activation_offload=True``. + This often works together with gradient checkpointing to get larger micro-batch sizes and it's only available in FSDP backend now. + +Tuning for Dynamic Batch Size +----------------------------- + +Dynamic batch size is a technique that allows the model to process similar number of tokens in a single forward pass (with different actual batch sizes). +This can significantly improve the training efficiency and reduce the memory usage. + +To utilize this technique, users can set ``use_dynamic_bsz=True`` in actor, ref, critic and reward models. +With ``use_dynamic_bsz=True``, users don't need to tune ``*micro_batch_size_per_gpu``. +Instead, users should tune the following parameters: + +- ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``, ``critic.ppo_max_token_len_per_gpu``: + The maximum number of tokens to be processed in fwd and bwd of ``update_policy`` and ``update_critic``. + +- ``actor_rollout_ref.ref.log_prob_max_token_len_per_gpu`` and ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``: + The maximum number of tokens to be processed in a the fwd computation of ``compute_log_prob`` and ``compute_ref_log_prob``. + +- ``critic.forward_micro_batch_size_per_gpu``, ``reward_model.forward_micro_batch_size_per_gpu``: + The maximum number of tokens to be processed in a the fwd computation of ``compute_values``, ``compute_rm_score``. + +Dynamic Batch Size Tuning tips +"""""""""""""""""""""""""""""" + +Here're some tips to tune the above parameters: + +1. **Increase** ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu`` + Make it at least 2 x (max_prompt_length + max_response_length). We set it to 3x in `run_qwen2-7b_rm_seq_balance.sh `_. + Try to increase it to get higher throughput. + +2. **Forward-only parameters can be larger**: + Similar to the non-dynamic-batch scenario, forward-only token limits can exceed those used in forward/backward operations. + +3. **Use larger limits for Critic and Reward models**: + Critic and Reward parameters can be set at least 2× the Actor’s limits. For instance, we set them to 4× here: + `run_qwen2-7b_rm_seq_balance.sh `_ + +.. :math:`\text{critic.ppo_max_token_len_per_gpu} = 2 \times \text{actor.ppo_max_token_len_per_gpu})`. + +Ulysses Sequence Parallel for Long Context Training +---------------------------------------------------- + +To utilize this technique, users can set ``ulysses_sequence_parallel_size>1`` in actor, ref, critic and reward models. + +We support different model utilize different ulysses_sequence_parallel_size sizes. + +To train long sequence (>32k), users may need to decrease the ``*micro_batch_size_per_gpu`` and ``*max_token_len_per_gpu`` to avoid OOM. + +LigerKernel for SFT +---------------------- + +LigerKernel is a high-performance kernel for Supervised Fine-Tuning (SFT) that can improve training efficiency. To enable LigerKernel in your SFT training: + +1. Install liger-kernel via ``pip3 install liger-kernel``. In your SFT configuration file (e.g., ``verl/trainer/config/sft_trainer.yaml``), set the ``use_liger`` parameter: + + .. code-block:: yaml + + model: + use_liger: True # Enable LigerKernel for SFT + +2. The default value is ``False``. Enable it only when you want to use LigerKernel's optimizations. + +3. LigerKernel is particularly useful for improving training performance in SFT scenarios. + +Forward prefetch in FSDP training backend +---------------------- + +During the training phase, users can enable forward prefetching in FSDP by setting ``fsdp_config.forward_prefetch=True``. For example, ``actor_rollout_ref.actor.fsdp_config.forward_prefetch=True``. This configuration prefetches the next forward-pass all-gather operation before completing the current forward computation, overlapping communication with computation and improving efficiency. For further details, refer to the `FSDP forward_prefetch `_ documentation. + +.. note:: + Backward prefetch is unsupported because the ``BACKWARD_POST`` policy may prefetch incorrectly in nested-module cases. For details, see the `FSDP documentation `_ + +Migrating to FSDP2 +---------------------- + +FSDP2 offers notable improvements over FSDP1. According to `PyTorch TorchTitan benchmarks `_: + +- 7% lower GPU memory usage on average +- 1.5% throughput improvement with BF16 training +- Better composability with DTensor and per-parameter sharding + +**Enabling FSDP2 in VERL:** + + .. code-block:: python + + # Enable FSDP2 in actor configuration + actor_rollout_ref.actor.strategy="fsdp2" + +.. note:: + FSDP2 requires PyTorch 2.1+ and is recommended for models with transformer architecture. + +Memory optimization for entropy calculation from logits +---------------------- + +The ``logits`` tensor (typically of shape ``[bsz*seq_len, voc]``) can consume significant memory. When using ``compute_entropy_from_logits``, memory usage reaches approximately ``[bsz*seq_len, voc] × (4 bytes (float32) + 2 bytes (autocast for softmax+logsumexp) + 1 byte (softmax output))``. + +To reduce this memory peak, enable chunked computation by setting: +``actor_rollout_ref.ref.entropy_from_logits_with_chunking = True`` +This processes the tensor in chunks of shape ``[chunk_size, voc]`` (e.g., 2048) rather than the full sequence length, exclusively during the model's forward pass. + +Additionally, during training, standard gradient checkpointing (``enable_gradient_checkpointing=True``) does not apply to entropy calculations. To reduce memory peaks in this context, set: +``actor_rollout_ref.actor.entropy_checkpointing = True`` +This enables entropy recomputation specifically for the entropy calculation, lowering memory usage during training. diff --git a/code/RL_model/verl/verl_train/docs/perf/torch_profiling.md b/code/RL_model/verl/verl_train/docs/perf/torch_profiling.md new file mode 100644 index 0000000000000000000000000000000000000000..3c2b67ea84881e2a5249f5b8f435d0cf80747289 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/perf/torch_profiling.md @@ -0,0 +1,117 @@ +# PyTorch Profiling in verl + +Last updated: 01/13/2026. + +This guide explains how to use the native [PyTorch Profiler](https://pytorch.org/tutorials/recipes/recipes/profiler_recipe.html) for profiling verl training runs. + +## Configuration + +Profiling in verl can be configured through parameters in the trainer configuration file (e.g., `ppo_trainer.yaml`). + +### Global Profiling Control + +In `global_profiler`, you can control when and how profiling occurs globally: + +* **`global_profiler.steps`**: List of step numbers to profile. E.g., `[1, 2, 5]` profiles steps 1, 2, and 5. Set to `null` to disable. +* **`global_profiler.save_path`**: Directory to save the profiling results. Default is `outputs/profile`. + +### Role Profiling Control + +Each RL role (Actor, Critic, etc.) has its own `profiler` configuration: + +* **`enable`**: Whether to enable profiling for this role. +* **`all_ranks`**: If `True`, profiles all ranks. +* **`ranks`**: List of specific ranks to profile if `all_ranks` is `False`. +* **`tool_config.torch`**: Configuration specific to the PyTorch Profiler. + +#### PyTorch Profiler Options (`tool_config.torch`) + +You can customize the PyTorch Profiler behavior using the following fields under `tool_config.torch`: + +* **`contents`**: List of contents to profile. + * **`cpu`**: Profile CPU activities. + * **`cuda`**: Profile CUDA activities. + * **`memory`**: Track tensor memory allocation/free. + * **`shapes`**: Record shapes of operator inputs. + * **`stack`**: Record source code file and line number. +* **`schedule`**: (Advanced) configuration for `wait`, `warmup`, `active`, `repeat` cycles. + +## Examples + +### 1. End-to-End Collection + +Collects performance data for all steps in a single trace file. + +```yaml +global_profiler: + steps: [1, 2, 5] + save_path: ./outputs/profile + +actor_rollout_ref: + actor: + profiler: + enable: True + all_ranks: True + tool_config: + torch: + discrete: False + contents: [cpu, cuda] + # rollout & ref follow actor settings +``` + +### 2. Discrete Mode Collection + +Discrete mode saves separate trace files for each step. This is useful for detailed analysis and is **mandatory** when using Agent Loop. + +**Configuration Example** + +This configuration supports profiling both Training (Actor) and Inference (Rollout). You can enable/disable them independently. + +```yaml +actor_rollout_ref: + actor: + profiler: + enable: True # Set to True to profile training + all_ranks: False + ranks: [0] # Global Rank 0 + tool_config: + torch: + discrete: True + contents: [cpu, cuda] + rollout: + profiler: + enable: True # Set to True to profile inference + all_ranks: False + ranks: [0] # In Agent Loop, this is the Replica Rank (e.g. 0-th instance) + tool_config: + torch: + discrete: True # REQUIRED + # ref follow actor settings +``` + +> **Note for Agent Loop Mode**: +> When using Agent Loop, `ranks` in rollout config refers to the **Replica Rank** (instance index), not the global rank. + +**Inference Backend Setup (for Agent Loop)** + +* **vLLM Engine**: + * **Environment Variables Required**: + * `VLLM_TORCH_PROFILER_DIR`: **(Required)** Directory to save traces (e.g., `/mnt/traces`). + * `VLLM_TORCH_PROFILER_WITH_STACK`: `1` to enable stack tracing (default). + * `VLLM_TORCH_PROFILER_RECORD_SHAPES`: `1` to record shapes of operator inputs. + * `VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY`: `1` to track tensor memory allocation/free. + * `VLLM_TORCH_PROFILER_WITH_FLOPS`: `1` to estimate FLOPS. + * *Note: vLLM ignores the `save_path` and `contents` in `ppo_trainer.yaml`.* + +* **SGLang Engine**: + * **Zero Configuration**: Automatically uses the settings from `ppo_trainer.yaml`. + +## Visualization + +Collected trace files (usually `.json` or `.json.gz`) are stored in the configured `save_path`. + +You can visualize them using: + +1. **Chrome Tracing**: Open `chrome://tracing` in a Chrome browser and load the JSON file. +2. **Perfetto**: Open [ui.perfetto.dev](https://ui.perfetto.dev/) and load the file (recommended for large traces). +3. **TensorBoard**: If using the TensorBoard plugin for PyTorch Profiler. diff --git a/code/RL_model/verl/verl_train/docs/perf/verl_profiler_system.md b/code/RL_model/verl/verl_train/docs/perf/verl_profiler_system.md new file mode 100644 index 0000000000000000000000000000000000000000..fc7ecc38eed92ca5e05274e23f40b6f1ce7033b0 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/perf/verl_profiler_system.md @@ -0,0 +1,36 @@ +# verl Profiler System + +Last updated: 08/18/2025. + +## Architecture + +The architecture of verl profiler system is like below: + +![verl-profiler-arch](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/2bc7ed0ba2f37f21707bfac3b241eca4b86d1bc6/docs/verl_profiler_arch.png) + +There is a global profiler and tool configuration to set some common config in single controller level, deciding + +- `tool`: which tool to use +- `steps`: which steps to profile +- `save_path`: results saving path + +When some tool need to profile behavior of each role, configurations in role-level is needed: + +- `tool`: which tool to use +- `enable`: whether enable profiling on this role +- rank info: `all_ranks` and `rank` to decide which rank to profile or log output + +For tool config in role-level, there are some detailed behavior needed to control, like the `discrete` mode in nsys profiler. + +Every role has a profiler config, and by default, rollout/ref/reward models follow the Actor's behavior. + +## To Add a new profiling tool + +New added profiling tool shall reuse the current APIs as much as possible. + +1. The logic of **whether to use the tool**: `tool == [new tool]`. +2. Add the global and local tool config to `ppo_trainer.yaml`/`ppo_megatron_trainer.yaml` and each `[role].yaml`, under `global_tool_config.[new tool]` and `tool_config.[new tool]` +3. The tool config should be implemented in `verl/utils/profiler/config.py`, inherit the `BaseConfig` class. +4. Implement profiling tool initialization logic using configurations in `global_profiler.global_tool_config.[new tool]` and the results saving logics (can also save in role-level profile) +5. For role function-level profiling, please follow the nsys profiler way in `nvtx_profiler.py`, implement a profiler class inherit `DistProfiler` and import new profiler in `verl/utils/profiler/__init__.py` +6. Add unit test and examples for others to use in convinience. \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/preparation/prepare_data.rst b/code/RL_model/verl/verl_train/docs/preparation/prepare_data.rst new file mode 100644 index 0000000000000000000000000000000000000000..c429e4b167967652a0c3fb52d9e0029f1b9899d4 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/preparation/prepare_data.rst @@ -0,0 +1,128 @@ +Prepare Data for Post-Training +======================================== + +Last updated: 02/09/2025. + +Before starting the post-training job, we need to prepare the data for +the policy training. The data should be stored in the parquet format. + +We provide several data preprocess scripts for different datasets, +including GSM8K, MATH, HelloSwag, Full_hh_rlhf. To prepare other datasets, we need +to follow the following steps: The data preprocess script can be divided +into two parts: + +1. The first part is the common part, which loads the dataset from + huggingface's ``datasets`` package. Then preprocess the datasets with + the ``make_map_fn`` and then store in the parquet format. + +.. code:: python + + import re + import os + import datasets + + from verl.utils.hdfs_io import copy, makedirs + import argparse + + # To extract the solution for each prompts in the dataset + # def extract_solution(solution_str): + # ... + + + if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--local_dir', default='/opt/tiger/gsm8k') + parser.add_argument('--hdfs_dir', default=None) + + args = parser.parse_args() + + num_few_shot = 5 + data_source = 'openai/gsm8k' + + dataset = datasets.load_dataset(data_source, 'main') + + train_dataset = dataset['train'] + test_dataset = dataset['test'] + + # Construct a `def make_map_fn(split)` for the corresponding datasets. + # ... + + train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True) + test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True) + + local_dir = args.local_dir + hdfs_dir = args.hdfs_dir + + train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet')) + test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet')) + + makedirs(hdfs_dir) + + copy(src=local_dir, dst=hdfs_dir) + +2. The users are required to implement the ``make_map_fn()`` function + (as well as the ``extract_solution``) on their own to support + different datasets or tasks. + +We already implemented the data preprocess of GSM8k, MATH, Hellaswag and Full_hh_rlhf +datasets. And we take the GSM8k dataset as an example: + +**GSM8K** + +In the ``make_map_fn``, each data field should consist of the following +5 fields: + +1. ``data_source``: The name of the dataset. To index the corresponding + reward function in the ``RewardModel`` +2. ``prompt``: This field should be constructed in the format of + huggingface chat_template. The tokenizer in ``RLHFDataset`` will + apply chat template and tokenize the prompt. +3. ``ability``: Define the task category. +4. ``reward_model``: Currently, we only utilize the ``ground_truth`` + field during evaluation. The ``ground_truth`` is computed by the + ``extract_solution`` function. **NOTED** that the implementation of + the corresponding reward function should align with this extracted + ``ground_truth``. +5. ``extra_info``: Record some information of the current prompt. Not + use for now. + +.. code:: python + + def extract_solution(solution_str): + solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) # extract the solution after #### + assert solution is not None + final_solution = solution.group(0) + final_solution = final_solution.split('#### ')[1].replace(',', '') + return final_solution + + instruction_following = "Let's think step by step and output the final answer after \"####\"." + + # add a row to each data item that represents a unique id + def make_map_fn(split): + + def process_fn(example, idx): + question = example.pop('question') + + question = question + ' ' + instruction_following + + answer = example.pop('answer') + solution = extract_solution(answer) + data = { + "data_source": data_source, + "prompt": [{ + "role": "user", + "content": question + }], + "ability": "math", + "reward_model": { + "style": "rule", + "ground_truth": solution + }, + "extra_info": { + 'split': split, + 'index': idx + } + } + return data + + return process_fn diff --git a/code/RL_model/verl/verl_train/docs/preparation/reward_function.rst b/code/RL_model/verl/verl_train/docs/preparation/reward_function.rst new file mode 100644 index 0000000000000000000000000000000000000000..286e2aff49fea71e34ac706d509725cc94aece13 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/preparation/reward_function.rst @@ -0,0 +1,71 @@ +Implement Reward Function for Dataset +====================================== + +Last updated: 06/02/2025. + +For each dataset, we need to implement a reward function or utilize a reward model to compute the rewards for the generated responses. +We already pre-implemented some reward functions in `reward_score directory `_. +You can also use customized reward functions. + +Currently, we support reward functions for GSM8k and MATH datasets. For RLHF datasets (e.g., +full_hh_rlhf) and Code Generation (e.g., APPS), we utilize reward model +and SandBox (will opensource soon) for evaluation respectively. + +RewardManager +------------- + +In the entrypoint of the PPO Post-Training script `main_ppo.py `_, +we implement a ``RewardManager`` that utilize pre-implemented reward functions to compute the scores for each response. + +In the ``RewardManager``, we implemented a ``__call__`` function to +compute the score for each response. +All the reward functions are executed by ``compute_score_fn``. +The input is a ``DataProto``, which includes: + +- ``input_ids``, ``attention_mask``: ``input_ids`` and ``attention_mask`` after applying + chat_template, including prompt and response +- ``responses``: response tokens +- ``ground_truth``: The ground truth string of the current prompt. + Stored in ``non_tensor_batch`` in the ``DataProto``, which should be + preprocessed in the parquet files. +- ``data_source``: The dataset name of the current prompt. Stored in + ``non_tensor_batch`` in the ``DataProto``, which should be + preprocessed in the parquet files. + +After detokenize the responses, the responses string and the ground +truth string will be input to the ``compute_score_fn`` to compute the +score for each response. + +Reward Functions +---------------- + +Pre-implemented +~~~~~~~~~~~~~~~ + +We already pre-implemented some reward functions in `reward_score directory `_. + +- In the `GSM8k example `_, we + force the response to output the final answer after four ####, then + use string matching to compare with the ground truth. If completely + correct, score 1 point; if the format is correct, score 0.1 points; if + the format is incorrect, score 0 points. +- In the `MATH example `_, we follow + the implementation in `lm-evaluation-harness repository `_. + +Customized +~~~~~~~~~~ + +You can implement customized reward functions in a separate file and specify them using ``custom_reward_function.path`` and ``custom_reward_function.name``. For the set of them, please refer to :ref:`config-explain-page`. + +The parameters of your reward function should be ``data_source``, ``solution_str``, ``ground_truth``, and ``extra_info``. +For example: + +.. code:: python + + def my_reward_fn(data_source, solution_str, ground_truth, extra_info=None): + return len(solution_str)/100 + +If you are testing only a single customized reward function, you can simply name it 'compute_score' and leave ``custom_reward_function.name`` unset. + +To run multiple tests with different customized reward functions, you can modify both ``custom_reward_function.path`` and ``custom_reward_function.name`` for each trial. +For instance, you might create a single `my_reward.py` file and implement multiple reward functions within it. This way, for different trials, you only need to adjust ``custom_reward_function.name``, making it more convenient to conduct multiple tests within scripts. diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/interaction_system.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/interaction_system.rst new file mode 100644 index 0000000000000000000000000000000000000000..812a9484eb264d79500bd0aba9607d43146bd01c --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/interaction_system.rst @@ -0,0 +1,417 @@ +Interaction System for Multi-turn RL Training +============================================= + +Last updated: 06/25/2025. + +Overview +-------- + +The verl interaction system enables dynamic, multi-turn conversational feedback during reinforcement learning training. This system allows models to engage in iterative problem-solving scenarios where interaction agents can provide corrective feedback, guidance, or evaluation based on the model's responses. + +**New in Multi-Interaction Support**: The system now supports multiple named interactions within a single training session, enabling sophisticated training scenarios where different samples can use different interaction strategies. This allows for curriculum learning, domain-specific feedback, and flexible agent switching at the sample level. + +Key features: + +- **Async-based Architecture**: Non-blocking interaction processing for distributed training +- **Instance Management**: Stateful session handling with unique instance IDs for concurrent interactions +- **SGLang Integration**: Seamless integration with SGLang rollout system for multi-turn conversations +- **Configuration-driven**: Dynamic agent loading via YAML configuration files +- **Multi-Interaction Support**: Registry system enabling multiple named interactions per rollout +- **Sample-Level Selection**: Each sample can specify which interaction to use via configuration +- **Reward Integration**: Turn-level scoring mechanism integrated with verl's reward system + +Architecture +------------ + +The interaction system follows a plugin-based architecture with clear separation of concerns: + +.. code-block:: + + Interaction Registry System + ↓ + BaseInteraction (Abstract Interface) + ↓ + Multiple Named Interactions (e.g., Gsm8kInteraction, CustomInteraction) + ↓ + SGLang Rollout Integration (interaction_map) + ↓ + Sample-Level Interaction Selection + ↓ + Async Request Lifecycle Management + +Core Components +~~~~~~~~~~~~~~~ + +**Interaction Registry System** + +The interaction registry system allows loading and managing multiple named interactions: + +.. code-block:: python + + from verl.interactions.utils.interaction_registry import initialize_interactions_from_config + + # Load multiple interactions from config + interaction_map = initialize_interactions_from_config("config.yaml") + + # Access specific interaction by name + gsm8k_interaction = interaction_map["gsm8k"] + custom_interaction = interaction_map["custom_solver"] + +**BaseInteraction Interface** + +All interaction agents must implement the ``BaseInteraction`` abstract class: + +.. code-block:: python + + from verl.interactions.base import BaseInteraction + from typing import Dict, Any, List, Tuple, Optional + + class BaseInteraction: + def __init__(self, config: Dict[str, Any]): + self.config = config + self.name: str = config.get("name", "interaction_agent") + + async def start_interaction(self, instance_id: Optional[str] = None, **kwargs) -> str: + """Initialize interaction session, return instance_id""" + + async def generate_response(self, instance_id: str, messages: List[Dict[str, Any]], **kwargs) -> Tuple[bool, str, float, Dict[str, Any]]: + """Generate response, return (should_terminate, response, score, metadata)""" + + async def calculate_score(self, instance_id: str, **kwargs) -> float: + """Calculate turn-level score for RL training""" + + async def finalize_interaction(self, instance_id: str, **kwargs) -> None: + """Clean up resources""" + +**Request Lifecycle** + +The interaction system integrates with SGLang's async rollout via state management: + +1. ``PENDING`` → Initialize interaction via ``start_interaction()`` +2. ``GENERATING`` → Model generates response +3. ``INTERACTING`` → Process response via ``generate_response()`` +4. ``GENERATING`` → Continue if not terminated, otherwise ``COMPLETED`` + +Configuration +------------- + +**Basic Setup** + +Enable interaction in your rollout configuration: + +.. code-block:: yaml + + actor_rollout_ref: + rollout: + multi_turn: + enable: true + interaction_config_path: "path/to/interaction_config.yaml" + max_user_turns: 10 + max_assistant_turns: 10 + +**Interaction Configuration File** + +Create an interaction configuration file (e.g., ``interaction_config.yaml``): + +**Single Interaction (Legacy Format)** + +.. code-block:: yaml + + interaction: + - name: "gsm8k" + class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction" + config: {} + +**Multiple Interactions (New Format)** + +.. code-block:: yaml + + interaction: + - name: "gsm8k" + class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction" + config: {} + - name: "custom_solver" + class_name: "custom.interactions.CustomInteraction" + config: + solver_type: "advanced" + timeout: 30 + - name: "code_verifier" + class_name: "verl.interactions.base.BaseInteraction" + config: + verification_mode: "strict" + +**Automatic Name Generation** + +If no ``name`` field is provided, the system will automatically generate one from the class name: + +.. code-block:: yaml + + interaction: + - class_name: "verl.interactions.gsm8k_interaction.Gsm8kInteraction" + config: {} + # Automatically generates name: "gsm8k" + +The system will dynamically load all specified interaction classes and make them available by name. + +Implementation Example: GSM8K +----------------------------- + +The GSM8K interaction demonstrates a complete implementation for math problem-solving scenarios: + +.. code-block:: python + + from verl.interactions.base import BaseInteraction + from verl.utils.reward_score import gsm8k + from uuid import uuid4 + + class Gsm8kInteraction(BaseInteraction): + def __init__(self, config: dict): + super().__init__(config) + self._instance_dict = {} + + async def start_interaction(self, instance_id=None, ground_truth=None, **kwargs): + if instance_id is None: + instance_id = str(uuid4()) + self._instance_dict[instance_id] = { + "response": "", + "ground_truth": ground_truth, + "reward": 0.0, + } + return instance_id + + async def generate_response(self, instance_id, messages, **kwargs): + # Extract last assistant message content + content = "" + for item in reversed(messages): + if item.get("role") == "assistant": + content = item.get("content", "") + break + + # Ensure GSM8K format (#### prefix) + self._instance_dict[instance_id]["response"] = content + + reward = await self.calculate_score(instance_id) + if reward == 1.0: + return True, "Your response is correct!", 1.0, {} + else: + return False, "Your response is incorrect! You need to reflect on your answer and try again.", 0.0, {} + + async def calculate_score(self, instance_id, **kwargs): + return gsm8k.compute_score( + self._instance_dict[instance_id]["response"], + self._instance_dict[instance_id]["ground_truth"], + method="strict", format_score=0.0, score=1.0, + ) + + async def finalize_interaction(self, instance_id, **kwargs): + del self._instance_dict[instance_id] + +Training Integration +-------------------- + +**Training Script Configuration** + +Include interaction configuration in your training command: + +.. code-block:: bash + + python3 -m verl.trainer.main_ppo \\ + --config-path="$CONFIG_PATH" \\ + --config-name='gsm8k_multiturn_grpo_w_interaction' \\ + algorithm.adv_estimator=grpo \\ + data.train_batch_size=512 \\ + data.return_raw_chat=True \\ + actor_rollout_ref.rollout.name=sglang \\ + actor_rollout_ref.rollout.multi_turn.interaction_config_path="$PROJECT_DIR/examples/sglang_multiturn/config/interaction_config/gsm8k_interaction_config.yaml" \\ + trainer.total_epochs=15 + +**Data Requirements** + +Ensure your dataset includes interaction parameters with the ``name`` field for interaction selection: + +.. code-block:: python + + # Dataset should include interaction_kwargs in non_tensor_batch + interaction_kwargs = [ + {"name": "gsm8k", "query": "What is 2+2?", "ground_truth": "4"}, + {"name": "custom_solver", "query": "Solve: x^2 + 5x + 6 = 0", "ground_truth": "x = -2, -3"}, + {"name": "gsm8k", "query": "What is 3+3?", "ground_truth": "6"}, + ] + +**Sample-Level Interaction Selection** + +Each sample can specify which interaction to use via the ``name`` field. This enables flexible training scenarios where different samples use different interaction strategies: + +.. code-block:: python + + # Example: Math problems use GSM8K interaction, code problems use code verifier + data_samples = [ + { + "prompt": "What is 15% of 200?", + "interaction_kwargs": { + "name": "gsm8k", + "query": "What is 15% of 200?", + "ground_truth": "30" + } + }, + { + "prompt": "Write a function to check if a number is prime", + "interaction_kwargs": { + "name": "code_verifier", + "code_type": "python", + "expected_behavior": "return True for prime numbers" + } + } + ] + +**Backward Compatibility** + +If no ``name`` field is provided in ``interaction_kwargs``, the system defaults to ``"gsm8k"`` for backward compatibility. + +Best Practices +-------------- + +**Resource Management** + +- Always implement proper cleanup in ``finalize_interaction()`` +- Use unique instance IDs to avoid conflicts in concurrent training +- Handle edge cases like empty messages or malformed content + +**Performance Optimization** + +- Keep interaction logic lightweight to avoid blocking training +- Use async/await properly to maintain non-blocking behavior +- Consider caching expensive computations within interaction instances + +**Testing** + +Comprehensive testing is essential for interaction systems: + +.. code-block:: python + + import pytest + from unittest.mock import patch + + @pytest.mark.asyncio + async def test_interaction_workflow(): + interaction = YourInteraction({}) + + # Test complete workflow + instance_id = await interaction.start_interaction(ground_truth="expected_answer") + + + messages = [{"role": "user", "content": "user_content"}, {"role": "assistant", "content": "assistant_content"}] + should_terminate, response, reward, metadata = await interaction.generate_response(instance_id, messages) + + assert should_terminate in [True, False] + assert isinstance(reward, float) + + await interaction.finalize_interaction(instance_id) + +Advanced Usage +-------------- + +**Multi-Interaction Training Strategies** + +You can design sophisticated training scenarios using multiple interactions: + +.. code-block:: python + + # Example: Progressive difficulty with different interaction agents + class MathTrainingPipeline: + def create_interaction_config(self): + return { + "interaction": [ + { + "name": "basic_math", + "class_name": "verl.interactions.gsm8k_interaction.Gsm8kInteraction", + "config": {"difficulty": "easy"} + }, + { + "name": "advanced_math", + "class_name": "custom.interactions.AdvancedMathInteraction", + "config": {"difficulty": "hard", "allow_hints": True} + }, + { + "name": "competition_math", + "class_name": "custom.interactions.CompetitionMathInteraction", + "config": {"time_limit": 300, "show_steps": False} + } + ] + } + + def create_curriculum_data(self, epoch): + if epoch < 5: + return [{"name": "basic_math", ...} for _ in samples] + elif epoch < 10: + return [{"name": "advanced_math", ...} for _ in samples] + else: + return [{"name": "competition_math", ...} for _ in samples] + +**Custom Scoring Functions** + +You can integrate custom reward functions: + +.. code-block:: python + + async def calculate_score(self, instance_id, **kwargs): + response = self._instance_dict[instance_id]["response"] + ground_truth = self._instance_dict[instance_id]["ground_truth"] + + # Custom evaluation logic + if custom_evaluation_function(response, ground_truth): + return 1.0 + else: + return 0.0 + +**Multi-step Interactions** + +For complex scenarios requiring multiple feedback rounds: + +.. code-block:: python + + async def generate_response(self, instance_id, messages, **kwargs): + instance = self._instance_dict[instance_id] + instance["attempts"] += 1 + + # Evaluate current response + reward = await self.calculate_score(instance_id) + + if reward > 0.8: + return True, "Excellent work!", reward, {} + elif instance["attempts"] < 3: + return False, "Good attempt, but try to improve...", reward, {} + else: + return True, "Maximum attempts reached.", reward, {} + +Troubleshooting +--------------- + +**Common Issues** + +1. **Instance ID Conflicts**: Ensure unique instance IDs across concurrent sessions +2. **Memory Leaks**: Always call ``finalize_interaction()`` to clean up resources +3. **Blocking Operations**: Keep interaction logic async and non-blocking +4. **Configuration Errors**: Verify interaction config path and class name are correct +5. **Interaction Name Conflicts**: Ensure all interactions have unique names in the configuration +6. **Missing Interaction**: Verify the ``name`` field in ``interaction_kwargs`` matches available interactions +7. **Backward Compatibility**: When migrating from single to multi-interaction, add ``name`` fields to existing data + +**Debugging** + +Enable debug logging to trace interaction flow: + +.. code-block:: bash + + export VERL_LOGGING_LEVEL=DEBUG + +**Performance Monitoring** + +Monitor interaction performance impact on training throughput and adjust accordingly. + +Related Documentation +-------------------- + +- :doc:`multiturn`: Basic multi-turn rollout configuration +- :doc:`sandbox_fusion`: Tool integration with SGLang +- :doc:`search_tool_example`: Search tool implementation example \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/multiturn.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/multiturn.rst new file mode 100644 index 0000000000000000000000000000000000000000..54548316d14155434c937fb8c292cd4dec471b0c --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/multiturn.rst @@ -0,0 +1,354 @@ +Multi-turn Rollout Support +========================== + +Last updated: 06/27/2025. + +Basic Configuration +~~~~~~~~~~~~~~~~~~~ + +To enable multi-turn rollout, make sure to configure the following fields in your rollout configuration: + +.. code-block:: yaml + + actor_rollout_ref: + rollout: + multi_turn: True + name: "sglang" + +These configuration activates the sglang engine for multi-turn interaction during rollout. + +Custom Tool Configuration +~~~~~~~~~~~~~~~~~~~~~~~~~ + +For custom environment interaction tools, you can implement your own tools based on ``verl.tools.base_tool.BaseTool``. Then, specify your tool configurations in a YAML file: + +.. code-block:: yaml + + tools: + - class_name: "" + config: + type: native + tool_schema: + +You may refer to GSM8KTool_example_configuration_, which is one example of the tool configurations. Its implementation can be found in gsm8k_tool.py_. + +Finally, set the ``tools_config_file`` in your rollout config: + +.. code-block:: yaml + + actor_rollout_ref: + rollout: + tool_kwargs: + tools_config_file: + +This allows integration of customized tool behaviors during actor rollout steps. + +If you want rollout with simulated interaction, you can set the ``interaction_config_file`` in your rollout config: + +.. code-block:: yaml + + interaction: + - class_name: "" + config: {} + +.. code-block:: yaml + + actor_rollout_ref: + rollout: + interaction_config_file: + +If your tool creates multi-modal inputs, you should return a list of multi-modal inputs in your tool.execute() implementation. + +Image and video should be processed before returning. For example, if you are using Qwen2.5-VL, you can use the following code to get the representations: + +.. code-block:: python + + async def create(self, ...) -> tuple[str, ToolResponse]: + ... + from verl.utils.dataset.vision_utils import process_image, process_video + + img1 = process_image(img1) + video1 = process_video(video1) + + # due to the (image | video) key is ("image" | "video") instead of ("images" | "videos") in vllm, we need to use ("image" | "video") to specify list of images/videos + # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205 + return instance_id, ToolResponse(image=[img1, ...], video=[video1, ...], text="...") + + async def execute(self, ...) -> Tuple[str | Dict[str, Any], float, dict]: + ... + from verl.utils.dataset.vision_utils import process_image, process_video + + img1 = process_image(img1) + video1 = process_video(video1) + + # due to the (image | video) key is ("image" | "video") instead of ("images" | "videos") in vllm, we need to use ("image" | "video") to specify list of images/videos + # link: https://github.com/vllm-project/vllm/blob/3c545c0c3b98ee642373a308197d750d0e449403/vllm/multimodal/parse.py#L205 + return ToolResponse(image=[img1, ...], video=[video1, ...], text="..."), 0, {} + +remeber to set ``return_multi_modal_inputs: False`` in your dataset config in order to process the multi-modal inputs in the rollout correctly. +Refer to the `Handling Multi-Modal Inputs in Datasets`_ section for more details. + +MCP Tool Configuration +~~~~~~~~~~~~~~~~~~~~~~ + +For MCP interaction tools, you can flexibly configure them using a YAML file. The typical setup is as follows: + +.. code-block:: yaml + + tools: + - class_name: "" + config: + type: mcp + mcp: + mcp_servers_config_path: ./mcp_server.json + tool_selected_list: {} + +The ``tool_selected_list`` field is optional and specifies which tools to use from the servers. If you want to enable all available tools, simply omit this attribute. Besides, ``mcp_servers_config_path`` points to a JSON file containing the MCP server configurations. For example: + +.. code-block:: json + + { + "mcpServers": { + "SSE Server": { + "url": "your_server_url", + "auth_token": "your_server_api_token" + }, + "STDIO Server": { + "command": "npx", + "args": ["-y", "server-mcp@0.2.1"], + "env": { + "SERVER_API_KEY": "your_server_api_token" + } + } + } + } + +Since the content formats returned by the MCP server may vary, users can inherit from ``MCPBaseTool`` and override the ``_parse_tool_result`` method to implement custom parsing logic. + +.. code-block:: python + + class MCPYourTool(MCPBaseTool): + def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema): + super().__init__(config, tool_schema) + + def _parse_tool_result(self, content: list) -> Tuple[str, dict]: + ... + +Overall, you may refer to mcp_search_tool.py_ and mcp_tool_config.yaml_ for custom implementation and configuration. + +Multi-turn Tokenization +~~~~~~~~~~~~~~~~~~~~~~~ + +Tokenizing multi-turn rollouts poses a challenge: after applying the chat template and tokenizing the full message list, it's hard to identify which tokens belong to assistant messages. Since the token list is flat, it lacks direct alignment with the message roles. + +To address this, we adopt a **delta-based tokenization** strategy. Each time the LLM generates a new message, we: + +1. Apply the chat template to all prior messages (`messages[:i]`). +2. Apply the chat template again including the latest message (`messages[:i+1]`). +3. Tokenize only the *delta* between these two serialized message strings. + +This ensures that only tokens generated by the assistant are included in the loss mask. + +.. code-block:: python + + # When using tokenizer + # Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True + prev = tokenizer.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False) + curr = tokenizer.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False) + token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False) + loss_mask += [1] * len(token_ids) # Mask only the new assistant tokens + +.. code-block:: python + + # When using processor + # Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True + prev = processor.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False) + prev_model_inputs = processor(text=prev, images=images, videos=videos, return_tensors="pt")[0].tolist() + curr = processor.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False) + curr_model_inputs = processor(text=curr, images=images, videos=videos, return_tensors="pt")[0].tolist() + token_ids += curr_model_inputs["input_ids"][len(prev_model_inputs["input_ids"]):] + loss_mask += [1] * len(token_ids) # Mask only the new assistant tokens + +While we've validated this produces consistent results with full message tokenization, future models' chat template could break compatibility. To guard against silent inconsistencies, we compare the delta-based tokenization with full-tokenization results by default at the end of each rollout. + +If you see the following warning, you can check the mismatched substring in the log: + +.. code-block:: + + Inconsistent training and inference tokenization detected. This may lead to unexpected behavior during training. Please review your chat template to determine if this is intentional. For more information, refer to the multiturn README.md. + +The tokenization sanity check mode can be configured using the ``actor_rollout_ref.rollout.multi_turn.tokenization_sanity_check_mode`` parameter, which accepts the following values: + +- ``strict`` (default): Performs strict comparison between delta-based and full tokenization results, raising warnings for any differences. + +- ``ignore_strippable``: Ignores differences in whitespace characters (``\n``, ``\t``, ``\r``, spaces) while still checking for meaningful text mismatches. This is useful when debugging chat template issues where whitespace variations are expected and acceptable. + +- ``disable``: Completely disables the tokenization sanity check. Only use this if you have thoroughly validated that tokenization discrepancies are expected and won't impact training. + +Example configuration: + +.. code-block:: yaml + + actor_rollout_ref: + rollout: + multi_turn: + tokenization_sanity_check_mode: "ignore_strippable" # Choose from: "disable", "ignore_strippable", "strict" + +Handling Multi-Modal Inputs in Datasets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If your dataset includes multi-modal inputs (such as images or videos), you can control whether these are pre-processed and included in each sample by setting the return_multi_modal_inputs flag in your dataset config (used by RLHFDataset). + +- ``return_multi_modal_inputs: True`` (default): The dataset will pre-process and include a multi_modal_inputs dictionary for each sample. This dict contains the model-ready representations (e.g., image tensors, video tensors, etc.) as produced by your processor. This is useful for single-turn or SFT-style training, where the model expects all modalities to be present in the batch. + +- ``return_multi_modal_inputs: False``: The dataset will not include the multi_modal_inputs field. This is recommended for multi-turn RL or tool-augmented rollouts, where the model may generate new multi-modal inputs dynamically during rollout, and you want to avoid conflicts or redundant data in the batch. + + +Special Cases +^^^^^^^^^^^^^ + +Some models (e.g., Qwen/QwQ-32B and Qwen3 series) remove internal reasoning content during chat template rendering. As a result, the message content can vary across turns, making the delta-based tokenization inaccurate. + +For example, for the following conversation: + +.. code-block:: python + + messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "What is 2 + 2?"}, + {"role": "assistant", "content": "user asked about a simple math question. 2 + 2 = 4."}, + {"role": "user", "content": "Explain why."}, + {"role": "assistant", "content": "user wants to know the reasoning behind the answer. Search for a good explanation", + "tool_calls": [{"id": "tool1", "type": "search", "arguments": {"query": "Why is 2 + 2 = 4?"}}]}, + {"role": "tool", "content": "The sum of two and two is four because it is a basic arithmetic operation."}, + {"role": "assistant", "content": "The tool provided a good explanation.The sum of two and two is four because it is a basic arithmetic operation."} + ] + +1. Qwen/QwQ-32B will remove all reasoning content except the last assistant message after applying the chat template. + +.. code-block:: text + + <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + What is 2 + 2?<|im_end|> + <|im_start|>assistant + 2 + 2 = 4.<|im_end|> + <|im_start|>user + Explain why.<|im_end|> + <|im_start|>assistant + + {"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}} + <|im_end|> + <|im_start|>user + + The sum of two and two is four because it is a basic arithmetic operation. + <|im_end|> + <|im_start|>assistant + The tool provided a good explanation. The sum of two and two is four because it is a basic arithmetic operation.<|im_end|> + +2. Qwen3 series will remove all reasoning content before the last user message. + +.. code-block:: text + + <|im_start|>system + You are a helpful assistant.<|im_end|> + <|im_start|>user + What is 2 + 2?<|im_end|> + <|im_start|>assistant + 2 + 2 = 4.<|im_end|> + <|im_start|>user + Explain why.<|im_end|> + <|im_start|>assistant + + user wants to know the reasoning behind the answer. Search for a good explanation + + + + {"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}} + <|im_end|> + <|im_start|>user + + The sum of two and two is four because it is a basic arithmetic operation. + <|im_end|> + <|im_start|>assistant + + The tool provided a good explanation. + + + The sum of two and two is four because it is a basic arithmetic operation.<|im_end|> + +To handle this, we fall back to a **fixed base conversation** containing only a single system and user message. Since this base doesn't include assistant messages or reasoning content, it remains consistent across turns. + +.. code-block:: python + + BASE_CHAT_HISTORY = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": "I am a user."} + ] + prev = tokenizer.apply_chat_template(BASE_CHAT_HISTORY, add_generation_prompt=True, tokenize=False) + curr = tokenizer.apply_chat_template([*BASE_CHAT_HISTORY, messages[i]], add_generation_prompt=False, tokenize=False) + token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False) + loss_mask += [1] * len(token_ids) + +This method works well for Qwen3 series. However, Qwen/QwQ-32B currently has a bug in its chat template. A fix_ has been proposed but not yet adopted. Until then, use the following command to download the fixed model revision: + +.. code-block:: bash + + pip install huggingface_hub + hf download Qwen/QwQ-32B --revision refs/pr/81 + +.. _fix: https://huggingface.co/Qwen/QwQ-32B/discussions/81 + +Discrepancy Between Training and Inference Templates +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Although the above approach fixes the delta mismatch issue, the removal of reasoning content in the inference-time chat template introduces a new discrepancy: training uses the full reasoning content, while inference does not. + +This mismatch can affect model performance in unpredictable ways. To avoid it, we default to using the full response (including reasoning) for both training and rollout. + +However, this approach comes with trade-offs: + +1. Long reasoning contents can easily exceed the model's context window, especially in multi-turn rollout. +2. There's a mismatch between rollout and production environment now—models will not have reasoning content from past turns if you use the default chat template in production. + +We are still evaluating the impact of these issues. If you experience context length problems or prefer rollouts that match production (i.e., exclude reasoning), you can enable: + +``actor_rollout_ref.rollout.multi_turn.use_inference_chat_template = True`` + +GSM8K Multi-turn Training Performance +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +See the training performance of multi-turn rollout on the GSM8K task HERE_. + +.. _HERE: https://wandb.ai/zhaochenyang20/gsm8k_async_rl/runs/1ro1r7om?nw=nwuserzhaochenyang20 + +.. _GSM8KTool_example_configuration: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml + +.. _gsm8k_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/gsm8k_tool.py + +.. _mcp_search_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/mcp_search_tool.py + +.. _mcp_tool_config.yaml: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/mcp_tool_config.yaml + +Interaction System +~~~~~~~~~~~~~~~~~~ + +For dynamic conversational feedback during RL training, see: + +.. toctree:: + :maxdepth: 1 + + interaction_system + +Search Tool Integration +~~~~~~~~~~~~~~~~~~~~~~~ + +.. toctree:: + :maxdepth: 1 + + search_tool_example + +Code Walkthrough +~~~~~~~~~~~~~~~~~~~~~~~ +If you want to learn more in depth about the code execution flow, please read https://github.com/zhaochenyang20/Awesome-ML-SYS-Tutorial/tree/main/rlhf/verl/multi-turn/code-walk-through diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/sandbox_fusion.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/sandbox_fusion.rst new file mode 100644 index 0000000000000000000000000000000000000000..94adb8a356cbe98309b9287b7b255767c2bcd860 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/sandbox_fusion.rst @@ -0,0 +1,304 @@ +=============================== +Sandbox Fusion Tool Integration +=============================== + +Last updated: 06/10/2025. + +Motivations +=========== + +- As users of verl, we want to allow the model to call certain tools during Actor rollout, incorporating the results into the training process. +- A colleague from ByteDance proposed a paper aimed at enhancing model capability through code execution tools. +- We aim to support tool-calling capabilities of inference engines using `sandbox-fusion` as the code execution system, providing the community with a reimplementation of `retools`. + +Reward Compute with Sandbox Fusion + FaaS Integration +===================================================== + +- In current datasets and tasks, similar work already exists (e.g., Prime), which uses local processes as runners to execute model-generated code for reward computation. +- On this basis, #1429 has advanced the design by integrating FaaS as the runner for reward computation. + +Goals +===== + +- Adapt to the `sglang` tool-calling protocol and define tools for sandbox fusion. +- Integrate with the `async-rollout` process, ensuring sandbox fusion tools follow asyncIO conventions. +- Design and implement a basic rate limiter to prevent issues such as 429 errors. + +Non-Goals +========= + +- Training effectiveness is out of scope. +- Observability metrics are not considered. +- Distributed failover and component fault tolerance are not addressed. + +Design Details +============== + +Tool Schema Definition +---------------------- + +- Currently, only code execution is considered, requiring a `code` field in the JSON from the model. +- Only Python code is supported for now, so no `language` parameter is defined. + +.. code-block:: python + + OpenAIFunctionToolSchema( + type="function", + function=OpenAIFunctionSchema( + name="code_interpreter", + description="A tool for executing code.", + parameters=OpenAIFunctionParametersSchema( + type="object", + properties={ + "code": OpenAIFunctionPropertySchema( + type="string", + description="The code to execute.", + enum=None, + ) + }, + required=["code"], + ), + strict=False, + ) + ) + +Configuration Parameters +-------------------------- + ++----------------------------+--------------------------------------------------------------+ +| Parameter Name | Description | ++============================+==============================================================+ +| `num_workers` | Number of worker threads/processes per DP to request runner. | ++----------------------------+--------------------------------------------------------------+ +| `rate_limit` | Global limit of concurrent code executions. Default: 10 | ++----------------------------+--------------------------------------------------------------+ +| `default_timeout` | Timeout (in seconds) for each code execution. Default: 30 | ++----------------------------+--------------------------------------------------------------+ +| `default_language` | Default programming language. Default: "python" | ++----------------------------+--------------------------------------------------------------+ +| `enable_global_rate_limit` | Whether to enable global rate limiting. Default: True | ++----------------------------+--------------------------------------------------------------+ +| `sandbox_fusion_url` | URL for the veFaas sandbox execution service | ++----------------------------+--------------------------------------------------------------+ + +Rate Limiting Design +----------------------- + +Objective: + +- Limit the number of inflight requests using a token bucket model. + +- Ensure ordered submission to code runners to avoid starvation due to backoff. + +Design Highlights: + +- Use Ray Global Actor as a singleton distributed counter at cluster level. + +- Semaphore used for counting, with `acquire` and `release` in separate thread pools to preserve order. + +- Use Ray’s cloud-pickle to serialize functions for decoupled `ExecutionWorker`. + +.. code-block:: python + + @ray.remote(concurrency_groups={"acquire": 1,"release": 10}) + class TokenBucketWorker: + def __init__(self, rate_limit: int): + self.rate_limit = rate_limit + self.current_count = 0 + self._semaphore = threading.Semaphore(rate_limit) + + @ray.method(concurrency_group="acquire") + def acquire(self): + self._semaphore.acquire() + self.current_count += 1 + + @ray.method(concurrency_group="release") + def release(self): + self._semaphore.release() + self.current_count -= 1 + + def get_current_count(self): + return self.current_count + + class ExecutionWorker: + def __init__(self, enable_global_rate_limit=True, rate_limit=10): + self.rate_limit_worker = self._init_rate_limit(rate_limit) if enable_global_rate_limit else None + + def _init_rate_limit(self, rate_limit): + return TokenBucketWorker.options(name="rate-limiter", get_if_exists=True).remote(rate_limit) + + def execute(self, fn: Callable[..., T], *fn_args, **fn_kwargs) -> T: + with ExitStack() as stack: + stack.callback(self.rate_limit_worker.release.remote) + ray.get(self.rate_limit_worker.acquire.remote()) + try: + return fn(*fn_args, **fn_kwargs) + except Exception as e: + logger.warning(f"Error when executing code: {e}") + + def init_execution_pool(num_workers: int, enable_global_rate_limit=True, rate_limit=10, mode: PoolMode=PoolMode.ThreadMode): + if mode == PoolMode.ThreadMode: + return ray.remote(ExecutionWorker).options(max_concurrency=num_workers).remote( + enable_global_rate_limit=enable_global_rate_limit, + rate_limit=rate_limit + ) + else: + raise NotImplementedError("Process mode is not implemented yet") + +Tool Implementation +------------------- + +- Use `instance_id` to identify requests across multiple dialogue rounds. + +- Use `execution_pool` to implement async invocation. + +- Cleanup state after rollout completion. + +.. code-block:: python + + class SandboxFusionTool(BaseTool): + def __init__(self, config: dict, tool_schema: OpenAIFunctionToolSchema): + ... + self.execution_pool = init_execution_pool(...) + ... + + async def create(self, instance_id: Optional[str] = None, ...): + ... + + async def execute(self, instance_id: str, parameters: dict[str, Any], **kwargs) -> Tuple[str, float, dict]: + code = parameters.get("code", "") + timeout = parameters.get("timeout", self.default_timeout) + language = parameters.get("language", self.default_language) + if not isinstance(code, str): + code = str(code) + + result = await self.execution_pool.execute.remote(self.execute_code,instance_id,code,timeout,language) + self._instance_dict[instance_id]["reward"].append(result.strip()) + + return result, result, {} + + def execute_code(self,instance_id,code,timeout=30,language="python"): + result_status, metadata = _process_single_case(0, None, None,self.sandbox_fusion_url, code, timeout, language) + # we should always expect this since we don't have correct answer + if metadata["run_status"] == "Finished": + actual_output = metadata["stdout"] if metadata["stdout"] is not None else "" + return actual_output + else: + return "no stdout here" + + async def calc_reward(self, instance_id: str, ...): + ... + + async def release(self, instance_id: str, ...): + ... + +Test Plan +========= + +Unit Tests +---------- + +- **test_tools_registration**: Test tool registration and initialization. +- **test_rollout_req_creation**: Validate that `AsyncRolloutReq` is built correctly. +- **test_over_size_case**: Ensure rollout terminates early when exceeding `max_seq_len`. +- **test_tool_call_basic_case**: Mock `sglang` output, validate tool call and result. +- **test_tool_call_batch_case**: Test batch processing of tool calls. +- **test_basic_multi_process_init**: Validate Ray global actor behaves as singleton. +- **TestSingleNodeRateLimiterCase**: Verify rate limiter works in single-node mode. +- **test_rotten_execution**: Ensure rate limiter recovers from function errors. +- **TestMultiNodeRateLimiterCase**: Verify behavior in multi-node environments. + +e2e Tests +---------- +we provide e2e test scripts in `tests/special_e2e` folder, named `tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh` + +by setting 'trainer.rollout_data_dir' you can dump the rollout data to local disk. here is an sample taken from the rollout data: + +.. code-block:: python + + { + "input": " + + system\nYou are a math expert. You are given a question and you need to solve it step by step. Reasoning step by step before any tool call. You should use the `calc_gsm8k_reward` tool after step by step solving the question, before generate final answer at least once and refine your answer if necessary. Put your final answer in the format of `#### `.\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\n{\"type\": \"function\", \"function\": {\"name\": \"code_interpreter\", \"description\": \"A tool for executing code.\", \"parameters\": {\"type\": \"object\", \"properties\": {\"code\": {\"type\": \"string\", \"description\": \"The code to execute.\", \"enum\": null}}, \"required\": [\"code\"]}, \"strict\": false}}\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n\n + + user\nJohn gets a bonus that's the same percentage every year. Last year he made $100,000 and got a $10,000 bonus. This year he makes $200,000. How much will John make this year when adding both his total pay and bonus together? Let's think step by step and output the final answer after `####`.\n + + assistant\n", + "output": "assistant\n\nOkay, let's see. John gets a bonus that's the same percentage every year. Last year, he made $100,000 and got a $10,000 bonus. This year, he makes $200,000. The question is asking how much he'll make this year when adding both his total pay and bonus together.\n\nFirst, I need to figure out the percentage bonus he gets. Since the bonus is the same percentage each year, if I can find that percentage based on last year's details, I can apply it to this year's earnings to find this year's bonus. Then, I'll add that bonus to his total pay this year.\n\nLast year, he earned $100,000 and got a $10,000 bonus. To find the percentage, I can divide the bonus by his total earnings. So, 10,000 divided by 100,000. Let me calculate that.\n\n10,000 / 100,000 equals 0.1, which is 10% as a percentage. So, his bonus is 10% of his pay each year.\n\nThis year, he makes $200,000. So, the bonus this year would be 10% of 200,000. Let me compute that using the code interpreter to ensure accuracy.\n + + \n{\"name\": \"code_interpreter\", \"arguments\": {\"code\": \"total_pay_this_year = 200000\\nbonus_percentage = 10 / 100\\nbonus_this_year = total_pay_this_year * bonus_percentage\\ntotal_income_this_year = total_pay_this_year + bonus_this_year\\nprint(total_income_this_year)\", \"executes\": \"True\"}}\n\n + + tool\n220000.0\n\n + + assistant\n\nSo the code shows that John's total income this year would be $220,000. This is consistent with the 10% bonus on last year's pay of $100,000, which was $10,000. Therefore, this year's bonus of $20,000 added to his total pay of $200,000 results in a total income of $220,000.\n\nI need to verify that there are no other factors at play. The problem states it's a constant percentage bonus each year, so there's no mention of changing percentages or other variables. Therefore, applying a straightforward percentage of 10% on this year's partial salary amount would result in $20,000. Adding that to his total pay gives $220,000. The calculation is correct and aligns with the question's parameters.\n\n\n#### 220000.0", + "score": 0, + "step": 1 + } + +here is the readable format version: + +.. code-block:: python + + [system] + + You are a math expert. You are given a question and you need to solve it step by step. Reasoning step by step before any tool call. You should use the `calc_gsm8k_reward` tool after step by step solving the question, before generate final answer at least once and refine your answer if necessary. Put your final answer in the format of `#### `. + + # Tools + + You may call one or more functions to assist with the user query. + + You are provided with function signatures within XML tags: + + {"type": "function", "function": {"name": "code_interpreter", "description": "A tool for executing code.", "parameters": {"type": "object", "properties": {"code": {"type": "string", "description": "The code to execute.", "enum": null}}, "required": ["code"]}, "strict": false}} + + + For each function call, return a json object with function name and arguments within XML tags: + + {"name": , "arguments": } + + + [user] + + John gets a bonus that's the same percentage every year. Last year he made $100,000 and got a $10,000 bonus. This year he makes $200,000. How much will John make this year when adding both his total pay and bonus together? Let's think step by step and output the final answer after `####`. + + [assistant] + + + Okay, let's see. John gets a bonus that's the same percentage every year. Last year, he made $100,000 and got a $10,000 bonus. This year, he makes $200,000. The question is asking how much he'll make this year when adding both his total pay and bonus together. + + First, I need to figure out the percentage bonus he gets. Since the bonus is the same percentage each year, if I can find that percentage based on last year's details, I can apply it to this year's earnings to find this year's bonus. Then, I'll add that bonus to his total pay this year. + + Last year, he earned $100,000 and got a $10,000 bonus. To find the percentage, I can divide the bonus by his total earnings. So, 10,000 divided by 100,000. Let me calculate that. + + 10,000 / 100,000 equals 0.1, which is 10% as a percentage. So, his bonus is 10% of his pay each year. + + This year, he makes $200,000. So, the bonus this year would be 10% of 200,000. Let me compute that using the code interpreter to ensure accuracy. + + {"name": "code_interpreter", "arguments": {"code": "total_pay_this_year = 200000\nbonus_percentage = 10 / 100\nbonus_this_year = total_pay_this_year * bonus_percentage\ntotal_income_this_year = total_pay_this_year + bonus_this_year\nprint(total_income_this_year)", "executes": "True"}} + + + [tool] + + 220000.0 + + [assistant] + + + So the code shows that John's total income this year would be $220,000. This is consistent with the 10% bonus on last year's pay of $100,000, which was $10,000. Therefore, this year's bonus of $20,000 added to his total pay of $200,000 results in a total income of $220,000. + + I need to verify that there are no other factors at play. The problem states it's a constant percentage bonus each year, so there's no mention of changing percentages or other variables. Therefore, applying a straightforward percentage of 10% on this year's partial salary amount would result in $20,000. Adding that to his total pay gives $220,000. The calculation is correct and aligns with the question's parameters. + + + #### 220000.0 + + +You can also use the `RolloutViewer` TUI tool to view the dumped rollout data: + + +.. code-block:: bash + + python scripts/rollout_viewer.py ${trainer.rollout_data_dir} + + +.. image:: https://github.com/user-attachments/assets/e34e5157-2880-4a21-afb2-73885d0dfb11 + :alt: RolloutViewer screenshot \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/sglang_multiturn/search_tool_example.rst b/code/RL_model/verl/verl_train/docs/sglang_multiturn/search_tool_example.rst new file mode 100644 index 0000000000000000000000000000000000000000..cbbdeb0d08e6102a00a85bd5544c345bb086969f --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/sglang_multiturn/search_tool_example.rst @@ -0,0 +1,264 @@ +======================= +Search Tool Integration +======================= + +Last updated: 05/30/2025. + +Introduction +------------ +- We have added a search tool calling function to Multi-Turn RL, enabling the model to initiate retrieval requests during Actor rollout and directly use retrieval results for training. **We support using a local dense retriever as the retrieval tool, as well as integrating with your own local retrieval engine.** + + + +Quick Reproduction +------------------ + +Create a New Docker Container +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: bash + + docker run \ + -it \ + --shm-size 32g \ + --gpus all \ + -v {Huggingface-Cache-Path}:/root/.cache \ + --ipc=host \ + --network=host \ + --privileged \ + --name sglang_{your-name} \ + lmsysorg/sglang:dev \ + /bin/zsh + +If you need to restart after exiting the container: + +.. code:: bash + + docker start -i sglang_{your-name} + +Update Python and Configure the Virtual Environment using uv +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code:: bash + + apt update + apt install -y python3.10 python3.10-venv + + # Create a virtual environment + python3 -m venv ~/.python/verl-multiturn-rollout + + # Activate the virtual environment + source ~/.python/verl-multiturn-rollout/bin/activate + + # Install uv + python3 -m pip install uv + +Install verl Upstream +~~~~~~~~~~~~~~~~~~~~~ + +.. code:: bash + + cd ~ + git clone https://github.com/volcengine/verl.git + cd verl + + # Install verl + python3 -m uv pip install . + python3 -m uv pip install -r ./requirements_sglang.txt + + # Manually install flash-attn + python3 -m uv pip install wheel + python3 -m uv pip install packaging + python3 -m uv pip install flash-attn --no-build-isolation --no-deps + +Set Up a Local Retrieval Engine +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +If you are using your own local retrieval service, you can skip this +step. We chose the local dense retriever provided in the search-R1 +example; detailed instructions are in the `searchR1 +docs `__. +In brief: + +- The GPU version offers higher accuracy and speed; each GPU uses about + 5–7 GB of memory. +- The CPU version can be used for simple testing but has lower + retrieval precision, which will degrade training performance. See the + `retriever + documentation `__ + in search-R1 for details. +- Recommend using Conda to install faiss-gpu=1.8.0; venv may cause errors. + +**Note**: To start both the training process and the local retrieval +service, we launch two separate Python environments. The training uses +uv in the verl-multiturn-rollout environment, while the retriever uses +conda to install ``faiss-gpu``. + +.. code:: bash + + # Download the Miniconda installer script + wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O ~/miniconda.sh + + # Install to $HOME/miniconda3 in batch mode + bash ~/miniconda.sh -b -p $HOME/miniconda3 + + # Activate conda (only in the current shell) + eval "$($HOME/miniconda3/bin/conda shell.bash hook)" + + # (Optional) Add conda to your default shell startup + conda init + + # Reload shell config + source ~/.bashrc + + # Create and activate the retriever environment with Python 3.10 + conda create -n retriever python=3.10 -y + conda activate retriever + + # Install PyTorch (with GPU support) and related libraries + conda install pytorch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 pytorch-cuda=12.1 -c pytorch -c nvidia -y + + # Install other Python packages + pip install transformers datasets pyserini huggingface_hub + + # Install the GPU version of faiss + conda install faiss-gpu=1.8.0 -c pytorch -c nvidia -y + + # Install the API service framework + pip install uvicorn fastapi + +Download the Indexing and Corpus +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The local retrieval files are large—prepare sufficient disk space. +Downloading is about 60–70 GB, and uncompressed takes about 132 GB: + +.. code:: bash + + conda activate retriever + + save_path=/the/path/to/save + python examples/sglang_multiturn/search_r1_like/local_dense_retriever/download.py --save_path $save_path + cat $save_path/part_* > $save_path/e5_Flat.index + gzip -d $save_path/wiki-18.jsonl.gz + +Start the Local flat e5 Retrieval Server +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. The first startup will download models and load the index. +2. Apart from the download, startup takes about 1–2 minutes. +3. After startup, each GPU uses about 5–7 GB of memory, leaving the rest + for multi-turn RL training. + +.. code:: bash + + conda activate retriever + + index_file=$save_path/e5_Flat.index + corpus_file=$save_path/wiki-18.jsonl + retriever_name=e5 + retriever_path=intfloat/e5-base-v2 + + python examples/sglang_multiturn/search_r1_like/local_dense_retriever/retrieval_server.py \ + --index_path $index_file \ + --corpus_path $corpus_file \ + --topk 3 \ + --retriever_name $retriever_name \ + --retriever_model $retriever_path \ + --faiss_gpu + +Set Up WANDB_API_KEY +~~~~~~~~~~~~~~~~~~~~ + +.. code:: bash + + export WANDB_API_KEY={YOUR_WANDB_API_KEY} + + # Define a timestamp function + function now() { + date '+%Y-%m-%d-%H-%M' + } + +**Preprocess the Dataset** +~~~~~~~~~~~~~~~~~~~~~~~~~~ + + **Note:** The following data processing and training commands must be + run in the verl-multiturn-rollout environment. + +.. code:: bash + + python3 examples/data_preprocess/preprocess_search_r1_dataset.py + +Testing on 8 x H20 +~~~~~~~~~~~~~~~~~~ + +.. code:: bash + + # Ensure the now() function is defined + # Create a logs directory + mkdir -p logs + + # Set GPUs and run with a suitable log path + export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + + nohup bash examples/sglang_multiturn/search_r1_like/run_qwen2.5-3b_instruct_search_multiturn.sh \ + trainer.experiment_name=qwen2.5-3b-it_rm-searchR1-like-sgl-multiturn-$(now) \ + > logs/searchR1-like$(now).log 2>&1 & + +Custom Search Configuration +--------------------------- + +To enable multi-turn reasoning, set the following fields in your config: + +.. code:: yaml + + actor_rollout_ref: + rollout: + name: "sglang" + multi_turn: + enable: True + +You must specify ``retrieval_service_url`` in ``examples/sglang_multiturn/config/tool_config/search_tool_config.yaml``, and properly configure concurrency. For more details on concurrency, refer to the Sandbox Fusion example: + +.. code:: yaml + + tools: + - class_name: verl.tools.search_tool.SearchTool + config: + retrieval_service_url: http://127.0.0.1:8000/retrieve + num_workers: 120 + rate_limit: 120 + timeout: 30 + +The retriever input/output formats are as follows. If your service +parameters match, only modify ``retrieval_service_url``. You can also +customize in ``search_r1_like_utils.py``. + +.. code:: python + + Input format: + { + "queries": ["What is Python?", "Tell me about neural networks."], + "topk": 3, + "return_scores": true + } + + Output format (when return_scores=True, similarity scores are returned): + { + "result": [ + [ # Results for each query + { + "document": doc, "score": score + }, + # ... more documents + ], + # ... results for other queries + ] + } + +Notes +----- + +1. The total training time is about 27 hours; meanwhile, the validation + dataset is very large (51 k), and each validation takes about 6000 s. + (Therefore, ``val_before_train=False`` by default) diff --git a/code/RL_model/verl/verl_train/docs/start/agentic_rl.rst b/code/RL_model/verl/verl_train/docs/start/agentic_rl.rst new file mode 100644 index 0000000000000000000000000000000000000000..73c0a7ce1e1d8a43f9811b571b634fa94f162a10 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/start/agentic_rl.rst @@ -0,0 +1,133 @@ +Agentic RL Training +=================== + +Last updated: 07/15/2025. + +Overview +---------- +The goal of Agentic RL is to improve the performance of backend models from reinforcement learning to the Agent. During the training process, a series of features are developed: + +1. Server-based asynchronous rollout +2. Multi-turn conversations and tool calls +3. LangGraph-based Agent + + +This document explains the system principles and usage involved to help users implement Agentic RL. + + +Server-based Asynchronous Rollout +--------------------------------- + +Since Agents need to interact with the environment through various tool calls, in order to avoid GPU idling while waiting for tool call return results, an asyncio based co-routing mechanism is utilized to execute each rollout requests asynchronously, thereby improving training performance. To support asynchronous rollout, the inference engine (server) and the agent (client) are architecturally separated, implementing a server-based system with the following objectives: + +1. Enabling load balancing mechanisms to balance loads across multiple GPUs and reduce the impact of long-tail requests on performance. For this purpose, scheduling capabilities in stream mode (recipe\stream_mode) are implemented as a recipe. +2. Preventing agent specific features such as tracing from affecting the inference engine. + +System Architecture +~~~~~~~~~~~~~~~~~~~ + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/agent_loop.png?raw=true + +For more detail on internal design, please refer to :doc:`Agent Loop<../advance/agent_loop>`. + +System Components +~~~~~~~~~~~~~~~~~ + ++--------------------------+----------------------------------------------------------------------------+ +| Component | Role | ++==========================+============================================================================+ +| AgentLoop | Client, implements Agent functions | ++--------------------------+----------------------------------------------------------------------------+ +| AsyncLLMServerManager | Inference gateway, provides generate interface for AgentLoop | ++--------------------------+----------------------------------------------------------------------------+ +| AsyncServer | Server, each instance is connected to one DP group of the inference engine | ++--------------------------+----------------------------------------------------------------------------+ + +**"generate" Interface** + +The "generate" function based on ray actor is used between the Client and Server instead of the standard chat completion API. This is because the conversion between tokens and text can be irreversible. For example, the token converted from "" will be different from that generated by the LLM. During the training phase, it is necessary to strictly use the tokens generated by LLM inference to avoid inaccurate in computing advantage, which may affect model performance. Having the Server provide a token-based API helps the Client maintain the relationship between the text generated by tool calls and the tokens returned by the LLM, so as to output correct tokens for training. + + +**Inference Engine Adaptation** +AsyncServer uniformly provides a generate function to the upper layer, with separate implementations for SGLang and vLLM to hide underlying differences: + +1. The SGLang AsyncServer uses the async_generate interface of the SGLang engine, which is located on the first GPU of each TP group. Therefore, AsyncServer needs to remotely call async_generate through ray actor. +2. The vLLM AsyncServer uses the generate interface of the vLLM engine, which can communicate with the GPUs in the TP group through ZMQ and can be directly called in AsyncServer. + + +Usage Example +~~~~~~~~~~~~~ + +Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints. + +There are two options required to use agent loop: + +- `data.return_raw_chat=True` +- `actor_rollout_ref.rollout.mode=async` + +This example uses the sglang inference engine by default, and you can also modify rollout_name to use vllm. + +.. code-block:: bash + + bash examples/grpo_trainer/run_qwen2-7b_seq_balance.sh + + +Multi-turn Conversations and Tool Calls +--------------------------------------- + +Follow :doc:`Multi-turn Rollout Support<../sglang_multiturn/multiturn>` to prepare tool and configuration files. + +The Tool Agent Loop has an additional requirement: adding an "agent_name" field to the dataset. During rollout, it will choose to use tool_agent_loop or single_turn_agent (default) based on this field. + +Usage Example +~~~~~~~~~~~~~ + +.. code-block:: bash + + # install mlflow to view toolcall and llm trace + pip install mlflow + + # This will download and preprocess the GSM8K dataset into ~/data/gsm8k/ and add the "agent_name" field. + python examples/data_preprocess/gsm8k_tool_agent_loop.py + + # Start training with tool calls and enabled mlflow based trace helping to debug the rollout details + bash examples/sglang_multiturn/run_qwen2.5-3b_gsm8k_tool_agent_mlflow.sh + + # When training is done, start a mlflow server to view trace + mlflow ui -h 0.0.0.0 -p 5000 --backend-store-uri sqlite:////tmp/mlruns.db + + # then you can open http://:5000 from browser to view trace + + +Note: During training, because the model may sometimes fail to generate correct toolcall tags, an error message "Failed to decode tool call" will be output to the console, which does not indicate an abnormality in training. + + +Follow :doc:`Rollout trace<../advance/rollout_trace>` to known more about trace feature. + + + +Agent Framework +--------------- + +System Architecture +~~~~~~~~~~~~~~~~~~~ + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/langgraph_agent.png?raw=true + +System Components +~~~~~~~~~~~~~~~~~ + ++--------------------------+-----------------------------------------------------------------------------------------------+ +| Component | Role | ++==========================+===============================================================================================+ +| ChatModel | LLM object of LangChain, used to adapt to the “generate” api provided by AsyncLLMServerManager| ++--------------------------+-----------------------------------------------------------------------------------------------+ +| RectAgentLoop | Agent adaptation layer, which by default supports a naive LangGraph Agentic. | +| | New classes can be derived to support user-defined Agents, and the run function needs to be | +| | implemented to complete Agent calls. | ++--------------------------+-----------------------------------------------------------------------------------------------+ +| AsyncServer | Server, each instance is connected to one DP group of the inference engine. | ++--------------------------+-----------------------------------------------------------------------------------------------+ + + +Follow doc "recipe/langgraph_agent/example/README.md" for more details. \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/docs/start/install.rst b/code/RL_model/verl/verl_train/docs/start/install.rst new file mode 100644 index 0000000000000000000000000000000000000000..2686713fbbef85c58da547fca27c42550748a684 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/start/install.rst @@ -0,0 +1,319 @@ +Installation +============ + +Requirements +------------ + +- **Python**: Version >= 3.10 +- **CUDA**: Version >= 12.8 + +verl supports various backends. Currently, the following configurations are available: + +- **FSDP** and **Megatron-LM** (optional) for training. +- **SGLang**, **vLLM** and **TGI** for rollout generation. + +Choices of Backend Engines +---------------------------- + +1. Training: + +We recommend using **FSDP** backend to investigate, research and prototype different models, datasets and RL algorithms. The guide for using FSDP backend can be found in :doc:`FSDP Workers<../workers/fsdp_workers>`. + +For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support `Megatron-LM v0.13.1 `_. The guide for using Megatron-LM backend can be found in :doc:`Megatron-LM Workers<../workers/megatron_workers>`. + + +2. Inference: + +For inference, vllm 0.8.3 and later versions have been tested for stability. We recommend turning on env var `VLLM_USE_V1=1` for optimal performance. + +For SGLang, refer to the :doc:`SGLang Backend<../workers/sglang_worker>` for detailed installation and usage instructions. SGLang rollout is under extensive development and offers many advanced features and optimizations. We encourage users to report any issues or provide feedback via the `SGLang Issue Tracker `_. + +For huggingface TGI integration, it is usually used for debugging and single GPU exploration. + +Install from docker image +------------------------- + +Start from v0.6.0, we use vllm and sglang release image as our base image. + +Base Image +:::::::::: + +- vLLM: https://hub.docker.com/r/vllm/vllm-openai +- SGLang: https://hub.docker.com/r/lmsysorg/sglang + +Application Image +::::::::::::::::: + +Upon base image, the following packages are added: + +- flash_attn +- Megatron-LM +- Apex +- TransformerEngine +- DeepEP + +Latest docker file: + +- `Dockerfile.stable.vllm `_ +- `Dockerfile.stable.sglang `_ + +All pre-built images are available in dockerhub: `verlai/verl `_. For example, ``verlai/verl:sgl055.latest``, ``verlai/verl:vllm011.latest``. + +You can find the latest images used for development and ci in our github workflows: + +- `.github/workflows/vllm.yml `_ +- `.github/workflows/sgl.yml `_ + + +Installation from Docker +:::::::::::::::::::::::: + +After pulling the desired Docker image and installing desired inference and training frameworks, you can run it with the following steps: + +1. Launch the desired Docker image and attach into it: + +.. code:: bash + + docker create --runtime=nvidia --gpus all --net=host --shm-size="10g" --cap-add=SYS_ADMIN -v .:/workspace/verl --name verl sleep infinity + docker start verl + docker exec -it verl bash + + +2. If you use the images provided, you only need to install verl itself without dependencies: + +.. code:: bash + + # install the nightly version (recommended) + git clone https://github.com/volcengine/verl && cd verl + pip3 install --no-deps -e . + +[Optional] If you hope to switch between different frameworks, you can install verl with the following command: + +.. code:: bash + + # install the nightly version (recommended) + git clone https://github.com/volcengine/verl && cd verl + pip3 install -e .[vllm] + pip3 install -e .[sglang] + + +Install from custom environment +--------------------------------------------- + +We recommend to use docker images for convenience. However, if your environment is not compatible with the docker image, you can also install verl in a python environment. + +.. note:: + + - Dockerfile provides more details than this installation instructions. You can find examples in each Dockerfile, for example `verl0.6-cu128-torch2.8.0-fa2.7.4 Dockerfile.base `_ . + + +Pre-requisites +:::::::::::::: + +For training and inference engines to utilize better and faster hardware support, CUDA/cuDNN and other dependencies are required, +and some of the dependencies are easy to be overridden when installing other packages, +so we put them in the :ref:`Post-installation` step. + +.. note:: + + - The installation steps below are recommended configurations for the latest version of verl. + + If you are trying to customize your own environment, please ignore the strict constraints. + +We need to install the following pre-requisites: + +- **CUDA**: Version >= 12.8 +- **cuDNN**: Version >= 9.10.0 +- **Apex** + +CUDA above 12.8 is recommended to use as the docker image, +please refer to `NVIDIA's official website `_ for other version of CUDA. + +.. code:: bash + + # change directory to anywher you like, in verl source code directory is not recommended + wget https://developer.download.nvidia.com/compute/cuda/12.8.1/local_installers/cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb + dpkg -i cuda-repo-ubuntu2204-12-8-local_12.8.1-570.124.06-1_amd64.deb + cp /var/cuda-repo-ubuntu2204-12-8-local/cuda-*-keyring.gpg /usr/share/keyrings/ + apt-get update + apt-get -y install cuda-toolkit-12-8 + update-alternatives --set cuda /usr/local/cuda-12-8 + + +cuDNN can be installed via the following command, +please refer to `NVIDIA's official website `_ for other version of cuDNN. + +.. code:: bash + + # change directory to anywher you like, in verl source code directory is not recommended + wget https://developer.download.nvidia.com/compute/cudnn/9.10.2/local_installers/cudnn-local-repo-ubuntu2204-9.10.2_1.0-1_amd64.deb + dpkg -i cudnn-local-repo-ubuntu2204-9.10.2_1.0-1_amd64.deb + cp /var/cudnn-local-repo-ubuntu2204-9.10.2/cudnn-*-keyring.gpg /usr/share/keyrings/ + apt-get update + apt-get -y install cudnn-cuda-12 + +Install dependencies +:::::::::::::::::::: + +.. note:: + + We recommend to use a fresh new conda environment to install verl and its dependencies. + + **Notice that the inference frameworks often strictly limit your pytorch version and will directly override your installed pytorch if not paying enough attention.** + + As a countermeasure, it is recommended to install inference frameworks first with the pytorch they needed. For vLLM, if you hope to use your existing pytorch, + please follow their official instructions + `Use an existing PyTorch installation `_ . + + +1. First of all, to manage environment, we recommend using conda: + +.. code:: bash + + conda create -n verl python==3.12 + conda activate verl + + +2. Then, execute the ``install.sh`` script that we provided in verl: + +.. code:: bash + + # Make sure you have activated verl conda env + # If you need to run with megatron + bash scripts/install_vllm_sglang_mcore.sh + # Or if you simply need to run with FSDP + USE_MEGATRON=0 bash scripts/install_vllm_sglang_mcore.sh + + +If you encounter errors in this step, please check the script and manually follow the steps in the script. + +[Optional] NVIDIA Apex is recommended for Megatron-LM training, but it's not needed if you only use FSDP backend. +You can install it via the following command, but notice that this steps can take a very long time. +It is recommended to set the ``MAX_JOBS`` environment variable to accelerate the installation process, +but do not set it too large, otherwise the memory will be overloaded and your machines may hang. + +.. code:: bash + + # change directory to anywher you like, in verl source code directory is not recommended + git clone https://github.com/NVIDIA/apex.git && \ + cd apex && \ + MAX_JOB=32 pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./ + +Install verl +:::::::::::: + +For installing the latest version of verl, the best way is to clone and +install it from source. Then you can modify our code to customize your +own post-training jobs. + +.. code:: bash + + git clone https://github.com/volcengine/verl.git + cd verl + pip install --no-deps -e . + + +Post-installation +::::::::::::::::: + +Please make sure that the installed packages are not overridden during the installation of other packages. + +The packages worth checking are: + +- **torch** and torch series +- **vLLM** +- **SGLang** +- **pyarrow** +- **tensordict** +- **nvidia-cudnn-cu12**: For Magetron backend + +If you encounter issues about package versions during running verl, please update the outdated ones. + + +Install with AMD GPUs - ROCM kernel support +------------------------------------------------------------------ + +When you run on AMD GPUs (MI300) with ROCM platform, you cannot use the previous quickstart to run verl. You should follow the following steps to build a docker and run it. +If you encounter any issues in using AMD GPUs running verl, feel free to contact me - `Yusheng Su `_. + +Find the docker for AMD ROCm: `docker/Dockerfile.rocm `_ +:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: + +.. code-block:: bash + + # Build the docker in the repo dir: + # docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 . + # docker images # you can find your built docker + FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + + # Set working directory + # WORKDIR $PWD/app + + # Set environment variables + ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942" + + # Install vllm + RUN pip uninstall -y vllm && \ + rm -rf vllm && \ + git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \ + cd vllm && \ + MAX_JOBS=$(nproc) python3 setup.py install && \ + cd .. && \ + rm -rf vllm + + # Copy the entire project directory + COPY . . + + # Install dependencies + RUN pip install "tensordict<0.6" --no-deps && \ + pip install accelerate \ + codetiming \ + datasets \ + dill \ + hydra-core \ + liger-kernel \ + numpy \ + pandas \ + datasets \ + peft \ + "pyarrow>=15.0.0" \ + pylatexenc \ + "ray[data,train,tune,serve]" \ + torchdata \ + transformers \ + wandb \ + orjson \ + pybind11 && \ + pip install -e . --no-deps + +Build the image +:::::::::::::::::::::::: + +.. code-block:: bash + + docker build -t verl-rocm . + +Launch the container +:::::::::::::::::::::::::::: + +.. code-block:: bash + + docker run --rm -it \ + --device /dev/dri \ + --device /dev/kfd \ + -p 8265:8265 \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v $HOME/.ssh:/root/.ssh \ + -v $HOME:$HOME \ + --shm-size 128G \ + -w $PWD \ + verl-rocm \ + /bin/bash + +If you do not want to root mode and require assign yourself as the user, +Please add ``-e HOST_UID=$(id -u)`` and ``-e HOST_GID=$(id -g)`` into the above docker launch script. + +verl with AMD GPUs currently supports FSDP as the training engine, vLLM and SGLang as the inference engine. We will support Megatron in the future. diff --git a/code/RL_model/verl/verl_train/docs/start/more_resources.rst b/code/RL_model/verl/verl_train/docs/start/more_resources.rst new file mode 100644 index 0000000000000000000000000000000000000000..aa8cb2a62b46579ee4bef2880d7f62485175495e --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/start/more_resources.rst @@ -0,0 +1,7 @@ +More Resources +============== + +Last updated: 06/30/2025. + +- Introduction to verl (`Slides `_) +- verl Code Walkthrough (`Slides `_, `Talk in Chinese `_) diff --git a/code/RL_model/verl/verl_train/docs/start/multinode.rst b/code/RL_model/verl/verl_train/docs/start/multinode.rst new file mode 100644 index 0000000000000000000000000000000000000000..4dd7d174aa465b966dfa41fff9c5d1fc1de0edff --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/start/multinode.rst @@ -0,0 +1,821 @@ +Multinode Training +================== + +Last updated: 06/10/2025. + +.. _wuxibin89: https://github.com/wuxibin89 + +Author: `Xibin Wu `_, `Yusheng Su `_. + +Option 1: Launch Manually +------------------------------ + +Set up multinode ray cluster +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +1. Start head node with ``ray start --head --dashboard-host=0.0.0.0``, there're 2 address you should care about: + +- GCS address: ``ray start --address=
``, where worker node should connect to. +- Dashboard address: ``
:8265``, where you should submit job to the cluster. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/head.png?raw=true + +2. Start worker node with ``ray start --address=
`` you get above. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/worker.png?raw=true + +3. Now you should see the cluster have 2 nodes with ``ray status``. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/status.png?raw=true + +4. Additionally, you can access dashboard in the browser with the address you get above. + +*Firewall rules maybe need configure to access the dashboard, if there's any trouble, please contact your network administrator.* + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/overview.png?raw=true + +Submit job to ray cluster +~~~~~~~~~~~~~~~~~~~~~~~~~ +1. Submit ray job to cluster with the dashboard address you get above. + +.. code-block:: bash + + ray job submit --address="http://127.0.0.1:8265" \ + --runtime-env=verl/trainer/runtime_env.yaml \ + --no-wait \ + -- \ + python3 -m verl.trainer.main_ppo \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=2 \ + ... + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/submit.png?raw=true + +2. Then you can check the job status with the following commands: + +- ray job list: list all jobs submitted to the cluster. +- ray job logs : query the logs of the job. +- ray job status : query the status of the job. +- ray job stop : request the job to be stopped. +- ray job list | grep submission_id | grep JobStatus | grep RUNNING | grep -oP 'raysubmit_[^'\''"]+' | head -n 1: get the latest job submission ID of the running job. +- ray job logs --follow: added ``--follow`` parameter to ray job logs command to enable continuous log streaming. + +3. You can also access driver/task/actor logs in ``/tmp/ray/session_latest/logs/``, driver log is ``job-driver-raysubmit_.log``. + +4. We strongly recommend you to view job detail from dashboard in multinode training, because it provide more structure way to view the job information. + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/job.png?raw=true +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/job_detail.png?raw=true + +Option 2: Launch via SkyPilot on Kubernetes or clouds +------------------------------------------------------ + +.. note:: + Ready-to-use SkyPilot example configurations are available in the `examples/skypilot/ `_ directory: + + - ``verl-ppo.yaml`` - PPO training with GSM8K dataset + - ``verl-grpo.yaml`` - GRPO training with MATH dataset + - ``verl-multiturn-tools.yaml`` - Multi-turn tool usage training + + See the `SkyPilot examples README `_ for detailed usage instructions. + +Step 1: Setup SkyPilot +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +SkyPilot can support different clouds, here we use GCP as example. `install skypilot `_ + +.. code-block:: bash + + conda create -y -n sky python=3.10 + conda activate sky + pip install "skypilot[gcp]" + + conda install -c conda-forge google-cloud-sdk + gcloud init + + # Run this if you don't have a credential file. + # This will generate ~/.config/gcloud/application_default_credentials.json. + gcloud auth application-default login + + # Check if the GCP credential is correctly setup. + sky check gcp + +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/setup_skypilot.png?raw=true + +Step 2: Prepare dataset +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + git clone https://github.com/volcengine/verl.git + cd examples/data_preprocess + python3 gsm8k.py --local_save_dir ~/data/gsm8k + + +Step 3: Submit a job with SkyPilot +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +1. Create a SkyPilot YAML ``verl-cluster.yml`` with the following content: + +.. parsed-literal:: workdir: . will sync all the data in the current dir to the remote cluster. + +.. code-block:: yaml + + resources: + accelerators: L4:1 # every node has 1 L4 GPU + image_id: docker:verlai/verl:base-verl0.5-cu126-cudnn9.8-torch2.7.0-fa2.7.4 + memory: 64+ # every node has 64 GB memory + ports: 8265 # expose port for ray dashboard + + num_nodes: 2 # cluster size + + # --------------- Work Directory Synchronization (workdir) --------------- + # Defines the local working directory to be synchronized to the remote cluster. + # Here, '.' means synchronizing the directory where the sky submit command is currently run. + workdir: . + + # --------------- (secrets) --------------- + secrets: + ## your wandb api key ## + WANDB_API_KEY: null + + # --------------- File Mounts/Data Upload (file_mounts) --------------- + # If your dataset (gsm8k folder) is local, it needs to be uploaded to the remote cluster. + file_mounts: + # Remote path (relative to remote user's home directory): Local path + # /remote/dir1/file: /local/dir1/file + data/gsm8k: ~/data/gsm8k + + # --------------- Environment Setup (setup) --------------- + # Commands run on each node of the remote cluster to set up the environment (e.g., install dependencies). These are run directly inside Docker. + setup: | + rm -rf verl + git clone https://github.com/volcengine/verl.git + cd verl + pip3 install -v -e .[vllm] + + # --------------- Run Command (run) --------------- + # The actual task commands to be executed on the remote cluster. + # This script will first start the Ray cluster (different ray start commands are executed on Head and Worker nodes). + # Then, your training script will only be run on the Head node (SKYPILOT_NODE_RANK == 0). + run: | + # Get the Head node's IP and total number of nodes (environment variables injected by SkyPilot). + head_ip=`echo "$SKYPILOT_NODE_IPS" | head -n1` + num_nodes=`echo "$SKYPILOT_NODE_IPS" | wc -l` # Here num_nodes should be equal to 2. + + # login wandb + python3 -c "import wandb; wandb.login(relogin=True, key='$WANDB_API_KEY')" + + # Start Ray based on node role (Head=0, Worker>0). + # This logic is a standard Ray cluster startup script. + if [ "$SKYPILOT_NODE_RANK" == "0" ]; then + # Head node starts Ray Head. + echo "Starting Ray head node..." + # Check if a Ray Head is already running to avoid duplicate starts. + ps aux | grep ray | grep 6379 &> /dev/null || ray start --head --disable-usage-stats \ + --port=6379 \ + --dashboard-host=0.0.0.0 \ + --dashboard-port=8265 + + # Wait for all worker nodes to join the cluster. + while [ $(ray nodes | grep NODE_ID | wc -l) -lt $num_nodes ]; do + echo "Waiting for all nodes to join... ($(ray nodes | grep NODE_ID | wc -l)/$num_nodes)" + sleep 5 + done + + # Head node executes the training script. + echo "Executing training script on head node..." + + python3 -m verl.trainer.main_ppo \ + data.train_files=data/gsm8k/train.parquet \ + data.val_files=data/gsm8k/test.parquet \ + data.train_batch_size=256 \ + data.max_prompt_length=512 \ + data.max_response_length=256 \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + critic.optim.lr=1e-5 \ + critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=['console','wandb'] \ + trainer.val_before_train=False \ + trainer.default_hdfs_dir=null \ + trainer.n_gpus_per_node=1 \ + trainer.nnodes=2 \ + trainer.save_freq=20 \ + trainer.test_freq=20 \ + trainer.total_epochs=2 \ + trainer.project_name=verl_examples \ + trainer.experiment_name=experiment_name_gsm8k + + else + # Wait for Ray Head to start. + sleep 10 # Increase waiting time to ensure Head finishes starting. + # Worker node starts Ray Worker. + echo "Starting Ray worker node..." + + # Check if a Ray Worker is already running to avoid duplicate starts. + ps aux | grep ray | grep $head_ip:6379 &> /dev/null || ray start --address $head_ip:6379 --disable-usage-stats + + # Add sleep to after `ray start` to give ray enough time to daemonize + sleep 5 # Ensure Worker successfully connects to Head. + fi + + # No commands are added to the Worker node here; the Worker's main task is to start Ray and wait for the Head node to assign tasks. + echo "Node setup and Ray start script finished for rank $SKYPILOT_NODE_RANK." + + +.. code-block:: bash + + export WANDB_API_KEY= + sky launch -c verl --secret WANDB_API_KEY verl-cluster.yml + +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/running_job.png?raw=true +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/running_job_1.png?raw=true +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/finished.png?raw=true + +**Check the cluster on GCP** + +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/gcp_instances.png?raw=true + +**Check Ray Dashboard** + +We can see the cluster on the RAY Dashboard with the GCP head node: + +```console +$ sky status --endpoint 8265 verl +1.2.3.4:8265 +``` + +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/ray_dashboard_overview.png?raw=true +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/ray_dashboard_jobs.png?raw=true +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/ray_dashboard_cluster.png?raw=true + + +**Check the checkpoint of model** + +.. code-block:: bash + + # login the head node + ssh verl + # The global step will vary. Find the correct path from the training logs. + cd ~/sky_workdir/checkpoints/verl_examples/gsm8k/ + # Then list contents to find the checkpoint, e.g.: + ls -R . + +.. image:: https://github.com/yottalabsai/open-source/blob/main/static/verl/saved_model.png?raw=true + + +Option 3: Launch via Slurm +------------------------------ + +Ray provides users with `this `_ official +tutorial to start a Ray cluster on top of Slurm. We have verified the :doc:`GSM8K example<../examples/gsm8k_example>` +on a Slurm cluster under a multi-node setting with the following steps. + +1. [Optional] If your cluster support `Apptainer or Singularity `_ and you wish +to use it, convert verl's Docker image to an Apptainer image. Alternatively, set up the environment with the package +manager available on your cluster or use other container runtimes (e.g. through `Slurm's OCI support `_) available to you. + +.. code:: bash + + apptainer pull /your/dest/dir/vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3.sif docker://verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3 + +2. Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints. + +3. Modify `examples/slurm/ray_on_slurm.slurm `_ with your cluster's own information. + +4. Submit the job script to the Slurm cluster with `sbatch`. + +Please note that Slurm cluster setup may vary. If you encounter any issues, please refer to Ray's +`Slurm user guide `_ for common caveats. + +If you changed Slurm resource specifications, please make sure to update the environment variables in the job script if necessary. + + +Option 4: Launch via dstack +------------------------------ + +`dstackai/dstack `_ is an open-source container orchestrator that simplifies distributed training across cloud providers and on-premises environments +without the need to use K8S or Slurm. + +Prerequisite +~~~~~~~~~~~~ +Once dstack is `installed `_, initialize the directory as a repo with ``dstack init``. + +.. code-block:: bash + + mkdir myproject && cd myproject + dstack init + +**Create a fleet** + +Before submitting distributed training jobs, create a `dstack` `fleet `_. + +Run a Ray cluster task +~~~~~~~~~~~~~~~~~~~~~~ + +Once the fleet is created, define a Ray cluster task, e.g. in ``ray-cluster.dstack.yml``: + +.. code-block:: yaml + + type: task + name: ray-verl-cluster + + nodes: 2 + + env: + - WANDB_API_KEY + - PYTHONUNBUFFERED=1 + - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + + image: verlai/verl:app-verl0.6-transformers4.56.1-sglang0.5.2-mcore0.13.0-te2.2 + commands: + - git clone https://github.com/volcengine/verl + - cd verl + - pip install --no-deps -e . + - pip install hf_transfer hf_xet + - | + if [ $DSTACK_NODE_RANK = 0 ]; then + python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k + python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2.5-7B-Instruct')" + ray start --head --port=6379; + else + ray start --address=$DSTACK_MASTER_NODE_IP:6379 + fi + + # Expose Ray dashboard port + ports: + - 8265 + + resources: + gpu: 80GB:8 + shm_size: 128GB + + # Save checkpoints on the instance + volumes: + - /checkpoints:/checkpoints + +Now, if you run this task via `dstack apply`, it will automatically forward the Ray's dashboard port to `localhost:8265`. + +.. code-block:: bash + + dstack apply -f ray-cluster.dstack.yml + +As long as the `dstack apply` is attached, you can use `localhost:8265` to submit Ray jobs for execution + +Submit Ray jobs +~~~~~~~~~~~~~~~ + +Before you can submit Ray jobs, ensure to install `ray` locally: + +.. code-block:: shell + + pip install ray + +Now you can submit the training job to the Ray cluster which is available at ``localhost:8265``: + +.. code-block:: shell + + $ RAY_ADDRESS=http://localhost:8265 + $ ray job submit \ + -- python3 -m verl.trainer.main_ppo \ + data.train_files=/root/data/gsm8k/train.parquet \ + data.val_files=/root/data/gsm8k/test.parquet \ + data.train_batch_size=256 \ + data.max_prompt_length=512 \ + data.max_response_length=256 \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + critic.optim.lr=1e-5 \ + critic.model.path=Qwen/Qwen2.5-7B-Instruct \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.project_name=ppo_training \ + trainer.experiment_name=qwen-2.5-7B \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=2 \ + trainer.default_local_dir=/checkpoints \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log \ + trainer.resume_mode=disable + + +For more details on how `dstack` works, check out its `documentation `_. + +How to debug? +--------------------- + + +Ray Distributed Debugger VSCode Extension (Recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. Starting with Ray 2.39, Anyscale has introduced the `Ray Distributed Debugger `_ VSCode extension. Follow the extension’s installation instructions, then add your cluster using the dashboard URL you obtained earlier. + + .. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/debugger.png?raw=true + :alt: Ray Distributed Debugger VSCode extension screenshot + +2. Prerequisites. + + Ensure the following are installed (see the extension README for more detail): + + - Visual Studio Code + - `ray[default]` >= 2.9.1 + - `debugpy` >= 1.8.0 + + .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/c7098b755ff689859837773a916c857.png?raw=true + :alt: VSCode with Ray prerequisites + +3. Environment Variables. + + To enable post‑mortem debugging, set: + + .. code-block:: bash + + export RAY_DEBUG_POST_MORTEM=1 + + .. admonition:: Note + :class: important + + Be sure to remove any legacy flags before starting Ray: + + - `RAY_DEBUG=legacy` + - `--ray-debugger-external` + +4. Configuring BreakpointsSet up breakpoint() in your code, and submit job to cluster. Then the extension will show the breakpoint information. + + + 1. Insert `breakpoint()` calls into your remote functions. + 2. Submit your job to the cluster. + + The extension will detect active breakpoints and display them in VSCode. + + .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/4ddad74395c79a1402331c0ce73316f.png?raw=true + :alt: Detected breakpoint in VSCode + + **Note:** Breakpoints are only supported inside functions decorated with `@ray.remote`. + +5. Launching the Debugger. + + Run your job directly from the command line (do not use a `launch.json`): + + .. code-block:: bash + + python job.py + +6. Attaching to a Breakpoint. + + Once the process hits the first `breakpoint()`, click the Ray Distributed Debugger icon in the VSCode sidebar to attach the debugger. + + .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/4ddad74395c79a1402331c0ce73316f.png?raw=true + :alt: Attaching VSCode debugger to Ray process + +7. Debugging With Multiple breakpoint(). + + For each subsequent task, first disconnect the current debugger session, then click the extension icon again to attach to the next breakpoint. + + .. image:: https://github.com/aoshen524/verl/blob/main/docs/start/6e83c910a62c82fecb89c6619e001cd.png?raw=true + :alt: Disconnecting and reconnecting the debugger + +Legacy Ray Debugger +~~~~~~~~~~~~~~~~~~~ +1. Ray has a builtin legacy `debugger `_ that allows you to debug your distributed applications. To enable debugger, start ray cluster with ``RAY_DEBUG=legacy`` and ``--ray-debugger-external``. + +.. code-block:: bash + + # start head node + RAY_DEBUG=legacy ray start --head --dashboard-host=0.0.0.0 --ray-debugger-external + # start worker node + RAY_DEBUG=legacy ray start --address='10.124.46.192:6379' --ray-debugger-external + +2. Set up breakpoint in your code, and submit job to cluster. Then run ``ray debug`` to wait breakpoint: + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/legacy.png?raw=true + + +Multi-node training on AMD clusters +--------------------------------------------------------------------------------------- + +If you want to run multi-node training with slurm with Docker/Podman container on AMD Cluster, you can use the following script. + +If you encounter any issues in using AMD GPUs running verl, please contact `Yusheng Su `_. + +.. note:: + 1. You need to use ``podman`` or ``docker`` in the following script. We will release the apptainer script later. + 2. If you want to use ``podman``, you just replace ``docker`` with ``podman`` in the following script. + +The script includes the following steps: + +1. SLURM Configuration +2. Environment Setup +3. Docker/Podman Container Setup +4. Ray Cluster Initialization +5. Data Preprocessing +6. Model Setup +7. Training Launch + + +slurm_script.sh +~~~~~~~~~~~~~~~~~~~~ + +.. code-block:: bash + + #!/bin/bash + + #SBATCH --job-name=verl-ray-on-slurm + #SBATCH --nodes=2 + #SBATCH --ntasks-per-node=2 + #SBATCH --mem=200G + #SBATCH --time=30-00:00:00 + #SBATCH --gpus-per-node=8 + #SBATCH --cpus-per-task=28 + #SBATCH --output=../verl_log/slurm-%j.out + #SBATCH --error=../verl_log/slurm-%j.err + #SBATCH --nodelist=gpu-[0,1] + + + # load necessary modules + ### Run this setup + # [Cluster]: Use docker + # docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + + + ########################################################################## + ###The following setting should be set in different project and cluster### + ########################################################################## + + ### Project + CONTAINER_NAME="multinode_verl_training" + IMG="verl.rocm" + DOCKERFILE="docker/Dockerfile.rocm" + # echo $PWD + verl_workdir="${HOME}/projects/verl_upstream" + export TRANSFORMERS_CACHE="${HOME}/.cache/huggingface" + export HF_HOME=$TRANSFORMERS_CACHE + + ### Cluster Network Setting + export NCCL_DEBUG=TRACE + export GPU_MAX_HW_QUEUES=2 + export TORCH_NCCL_HIGH_PRIORITY=1 + export NCCL_CHECKS_DISABLE=1 + # export NCCL_IB_HCA=rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7 + export NCCL_IB_HCA=mlx5_0,mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_8,mlx5_9 + export NCCL_IB_GID_INDEX=3 + export NCCL_CROSS_NIC=0 + export CUDA_DEVICE_MAX_CONNECTIONS=1 + export NCCL_PROTO=Simple + export RCCL_MSCCL_ENABLE=0 + export TOKENIZERS_PARALLELISM=false + export HSA_NO_SCRATCH_RECLAIM=1 + ########################################################################## + + ### For rocm and training script + export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 + export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES + + + # Build and launch the Docker container + srun bash -c " + # Exit on any error + set -e + + # Clean up dangling images (images with tag) + docker image prune -f + + # Need to pull the docker first + docker pull docker.io/rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 + + if ! docker images --format "{{.Repository}}:{{.Tag}}" | grep -q "${IMG}"; then + echo \"Building ${IMG} image...\" + docker build -f \"${DOCKERFILE}\" -t \"${IMG}\" . + else + echo \"${IMG} image already exists, skipping build\" + fi + + # Removing old container if exists + docker rm \"${CONTAINER_NAME}\" 2>/dev/null || true + + # Checking network devices + ibdev2netdev + + # Launch the docker + docker run --rm -d \ + -e HYDRA_FULL_ERROR=1 \ + -e HIP_VISIBLE_DEVICES=${HIP_VISIBLE_DEVICES} \ + -e ROCR_VISIBLE_DEVICES=${ROCR_VISIBLE_DEVICES} \ + -e CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES} \ + -e NCCL_DEBUG=${NCCL_DEBUG} \ + -e GPU_MAX_HW_QUEUES=${GPU_MAX_HW_QUEUES} \ + -e TORCH_NCCL_HIGH_PRIORITY=${TORCH_NCCL_HIGH_PRIORITY} \ + -e NCCL_CHECKS_DISABLE=${NCCL_CHECKS_DISABLE} \ + -e NCCL_IB_HCA=${NCCL_IB_HCA} \ + -e NCCL_IB_GID_INDEX=${NCCL_IB_GID_INDEX} \ + -e NCCL_CROSS_NIC=${NCCL_CROSS_NIC} \ + -e CUDA_DEVICE_MAX_CONNECTIONS=${CUDA_DEVICE_MAX_CONNECTIONS} \ + -e NCCL_PROTO=${NCCL_PROTO} \ + -e RCCL_MSCCL_ENABLE=${RCCL_MSCCL_ENABLE} \ + -e TOKENIZERS_PARALLELISM=${TOKENIZERS_PARALLELISM} \ + -e HSA_NO_SCRATCH_RECLAIM=${HSA_NO_SCRATCH_RECLAIM} \ + -e TRANSFORMERS_CACHE=${TRANSFORMERS_CACHE} \ + -e HF_HOME=${HF_HOME} \ + --network host \ + --device /dev/dri \ + --device /dev/kfd \ + --device /dev/infiniband \ + --group-add video \ + --cap-add SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --privileged \ + -v \${HOME}:\${HOME} \ + -v \${HOME}/.ssh:/root/.ssh \ + -w "${verl_workdir}" \ + --shm-size 128G \ + --name \"${CONTAINER_NAME}\" \ + \"${IMG}\" \ + tail -f /dev/null + + echo \"Container setup completed\" + " + # (Optional): If you do not want to root mode and require assign yuorself as the user + # Please add `-e HOST_UID=$(id -u)` and `-e HOST_GID=$(id -g)` into the above docker launch script. + + + + + + ### Ray launch the nodes before training + + # Getting the node names + nodes_array=($(scontrol show hostnames "$SLURM_JOB_NODELIST" | tr '\n' ' ')) + + head_node=${nodes_array[0]} + head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address) + + # if we detect a space character in the head node IP, we'll + # convert it to an ipv4 address. This step is optional. + if [[ "$head_node_ip" == *" "* ]]; then + IFS=' ' read -ra ADDR <<<"$head_node_ip" + if [[ ${#ADDR[0]} -gt 16 ]]; then + head_node_ip=${ADDR[1]} + else + head_node_ip=${ADDR[0]} + fi + echo "IPV6 address detected. We split the IPV4 address as $head_node_ip" + fi + + port=6379 + ip_head=$head_node_ip:$port + export ip_head + echo "IP Head: $ip_head" + + # make sure we set environment variables before Ray initialization + + # Print out all env variables + printenv + + echo "Starting HEAD at $head_node" + srun --nodes=1 --ntasks=1 -w "$head_node" \ + docker exec "${CONTAINER_NAME}" \ + ray start --head --node-ip-address="$head_node_ip" --port=$port \ + --dashboard-port=8266 \ + --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & + # optional, though may be useful in certain versions of Ray < 1.0. + sleep 10 + + # number of nodes other than the head node + worker_num=$((SLURM_JOB_NUM_NODES - 1)) + + for ((i = 1; i <= worker_num; i++)); do + node_i=${nodes_array[$i]} + echo "Debug: Starting worker on node_i = ${node_i}" + if [ -z "$node_i" ]; then + echo "Error: Empty node name for worker $i" + continue + fi + echo "Starting WORKER $i at $node_i" + srun --nodes=1 --ntasks=1 -w "$node_i" \ + docker exec "${CONTAINER_NAME}" \ + ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block & + sleep 5 + done + + + + + # Ray initlization test (See whether any error in the above execution) + echo "Testing Ray initialization in the slurm nodes..." + docker exec "${CONTAINER_NAME}" python3 -c ' + import ray + try: + ray.init(address="auto") + print("\n=== Ray Cluster Status ===") + print(f"Number of nodes: {len(ray.nodes())}") + for node in ray.nodes(): + print("Node: {}, Status: {}".format(node["NodeManagerHostname"], node["Alive"])) + # print(f"Node: {node}") + ray.shutdown() + print("Ray initialization successful!") + except Exception as e: + print(f"Ray initialization failed: {str(e)}") + ' + echo "=== Ray test completed ===" + ###### + + + + # Run data preprocessing + + echo "Starting data preprocessing..." + docker exec "${CONTAINER_NAME}" \ + python3 "examples/data_preprocess/gsm8k.py" "--local_save_dir" "../data/gsm8k" + + echo "Starting data preprocessing..." + docker exec "${CONTAINER_NAME}" \ + python3 "examples/data_preprocess/math_dataset.py" "--local_dir" "../data/math" + + train_files="../data/gsm8k/train.parquet" + val_files="../data/gsm8k/test.parquet" + + # Download and test model + echo "Loading model..." + docker exec "${CONTAINER_NAME}" \ + python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')" + MODEL_PATH="Qwen/Qwen2-7B-Instruct" + + # Set model path after pipeline test + MODEL_PATH="Qwen/Qwen2.5-0.5B-Instruct" + + echo "== Data and model loading Done ==" + + echo "Start to train..." + + docker exec "${CONTAINER_NAME}" \ + python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2-7B-Instruct')" + MODEL_PATH="Qwen/Qwen2-7B-Instruct" + + + PYTHONUNBUFFERED=1 srun --overlap --nodes=${SLURM_NNODES} --ntasks=1 -w "$head_node" \ + docker exec "${CONTAINER_NAME}" \ + python3 -m verl.trainer.main_ppo \ + data.train_files=$train_files \ + data.val_files=$val_files \ + data.train_batch_size=1024 \ + data.max_prompt_length=1024 \ + data.max_response_length=1024 \ + actor_rollout_ref.model.path=$MODEL_PATH \ + actor_rollout_ref.model.enable_gradient_checkpointing=False \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=256 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=False \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.9 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=$MODEL_PATH \ + critic.model.enable_gradient_checkpointing=False \ + critic.ppo_micro_batch_size_per_gpu=8 \ + critic.model.fsdp_config.param_offload=False \ + critic.model.fsdp_config.optimizer_offload=False \ + algorithm.kl_ctrl.kl_coef=0.0001 \ + trainer.critic_warmup=0 \ + trainer.logger='["console","wandb"]' \ + trainer.project_name='verl_example' \ + trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \ + trainer.n_gpus_per_node=${SLURM_GPUS_PER_NODE} \ + trainer.val_before_train=False \ + trainer.nnodes=${SLURM_NNODES} \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 + + +Run multi-node training with above slurm_script.sh +~~~~~~~~~~~~~~~~~~~~ +Just sbatch your slurm_script.sh + +.. code-block:: bash + + sbatch slurm_script.sh + diff --git a/code/RL_model/verl/verl_train/docs/start/quickstart.rst b/code/RL_model/verl/verl_train/docs/start/quickstart.rst new file mode 100644 index 0000000000000000000000000000000000000000..c0be6a6b30b4d988eba7aa66cc0a0100476aacef --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/start/quickstart.rst @@ -0,0 +1,151 @@ +.. _quickstart: + +========================================================= +Quickstart: PPO training on GSM8K dataset +========================================================= + +Post-train a LLM using GSM8K dataset. + +Introduction +------------ + +.. _hf_dataset_gsm8k: https://huggingface.co/datasets/openai/gsm8k + +In this example, we train an LLM to tackle the `GSM8k `_ task with function-based rewards. [1]_ + +Prerequisite: + +- the latest version of ``verl`` and its dependencies installed following the installation guide. Using the docker image is recommended. + +- a GPU with at least 24 GB HBM + + +Dataset Introduction +-------------------- + +GSM8k is a math problem dataset. The prompt is an elementary school +problem. The LLM model is asked to solve the math problem. Below is an example: + +Prompt + + Katy makes coffee using teaspoons of sugar and cups of water in the + ratio of 7:13. If she used a total of 120 teaspoons of sugar and cups + of water, calculate the number of teaspoonfuls of sugar she used. + +Solution + + The total ratio representing the ingredients she used to make the + coffee is 7+13 = <<7+13=20>>20 Since the fraction representing the + number of teaspoons she used is 7/20, she used 7/20\ *120 = + <<7/20*\ 120=42>>42 #### 42 + +Step 1: Prepare the dataset +---------------------------- + +We preprocess the dataset in parquet format so that (1) it contains necessary fields for computing RL rewards and (2) is faster to read. + +.. code-block:: bash + + python3 examples/data_preprocess/gsm8k.py --local_save_dir ~/data/gsm8k + +Step 2: Download a model for post-training +------------------------------------------- + +In this example, we start with the ``Qwen2.5-0.5B-Instruct`` model. + +If you want to perform SFT before RL, refer to the :doc:`Complete GSM8K Example<../examples/gsm8k_example>`, the `sft directory `_ and `SFT Trainer `_ for further details. + +.. code-block:: bash + + python3 -c "import transformers; transformers.pipeline('text-generation', model='Qwen/Qwen2.5-0.5B-Instruct')" + +Step 3: Perform PPO training with the instruct model +---------------------------------------------------------------------- + +**Reward Model/Function** + +We use a pre-defined rule-based reward model. We force the model to produce a final +answer following 4 “#” as shown in the solution. We extract the final +answer from both the solution and model's output using regular +expression matching. We assign a reward of 1 to correct +answer, 0.0 to incorrect answer and 0 to no answer. + +For more details, please refer to `verl/utils/reward_score/gsm8k.py `_. + +**Training Script** + +Now let's run PPO training with the dataset and model above. [2]_ + + +Set the ``data.train_files`` ,\ ``data.val_files``, ``actor_rollout_ref.model.path`` and ``critic.model.path`` based on your dataset and model names or paths. +You may set ``VERL_USE_MODELSCOPE=True`` to download models from `modelscope `_ instead of `huggingface `_. + +.. code-block:: bash + + PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=256 \ + data.max_prompt_length=512 \ + data.max_response_length=512 \ + actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.name=vllm \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + critic.optim.lr=1e-5 \ + critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ + critic.ppo_micro_batch_size_per_gpu=4 \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=console \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=1 \ + trainer.nnodes=1 \ + trainer.save_freq=10 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log + +You are expected to see the following logs, indicating training in progress. The key metric ``val/test_score/openai/gsm8k`` is computed every ``trainer.test_freq`` steps: + +.. code-block:: bash + + step:0 - timing/gen:21.470 - timing/ref:4.360 - timing/values:5.800 - actor/reward_kl_penalty:0.000 - actor/reward_kl_penalty_coeff:0.001 - timing/adv:0.109 - timing/update_critic:15.664 - critic/vf_loss:14.947 - critic/vf_clipfrac:0.000 - critic/vpred_mean:-2.056 - critic/grad_norm:1023.278 - critic/lr(1e-4):0.100 - timing/update_actor:20.314 - actor/entropy_loss:0.433 - actor/pg_loss:-0.005 - actor/pg_clipfrac:0.000 - actor/ppo_kl:0.000 - actor/grad_norm:1.992 - actor/lr(1e-4):0.010 - critic/score/mean:0.004 - critic/score/max:1.000 - critic/score/min:0.000 - critic/rewards/mean:0.004 - critic/rewards/max:1.000 - critic/rewards/min:0.000 - critic/advantages/mean:-0.000 - critic/advantages/max:2.360 - critic/advantages/min:-2.280 - critic/returns/mean:0.003 - critic/returns/max:0.000 - critic/returns/min:0.000 - critic/values/mean:-2.045 - critic/values/max:9.500 - critic/values/min:-14.000 - response_length/mean:239.133 - response_length/max:256.000 - response_length/min:77.000 - prompt_length/mean:104.883 - prompt_length/max:175.000 - prompt_length/min:68.000 + step:1 - timing/gen:23.020 - timing/ref:4.322 - timing/values:5.953 - actor/reward_kl_penalty:0.000 - actor/reward_kl_penalty:0.001 - timing/adv:0.118 - timing/update_critic:15.646 - critic/vf_loss:18.472 - critic/vf_clipfrac:0.384 - critic/vpred_mean:1.038 - critic/grad_norm:942.924 - critic/lr(1e-4):0.100 - timing/update_actor:20.526 - actor/entropy_loss:0.440 - actor/pg_loss:0.000 - actor/pg_clipfrac:0.002 - actor/ppo_kl:0.000 - actor/grad_norm:2.060 - actor/lr(1e-4):0.010 - critic/score/mean:0.000 - critic/score/max:0.000 - critic/score/min:0.000 - critic/rewards/mean:0.000 - critic/rewards/max:0.000 - critic/rewards/min:0.000 - critic/advantages/mean:0.000 - critic/advantages/max:2.702 - critic/advantages/min:-2.616 - critic/returns/mean:0.000 - critic/returns/max:0.000 - critic/returns/min:0.000 - critic/values/mean:-2.280 - critic/values/max:11.000 - critic/values/min:-16.000 - response_length/mean:232.242 - response_length/max:256.000 - response_length/min:91.000 - prompt_length/mean:102.398 - prompt_length/max:185.000 - prompt_length/min:70.000 + +Checkout ``Algorithm Baselines`` page for full training and validation logs for reference. + +The checkpoint is saved at the following dir by default: ``checkpoints/${trainer.project_name}/${trainer.experiment_name}``. You can merge the saved checkpoints to huggingface model using ``verl.model_merger`` module, for example: + +.. code-block:: bash + + python3 -m verl.model_merger merge \ + --backend fsdp \ + --local_dir checkpoints/${trainer.project_name}/${trainer.experiment_name}/global_step_1/actor \ + --target_dir checkpoints/${trainer.project_name}/${trainer.experiment_name}/global_step_1/actor/huggingface + +For more details about checkpoint and model merging, please refer to :ref:`checkpoint-page`. + +To enable ``wandb`` for experiment tracking, set the following configs: + +.. code-block:: bash + + trainer.logger='["console","wandb"]' \ + trainer.project_name=$YOUR_PROJECT_NAME \ + trainer.experiment_name=$YOUR_RUN_NAME \ + +If you encounter out of memory issues with HBM less than 32GB, enable the following configs would help: + +.. code-block:: bash + + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \ + critic.ppo_micro_batch_size_per_gpu=1 \ + +For the full set of configs, please refer to :ref:`config-explain-page` for detailed explanation and performance tuning. + + +.. [1] The original paper (https://arxiv.org/pdf/2110.14168) mainly focuses on training a verifier (a reward model) to solve math problems via Best-of-N sampling. In this example, we train an RL agent using a rule-based reward model. +.. [2] More training script examples for FSDP and Megatron-LM backend are stored in `examples/ppo_trainer `_ directory. diff --git a/code/RL_model/verl/verl_train/docs/start/ray_debug_tutorial.rst b/code/RL_model/verl/verl_train/docs/start/ray_debug_tutorial.rst new file mode 100644 index 0000000000000000000000000000000000000000..9e7c87dfaee0c04f24bdb6921717b8068d1ee6a2 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/start/ray_debug_tutorial.rst @@ -0,0 +1,96 @@ +Ray Debug Tutorial +================== + +Last updated: 04/23/2025 + + +.. _wuxibin89: https://github.com/wuxibin89 + +Author: `Ao Shen `_. + +How to debug? +--------------------- + + +Ray Distributed Debugger VSCode Extension (Recommended) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +1. Starting with Ray 2.39, Anyscale has introduced the `Ray Distributed Debugger `_ VSCode extension. Follow the extension’s installation instructions, then add your cluster using the dashboard URL you obtained earlier. + + .. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/debugger.png?raw=true + :alt: Ray Distributed Debugger VSCode extension screenshot + +2. Prerequisites. + + Ensure the following are installed (see the extension README for more detail): + + - Visual Studio Code + - `ray[default]` >= 2.9.1 + - `debugpy` >= 1.8.0 + + .. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/readme.png?raw=true + :alt: VSCode with Ray prerequisites + +3. Environment Variables. + + To enable post‑mortem debugging, set: + + .. code-block:: bash + + export RAY_DEBUG_POST_MORTEM=1 + + .. admonition:: Note + :class: important + + Be sure to remove any legacy flags before starting Ray: + + - `RAY_DEBUG=legacy` + - `--ray-debugger-external` + +4. Configuring BreakpointsSet up breakpoint() in your code, and submit job to cluster. Then the extension will show the breakpoint information. + + + 1. Insert `breakpoint()` calls into your remote functions. + 2. Submit your job to the cluster. + + The extension will detect active breakpoints and display them in VSCode. + + **Note:** Breakpoints are only supported inside functions decorated with `@ray.remote`. + +5. Launching the Debugger. + + Run your job directly from the command line (do not use a `launch.json`): + + .. code-block:: bash + + python job.py + +6. Attaching to a Breakpoint. + + Once the process hits the first `breakpoint()`, click the Ray Distributed Debugger icon in the VSCode sidebar to attach the debugger. + + .. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/launch.png?raw=true + :alt: Attaching VSCode debugger to Ray process + +7. Debugging With Multiple breakpoint(). + + For each subsequent task, first disconnect the current debugger session, then click the extension icon again to attach to the next breakpoint. + + .. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/disconnect.png?raw=true + :alt: Disconnecting and reconnecting the debugger + +Legacy Ray Debugger +~~~~~~~~~~~~~~~~~~~ +1. Ray has a builtin legacy `debugger `_ that allows you to debug your distributed applications. To enable debugger, start ray cluster with ``RAY_DEBUG=legacy`` and ``--ray-debugger-external``. + +.. code-block:: bash + + # start head node + RAY_DEBUG=legacy ray start --head --dashboard-host=0.0.0.0 --ray-debugger-external + # start worker node + RAY_DEBUG=legacy ray start --address='10.124.46.192:6379' --ray-debugger-external + +2. Set up breakpoint in your code, and submit job to cluster. Then run ``ray debug`` to wait breakpoint: + +.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray/legacy.png?raw=true + diff --git a/code/RL_model/verl/verl_train/docs/workers/fsdp_workers.rst b/code/RL_model/verl/verl_train/docs/workers/fsdp_workers.rst new file mode 100644 index 0000000000000000000000000000000000000000..03bde11376c21be4bd8d83218278dc479700b543 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/workers/fsdp_workers.rst @@ -0,0 +1,140 @@ +PyTorch FSDP Backend +====================== + +Last updated: 12/01/2025. + +We support PyTorch FSDP Backend by implementing various workers for +actor, critic, reference, rollout and reward models. + +**Pros** + +- Readily support various models. + + - Users only need to implement the corresponding + ``dtensor_weight_loader`` for weight synchronization between FSDP + and vLLM. While for ``hf_weight_loader``, users can directly apply + any models supported both in HF and vLLM without any code change. + +- Easy to organize the forward and backward computation for each model. + +**Cons** + +- Poor scalability when it comes to large-scale models (e.g. Llama 70B + and 405B) +- The resharding overhead between actor and rollout could be larger than + Megatron-LM backend. + +Due to the simplicity, we recommend using FSDP backend for algorithm +research and prototyping. + +FSDP Workers +-------------- + +ActorRolloutRefWorker +^^^^^^^^^^^^^^^^^^^^^ + +Actor/Rollout HybridEngine +'''''''''''''''''''''''''' + +1. HybridEngine, Actor and Rollout initialization API. + +.. code:: python + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + +``ONE_TO_ALL``: when calling the ``init_model`` function from the driver +process, each worker (on a GPU) will execute the following model +initialization process. + +The initialization details of HybridEngine, Actor and Rollout are +highlighted below: + +1. ``DataParallelPPOActor`` implements the simple PPO computation logics + when the model is built with FSDP, including compute log prob, model + update. +2. ``vLLMRollout`` support generation with vLLM. We modify the vLLM + Engine and make it executed under SPMD to fit into our + ``WorkerGroup`` design. + +See `source code `_. for more information. + +1. Generate sequence and recompute log prob + +.. code:: python + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def generate_sequences(self, prompts: DataProto): + +- ``Dispatch.DP_COMPUTE_PROTO``: The data will be dispatched and + collected along the DP dimension + +- In this function, the rollout model will perform auto-regressive + generation and the actor model will recompute the old log prob for the + generated response. + +3. Update actor model + +.. code:: python + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def update_actor(self, data: DataProto): + +- Update the actor model weight using PPO & entropy loss. + +ReferenceModel +'''''''''''''' + +1. Reference model initialization + +The reference model is initialized using the same function as the actor +model without initializing the HybridEngine and Optimizer. Then the +actor model is also wrapped by the ``DataParallelPPOActor``. + +2. Compute reference log prob + +.. code:: python + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def compute_ref_log_prob(self, data: DataProto): + +- In this function, the reference model will call the compute log prob + function in ``DataParallelPPOActor`` to compute the reference log + prob. + +CriticWorker and RewardWorker +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Model initialization + +Quite similar to reference model. The CriticWorker will perform +additional initialization for the Optimizer. + +2. Compute Values for CriticWorker + +.. code:: python + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def compute_values(self, data: DataProto): + +3. Update Critic + +.. code:: python + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def update_critic(self, data: DataProto): + +4. Compute Reward + +.. code:: python + + @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) + def compute_rm_score(self, data: DataProto): + + +HybridShard +------------ + +We didn't support FSDP `HybridShard`. To support this, we may need to +construct a 2D device mesh and test the corresponding +``dtensor_weight_loader`` and ``hf_weight_loader`` for each model. diff --git a/code/RL_model/verl/verl_train/docs/workers/megatron_workers.rst b/code/RL_model/verl/verl_train/docs/workers/megatron_workers.rst new file mode 100644 index 0000000000000000000000000000000000000000..91452c7dc51f1d654ca3dc5039ef6d373e23b176 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/workers/megatron_workers.rst @@ -0,0 +1,276 @@ +Megatron-LM Backend +=================== + +Last updated: 12/01/2025. + +We support Megatron Backend by implementing various workers for actor, +critic, reference, rollout and reward models. We also implement the +``3DHybridEngine`` using Megatron-LM and vLLM/SGLang in +`megatron_vllm.py `_ +and `megatron_sglang.py `_. + +**Pros** + +- Support 5D parallelism (TP, EP, CP, DP, PP) and sequence parallelism + for best scalablility and throughput. +- 3D HybridEngine can significantly reduce peak memory usage and reduce + weight synchronize overhead between actor and rollout. + +**Cons** + +- Huggingface Models and Megatron checkpoints need tools for conversion. + + +Development Progress +-------------------- + + +Note that [Deprecated] means that the feature is not supported in the latest +version of verl. +[To-Optimize] means that the feature is implemented but not optimized yet. +[WIP] means that the feature is working in progress. +[In-Release] means that the feature is ready and in review process, +coming at any time. + + ++---------------+-----------------------------------------------------------+ +| [Deprecated] | Megatron 3D Parallelism with custom models | ++---------------+-----------------------------------------------------------+ +| [Done] | Megatron 0.11.0 ``GPTModel`` support | ++---------------+-----------------------------------------------------------+ +| [Done] | Megatron GRPO support | ++---------------+-----------------------------------------------------------+ +| [Done] | Megatron with vLLM 0.8.2, with per-tensor weights loading | ++---------------+-----------------------------------------------------------+ +| [Done] | Megatron with Context Parallel | ++---------------+-----------------------------------------------------------+ +| [Done] | Qwen2MoE model support | ++---------------+-----------------------------------------------------------+ +| [To-Optimize] | Megatron dist Checkpoint | ++---------------+-----------------------------------------------------------+ +| [To-Optimize] | Huggingface and Megatron Checkpoint Converter | ++---------------+-----------------------------------------------------------+ +| [To-Optimize] | Efficient fused linear, entropy and cross entropy | ++---------------+-----------------------------------------------------------+ +| [Done] | Megatron offload(param, grad, optimizer) | ++---------------+-----------------------------------------------------------+ +| [Done] | Megatron Profiler | ++---------------+-----------------------------------------------------------+ +| [In-Release] | Megatron 0.12.0, TE 2.2 with vLLM 0.8.3 and Fused Attn | ++---------------+-----------------------------------------------------------+ +| [WIP] | Moonlight/DeepSeek-V3 model support | ++---------------+-----------------------------------------------------------+ +| [WIP] | Expert Parallel support | ++---------------+-----------------------------------------------------------+ +| [WIP] | Megatron support dynamic batch size | ++---------------+-----------------------------------------------------------+ +| [To-Do] | Performance tuning | ++---------------+-----------------------------------------------------------+ +| [MileStone] | Runnable with DeepSeek-V3 671B post-training | ++---------------+-----------------------------------------------------------+ + + + +Utils of Megatron Workers +------------------------- + +MegatronWorker +^^^^^^^^^^^^^^ + +``MegatronWorker`` is the base class of different megatron worker +classes. In this class, ``get_megatron_global_info`` and +``get_megatron_rank_info`` function to retrieve the 3D parallel world +size and rank of each ``Worker`` running on specific GPU. These information +will be used in transfer protocol for Megatron Backend. + +The following ``Worker`` class for different models will be utilized to +construct the ``WorkerGroup`` . + +We implement various of APIs for each ``Worker`` class decorated by the +``@register(dispatch_mode=)`` . These APIs can be called by the ray +driver process. The data can be correctly collect and dispatch following +the ``dispatch_mode`` on each function. The supported dispatch_model +(i.e., transfer protocols) can be found in `decorator.py `_. + +ActorRolloutRefWorker +^^^^^^^^^^^^^^^^^^^^^ + +This class is implemented for Actor/Rollout HybridEngine or for the +reference model to initialize their model and perform computation. + +Actor/Rollout HybridEngine +'''''''''''''''''''''''''' + +1. HybridEngine, Actor and Rollout initialization API. + +.. code:: python + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def init_model(self): + +``ONE_TO_ALL``: when calling the ``init_model`` function from the driver +process, each worker (on a GPU) will execute the following model +initialization process. + +The initialization details of HybridEngine, Actor and Rollout are +highlighted below: + +1. ``MegatronPPOActor`` implements the simple PPO computation logics + when the model is built with Megatron, including compute log prob, + model update. +2. ``vLLMRollout`` support generation with vLLM. We modify the vLLM + Engine and make it executed under SPMD to fit into our + ``WorkerGroup`` design. + +See `source code `_ for more information. + +.. code:: python + + # build actor model + self.actor = MegatronPPOActor(config=self.config.actor, + model_config=self.actor_model_config, + megatron_config=megatron_config, + actor_module=self.actor_module, + actor_optimizer=self.actor_optimizer, + actor_optimizer_config=self.actor_optim_config) + + # build rollout + # rollout initialization + rollout = vLLMRollout(actor_module=params, + config=self.config.rollout, + tokenizer=self.tokenizer, + model_hf_config=self.actor_model_config, + train_tp=mpu.get_tensor_model_parallel_world_size()) + ... + +1. Generate sequence and recompute log prob + +.. code:: python + + @register(dispatch_mode=Dispatch.MEGATRON_PP_AS_DP_PROTO) + def generate_sequences(self, prompts: DataProto): + +- ``Dispatch.MEGATRON_PP_AS_DP_PROTO``: The PP dimension of the actor + model will be regarded as DP dimension. Then the driver process will + dispatch and collect the data according to this reorganization. This + is because, in HybridEngine, the actor weight, which usually applied + larger 3D parallel sizes, will be gathered along the PP dimension and + TP dimension. Therefore, the corresponding data should be dispatched + and collected through the 3D parallel group of the rollout model, + rather than the actor model. However, the world_size and rank + information can only be retrieved from ``get_megatron_global_info`` and + ``get_megatron_rank_info``, which records the 3D information for the + actor model. Moreover, the data resharding inside TP dimension will be + processed within the HybridEngine. + +- In this function, the rollout model will perform auto-regressive + generation and the actor model will recompute the old log prob for the + generated response. + +3. Update actor model + +.. code:: python + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def update_actor(self, data: DataProto): + +- ``Dispatch.MEGATRON_COMPUTE_PROTO``: User passes the data partitioned + by DP dimension. The data is dispatched to all tp/pp ranks within the + same dp group, and ultimately only collects output data from tp=0 and + the last pp. +- Update the actor model weight using PPO & entropy loss. + + +..note:: + + Currently, training Tensor Parallel Size can be different from inference + Tensor Parallel Size. + + +ReferenceModel +'''''''''''''' + +1. Reference model initialization + +The reference model is initialized using the same function as the actor +model without initializing the HybridEngine and Optimizer. Then the +actor model is also wrapped by the ``MegatronPPOActor``. + +2. Compute reference log prob + +.. code:: python + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def compute_ref_log_prob(self, data: DataProto): + +- In this function, the reference model will call the compute log prob + function in ``MegatronPPOActor`` to compute the reference log prob. + +CriticWorker and RewardWorker +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. Model initialization + +Quite similar to reference model. The CriticWorker will perform +additional initialization for the Optimizer. + +2. Compute Values for CriticWorker + +.. code:: python + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def compute_values(self, data: DataProto): + +3. Update Critic + +.. code:: python + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def update_critic(self, data: DataProto): + +4. Compute Reward + +.. code:: python + + @register(dispatch_mode=Dispatch.MEGATRON_COMPUTE_PROTO) + def compute_rm_score(self, data: DataProto): + + +Utils of Train Optimization +--------------------------- + +Offload +^^^^^^^ +When resources are tight, the offload method can lower GPU memory +usage, helping training and inference frameworks work well under verl. +It moves parameters, gradients, and optimizers to CPU memory and only +loads them back to the GPU when needed. + +If you want to use the offload, you can add the following parameters +for the actor and ref separately. + +.. code:: python + + # For the actor + actor_rollout_ref.actor.megatron.param_offload=True \ + actor_rollout_ref.actor.megatron.grad_offload=True \ + actor_rollout_ref.actor.megatron.optimizer_offload=True \ + # For the ref w/o grad and optimizer + actor_rollout_ref.ref.megatron.param_offload=True \ + + +For the critic, you can include these parameters. + +.. code:: python + + # For the critic + critic.megatron.param_offload=True \ + critic.megatron.grad_offload=True \ + critic.megatron.optimizer_offload=True \ + + +Related MCore Document +---------------------- + +There is also a detailed document of using MCore to train different +kinds of models, please refer to `MCore Document `_. diff --git a/code/RL_model/verl/verl_train/docs/workers/model_engine.rst b/code/RL_model/verl/verl_train/docs/workers/model_engine.rst new file mode 100644 index 0000000000000000000000000000000000000000..6642242bc3cde037ace437927fcf5da1dadb7b3e --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/workers/model_engine.rst @@ -0,0 +1,125 @@ +Model Engine +============ + +.. _vermouth: https://github.com/vermouth1992 + +Author: `Chi Zhang `_ + +Last updated: 09/25/2025. + +Current Support Matrix +---------------------- + ++----------+-----------+--------------+-------------+--------------------------+ +| Backends | Model | Scalability | Model | Pain points | +| | Supported | | Definition | | +| | | | | | ++==========+===========+==============+=============+==========================+ +| FSDP | Day 1 | - Dense is OK| Huggingface | Monkey patch can be | +| + | support | | + monkey | easily impacted by | +| ulysses | HF model | - MoE is bad | patch | transformers version | ++----------+-----------+--------------+-------------+--------------------------+ +| MCore | Limited | Best | GPTModel | Supporting new models is | +| | | | (One model | difficult | +| | | | for all) | | ++----------+-----------+--------------+-------------+--------------------------+ + +- We monkey patch attention function to support ulysses +- We monkey patch VLM models to support FSDP with mixed data with and + without images + +Class Hierarchy +--------------- + +Note that all the workers and trainers run in **SPMD** mode. SFT/DPO/RM +trainer is directly invoked by ``torchrun``. The Actor/Critic worker can +also be invoked by a RayWorkerGroup and provides APIs to a single +controller. + +- Base Engine level: implement model init, optimizer init, lr scheduler + init, sharding, checkpoint manager. +- Full Engine level: subclass base engine and implement + ``forward_step``. +- Worker/SPMD trainer level: **engine agnostic**, implement training + logics using abstract engine APIs + +RL trainer utilizes workers to construct HybridFlow program. This is out +of the scope of model engine. + +Existing Model Types +-------------------- + +========== ====================== ====================== +Model type Language model Value model +========== ====================== ====================== +Input text/image/video/audio text/image/video/audio +Output logits for next token logits as value +========== ====================== ====================== + +Currently, we have two model types: language model and value model. We +expect to expand the category to include Qwen-Omni family (output both +text and audio) and VLA models. + +Data Format +----------- + +Currently, verl adopts left-right padding data format in RL trainer. +This creates massive padding when the discrepancy between response +length is large. We will start to implement no-padding format throughout +the whole system. + +.. image:: https://github.com/vermouth1992/verl-data/blob/master/images/data_format.png?raw=true + :alt: Data Format + +Here is the migration plan: +- Implement no-padding format in engine +- Add a transformation layer in Actor/Critic worker. +- Replace Actor/Critic Worker in RL trainer +- Implement no-padding throughput system + +Checkpoint System +----------------- + +.. image:: https://github.com/vermouth1992/verl-data/blob/master/images/verl-ckpt.png?raw=true + :alt: Model Engine Checkpoint System + +The engine constructs the model using huggingface config, then load +weights from huggingface checkpoint. If the engine directly uses +huggingface model definition, it can use function provided by +``transformers``. Otherwise, each engine has to write their own +checkpoint load logic (e.g., +`mbridge `__). During model +training, each engine has to implement save_checkpoint and +load_checkpoint that save/load intermediate sharded checkpoint including +model, optimizer and lr scheduler states. Each engine has to implement a +checkpoint merge script, that merges the intermediate sharded checkpoint +back to huggingface format. + +API +--- + +A tentative model engine API can be found: +https://github.com/volcengine/verl/blob/main/verl/workers/engine/base.py#L24 + +Extension +--------- + +Add a new backend +~~~~~~~~~~~~~~~~~ + +- Start a new folder under ``verl/workers/engine``. Then, implement + ``transformer_impl.py``. If you want to implement a non-transformer + model, please contact us in advance. +- Add the engine config to the GSM8k SFT trainer script: + https://github.com/volcengine/verl/blob/main/tests/special_e2e/sft/run_sft_engine_gsm8k.sh +- Invoke the tests with your backend: + https://github.com/volcengine/verl/blob/main/tests/special_e2e/sft/test_sft_engine_all.sh. + This test script will run various backends and various + configurations, and compare the loss and grad norm of the first step + to make sure they are close. + +Add a new model type +~~~~~~~~~~~~~~~~~~~~ + +- This is mainly reserved for models whose the output is not just text + (e.g., Qwen3-Omni). Please discuss with us before you proceed. diff --git a/code/RL_model/verl/verl_train/docs/workers/ray_trainer.rst b/code/RL_model/verl/verl_train/docs/workers/ray_trainer.rst new file mode 100644 index 0000000000000000000000000000000000000000..9c482d39a4223ca292029325db3d064a417c9ba1 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/workers/ray_trainer.rst @@ -0,0 +1,241 @@ +PPO Ray Trainer +=============== + +Last updated: 02/12/2025. + +We implement the RayPPOTrainer, which is a trainer runs on the driver +process on a single CPU/GPU node (default is CPU). + +The PPORayTrainer include 3 core functions for data preparation, +WorkerGroup initialization and PPO training loop. + +Data Preparation +---------------- + +The ``PPORayTrainer``, as a single process, is responsible for loading a +complete batch of samples (prompts) from the dataset and then dispatch +to different worker_groups running on different GPUs. + +To generalize the data loading, we implement the ``RLHFDataset`` class +to load the preprocessed parquet files, apply chat templates to the +prompts, add padding, truncate prompts that exceed max prompt length and +then tokenize. + +.. code:: python + + self.train_dataset = RLHFDataset(data_files=self.config.data.train_files, + tokenizer=self.tokenizer, + config=self.config.data) + +Then, the dataloader will iterate the dataset under PPO mini batch size. + +WorkerGroup Initialization +-------------------------- + +We first introduce a basic implementation of initializing the +``WorkerGroup`` of the actor model on a given set of GPUs. + +.. code:: python + + # max_colocate_count means the number of WorkerGroups (i.e. processes) in each RayResourcePool + # For FSDP backend, we recommend using max_colocate_count=1 that merge all WorkerGroups into one. + # For Megatron backend, we recommend using max_colocate_count>1 that can utilize different WorkerGroup for differnt models + resource_pool = RayResourcePool(process_on_nodes=[config.trainer.n_gpus_per_node] * config.trainer.nnodes, + use_gpu=True, + max_colocate_count=1) + # define actor rollout cls to be init on remote + actor_rollout_cls = RayClassWithInitArgs(cls=ActorRolloutWorker) + # define actor_rollout worker group + actor_rollout_worker_group = MegatronRayWorkerGroup(resource_pool=resource_pool, + ray_cls_with_init=actor_rollout_cls, + default_megatron_kwargs=config.actor_rollout.megatron) + +Different WorkerGroups, like ``actor_rollout_worker_group`` , +``critic_worker_group`` and ``ref_worker_group`` lies on a separate +process in the above implementation. + +The driver process can then call the distributed compute function within +the ``actor_rollout_worker_group`` and other roles to construct the RL +training loop. + +For models colocated in the same set of GPUs, we further provide a +fine-grain optimization, which merge the ``worker_group`` of different roles +in the same process. This optimization can save the redundant +CUDA/distributed context in different processes. + +.. code:: python + + # initialize WorkerGroup + # NOTE: if you want to use a different resource pool for each role, which can support different parallel size, + # you should not use `create_colocated_worker_cls`. Instead, directly pass different resource pool to different worker groups. + # See TODO(url) for more information. + all_wg = {} + for resource_pool, class_dict in self.resource_pool_to_cls.items(): + worker_dict_cls = create_colocated_worker_cls(class_dict=class_dict) + wg_dict = self.ray_worker_group_cls(resource_pool=resource_pool, ray_cls_with_init=worker_dict_cls) + spawn_wg = wg_dict.spawn(prefix_set=class_dict.keys()) + all_wg.update(spawn_wg) + + if self.use_critic: + self.critic_wg = all_wg['critic'] + self.critic_wg.init_model() + + if self.use_reference_policy: + self.ref_policy_wg = all_wg['ref'] + self.ref_policy_wg.init_model() + + if self.use_rm: + self.rm_wg = all_wg['rm'] + self.rm_wg.init_model() + + # we should create rollout at the end so that vllm can have a better estimation of kv cache memory + self.actor_rollout_wg = all_wg['actor_rollout'] + self.actor_rollout_wg.init_model() + +.. note:: For megatron backend, if we merge the ``worker_groups`` into the same processes, all the roles will utilize the same 3D parallel size. To optimize this, we may need to maintain several 3D process groups for each role in the same distributed context. If you want to use different 3D parallel size for different roles, please follow the similar architecture of the first code block to initialize each role's ``worker_group`` + + +PPO Training Loop +----------------- + +We implement the PPO training loop by calling the functions in +worker_group of each role. The input and output data of each function is +a ``DataProto`` object implemented in `protocol.py `_. In the training +loop, trainer will dispatch/collect the data to/from different GPUs +following the transfer protocols wrapped in the workers' functions. The +computation of PPO micro batches is processed in ``update_actor`` and +``update_critic`` functions. + +To extend to other RLHF algorithms, such as DPO, GRPO, please refer to +:doc:`../advance/dpo_extension`. + +.. code:: python + + def fit(self): + """ + The training loop of PPO. + The driver process only need to call the compute functions of the worker group through RPC to construct the PPO dataflow. + The light-weight advantage computation is done on the driver process. + """ + from verl.utils.tracking import Tracking + from omegaconf import OmegaConf + + logger = Tracking(project_name=self.config.trainer.project_name, + experiment_name=self.config.trainer.experiment_name, + default_backend=self.config.trainer.logger, + config=OmegaConf.to_container(self.config, resolve=True)) + + global_steps = 0 + + # perform validation before training + # currently, we only support validation using the reward_function. + if self.val_reward_fn is not None: + val_metrics = self._validate() + pprint(f'Initial validation metrics: {val_metrics}') + + for epoch in range(self.config.trainer.total_epochs): + for batch_dict in self.train_dataloader: + metrics = {} + + batch: DataProto = DataProto.from_single_dict(batch_dict) + # batch = batch.to('cuda') + + # pop those keys for generation + gen_batch = batch.pop(batch_keys=['input_ids', 'attention_mask', 'position_ids']) + + # generate a batch + with Timer(name='gen', logger=None) as timer: + gen_batch_output = self.actor_rollout_wg.generate_sequences(gen_batch) + metrics['timing/gen'] = timer.last + + batch = batch.union(gen_batch_output) + + if self.use_reference_policy: + # compute reference log_prob + with Timer(name='ref', logger=None) as timer: + ref_log_prob = self.ref_policy_wg.compute_ref_log_prob(batch) + batch = batch.union(ref_log_prob) + metrics['timing/ref'] = timer.last + + # compute values + with Timer(name='values', logger=None) as timer: + values = self.critic_wg.compute_values(batch) + batch = batch.union(values) + metrics['timing/values'] = timer.last + + with Timer(name='adv', logger=None) as timer: + # compute scores. Support both model and function-based. + # We first compute the scores using reward model. Then, we call reward_fn to combine + # the results from reward model and rule-based results. + if self.use_rm: + # we first compute reward model score + reward_tensor = self.rm_wg.compute_rm_score(batch) + batch = batch.union(reward_tensor) + + # we combine with rule-based rm + reward_tensor = self.reward_fn(batch) + batch.batch['token_level_scores'] = reward_tensor + + # compute rewards. apply_kl_penalty if available + batch, kl_metrics = apply_kl_penalty(batch, + kl_ctrl=self.kl_ctrl_in_reward, + kl_penalty=self.config.algorithm.kl_penalty) + metrics.update(kl_metrics) + + # compute advantages, executed on the driver process + batch = compute_advantage(batch, + self.config.algorithm.gamma, + self.config.algorithm.lam, + adv_estimator=self.config.algorithm.adv_estimator) + metrics['timing/adv'] = timer.last + + # update critic + if self.use_critic: + with Timer(name='update_critic', logger=None) as timer: + critic_output = self.critic_wg.update_critic(batch) + metrics['timing/update_critic'] = timer.last + critic_output_metrics = reduce_metrics(critic_output.meta_info['metrics']) + metrics.update(critic_output_metrics) + + # implement critic warmup + if self.config.trainer.critic_warmup <= global_steps: + # update actor + with Timer(name='update_actor', logger=None) as timer: + actor_output = self.actor_rollout_wg.update_actor(batch) + metrics['timing/update_actor'] = timer.last + actor_output_metrics = reduce_metrics(actor_output.meta_info['metrics']) + metrics.update(actor_output_metrics) + + # validate + if self.val_reward_fn is not None and (global_steps + 1) % self.config.trainer.test_freq == 0: + with Timer(name='testing', logger=None) as timer: + val_metrics: dict = self._validate() + val_metrics = {f'val/{key}': val for key, val in val_metrics.items()} + metrics['timing/testing'] = timer.last + metrics.update(val_metrics) + + # collect metrics + data_metrics = compute_data_metrics(batch=batch) + metrics.update(data_metrics) + + # TODO: make a canonical logger that supports various backend + logger.log(data=metrics, step=global_steps) + + if self.config.trainer.save_freq > 0 and (global_steps + 1) % self.config.trainer.save_freq == 0: + actor_local_path = os.path.join(self.config.trainer.default_local_dir, 'actor', + f'global_step_{global_steps}') + actor_remote_path = os.path.join(self.config.trainer.default_hdfs_dir, 'actor') + self.actor_rollout_wg.save_checkpoint(actor_local_path, actor_remote_path) + + if self.use_critic: + critic_local_path = os.path.join(self.config.trainer.default_local_dir, 'critic', + f'global_step_{global_steps}') + critic_remote_path = os.path.join(self.config.trainer.default_hdfs_dir, 'critic') + self.critic_wg.save_checkpoint(critic_local_path, critic_remote_path) + + global_steps += 1 + + # perform validation after training + if self.val_reward_fn is not None: + val_metrics = self._validate() + pprint(f'Final validation metrics: {val_metrics}') diff --git a/code/RL_model/verl/verl_train/docs/workers/sglang_worker.rst b/code/RL_model/verl/verl_train/docs/workers/sglang_worker.rst new file mode 100644 index 0000000000000000000000000000000000000000..08cc48a075d3f3a2abc131e881f186c0f0df8fed --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/workers/sglang_worker.rst @@ -0,0 +1,237 @@ +SGLang Backend +============== + +Last updated: 05/31/2025. + +**Authored By SGLang RL Team and listed alphabetically by last name** + +`Jingyi Chen `_, `Yitong Guan `_, `Zhuobin Huang `_, `Jiajun Li `_, `Ji Li `_, `Shenggui Li `_, `Junrong Lin `_, `Xiang Long `_, `Rui Lu `_, `Jin Pan `_, `Shuai Shi `_, `Yushen Su `_, `Xinyuan Tong `_, `Chendong Wang `_, `Hanchen Zhang `_, `Haoran Wang `_, `Yongan Xiang `_, `Chengxing Xie `_, `Yuhao Yang `_, `Jinwei Yao `_, `Qiaolin Yu `_, `Yuzhen Zhou `_, `Chenyang Zhao `_ + + + +Introduction +------------ +`SGLang `_ is an open-source state-of-the-art inference service engine, fully adopted by xAI to support all inference needs of Grok during research and serving processes. + +Currently, verl fully supports using SGLang as the inference engine during the rollout phase. As a rollout engine, SGLang provides the same feature coverage as vLLM., including memory saving and multi-node rollout features. After installing verl and SGLang, simply add ``actor_rollout_ref.rollout.name=sglang`` at startup script to seamlessly switch between the two inference frameworks. + +In addition, the SGLang team is actively working on supporting features such as Multi-Turn Agentic RL, VLM RLHF, Server-Based RLHF, and Partial Rollout. You can track the related development progress in the `Tracking Roadmap `_. + +Installation +------------ +Please always follow the following command to install SGLang with verl. + +.. code-block:: bash + + pip install --upgrade pip + # Currently 0.4.8, subject to updates at any time, please refer to the latest version specified in `setup.py` + pip install -e ".[sglang]" + +You can check the following dependencies are in your environment: + +.. note:: + + - **PyTorch**: 2.6.0+cu124 + - **CUDA**: 12.4 + - **flashinfer-python**: 0.2.5+cu124torch2.6 + - **SGLang**: 0.4.6.post5 + - **sgl-kernel**: 0.1.4 + +Using SGLang as the Inference Backend for PPO Training on a Single Machine +------------------------------------------------------------------------- +We use Qwen/Qwen2-7B-Instruct on the gsm8k dataset for a simple test. + +1. Run the following command to prepare the gsm8k dataset: + +.. code-block:: bash + + python3 examples/data_preprocess/gsm8k.py + +2. Run the following script to conduct a PPO experiment on a single machine with 4 GPUs: + +.. code-block:: bash + + export SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK=True + PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ + data.train_files=$HOME/data/gsm8k/train.parquet \ + data.val_files=$HOME/data/gsm8k/test.parquet \ + data.train_batch_size=4096 \ + data.max_prompt_length=4096 \ + data.max_response_length=4096 \ + actor_rollout_ref.rollout.name=sglang \ + actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ + critic.optim.lr=1e-5 \ + critic.model.path=Qwen/Qwen2-7B-Instruct \ + critic.ppo_micro_batch_size_per_gpu=4 \ + critic.model.fsdp_config.param_offload=True \ + critic.model.fsdp_config.optimizer_offload=True \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.logger=console \ + trainer.val_before_train=False \ + trainer.n_gpus_per_node=4 \ + trainer.nnodes=1 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log + +Why export SGL_DISABLE_TP_MEMORY_INBALANCE_CHECK? +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +1. ``verl`` initializes a ``SGLangRollout`` module during rollout, which is used to evaluate/generate samples. + +2. ``SGLangRollout`` will initialize ``Engine``, and further initialize a ``torch.distributed.DeviceMesh``, used to support Tensor Parallel (TP). + +3. ``DeviceMesh.init()`` internally checks the free GPU memory of all participating devices. If the difference is too large (more than ~10%), it directly reports an error to avoid initialization failures or deadlocks. + +Why might there be inconsistent GPU memory? +""""""""""""""""""""""""""""""""""""""""""" + +**1. Ray Distributed Actor loads the model at different times** + +``verl`` uses Ray-based multi-process, multi-GPU concurrent training. Each ``WorkerDict`` may be called at different times: + +.. code-block:: python + + self.rollout = SGLangRollout(...) + +Different workers initialize the model at different times → different memory usage. + +**2. Delayed initialization causes memory bias** + +Some workers start model loading/inference (e.g., ``generate_sequences()``, ``compute_log_prob()``) earlier than others. +Early workers already use up GPU memory → late workers still have empty memory → memory difference appears. + +**3. SGLang's TP init uses "all-device broadcast", but there's no uniform release timing** + +Although ``SGLangRollout`` may only involve subset of GPUs, its ``Engine`` initialization calls ``torch.distributed.init_process_group()`` and broadcasts weights, so: + +- Non-rollout GPUs also join the communication. +- Later on, ``DeviceMesh`` init will fail due to "inconsistent memory". + +**4. Different FSDP/TP loading behaviors also lead to mismatch** + +If using: + +.. code-block:: bash + + actor.fsdp_config.param_offload=True + ref.fsdp_config.param_offload=True + +Then some workers keep params on CPU while others already sharded to GPU → leads to asymmetric memory layout. + +Using SGLang as the Inference Backend for PPO Training Across Multiple Machines +------------------------------------------------------------------------------ +SGLang also supports running verl's RAY-based cross-machine inference in IPv4 and IPv6 scenarios. In the script below, we use TP=16 for cross-machine inference. Suppose we have two interconnected machines: node0 with IP 10.94.16.4 and node1 with IP 10.94.16.5. + +1. Start Ray on node0: + +.. code-block:: bash + + ray start --head --dashboard-host=0.0.0.0 + +You will see the following prompt: + +.. code-block:: bash + + Usage stats collection is enabled. To disable this, add `--disable-usage-stats` to the command that starts the cluster, or run the following command: `ray disable-usage-stats` before starting the cluster. See https://docs.ray.io/en/master/cluster/usage-stats.html for more details. + + Local node IP: 10.94.16.4 + + -------------------- + Ray runtime started. + -------------------- + + Next steps + To add another node to this Ray cluster, run + ray start --address='10.94.16.4:6379' + +2. Have node1 join the Ray cluster: + +Run the following command on node1: + +.. code-block:: bash + + ray start --address='10.94.16.4:6379' + +Run the following command to confirm that the Ray cluster now has two nodes: + +.. code-block:: bash + + ray status + +You can see that the cluster has two nodes with 16 GPUs: + +.. code-block:: bash + + ======== Autoscaler status: 2025-04-09 09:25:37.694016 ======== + Node status + --------------------------------------------------------------- + Active: + 1 node_ef382ffd687d8f6b060c1b68e63ada7341b936fe5b1901dd04de1027 + 1 node_1eb4d7d07e793114c23a89d1a41f1f76acf6ef5b35af844a4ee8e4ba + Pending: + (no pending nodes) + Recent failures: + (no failures) + + Resources + --------------------------------------------------------------- + Usage: + 0.0/360.0 CPU + 0.0/16.0 GPU + 0B/3.39TiB memory + 0B/372.53GiB object_store_memory + +3. Run the following script to train meta-llama/Llama-3.1-8B-Instruct with TP=16 across 2 machines using 16 GPUs: + +.. code-block:: bash + + DATA_DIR=$HOME/data/gsm8k + + python3 -m verl.trainer.main_ppo \ + actor_rollout_ref.rollout.name=sglang \ + data.train_files=$DATA_DIR/train.parquet \ + data.val_files=$DATA_DIR/test.parquet \ + data.train_batch_size=4096 \ + data.max_prompt_length=4096 \ + data.max_response_length=4096 \ + actor_rollout_ref.model.path=meta-llama/Llama-3.1-8B-Instruct \ + actor_rollout_ref.actor.optim.lr=1e-6 \ + actor_rollout_ref.model.use_remove_padding=True \ + actor_rollout_ref.actor.ppo_mini_batch_size=64 \ + actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.model.enable_gradient_checkpointing=True \ + actor_rollout_ref.actor.fsdp_config.param_offload=True \ + actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \ + actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \ + actor_rollout_ref.rollout.tensor_model_parallel_size=16 \ + actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ + actor_rollout_ref.rollout.free_cache_engine=True \ + actor_rollout_ref.ref.log_prob_micro_batch_size=16 \ + actor_rollout_ref.ref.fsdp_config.param_offload=True \ + critic.optim.lr=1e-5 \ + critic.model.use_remove_padding=True \ + critic.model.path=meta-llama/Llama-3.1-8B-Instruct \ + critic.model.enable_gradient_checkpointing=True \ + critic.ppo_micro_batch_size=16 \ + critic.model.fsdp_config.param_offload=True \ + critic.model.fsdp_config.optimizer_offload=True \ + algorithm.kl_ctrl.kl_coef=0.001 \ + trainer.critic_warmup=0 \ + trainer.logger=console \ + trainer.val_before_train=True \ + trainer.n_gpus_per_node=8 \ + trainer.nnodes=2 \ + trainer.save_freq=-1 \ + trainer.test_freq=10 \ + trainer.total_epochs=15 2>&1 | tee verl_demo.log diff --git a/code/RL_model/verl/verl_train/docs/workers/trtllm_worker.rst b/code/RL_model/verl/verl_train/docs/workers/trtllm_worker.rst new file mode 100644 index 0000000000000000000000000000000000000000..ad6781f5e3bdd32f37b50a040eeb217291731715 --- /dev/null +++ b/code/RL_model/verl/verl_train/docs/workers/trtllm_worker.rst @@ -0,0 +1,62 @@ +TensorRT-LLM Backend +==================== + +Last updated: 12/31/2025. + +**Authored By TensorRT-LLM Team** + +Introduction +------------ +`TensorRT-LLM `_ is a high-performance LLM inference engine with state-of-the-art optimizations for NVIDIA GPUs. +The verl integration of TensorRT-LLM is based on TensorRT-LLM's `Ray orchestrator `_. This integration is in its early stage, with more features and optimizations to come. + +The TensorRT-LLM rollout engine primarily targets the colocated mode. Instead of relying purely on standard colocated mode, we adopted a mixed design combining aspects of the hybrid engine and colocated mode. + +Installation +------------ +We provide ``docker/Dockerfile.stable.trtllm`` for building a docker image with TensorRT-LLM pre-installed. The verl integration is supported from ``nvcr.io/nvidia/tensorrt-llm/release:1.2.0rc6``, and you can choose other TensorRT-LLM versions via ``TRTLLM_BASE_IMAGE`` from the `NGC Catalog `_. + +Alternatively, refer to the `TensorRT-LLM installation guide `_ for compatible environments if you want to build your own. + +Install verl with TensorRT-LLM: + +.. code-block:: bash + + pip install --upgrade pip + pip install -e ".[trtllm]" + +.. note:: + + Using the TensorRT-LLM rollout requires setting the following environment variables before launching the Ray cluster. These have been included in all the example scripts: + + .. code-block:: bash + + # Clean all SLURM/MPI/PMIx env to avoid PMIx mismatch error. + for v in $(env | awk -F= '/^(PMI|PMIX|MPI|OMPI|SLURM)_/{print $1}'); do + unset "$v" + done + +Using TensorRT-LLM as the Rollout Engine for GRPO +------------------------------------------------- + +We provide the following GRPO recipe scripts for you to test the performance and accuracy curve of TensorRT-LLM as the rollout engine: + +.. code-block:: bash + + ## For FSDP training engine + bash examples/grpo_trainer/run_qwen2-7b_math_trtllm.sh + ## For Megatron-Core training engine + bash examples/grpo_trainer/run_qwen2-7b_math_megatron_trtllm.sh + +Using TensorRT-LLM as the Rollout Engine for DAPO +------------------------------------------------- + +We provide a DAPO recipe script ``recipe/dapo/test_dapo_7b_math_trtllm.sh``. + +.. code-block:: bash + + ## For FSDP training engine + bash recipe/dapo/test_dapo_7b_math_trtllm.sh + ## For Megatron-Core training engine + TRAIN_ENGINE=megatron bash recipe/dapo/test_dapo_7b_math_trtllm.sh + diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a4d73e1cdafc11d4b34715fd5846cb4651b4b5b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..cf50238cf026f93f3dc76cf752f16ba146aa0a0c --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db8b3a32660528290cf5decb6a60a49f6f52151f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-17-52/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..6a4d73e1cdafc11d4b34715fd5846cb4651b4b5b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..47dd51c678f1f17a7fb1df5faa32cfd10d8b5b91 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..db8b3a32660528290cf5decb6a60a49f6f52151f --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/00-19-10/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e51541488045d836eeaaf3c50aa151b8d93b7e34 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 8 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 8 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: false + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 8 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ddbc6430da5598bff1cf1fb9a362d594332115aa --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/hydra.yaml @@ -0,0 +1,211 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..81da8be7bc51fa6795af461878b701a9a4d877ea --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-10-33/.hydra/overrides.yaml @@ -0,0 +1,44 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d97f64afc220df0d4f4397e5dc39a39fa52103cd --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.6 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..3303afb0b603dab31220073dfc9457778d0248bc --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=False + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=False + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.6 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=False + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=False,actor_rollout_ref.actor.fsdp_config.param_offload=False,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=False,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.6,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..ea4b10cf1c3a94a724a9c102ee6594eb1cf94e70 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/10-43-36/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=False +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=False +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.6 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=False +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1cfb0f135024f1f50aaca1311ef1461802759f0b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..e24bbea90a391ba0e84dd241372d7705c5376acf --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4e460bb89d291402a8b1e4e2ed0fe7b22d0d0620 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-06-12/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/config.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..1cfb0f135024f1f50aaca1311ef1461802759f0b --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/config.yaml @@ -0,0 +1,649 @@ +actor_rollout_ref: + actor: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-06 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: true + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + ppo_mini_batch_size: 256 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + use_dynamic_bsz: false + ppo_max_token_len_per_gpu: 16384 + clip_ratio: 0.2 + clip_ratio_low: 0.2 + clip_ratio_high: 0.2 + tau_pos: 1.0 + tau_neg: 1.05 + freeze_vision_tower: false + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + loss_mode: vanilla + clip_cov_ratio: 0.0002 + clip_cov_lb: 1.0 + clip_cov_ub: 5.0 + kl_cov_ratio: 0.0002 + ppo_kl_coef: 0.1 + clip_ratio_c: 3.0 + loss_agg_mode: token-mean + loss_scale_factor: null + entropy_coeff: 0 + calculate_entropy: false + use_kl_loss: true + use_prefix_grouper: false + use_torch_compile: true + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + ppo_epochs: 1 + shuffle: false + data_loader_seed: 42 + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + use_fused_kernels: ${oc.select:actor_rollout_ref.model.use_fused_kernels,false} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + grad_clip: 1.0 + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + use_remove_padding: ${oc.select:actor_rollout_ref.model.use_remove_padding,false} + calculate_sum_pi_squared: false + sum_pi_squared_checkpointing: false + ref: + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: ${actor_rollout_ref.actor.strategy} + use_torch_compile: ${oc.select:actor_rollout_ref.actor.use_torch_compile,true} + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: true + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: true + strategy: fsdp + dtype: bfloat16 + _target_: verl.workers.config.FSDPActorConfig + ulysses_sequence_parallel_size: ${oc.select:actor_rollout_ref.actor.ulysses_sequence_parallel_size,1} + entropy_from_logits_with_chunking: false + entropy_checkpointing: false + rollout: + _target_: verl.workers.config.RolloutConfig + name: vllm + mode: async + temperature: 1.0 + top_k: -1 + top_p: 1 + prompt_length: ${oc.select:data.max_prompt_length,512} + response_length: ${oc.select:data.max_response_length,512} + dtype: bfloat16 + gpu_memory_utilization: 0.4 + ignore_eos: false + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + tensor_model_parallel_size: 1 + data_parallel_size: 1 + expert_parallel_size: 1 + pipeline_model_parallel_size: 1 + max_num_batched_tokens: 8192 + max_model_len: 8192 + max_num_seqs: 1024 + enable_chunked_prefill: true + enable_prefix_caching: true + logprobs_mode: processed_logprobs + scheduling_policy: fcfs + load_format: dummy + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + log_prob_max_token_len_per_gpu: ${oc.select:actor_rollout_ref.actor.ppo_max_token_len_per_gpu,16384} + disable_log_stats: true + do_sample: true + 'n': 3 + over_sample_rate: 0 + multi_stage_wake_up: false + engine_kwargs: + vllm: {} + sglang: {} + trtllm: {} + val_kwargs: + _target_: verl.workers.config.SamplingConfig + top_k: -1 + top_p: 1.0 + temperature: 0 + 'n': 1 + do_sample: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + max_assistant_turns: null + tool_config_path: null + max_user_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + tool_response_truncate_side: middle + interaction_config_path: null + use_inference_chat_template: false + tokenization_sanity_check_mode: strict + format: hermes + num_repeat_rollouts: null + calculate_log_probs: false + agent: + _target_: verl.workers.config.AgentLoopConfig + num_workers: 8 + default_agent_loop: single_turn_agent + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + path: null + name: null + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + engine_kwargs: {} + trace: + _target_: verl.workers.config.TraceConfig + backend: null + token2text: false + max_samples_per_step_per_worker: null + skip_rollout: false + skip_dump_dir: /tmp/rollout_dump + skip_tokenizer_init: true + enable_rollout_routing_replay: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: ${oc.select:actor_rollout_ref.actor.profiler.enable,false} + all_ranks: ${oc.select:actor_rollout_ref.actor.profiler.all_ranks,false} + ranks: ${oc.select:actor_rollout_ref.actor.profiler.ranks,[]} + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + port: 9090 + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + served_model_name: ${oc.select:actor_rollout_ref.model.path,null} + quantization: null + quantization_config_file: null + mtp: ${oc.select:actor_rollout_ref.model.mtp, null} + layered_summon: false + model: + _target_: verl.workers.config.HFModelConfig + path: Qwen/Qwen3-4B-Instruct-2507 + hf_config_path: null + tokenizer_path: null + use_shm: false + trust_remote_code: false + custom_chat_template: null + external_lib: null + override_config: {} + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: true + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + exclude_modules: null + lora_adapter_path: null + use_liger: false + use_fused_kernels: false + fused_kernel_options: + impl_backend: torch + tiled_mlp: + enabled: false + num_shards: 4 + mtp: + _target_: verl.workers.config.MtpConfig + enable: false + enable_train: false + enable_rollout: false + detach_encoder: false + mtp_loss_scaling_factor: 0.1 + speculative_algorithm: EAGLE + speculative_num_steps: 3 + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + method: mtp + num_speculative_tokens: 1 + hybrid_engine: true + nccl_timeout: 600 +data: + tokenizer: null + use_shm: false + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + train_max_samples: -1 + val_max_samples: -1 + prompt_key: prompt + reward_fn_key: data_source + max_prompt_length: 1024 + max_response_length: 2048 + train_batch_size: 512 + val_batch_size: null + tool_config_path: ${oc.select:actor_rollout_ref.rollout.multi_turn.tool_config_path, + null} + return_raw_input_ids: false + return_raw_chat: true + return_full_prompt: false + shuffle: true + seed: null + dataloader_num_workers: 8 + image_patch_size: 14 + validation_shuffle: false + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + truncation: error + image_key: images + video_key: videos + trust_remote_code: false + custom_cls: + path: null + name: null + return_multi_modal_inputs: true + sampler: + class_path: null + class_name: null + datagen: + path: null + name: null + apply_chat_template_kwargs: {} +reward_manager: + _target_: verl.trainer.config.config.RewardManagerConfig + source: register + name: ${oc.select:reward_model.reward_manager,naive} + module: + _target_: verl.trainer.config.config.ModuleConfig + path: null + name: custom_reward_manager +critic: + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + optimizer: AdamW + optimizer_impl: torch.optim + lr: 1.0e-05 + lr_warmup_steps_ratio: 0.0 + total_training_steps: -1 + weight_decay: 0.01 + lr_warmup_steps: -1 + betas: + - 0.9 + - 0.999 + clip_grad: 1.0 + min_lr_ratio: 0.0 + num_cycles: 0.5 + lr_scheduler_type: constant + warmup_style: null + override_optimizer_config: null + model: + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + optimizer_offload: false + offload_policy: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + model_dtype: fp32 + use_orig_params: false + seed: 42 + full_determinism: false + ulysses_sequence_parallel_size: 1 + entropy_from_logits_with_chunking: false + use_torch_compile: true + entropy_checkpointing: false + forward_only: false + strategy: fsdp + dtype: bfloat16 + path: ~/models/deepseek-llm-7b-chat + tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"} + override_config: {} + external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null} + trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false} + _target_: verl.workers.config.FSDPCriticModelCfg + use_shm: false + enable_gradient_checkpointing: true + enable_activation_offload: false + use_remove_padding: false + lora_rank: 0 + lora_alpha: 16 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + _target_: verl.workers.config.FSDPCriticConfig + rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1} + strategy: fsdp + enable: null + ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256} + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null} + use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false} + ppo_max_token_len_per_gpu: 32768 + forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu} + ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1} + shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false} + data_loader_seed: 42 + cliprange_value: 0.5 + loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean} + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + save_contents: + - model + - optimizer + - extra + load_contents: ${.save_contents} + async_save: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: ${oc.select:global_profiler.global_tool_config.nsys.discrete} + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + contents: [] + level: level0 + analysis: true + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000} + stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32} + forward_micro_batch_size: ${oc.select:.ppo_micro_batch_size,null} + forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null} + ulysses_sequence_parallel_size: 1 + grad_clip: 1.0 +reward_model: + enable: false + enable_resource_pool: false + n_gpus_per_node: 8 + nnodes: 0 + strategy: fsdp + model: + input_tokenizer: ${actor_rollout_ref.model.path} + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + external_lib: ${actor_rollout_ref.model.external_lib} + trust_remote_code: false + override_config: {} + use_shm: false + use_remove_padding: false + use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels} + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + wrap_policy: + min_num_params: 0 + param_offload: false + reshard_after_forward: true + fsdp_size: -1 + forward_prefetch: false + micro_batch_size: null + micro_batch_size_per_gpu: null + max_length: null + use_dynamic_bsz: ${critic.use_dynamic_bsz} + forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu} + reward_manager: naive + reward_loop_source: register + reward_loop_module_path: null + reward_loop_class_name: null + launch_reward_fn_async: false + sandbox_fusion: + url: null + max_concurrent: 64 + memory_limit_mb: 1024 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: ${oc.select:global_profiler.tool,null} + enable: false + all_ranks: false + ranks: [] + save_path: ${oc.select:global_profiler.save_path,null} + tool_config: ${oc.select:actor_rollout_ref.actor.profiler.tool_config,null} + ulysses_sequence_parallel_size: 1 + use_reward_loop: true + num_workers: 1 + rollout: + _target_: verl.workers.config.RolloutConfig + name: ??? + dtype: bfloat16 + gpu_memory_utilization: 0.5 + enforce_eager: true + cudagraph_capture_sizes: null + free_cache_engine: true + data_parallel_size: 1 + expert_parallel_size: 1 + tensor_model_parallel_size: 2 + max_num_batched_tokens: 8192 + max_model_len: null + max_num_seqs: 1024 + load_format: auto + engine_kwargs: {} + limit_images: null + enable_chunked_prefill: true + enable_prefix_caching: true + disable_log_stats: true + skip_tokenizer_init: false + prompt_length: 2048 + response_length: 2048 +algorithm: + rollout_correction: + rollout_is: null + rollout_is_threshold: 2.0 + rollout_rs: null + rollout_rs_threshold: null + bypass_mode: false + loss_type: ppo_clip + rollout_is_batch_normalize: false + _target_: verl.trainer.config.AlgoConfig + gamma: 1.0 + lam: 1.0 + adv_estimator: grpo + norm_adv_by_std_in_grpo: true + use_kl_in_reward: false + kl_penalty: kl + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + type: fixed + kl_coef: 0.001 + horizon: 10000 + target_kl: 0.1 + use_pf_ppo: false + pf_ppo: + reweight_method: pow + weight_pow: 2.0 +custom_reward_function: + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + name: compute_score +trainer: + balance_batch: true + total_epochs: 15 + total_training_steps: null + project_name: readctrl-verl + experiment_name: qwen3-4b-instruct-en + logger: + - console + - wandb + log_val_generations: 0 + rollout_data_dir: null + validation_data_dir: null + nnodes: 1 + n_gpus_per_node: 2 + save_freq: 5 + esi_redundant_time: 0 + resume_mode: auto + resume_from_path: null + val_before_train: true + val_only: false + test_freq: 10 + critic_warmup: 0 + default_hdfs_dir: null + del_local_ckpt_after_load: false + default_local_dir: /home/mshahidul/readctrl/code/RL_model/train_v2 + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + ray_wait_register_center_timeout: 300 + device: cuda + use_legacy_worker_impl: auto + remove_previous_ckpt_in_save: true +global_profiler: + _target_: verl.utils.profiler.ProfilerConfig + tool: null + steps: null + profile_continuous_steps: false + save_path: outputs/profile + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + controller_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + worker_nsight_options: + trace: cuda,nvtx,cublas,ucx + cuda-memory-usage: 'true' + cuda-graph-trace: graph + capture-range: cudaProfilerApi + capture-range-end: null + kill: none + torch_memory: + trace_alloc_max_entries: 100000 + stack_depth: 32 + context: all + stacks: all + kw_args: {} +transfer_queue: + enable: false +ray_kwargs: + ray_init: + num_cpus: null + timeline_json_file: null diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/hydra.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/hydra.yaml new file mode 100644 index 0000000000000000000000000000000000000000..41eaa94f4422bb50c68219971dd9006accd933f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/hydra.yaml @@ -0,0 +1,212 @@ +hydra: + run: + dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S} + sweep: + dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S} + subdir: ${hydra.job.num} + launcher: + _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher + sweeper: + _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper + max_batch_size: null + params: null + help: + app_name: ${hydra.job.name} + header: '${hydra.help.app_name} is powered by Hydra. + + ' + footer: 'Powered by Hydra (https://hydra.cc) + + Use --hydra-help to view Hydra specific help + + ' + template: '${hydra.help.header} + + == Configuration groups == + + Compose your configuration from those groups (group=option) + + + $APP_CONFIG_GROUPS + + + == Config == + + Override anything in the config (foo.bar=value) + + + $CONFIG + + + ${hydra.help.footer} + + ' + hydra_help: + template: 'Hydra (${hydra.runtime.version}) + + See https://hydra.cc for more info. + + + == Flags == + + $FLAGS_HELP + + + == Configuration groups == + + Compose your configuration from those groups (For example, append hydra/job_logging=disabled + to command line) + + + $HYDRA_CONFIG_GROUPS + + + Use ''--cfg hydra'' to Show the Hydra config. + + ' + hydra_help: ??? + hydra_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][HYDRA] %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + root: + level: INFO + handlers: + - console + loggers: + logging_example: + level: DEBUG + disable_existing_loggers: false + job_logging: + version: 1 + formatters: + simple: + format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' + handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log + root: + level: INFO + handlers: + - console + - file + disable_existing_loggers: false + env: {} + mode: RUN + searchpath: [] + callbacks: {} + output_subdir: .hydra + overrides: + hydra: + - hydra.mode=RUN + task: + - algorithm.adv_estimator=grpo + - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py + - data.train_batch_size=512 + - data.max_prompt_length=1024 + - data.max_response_length=2048 + - data.filter_overlong_prompts=True + - data.truncation=error + - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 + - actor_rollout_ref.actor.optim.lr=1e-6 + - actor_rollout_ref.model.use_remove_padding=True + - actor_rollout_ref.actor.ppo_mini_batch_size=256 + - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 + - actor_rollout_ref.actor.use_kl_loss=True + - actor_rollout_ref.actor.kl_loss_coef=0.001 + - actor_rollout_ref.actor.kl_loss_type=low_var_kl + - actor_rollout_ref.actor.entropy_coeff=0 + - actor_rollout_ref.model.enable_gradient_checkpointing=True + - actor_rollout_ref.actor.fsdp_config.param_offload=True + - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True + - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.rollout.tensor_model_parallel_size=1 + - actor_rollout_ref.rollout.name=vllm + - actor_rollout_ref.rollout.gpu_memory_utilization=0.4 + - actor_rollout_ref.rollout.enforce_eager=True + - actor_rollout_ref.rollout.max_model_len=8192 + - actor_rollout_ref.rollout.n=3 + - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 + - actor_rollout_ref.ref.fsdp_config.param_offload=True + - algorithm.use_kl_in_reward=False + - trainer.critic_warmup=0 + - trainer.logger=["console","wandb"] + - trainer.project_name=readctrl-verl + - trainer.experiment_name=qwen3-4b-instruct-en + - trainer.n_gpus_per_node=2 + - trainer.nnodes=1 + - trainer.save_freq=5 + - trainer.test_freq=10 + - +trainer.remove_previous_ckpt_in_save=true + - trainer.max_actor_ckpt_to_keep=1 + - trainer.max_critic_ckpt_to_keep=1 + - trainer.resume_mode=auto + - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 + - trainer.total_epochs=15 + job: + name: main_ppo + chdir: null + override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15 + id: ??? + num: ??? + config_name: ppo_trainer + env_set: {} + env_copy: [] + config: + override_dirname: + kv_sep: '=' + item_sep: ',' + exclude_keys: [] + runtime: + version: 1.3.2 + version_base: '1.3' + cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + config_sources: + - path: hydra.conf + schema: pkg + provider: hydra + - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config + schema: file + provider: main + - path: '' + schema: structured + provider: schema + output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42 + choices: + algorithm@algorithm.rollout_correction: rollout_correction + reward_model: dp_reward_loop + critic: dp_critic + critic/../engine@critic.model.fsdp_config: fsdp + critic/../optim@critic.optim: fsdp + model@actor_rollout_ref.model: hf_model + rollout@actor_rollout_ref.rollout: rollout + ref@actor_rollout_ref.ref: dp_ref + ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp + data: legacy_data + actor@actor_rollout_ref.actor: dp_actor + actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp + actor/../optim@actor_rollout_ref.actor.optim: fsdp + hydra/env: default + hydra/callbacks: null + hydra/job_logging: default + hydra/hydra_logging: default + hydra/hydra_help: default + hydra/help: default + hydra/sweeper: basic + hydra/launcher: basic + hydra/output: default + verbose: false diff --git a/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/overrides.yaml b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/overrides.yaml new file mode 100644 index 0000000000000000000000000000000000000000..4e460bb89d291402a8b1e4e2ed0fe7b22d0d0620 --- /dev/null +++ b/code/RL_model/verl/verl_train/outputs/2026-02-10/13-12-42/.hydra/overrides.yaml @@ -0,0 +1,45 @@ +- algorithm.adv_estimator=grpo +- data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet +- data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet +- custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +- data.train_batch_size=512 +- data.max_prompt_length=1024 +- data.max_response_length=2048 +- data.filter_overlong_prompts=True +- data.truncation=error +- actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507 +- actor_rollout_ref.actor.optim.lr=1e-6 +- actor_rollout_ref.model.use_remove_padding=True +- actor_rollout_ref.actor.ppo_mini_batch_size=256 +- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 +- actor_rollout_ref.actor.use_kl_loss=True +- actor_rollout_ref.actor.kl_loss_coef=0.001 +- actor_rollout_ref.actor.kl_loss_type=low_var_kl +- actor_rollout_ref.actor.entropy_coeff=0 +- actor_rollout_ref.model.enable_gradient_checkpointing=True +- actor_rollout_ref.actor.fsdp_config.param_offload=True +- actor_rollout_ref.actor.fsdp_config.optimizer_offload=True +- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.rollout.tensor_model_parallel_size=1 +- actor_rollout_ref.rollout.name=vllm +- actor_rollout_ref.rollout.gpu_memory_utilization=0.4 +- actor_rollout_ref.rollout.enforce_eager=True +- actor_rollout_ref.rollout.max_model_len=8192 +- actor_rollout_ref.rollout.n=3 +- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 +- actor_rollout_ref.ref.fsdp_config.param_offload=True +- algorithm.use_kl_in_reward=False +- trainer.critic_warmup=0 +- trainer.logger=["console","wandb"] +- trainer.project_name=readctrl-verl +- trainer.experiment_name=qwen3-4b-instruct-en +- trainer.n_gpus_per_node=2 +- trainer.nnodes=1 +- trainer.save_freq=5 +- trainer.test_freq=10 +- +trainer.remove_previous_ckpt_in_save=true +- trainer.max_actor_ckpt_to_keep=1 +- trainer.max_critic_ckpt_to_keep=1 +- trainer.resume_mode=auto +- trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/train_v2 +- trainer.total_epochs=15 diff --git a/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py b/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py new file mode 100644 index 0000000000000000000000000000000000000000..ff3c9f82126bef4c93253a7f7f02dbcf1823d36f --- /dev/null +++ b/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py @@ -0,0 +1,378 @@ +from cgi import print_arguments +import os +import json +import re +import dspy +from openai import OpenAI +import itertools + +CHAT_TEMPLATE = ( + "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "Cutting Knowledge Date: December 2023\n" + "Today Date: 26 July 2024\n\n" + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" + "{user_prompt}" + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +) + + +class MedicalClaimVerifier: + def __init__(self): + # Prefer local vLLM (OpenAI-compatible) server settings + self.model_name = os.getenv("VLLM_MODEL", "sc") + self.base_url = os.getenv("VLLM_API_BASE", "http://172.16.34.22:3090/v1") + self.client = OpenAI(api_key="EMPTY", base_url=self.base_url) + + # Keep completeness threshold fixed at 1.0. + self.comp_thresholds = { + "low": 1.0, + "intermediate": 1.0, + "proficient": 1.0, + } + # Use IQR ranges (lower, upper) for coverage. + self.cov_iqr_ranges = { + "low": (0.1765, 0.3226), + "intermediate": (0.1818, 0.4091), + "proficient": (0.7725, 0.9347), + } + + def build_user_prompt(self, text, subclaims): + numbered_subclaims = "\n".join( + f"{idx + 1}. {subclaim}" for idx, subclaim in enumerate(subclaims) + ) + return ( + "You are a medical evidence checker.\n" + "Given a medical passage and a list of subclaims, return labels for each " + "subclaim in the same order.\n\n" + "Allowed labels: supported, not_supported.\n" + "Output format: a JSON array of strings only.\n\n" + f"Medical text:\n{text}\n\n" + f"Subclaims:\n{numbered_subclaims}" + ) + + def render_chat_prompt(self, user_prompt): + return CHAT_TEMPLATE.format(user_prompt=user_prompt) + + def extract_label_list(self, text): + cleaned = text.strip() + try: + parsed = json.loads(cleaned) + if isinstance(parsed, list): + return parsed + except json.JSONDecodeError: + pass + + match = re.search(r"\[[\s\S]*\]", cleaned) + if match: + try: + parsed = json.loads(match.group(0)) + if isinstance(parsed, list): + return parsed + except json.JSONDecodeError: + return [] + return [] + + def check_support_api(self, context, subclaims): + if not context or not subclaims: + return [] + + user_prompt = self.build_user_prompt(context, subclaims) + prompt = self.render_chat_prompt(user_prompt) + try: + response = self.client.completions.create( + model=self.model_name, + prompt=prompt, + max_tokens=256, + temperature=0, # Keep it deterministic for evaluation + ) + pred_text = response.choices[0].text.strip() + labels = self.extract_label_list(pred_text) + normalized = [str(x).strip().lower() for x in labels] + # print("--------------------------------") + # print(pred_text) + # print(normalized) + # print("--------------------------------") + return normalized + except Exception: + return [] + + def _average_supported(self, labels, expected_len): + if expected_len <= 0: + return 0.0 + normalized = [str(x).strip().lower() for x in labels] + if len(normalized) < expected_len: + normalized.extend(["invalid"] * (expected_len - len(normalized))) + elif len(normalized) > expected_len: + normalized = normalized[:expected_len] + supported_count = sum(1 for item in normalized if item == "supported") + return supported_count / expected_len + + def evaluate_level(self, gen_text, gold_subs, full_subs): + if not gen_text or not gold_subs or not full_subs: + return 0.0, 0.0 + + # Match support-check format with test.py: single prompt with text + list of subclaims. + comp_labels = self.check_support_api(gen_text, gold_subs) + cov_labels = self.check_support_api(gen_text, full_subs) + + comp_score = self._average_supported(comp_labels, len(gold_subs)) + cov_score = self._average_supported(cov_labels, len(full_subs)) + return comp_score, cov_score + +verifier = MedicalClaimVerifier() +DEFAULT_API_BASE = "http://172.16.34.22:8040/v1" +LITERACY_LMS = [ + dspy.LM( + model="openai/dspy", + api_base=os.getenv("VLLM_API_BASE", DEFAULT_API_BASE), + api_key="EMPTY", + temperature=0.0, + ) +] +literacy_lm_cycle = itertools.cycle(LITERACY_LMS) + +MODEL_PATH = os.environ.get( + "HEALTH_LITERACY_MODEL_PATH", + "/home/mshahidul/readctrl/code/text_classifier/" + "dspy_model/vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1/model.json", +) + + +# dspy.configure(lm=next(literacy_lm_cycle)) + + +class HealthLiteracySignature(dspy.Signature): + """ + Analyze the linguistic complexity, use of medical jargon, and sentence + structure of 'generated_text' to determine the health literacy level. + """ + + generated_text = dspy.InputField( + desc="A version of the source text rewritten for a specific audience." + ) + literacy_label = dspy.OutputField( + desc=( + "Classification: low_health_literacy (simple words, no jargon), " + "intermediate_health_literacy (moderate technicality), or " + "proficient_health_literacy (highly technical/original level)." + ) + ) + + +class HealthLiteracyClassifier(dspy.Module): + def __init__(self): + super().__init__() + self.classifier = dspy.ChainOfThought(HealthLiteracySignature) + + def forward(self, generated_text): + return self.classifier(generated_text=generated_text) + + +_COMPILED_CLASSIFIER = None + + +def _load_compiled_classifier(path): + if hasattr(dspy, "load"): + try: + return dspy.load(path) + except Exception: + pass + classifier = HealthLiteracyClassifier() + try: + classifier.load(path) + except Exception as exc: + raise RuntimeError(f"Failed to load compiled model from {path}") from exc + return classifier + + +def _get_classifier(): + global _COMPILED_CLASSIFIER + if _COMPILED_CLASSIFIER is None: + if not os.path.exists(MODEL_PATH): + raise FileNotFoundError(f"Model file not found: {MODEL_PATH}") + _COMPILED_CLASSIFIER = _load_compiled_classifier(MODEL_PATH) + return _COMPILED_CLASSIFIER + +def _parse_solution_json(solution_str): + try: + cleaned_str = solution_str.strip() + if "```json" in cleaned_str: + cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip() + elif "```" in cleaned_str: + cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip() + return json.loads(cleaned_str) + except Exception: + return None + + +def _get_target_level(extra_info): + if not extra_info: + return None + return extra_info.get("target_level") + + +def _predict_label(generated_text): + classifier = _get_classifier() + + # 2. Pick the next GPU/LM from the pool + current_lm = next(literacy_lm_cycle) + + # 3. Use dspy.context to ensure THIS specific call uses the selected GPU + with dspy.context(lm=current_lm): + prediction = classifier(generated_text=generated_text) + + if not prediction or not hasattr(prediction, "literacy_label"): + return "" + # import ipdb; ipdb.set_trace() + return str(prediction.literacy_label).strip().lower() + + +def _compute_classifier_reward(target_level, gen_text): + try: + pred_label = _predict_label(gen_text) + except Exception: + return 0.0 + return 1.0 if target_level in pred_label else 0.0 + +import numpy as np + +def _score_flat_top_iqr(value, bounds, weight=1.0): + """ + Provides a constant maximum reward within the range, + and a linear penalty outside of it. + """ + lower, upper = bounds + if lower <= value <= upper: + return weight # Maximum reward for being in the "Goldilocks" zone + + # Calculate distance to the nearest bound + distance = lower - value if value < lower else value - upper + # Linear decay: the further away, the lower the reward (can go negative) + return weight - distance + +def compute_score(data_source, solution_str, ground_truth, extra_info=None): + gold_subs = ground_truth.get('summary_subclaims', []) + full_subs = ground_truth.get('fulltext_subclaims', []) + + # 1. Strict Format & Data Validation + if not gold_subs or not full_subs: + return 0.0 + + data = _parse_solution_json(solution_str) + if not data: + return -2.0 # Penalize format failure more than content failure + + target_level = _get_target_level(extra_info) + level_map = { + "low_health_literacy": "low", + "intermediate_health_literacy": "intermediate", + "proficient_health_literacy": "proficient", + } + level_key = level_map.get(target_level) + + if not target_level or not level_key: + return 0.0 + + gen_text = data.get(target_level, "") + if not gen_text or len(gen_text.strip()) < 10: + return -1.0 # Penalize empty or trivial responses + + # 2. Extract Metrics from Verifier + comp_s, cov_s = verifier.evaluate_level(gen_text, gold_subs, full_subs) + + # 3. Component Weights + W_COMPLETENESS = 1.5 # Primary goal: Don't lie/omit facts + W_COVERAGE = 1.0 # Secondary: Match the intended information density + W_CLASSIFIER = 1.0 # Tertiary: Match the linguistic style + + # --- FACTUAL COMPLETENESS REWARD --- + # Use squared scaling: moving from 0.8 -> 0.9 is worth more than 0.1 -> 0.2 + # This prevents the model from "settling" for mediocre factual accuracy. + comp_reward = comp_s * W_COMPLETENESS + + # --- INFORMATION COVERAGE (IQR) REWARD --- + # We use flat-top to prevent "pinching" the model into one specific number. + cov_range = verifier.cov_iqr_ranges[level_key] + cov_reward = _score_flat_top_iqr(cov_s, cov_range, weight=W_COVERAGE) + + # --- LITERACY CLASSIFIER REWARD --- + classifier_reward = _compute_classifier_reward(target_level, gen_text) * W_CLASSIFIER + + # 4. Total Reward Calculation + total_score = comp_reward + cov_reward + classifier_reward + + + return total_score + + +# import os +# import json +# import time + +# def run_actual_api_test(): +# # 1. Prepare Real Medical Data +# # A summary vs a full text about Hypertension (Lisinopril) +# ground_truth = { +# "summary_subclaims": [ +# "Lisinopril is used to treat high blood pressure.", +# "It belongs to a class of drugs called ACE inhibitors.", +# "Common side effects include a dry cough." +# ], +# "fulltext_subclaims": [ +# "Lisinopril is used to treat high blood pressure.", +# "It belongs to a class of drugs called ACE inhibitors.", +# "Common side effects include a dry cough.", +# "It helps prevent heart attacks and strokes.", +# "Patients should have their kidney function monitored.", +# "Do not use if you are pregnant." +# ] +# } + +# # This is what the LLM generated for "low_health_literacy" +# # Note: It covers the first 2 subclaims but ignores the cough and pregnancy warnings. +# generated_response = { +# "low_health_literacy": ( +# "This medicine is for your high blood pressure. It is a type of drug " +# "called an ACE inhibitor. It helps your heart work better." +# ) +# } + +# solution_str = f"```json\n{json.dumps(generated_response)}\n```" +# extra_info = {"target_level": "low_health_literacy"} + +# print("📡 Initializing actual API connection to 172.16.34.21...") +# start_time = time.time() + +# try: +# # 2. Execute the actual score logic +# # This will trigger the ThreadPoolExecutor and make actual HTTP calls to your vLLM +# score = compute_score( +# data_source="real_api_test", +# solution_str=solution_str, +# ground_truth=ground_truth, +# extra_info=extra_info +# ) + +# duration = time.time() - start_time +# print(f"\n✅ API Call Successful ({round(duration, 2)}s)") +# print("-" * 40) +# print(f"Target Level: {extra_info['target_level']}") +# print(f"Final Reward Score: {round(score, 4)}") +# print("-" * 40) + +# # Logic check for the user +# print("\nDEBUG INFO:") +# print("- Completeness: Checks if the 3 summary claims are in the 'Low' text.") +# print("- Coverage: Checks how many of the 6 full-text claims are present.") +# print(f"- Target Thresholds: Comp >= 1.0, Cov between 0.32 and 0.45") + +# except Exception as e: +# print(f"\n❌ API Call Failed!") +# print(f"Error Type: {type(e).__name__}") +# print(f"Details: {str(e)}") +# print("\nPossible fixes:") +# print("1. Check if the vLLM server at :8086 and :8034 are running.") +# print("2. Check if your API key in api_new.json is valid.") + +# if __name__ == "__main__": +# run_actual_api_test() \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/reward_func/reward_func/reward_1.py b/code/RL_model/verl/verl_train/reward_func/reward_func/reward_1.py new file mode 100644 index 0000000000000000000000000000000000000000..40dc9357284427068c911ddea7f504c7d0720291 --- /dev/null +++ b/code/RL_model/verl/verl_train/reward_func/reward_func/reward_1.py @@ -0,0 +1,378 @@ +from cgi import print_arguments +import os +import json +import re +import dspy +from openai import OpenAI +import itertools + +CHAT_TEMPLATE = ( + "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "Cutting Knowledge Date: December 2023\n" + "Today Date: 26 July 2024\n\n" + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" + "{user_prompt}" + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +) + + +class MedicalClaimVerifier: + def __init__(self): + # Prefer local vLLM (OpenAI-compatible) server settings + self.model_name = os.getenv("VLLM_MODEL", "sc") + self.base_url = os.getenv("VLLM_API_BASE", "http://172.16.34.22:3090/v1") + self.client = OpenAI(api_key="EMPTY", base_url=self.base_url) + + # Keep completeness threshold fixed at 1.0. + self.comp_thresholds = { + "low": 1.0, + "intermediate": 1.0, + "proficient": 1.0, + } + # Use IQR ranges (lower, upper) for coverage. + self.cov_iqr_ranges = { + "low": (0.1765, 0.3226), + "intermediate": (0.1818, 0.4091), + "proficient": (0.7725, 0.9347), + } + + def build_user_prompt(self, text, subclaims): + numbered_subclaims = "\n".join( + f"{idx + 1}. {subclaim}" for idx, subclaim in enumerate(subclaims) + ) + return ( + "You are a medical evidence checker.\n" + "Given a medical passage and a list of subclaims, return labels for each " + "subclaim in the same order.\n\n" + "Allowed labels: supported, not_supported.\n" + "Output format: a JSON array of strings only.\n\n" + f"Medical text:\n{text}\n\n" + f"Subclaims:\n{numbered_subclaims}" + ) + + def render_chat_prompt(self, user_prompt): + return CHAT_TEMPLATE.format(user_prompt=user_prompt) + + def extract_label_list(self, text): + cleaned = text.strip() + try: + parsed = json.loads(cleaned) + if isinstance(parsed, list): + return parsed + except json.JSONDecodeError: + pass + + match = re.search(r"\[[\s\S]*\]", cleaned) + if match: + try: + parsed = json.loads(match.group(0)) + if isinstance(parsed, list): + return parsed + except json.JSONDecodeError: + return [] + return [] + + def check_support_api(self, context, subclaims): + if not context or not subclaims: + return [] + + user_prompt = self.build_user_prompt(context, subclaims) + prompt = self.render_chat_prompt(user_prompt) + try: + response = self.client.completions.create( + model=self.model_name, + prompt=prompt, + max_tokens=256, + temperature=0, # Keep it deterministic for evaluation + ) + pred_text = response.choices[0].text.strip() + labels = self.extract_label_list(pred_text) + normalized = [str(x).strip().lower() for x in labels] + # print("--------------------------------") + # print(pred_text) + # print(normalized) + # print("--------------------------------") + return normalized + except Exception: + return [] + + def _average_supported(self, labels, expected_len): + if expected_len <= 0: + return 0.0 + normalized = [str(x).strip().lower() for x in labels] + if len(normalized) < expected_len: + normalized.extend(["invalid"] * (expected_len - len(normalized))) + elif len(normalized) > expected_len: + normalized = normalized[:expected_len] + supported_count = sum(1 for item in normalized if item == "supported") + return supported_count / expected_len + + def evaluate_level(self, gen_text, gold_subs, full_subs): + if not gen_text or not gold_subs or not full_subs: + return 0.0, 0.0 + + # Match support-check format with test.py: single prompt with text + list of subclaims. + comp_labels = self.check_support_api(gen_text, gold_subs) + cov_labels = self.check_support_api(gen_text, full_subs) + + comp_score = self._average_supported(comp_labels, len(gold_subs)) + cov_score = self._average_supported(cov_labels, len(full_subs)) + return comp_score, cov_score + +verifier = MedicalClaimVerifier() +DEFAULT_API_BASE = "http://172.16.34.22:8040/v1" +LITERACY_LMS = [ + dspy.LM( + model="openai/dspy", + api_base=os.getenv("VLLM_API_BASE", DEFAULT_API_BASE), + api_key="EMPTY", + temperature=0.0, + ) +] +literacy_lm_cycle = itertools.cycle(LITERACY_LMS) + +MODEL_PATH = os.environ.get( + "HEALTH_LITERACY_MODEL_PATH", + "/home/mshahidul/readctrl/code/text_classifier/" + "dspy_model/vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1/model.json", +) + + +# dspy.configure(lm=next(literacy_lm_cycle)) + + +class HealthLiteracySignature(dspy.Signature): + """ + Analyze the linguistic complexity, use of medical jargon, and sentence + structure of 'generated_text' to determine the health literacy level. + """ + + generated_text = dspy.InputField( + desc="A version of the source text rewritten for a specific audience." + ) + literacy_label = dspy.OutputField( + desc=( + "Classification: low_health_literacy (simple words, no jargon), " + "intermediate_health_literacy (moderate technicality), or " + "proficient_health_literacy (highly technical/original level)." + ) + ) + + +class HealthLiteracyClassifier(dspy.Module): + def __init__(self): + super().__init__() + self.classifier = dspy.ChainOfThought(HealthLiteracySignature) + + def forward(self, generated_text): + return self.classifier(generated_text=generated_text) + + +_COMPILED_CLASSIFIER = None + + +def _load_compiled_classifier(path): + if hasattr(dspy, "load"): + try: + return dspy.load(path) + except Exception: + pass + classifier = HealthLiteracyClassifier() + try: + classifier.load(path) + except Exception as exc: + raise RuntimeError(f"Failed to load compiled model from {path}") from exc + return classifier + + +def _get_classifier(): + global _COMPILED_CLASSIFIER + if _COMPILED_CLASSIFIER is None: + if not os.path.exists(MODEL_PATH): + raise FileNotFoundError(f"Model file not found: {MODEL_PATH}") + _COMPILED_CLASSIFIER = _load_compiled_classifier(MODEL_PATH) + return _COMPILED_CLASSIFIER + +def _parse_solution_json(solution_str): + try: + cleaned_str = solution_str.strip() + if "```json" in cleaned_str: + cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip() + elif "```" in cleaned_str: + cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip() + return json.loads(cleaned_str) + except Exception: + return None + + +def _get_target_level(extra_info): + if not extra_info: + return None + return extra_info.get("target_level") + + +def _predict_label(generated_text): + classifier = _get_classifier() + + # 2. Pick the next GPU/LM from the pool + current_lm = next(literacy_lm_cycle) + + # 3. Use dspy.context to ensure THIS specific call uses the selected GPU + with dspy.context(lm=current_lm): + prediction = classifier(generated_text=generated_text) + + if not prediction or not hasattr(prediction, "literacy_label"): + return "" + # import ipdb; ipdb.set_trace() + return str(prediction.literacy_label).strip().lower() + + +def _compute_classifier_reward(target_level, gen_text): + try: + pred_label = _predict_label(gen_text) + except Exception: + return 0.0 + return 1.0 if target_level in pred_label else 0.0 + +import numpy as np + +def _score_flat_top_iqr(value, bounds, weight=1.0): + """ + Provides a constant maximum reward within the range, + and a linear penalty outside of it. + """ + lower, upper = bounds + if lower <= value <= upper: + return weight # Maximum reward for being in the "Goldilocks" zone + + # Calculate distance to the nearest bound + distance = lower - value if value < lower else value - upper + # Linear decay: the further away, the lower the reward (can go negative) + return weight - distance + +def compute_score(data_source, solution_str, ground_truth, extra_info=None): + gold_subs = ground_truth.get('summary_subclaims', []) + full_subs = ground_truth.get('fulltext_subclaims', []) + + # 1. Strict Format & Data Validation + if not gold_subs or not full_subs: + return 0.0 + + data = _parse_solution_json(solution_str) + if not data: + return -2.0 # Penalize format failure more than content failure + + target_level = _get_target_level(extra_info) + level_map = { + "low_health_literacy": "low", + "intermediate_health_literacy": "intermediate", + "proficient_health_literacy": "proficient", + } + level_key = level_map.get(target_level) + + if not target_level or not level_key: + return 0.0 + + gen_text = data.get(target_level, "") + if not gen_text or len(gen_text.strip()) < 10: + return -1.0 # Penalize empty or trivial responses + + # 2. Extract Metrics from Verifier + comp_s, cov_s = verifier.evaluate_level(gen_text, gold_subs, full_subs) + + # 3. Component Weights + W_COMPLETENESS = 2.5 # Primary goal: Don't lie/omit facts + W_COVERAGE = 1.5 # Secondary: Match the intended information density + W_CLASSIFIER = 1.0 # Tertiary: Match the linguistic style + + # --- FACTUAL COMPLETENESS REWARD --- + # Use squared scaling: moving from 0.8 -> 0.9 is worth more than 0.1 -> 0.2 + # This prevents the model from "settling" for mediocre factual accuracy. + comp_reward = (comp_s ** 2) * W_COMPLETENESS + + # --- INFORMATION COVERAGE (IQR) REWARD --- + # We use flat-top to prevent "pinching" the model into one specific number. + cov_range = verifier.cov_iqr_ranges[level_key] + cov_reward = _score_flat_top_iqr(cov_s, cov_range, weight=W_COVERAGE) + + # --- LITERACY CLASSIFIER REWARD --- + classifier_reward = _compute_classifier_reward(target_level, gen_text) * W_CLASSIFIER + + # 4. Total Reward Calculation + total_score = comp_reward + cov_reward + classifier_reward + + + return total_score + + +# import os +# import json +# import time + +# def run_actual_api_test(): +# # 1. Prepare Real Medical Data +# # A summary vs a full text about Hypertension (Lisinopril) +# ground_truth = { +# "summary_subclaims": [ +# "Lisinopril is used to treat high blood pressure.", +# "It belongs to a class of drugs called ACE inhibitors.", +# "Common side effects include a dry cough." +# ], +# "fulltext_subclaims": [ +# "Lisinopril is used to treat high blood pressure.", +# "It belongs to a class of drugs called ACE inhibitors.", +# "Common side effects include a dry cough.", +# "It helps prevent heart attacks and strokes.", +# "Patients should have their kidney function monitored.", +# "Do not use if you are pregnant." +# ] +# } + +# # This is what the LLM generated for "low_health_literacy" +# # Note: It covers the first 2 subclaims but ignores the cough and pregnancy warnings. +# generated_response = { +# "low_health_literacy": ( +# "This medicine is for your high blood pressure. It is a type of drug " +# "called an ACE inhibitor. It helps your heart work better." +# ) +# } + +# solution_str = f"```json\n{json.dumps(generated_response)}\n```" +# extra_info = {"target_level": "low_health_literacy"} + +# print("📡 Initializing actual API connection to 172.16.34.21...") +# start_time = time.time() + +# try: +# # 2. Execute the actual score logic +# # This will trigger the ThreadPoolExecutor and make actual HTTP calls to your vLLM +# score = compute_score( +# data_source="real_api_test", +# solution_str=solution_str, +# ground_truth=ground_truth, +# extra_info=extra_info +# ) + +# duration = time.time() - start_time +# print(f"\n✅ API Call Successful ({round(duration, 2)}s)") +# print("-" * 40) +# print(f"Target Level: {extra_info['target_level']}") +# print(f"Final Reward Score: {round(score, 4)}") +# print("-" * 40) + +# # Logic check for the user +# print("\nDEBUG INFO:") +# print("- Completeness: Checks if the 3 summary claims are in the 'Low' text.") +# print("- Coverage: Checks how many of the 6 full-text claims are present.") +# print(f"- Target Thresholds: Comp >= 1.0, Cov between 0.32 and 0.45") + +# except Exception as e: +# print(f"\n❌ API Call Failed!") +# print(f"Error Type: {type(e).__name__}") +# print(f"Details: {str(e)}") +# print("\nPossible fixes:") +# print("1. Check if the vLLM server at :8086 and :8034 are running.") +# print("2. Check if your API key in api_new.json is valid.") + +# if __name__ == "__main__": +# run_actual_api_test() \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py b/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py new file mode 100644 index 0000000000000000000000000000000000000000..19dd6fcf0c3e297483b5b82b056bc1b3d455def3 --- /dev/null +++ b/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new.py @@ -0,0 +1,384 @@ +from cgi import print_arguments +import os +import json +import re +import dspy +from openai import OpenAI +import itertools + +CHAT_TEMPLATE = ( + "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n" + "Cutting Knowledge Date: December 2023\n" + "Today Date: 26 July 2024\n\n" + "<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n" + "{user_prompt}" + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n" +) + + +class MedicalClaimVerifier: + def __init__(self): + # Prefer local vLLM (OpenAI-compatible) server settings + self.model_name = os.getenv("VLLM_MODEL", "sc") + self.base_url = os.getenv("VLLM_API_BASE", "http://172.16.34.22:3090/v1") + self.client = OpenAI(api_key="EMPTY", base_url=self.base_url) + + # Keep completeness threshold fixed at 1.0. + self.comp_thresholds = { + "low": 1.0, + "intermediate": 1.0, + "proficient": 1.0, + } + # Use IQR ranges (lower, upper) for coverage. + self.cov_iqr_ranges = { + "low": (0.1765, 0.3226), + "intermediate": (0.1818, 0.4091), + "proficient": (0.7725, 0.9347), + } + + def build_user_prompt(self, text, subclaims): + numbered_subclaims = "\n".join(f"{idx + 1}. {subclaim}" for idx, subclaim in enumerate(subclaims)) + return ( + "You are an expert medical adjudicator. Determine if the 'Medical Passage' " + "contains the core factual information of each 'Subclaim', even if the passage " + "uses simpler language or layperson terms.\n\n" + "Rules:\n" + "- Label 'supported' if the essential meaning is present.\n" + "- Label 'not_supported' only if the information is missing or contradicted.\n" + "Output: JSON array of strings ['supported', 'not_supported', ...]\n\n" + f"Medical Passage: {text}\n\n" + f"Subclaims:\n{numbered_subclaims}" + ) + + def render_chat_prompt(self, user_prompt): + return CHAT_TEMPLATE.format(user_prompt=user_prompt) + + def extract_label_list(self, text): + # Find anything that looks like a list [ ... ] + match = re.search(r"\[\s*['\"]supported['\"]|['\"]not_supported['\"].*?\]", text, re.IGNORECASE | re.DOTALL) + if match: + try: + # Replace single quotes with double quotes for valid JSON + valid_json = match.group(0).replace("'", '"') + return json.loads(valid_json) + except: + pass + return [] + + def check_support_api(self, context, subclaims): + if not context or not subclaims: + return [] + + user_prompt = self.build_user_prompt(context, subclaims) + prompt = self.render_chat_prompt(user_prompt) + try: + response = self.client.completions.create( + model=self.model_name, + prompt=prompt, + max_tokens=256, + temperature=0, # Keep it deterministic for evaluation + ) + pred_text = response.choices[0].text.strip() + labels = self.extract_label_list(pred_text) + normalized = [str(x).strip().lower() for x in labels] + # print("--------------------------------") + # print(pred_text) + # print(normalized) + # print("--------------------------------") + return normalized + except Exception: + return [] + + def _average_supported(self, labels, expected_len): + if expected_len <= 0: + return 0.0 + normalized = [str(x).strip().lower() for x in labels] + if len(normalized) < expected_len: + normalized.extend(["invalid"] * (expected_len - len(normalized))) + elif len(normalized) > expected_len: + normalized = normalized[:expected_len] + supported_count = sum(1 for item in normalized if item == "supported") + return supported_count / expected_len + + def evaluate_level(self, gen_text, gold_subs, full_subs): + if not gen_text or not gold_subs or not full_subs: + return 0.0, 0.0 + + # Match support-check format with test.py: single prompt with text + list of subclaims. + comp_labels = self.check_support_api(gen_text, gold_subs) + cov_labels = self.check_support_api(gen_text, full_subs) + + comp_score = self._average_supported(comp_labels, len(gold_subs)) + cov_score = self._average_supported(cov_labels, len(full_subs)) + return comp_score, cov_score + +verifier = MedicalClaimVerifier() +DEFAULT_API_BASE = "http://172.16.34.22:8040/v1" +LITERACY_LMS = [ + dspy.LM( + model="openai/dspy", + api_base=os.getenv("VLLM_API_BASE", DEFAULT_API_BASE), + api_key="EMPTY", + temperature=0.0, + ) +] +literacy_lm_cycle = itertools.cycle(LITERACY_LMS) + +MODEL_PATH = os.environ.get( + "HEALTH_LITERACY_MODEL_PATH", + "/home/mshahidul/readctrl/code/text_classifier/" + "dspy_model/vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1/model.json", +) + + +# dspy.configure(lm=next(literacy_lm_cycle)) + + +class HealthLiteracySignature(dspy.Signature): + """ + Analyze the linguistic complexity, use of medical jargon, and sentence + structure of 'generated_text' to determine the health literacy level. + """ + + generated_text = dspy.InputField( + desc="A version of the source text rewritten for a specific audience." + ) + literacy_label = dspy.OutputField( + desc=( + "Classification: low_health_literacy (simple words, no jargon), " + "intermediate_health_literacy (moderate technicality), or " + "proficient_health_literacy (highly technical/original level)." + ) + ) + + +class HealthLiteracyClassifier(dspy.Module): + def __init__(self): + super().__init__() + self.classifier = dspy.ChainOfThought(HealthLiteracySignature) + + def forward(self, generated_text): + return self.classifier(generated_text=generated_text) + + +_COMPILED_CLASSIFIER = None + + +def _load_compiled_classifier(path): + if hasattr(dspy, "load"): + try: + return dspy.load(path) + except Exception: + pass + classifier = HealthLiteracyClassifier() + try: + classifier.load(path) + except Exception as exc: + raise RuntimeError(f"Failed to load compiled model from {path}") from exc + return classifier + + +def _get_classifier(): + global _COMPILED_CLASSIFIER + if _COMPILED_CLASSIFIER is None: + if not os.path.exists(MODEL_PATH): + raise FileNotFoundError(f"Model file not found: {MODEL_PATH}") + _COMPILED_CLASSIFIER = _load_compiled_classifier(MODEL_PATH) + return _COMPILED_CLASSIFIER + +def _parse_solution_json(solution_str): + try: + cleaned_str = solution_str.strip() + if "```json" in cleaned_str: + cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip() + elif "```" in cleaned_str: + cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip() + return json.loads(cleaned_str) + except Exception: + return None + + +def _get_target_level(extra_info): + if not extra_info: + return None + return extra_info.get("target_level") + + +def _predict_label(generated_text): + classifier = _get_classifier() + + # 2. Pick the next GPU/LM from the pool + current_lm = next(literacy_lm_cycle) + + # 3. Use dspy.context to ensure THIS specific call uses the selected GPU + with dspy.context(lm=current_lm): + prediction = classifier(generated_text=generated_text) + + if not prediction or not hasattr(prediction, "literacy_label"): + return "" + # import ipdb; ipdb.set_trace() + return str(prediction.literacy_label).strip().lower() + + +def _compute_classifier_reward(target_level, gen_text): + try: + pred_label = _predict_label(gen_text) + except Exception: + return 0.0 + return 1.0 if target_level in pred_label else 0.0 + +import numpy as np + +def _score_flat_top_iqr(value, bounds, weight=1.0): + lower, upper = bounds + + # 1. Optimal Zone: Maximum Reward + if lower <= value <= upper: + return weight + + # 2. Buffer Zone: Partial Reward + # If the value is within 20% of the boundaries, give partial credit. + buffer = 0.20 + if value < lower: + distance = lower - value + # Linear decay from weight to 0 over the buffer distance + return max(0, weight * (1 - (distance / buffer))) + else: + distance = value - upper + return max(0, weight * (1 - (distance / buffer))) + +def compute_completeness_reward(comp_s, weight=3.0): + # If the model is nearly perfect, give it a big boost + if comp_s >= 0.9: + return weight * 1.2 # 20% bonus for being in your 'Good' range + + # If it's between 0.7 and 0.9, give it a linear reward + if comp_s >= 0.7: + return weight * comp_s + + # Below 0.7, it's missing too much medical info. + # We penalize it to force it to prioritize facts over style. + return (comp_s * weight) - 1.0 + +def compute_score(data_source, solution_str, ground_truth, extra_info=None): + gold_subs = ground_truth.get('summary_subclaims', []) + full_subs = ground_truth.get('fulltext_subclaims', []) + + # 1. Strict Format & Data Validation + if not gold_subs or not full_subs: + return 0.0 + + data = _parse_solution_json(solution_str) + if not data: + return -2.0 # Penalize format failure more than content failure + + target_level = _get_target_level(extra_info) + level_map = { + "low_health_literacy": "low", + "intermediate_health_literacy": "intermediate", + "proficient_health_literacy": "proficient", + } + level_key = level_map.get(target_level) + + if not target_level or not level_key: + return 0.0 + + gen_text = data.get(target_level, "") + if not gen_text or len(gen_text.strip()) < 10: + return -1.0 # Penalize empty or trivial responses + + + comp_s, cov_s = verifier.evaluate_level(gen_text, gold_subs, full_subs) + + # 2. Re-balanced Weights + W_COMPLETENESS = 3.0 # Increased weight for facts + W_COVERAGE = 1.5 + W_CLASSIFIER = 1.0 + + comp_reward = compute_completeness_reward(comp_s, weight=W_COMPLETENESS) + + # --- UPDATED COVERAGE REWARD --- + cov_range = verifier.cov_iqr_ranges[level_key] + cov_reward = _score_flat_top_iqr(cov_s, cov_range, weight=W_COVERAGE) + + # --- CLASSIFIER REWARD --- + classifier_reward = _compute_classifier_reward(target_level, gen_text) * W_CLASSIFIER + + # 3. Total Calculation + # We add a small penalty for extremely short text to avoid "cheating" the coverage floor + length_penalty = -1.0 if len(gen_text.split()) < 15 else 0.0 + + return comp_reward + cov_reward + classifier_reward + length_penalty + + +# import os +# import json +# import time + +# def run_actual_api_test(): +# # 1. Prepare Real Medical Data +# # A summary vs a full text about Hypertension (Lisinopril) +# ground_truth = { +# "summary_subclaims": [ +# "Lisinopril is used to treat high blood pressure.", +# "It belongs to a class of drugs called ACE inhibitors.", +# "Common side effects include a dry cough." +# ], +# "fulltext_subclaims": [ +# "Lisinopril is used to treat high blood pressure.", +# "It belongs to a class of drugs called ACE inhibitors.", +# "Common side effects include a dry cough.", +# "It helps prevent heart attacks and strokes.", +# "Patients should have their kidney function monitored.", +# "Do not use if you are pregnant." +# ] +# } + +# # This is what the LLM generated for "low_health_literacy" +# # Note: It covers the first 2 subclaims but ignores the cough and pregnancy warnings. +# generated_response = { +# "low_health_literacy": ( +# "This medicine is for your high blood pressure. It is a type of drug " +# "called an ACE inhibitor. It helps your heart work better." +# ) +# } + +# solution_str = f"```json\n{json.dumps(generated_response)}\n```" +# extra_info = {"target_level": "low_health_literacy"} + +# print("📡 Initializing actual API connection to 172.16.34.21...") +# start_time = time.time() + +# try: +# # 2. Execute the actual score logic +# # This will trigger the ThreadPoolExecutor and make actual HTTP calls to your vLLM +# score = compute_score( +# data_source="real_api_test", +# solution_str=solution_str, +# ground_truth=ground_truth, +# extra_info=extra_info +# ) + +# duration = time.time() - start_time +# print(f"\n✅ API Call Successful ({round(duration, 2)}s)") +# print("-" * 40) +# print(f"Target Level: {extra_info['target_level']}") +# print(f"Final Reward Score: {round(score, 4)}") +# print("-" * 40) + +# # Logic check for the user +# print("\nDEBUG INFO:") +# print("- Completeness: Checks if the 3 summary claims are in the 'Low' text.") +# print("- Coverage: Checks how many of the 6 full-text claims are present.") +# print(f"- Target Thresholds: Comp >= 1.0, Cov between 0.32 and 0.45") + +# except Exception as e: +# print(f"\n❌ API Call Failed!") +# print(f"Error Type: {type(e).__name__}") +# print(f"Details: {str(e)}") +# print("\nPossible fixes:") +# print("1. Check if the vLLM server at :8086 and :8034 are running.") +# print("2. Check if your API key in api_new.json is valid.") + +# if __name__ == "__main__": +# run_actual_api_test() \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new_v2.py b/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..da7522ca143761c186a36ef9f793deb60c9db002 --- /dev/null +++ b/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new_v2.py @@ -0,0 +1,332 @@ +import os +import json +import argparse +try: + import dspy +except ImportError: + dspy = None +from openai import OpenAI +from typing import Any + +class MedicalClaimVerifier: + def __init__(self): + # Prefer local vLLM (OpenAI-compatible) server settings + self.model_name = os.getenv("VLLM_MODEL", "sc") + self.base_url = os.getenv("VLLM_API_BASE", "http://172.16.34.22:3090/v1") + self.client = OpenAI(api_key="EMPTY", base_url=self.base_url) + self.valid_labels = {"supported", "not_supported"} + self.label_aliases = { + "supported": "supported", + "support": "supported", + "not_supported": "not_supported", + "not supported": "not_supported", + "not-supported": "not_supported", + "unsupported": "not_supported", + } + + # Keep completeness threshold fixed at 1.0. + self.comp_thresholds = { + "low": 1.0, + "intermediate": 1.0, + "proficient": 1.0, + } + # Use IQR ranges (lower, upper) for coverage. + self.cov_iqr_ranges = { + "low": (0.1765, 0.3226), + "intermediate": (0.1818, 0.4091), + "proficient": (0.7725, 0.9347), + } + + def build_user_prompt(self, text, subclaims): + numbered_subclaims = "\n".join(f"{idx + 1}. {subclaim}" for idx, subclaim in enumerate(subclaims)) + return ( + "You are an expert medical adjudicator.\n" + "Determine whether each Subclaim is supported by the Medical Passage.\n\n" + "Decision rules:\n" + "- supported: the core meaning is present (paraphrase allowed).\n" + "- not_supported: missing, contradicted, or materially incomplete.\n\n" + "Return ONLY valid JSON in this exact shape:\n" + "{\n" + ' "labels": ["supported" | "not_supported", ...]\n' + "}\n" + "The labels array length must exactly equal the number of subclaims, in order.\n" + "Do not add markdown, code fences, or extra keys.\n\n" + f"Medical text: {text}\n\n" + f"Subclaims:\n{numbered_subclaims}" + ) + + def _normalize_label(self, value: Any) -> str: + text = str(value).strip().lower() + return self.label_aliases.get(text, text) + + + + def check_support_api(self, context, subclaims): + if not context or not subclaims: + return [] + + user_prompt = self.build_user_prompt(context, subclaims) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=[{"role": "user", "content": user_prompt}], + max_tokens=256, + temperature=0.0, + timeout=300, + ) + except Exception as exc: + print(f"Warning: Reward API call failed/timed out: {exc}") + return ["invalid"] * len(subclaims) + try: + pred_text = "" + if response.choices: + pred_text = (response.choices[0].message.content or "").strip() + labels = json.loads(pred_text.split("")[1].strip())["labels"] + # print(f"labels2: {labels}") + # extract_label_list already returns normalized valid labels. + normalized = labels + # Force exact alignment with the requested subclaim count. + if len(normalized) < len(subclaims): + normalized.extend(["invalid"] * (len(subclaims) - len(normalized))) + elif len(normalized) > len(subclaims): + normalized = normalized[:len(subclaims)] + # print("--------------------------------") + # print(f"pred_text: {pred_text}") + # print(f"normalized: {normalized}") + # print("--------------------------------") + return normalized + except Exception as exc: + return ["invalid"] * len(subclaims) + + def _average_supported(self, labels, expected_len): + if expected_len <= 0: + return 0.0 + normalized = [str(x).strip().lower() for x in labels] + # print(f"normalized: {normalized}") + if len(normalized) < expected_len: + normalized.extend(["invalid"] * (expected_len - len(normalized))) + elif len(normalized) > expected_len: + normalized = normalized[:expected_len] + supported_count = sum(1 for item in normalized if item == "supported") + return supported_count / expected_len + + def evaluate_level(self, gen_text, gold_subs, full_subs): + if not gen_text or not gold_subs or not full_subs: + return 0.0, 0.0 + + # Match support-check format with test.py: single prompt with text + list of subclaims. + comp_labels = self.check_support_api(gen_text, gold_subs) + cov_labels = self.check_support_api(gen_text, full_subs) + # print(f"comp_labels: {comp_labels}") + # print(f"cov_labels: {cov_labels}") + + comp_score = self._average_supported(comp_labels, len(gold_subs)) + cov_score = self._average_supported(cov_labels, len(full_subs)) + # print(f"comp_score: {comp_score}, cov_score: {cov_score}") + return comp_score, cov_score + +verifier = MedicalClaimVerifier() +DEFAULT_API_BASE = "http://172.16.34.22:8040/v1" +if dspy is not None: + LITERACY_LM = dspy.LM( + model="openai/dspy", + api_base=os.getenv("VLLM_API_BASE", DEFAULT_API_BASE), + api_key="EMPTY", + temperature=0.0, + cache=False, # Often helpful to disable during active training debugging + timeout=300 # Set a generous 5-minute timeout + ) +else: + LITERACY_LM = None + +MODEL_PATH = os.environ.get( + "HEALTH_LITERACY_MODEL_PATH", + "/home/mshahidul/readctrl/code/text_classifier/" + "dspy_model/vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1/model.json", +) + + +# dspy.configure(lm=next(literacy_lm_cycle)) + +if dspy is not None: + class HealthLiteracySignature(dspy.Signature): + """ + Analyze the linguistic complexity, use of medical jargon, and sentence + structure of 'generated_text' to determine the health literacy level. + """ + + generated_text = dspy.InputField( + desc="A version of the source text rewritten for a specific audience." + ) + literacy_label = dspy.OutputField( + desc=( + "Classification: low_health_literacy (simple words, no jargon), " + "intermediate_health_literacy (moderate technicality), or " + "proficient_health_literacy (highly technical/original level)." + ) + ) + + + class HealthLiteracyClassifier(dspy.Module): + def __init__(self): + super().__init__() + self.classifier = dspy.ChainOfThought(HealthLiteracySignature) + + def forward(self, generated_text): + return self.classifier(generated_text=generated_text) + + +_COMPILED_CLASSIFIER = None +_CLASSIFIER_ERROR_LOGGED = False + + +def _load_compiled_classifier(path): + if dspy is None: + raise RuntimeError("dspy is not installed") + if hasattr(dspy, "load"): + try: + return dspy.load(path) + except Exception: + pass + classifier = HealthLiteracyClassifier() + try: + classifier.load(path) + except Exception as exc: + raise RuntimeError(f"Failed to load compiled model from {path}") from exc + return classifier + + +def _get_classifier(): + global _COMPILED_CLASSIFIER + if _COMPILED_CLASSIFIER is None: + if not os.path.exists(MODEL_PATH): + raise FileNotFoundError(f"Model file not found: {MODEL_PATH}") + _COMPILED_CLASSIFIER = _load_compiled_classifier(MODEL_PATH) + return _COMPILED_CLASSIFIER + +def _parse_solution_json(solution_str): + # Accept pre-parsed JSON objects directly. + if isinstance(solution_str, (dict, list)): + return solution_str + try: + cleaned_str = str(solution_str).strip() + if "```json" in cleaned_str: + cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip() + elif "```" in cleaned_str: + cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip() + return json.loads(cleaned_str) + except Exception: + return None + + +def _predict_label(generated_text): + global _CLASSIFIER_ERROR_LOGGED + if dspy is None: + return "" + try: + classifier = _get_classifier() + + if LITERACY_LM is not None: + with dspy.context(lm=LITERACY_LM): + prediction = classifier(generated_text=generated_text) + else: + prediction = classifier(generated_text=generated_text) + except Exception as exc: + if not _CLASSIFIER_ERROR_LOGGED: + print(f"Warning: literacy classifier unavailable, continuing without it: {exc}") + _CLASSIFIER_ERROR_LOGGED = True + return "" + # print(f"prediction: {prediction}") + + if not prediction or not hasattr(prediction, "literacy_label"): + return "" + return str(prediction.literacy_label).strip().lower() + + +def _compute_classifier_reward(target_level, gen_text): + # Classifier reward is currently disabled; keep best-effort invocation for observability. + _predict_label(gen_text) + return 0.0 + +def _score_flat_top_iqr(value, bounds, weight=1.0): + lower, upper = bounds + + # 1. Optimal Zone: Maximum Reward + if lower <= value <= upper: + return weight + + # 2. Buffer Zone: Partial Reward + # If the value is within 20% of the boundaries, give partial credit. + buffer = 0.20 + if value < lower: + distance = lower - value + # Linear decay from weight to 0 over the buffer distance + return max(0, weight * (1 - (distance / buffer))) + else: + distance = value - upper + return max(0, weight * (1 - (distance / buffer))) + +def compute_completeness_reward(comp_s, weight=3.0): + # If the model is nearly perfect, give it a big boost + if comp_s >= 0.9: + return weight * 1.2 # 20% bonus for being in your 'Good' range + + # If it's between 0.7 and 0.9, give it a linear reward + if comp_s >= 0.7: + return weight * comp_s + + # Below 0.7, it's missing too much medical info. + # We penalize it to force it to prioritize facts over style. + return (comp_s * weight) - 1.0 + +def compute_score(data_source, solution_str, ground_truth, extra_info=None): + gold_subs = ground_truth.get('summary_subclaims', []) + full_subs = ground_truth.get('fulltext_subclaims', []) + + # 1. Strict Format & Data Validation + if not gold_subs or not full_subs: + return 0.0 + + data = _parse_solution_json(solution_str) + if not data: + return -2.0 # Penalize format failure more than content failure + + target_level = extra_info.get("target_level") if extra_info else None + level_map = { + "low_health_literacy": "low", + "intermediate_health_literacy": "intermediate", + "proficient_health_literacy": "proficient", + } + level_key = level_map.get(target_level) + + if not target_level or not level_key: + return 0.0 + + gen_text = data.get(target_level, "") + if not gen_text or len(gen_text.strip()) < 10: + return -1.0 # Penalize empty or trivial responses + + + comp_s, cov_s = verifier.evaluate_level(gen_text, gold_subs, full_subs) + + # 2. Re-balanced Weights + W_COMPLETENESS = 3.0 # Increased weight for facts + W_COVERAGE = 1.5 + W_CLASSIFIER = 1.0 + + comp_reward = compute_completeness_reward(comp_s, weight=W_COMPLETENESS) + + # --- UPDATED COVERAGE REWARD --- + cov_range = verifier.cov_iqr_ranges[level_key] + cov_reward = _score_flat_top_iqr(cov_s, cov_range, weight=W_COVERAGE) + + # --- CLASSIFIER REWARD --- + classifier_reward = _compute_classifier_reward(target_level, gen_text) * W_CLASSIFIER + + # 3. Total Calculation + # We add a small penalty for extremely short text to avoid "cheating" the coverage floor + length_penalty = -1.0 if len(gen_text.split()) < 15 else 0.0 + + return comp_reward + cov_reward + classifier_reward + length_penalty + diff --git a/code/RL_model/verl/verl_train/reward_func/reward_func/testing/r1.py b/code/RL_model/verl/verl_train/reward_func/reward_func/testing/r1.py new file mode 100644 index 0000000000000000000000000000000000000000..4bba1e2b086b91d4103237b0668cfd1dfa410777 --- /dev/null +++ b/code/RL_model/verl/verl_train/reward_func/reward_func/testing/r1.py @@ -0,0 +1,438 @@ +import os +import json +import argparse +try: + import dspy +except ImportError: + dspy = None +from openai import OpenAI +from typing import Any + +class MedicalClaimVerifier: + def __init__(self): + # Prefer local vLLM (OpenAI-compatible) server settings + self.model_name = os.getenv("VLLM_MODEL", "sc") + self.base_url = os.getenv("VLLM_API_BASE", "http://172.16.34.22:3090/v1") + self.client = OpenAI(api_key="EMPTY", base_url=self.base_url) + self.valid_labels = {"supported", "not_supported"} + self.label_aliases = { + "supported": "supported", + "support": "supported", + "not_supported": "not_supported", + "not supported": "not_supported", + "not-supported": "not_supported", + "unsupported": "not_supported", + } + + # Keep completeness threshold fixed at 1.0. + self.comp_thresholds = { + "low": 1.0, + "intermediate": 1.0, + "proficient": 1.0, + } + # Use IQR ranges (lower, upper) for coverage. + self.cov_iqr_ranges = { + "low": (0.1765, 0.3226), + "intermediate": (0.1818, 0.4091), + "proficient": (0.7725, 0.9347), + } + + def build_user_prompt(self, text, subclaims): + numbered_subclaims = "\n".join(f"{idx + 1}. {subclaim}" for idx, subclaim in enumerate(subclaims)) + return ( + "You are an expert medical adjudicator.\n" + "Determine whether each Subclaim is supported by the Medical Passage.\n\n" + "Decision rules:\n" + "- supported: the core meaning is present (paraphrase allowed).\n" + "- not_supported: missing, contradicted, or materially incomplete.\n\n" + "Return ONLY valid JSON in this exact shape:\n" + "{\n" + ' "labels": ["supported" | "not_supported", ...]\n' + "}\n" + "The labels array length must exactly equal the number of subclaims, in order.\n" + "Do not add markdown, code fences, or extra keys.\n\n" + f"Medical text: {text}\n\n" + f"Subclaims:\n{numbered_subclaims}" + ) + + def _normalize_label(self, value: Any) -> str: + text = str(value).strip().lower() + return self.label_aliases.get(text, text) + + + + def check_support_api(self, context, subclaims): + if not context or not subclaims: + return [] + + user_prompt = self.build_user_prompt(context, subclaims) + + try: + response = self.client.chat.completions.create( + model=self.model_name, + messages=[{"role": "user", "content": user_prompt}], + max_tokens=256, + temperature=0.0, + ) + except Exception as exc: + raise RuntimeError(f"check_support_api error: {exc}") from exc + try: + pred_text = "" + if response.choices: + pred_text = (response.choices[0].message.content or "").strip() + labels = json.loads(pred_text.split("")[1].strip())["labels"] + # print(f"labels2: {labels}") + # extract_label_list already returns normalized valid labels. + normalized = labels + # Force exact alignment with the requested subclaim count. + if len(normalized) < len(subclaims): + normalized.extend(["invalid"] * (len(subclaims) - len(normalized))) + elif len(normalized) > len(subclaims): + normalized = normalized[:len(subclaims)] + # print("--------------------------------") + # print(f"pred_text: {pred_text}") + # print(f"normalized: {normalized}") + # print("--------------------------------") + return normalized + except Exception as exc: + return ["invalid"] * len(subclaims) + + def _average_supported(self, labels, expected_len): + if expected_len <= 0: + return 0.0 + normalized = [str(x).strip().lower() for x in labels] + # print(f"normalized: {normalized}") + if len(normalized) < expected_len: + normalized.extend(["invalid"] * (expected_len - len(normalized))) + elif len(normalized) > expected_len: + normalized = normalized[:expected_len] + supported_count = sum(1 for item in normalized if item == "supported") + return supported_count / expected_len + + def evaluate_level(self, gen_text, gold_subs, full_subs): + if not gen_text or not gold_subs or not full_subs: + return 0.0, 0.0 + + # Match support-check format with test.py: single prompt with text + list of subclaims. + comp_labels = self.check_support_api(gen_text, gold_subs) + cov_labels = self.check_support_api(gen_text, full_subs) + # print(f"comp_labels: {comp_labels}") + # print(f"cov_labels: {cov_labels}") + + comp_score = self._average_supported(comp_labels, len(gold_subs)) + cov_score = self._average_supported(cov_labels, len(full_subs)) + # print(f"comp_score: {comp_score}, cov_score: {cov_score}") + return comp_score, cov_score + +verifier = MedicalClaimVerifier() +DEFAULT_API_BASE = "http://172.16.34.22:8040/v1" +if dspy is not None: + LITERACY_LM = dspy.LM( + model="openai/dspy", + api_base=os.getenv("VLLM_API_BASE", DEFAULT_API_BASE), + api_key="EMPTY", + temperature=0.0, + ) +else: + LITERACY_LM = None + +MODEL_PATH = os.environ.get( + "HEALTH_LITERACY_MODEL_PATH", + "/home/mshahidul/readctrl/code/text_classifier/" + "dspy_model/vllm-Meta-Llama-3.1-8B-Instruct_teacher-gpt5_v1/model.json", +) + + +# dspy.configure(lm=next(literacy_lm_cycle)) + +if dspy is not None: + class HealthLiteracySignature(dspy.Signature): + """ + Analyze the linguistic complexity, use of medical jargon, and sentence + structure of 'generated_text' to determine the health literacy level. + """ + + generated_text = dspy.InputField( + desc="A version of the source text rewritten for a specific audience." + ) + literacy_label = dspy.OutputField( + desc=( + "Classification: low_health_literacy (simple words, no jargon), " + "intermediate_health_literacy (moderate technicality), or " + "proficient_health_literacy (highly technical/original level)." + ) + ) + + + class HealthLiteracyClassifier(dspy.Module): + def __init__(self): + super().__init__() + self.classifier = dspy.ChainOfThought(HealthLiteracySignature) + + def forward(self, generated_text): + return self.classifier(generated_text=generated_text) + + +_COMPILED_CLASSIFIER = None + + +def _load_compiled_classifier(path): + if dspy is None: + raise RuntimeError("dspy is not installed") + if hasattr(dspy, "load"): + try: + return dspy.load(path) + except Exception: + pass + classifier = HealthLiteracyClassifier() + try: + classifier.load(path) + except Exception as exc: + raise RuntimeError(f"Failed to load compiled model from {path}") from exc + return classifier + + +def _get_classifier(): + global _COMPILED_CLASSIFIER + if _COMPILED_CLASSIFIER is None: + if not os.path.exists(MODEL_PATH): + raise FileNotFoundError(f"Model file not found: {MODEL_PATH}") + _COMPILED_CLASSIFIER = _load_compiled_classifier(MODEL_PATH) + return _COMPILED_CLASSIFIER + +def _parse_solution_json(solution_str): + # Accept pre-parsed JSON objects directly. + if isinstance(solution_str, (dict, list)): + return solution_str + try: + cleaned_str = str(solution_str).strip() + if "```json" in cleaned_str: + cleaned_str = cleaned_str.split("```json")[1].split("```")[0].strip() + elif "```" in cleaned_str: + cleaned_str = cleaned_str.split("```")[1].split("```")[0].strip() + return json.loads(cleaned_str) + except Exception: + return None + + +def _predict_label(generated_text): + if dspy is None: + return "" + classifier = _get_classifier() + + if LITERACY_LM is not None: + with dspy.context(lm=LITERACY_LM): + prediction = classifier(generated_text=generated_text) + + else: + prediction = classifier(generated_text=generated_text) + # print(f"prediction: {prediction}") + + if not prediction or not hasattr(prediction, "literacy_label"): + return "" + # import ipdb; ipdb.set_trace() + # print("--------------------------------") + # print(f"literacy_label: {prediction.literacy_label}") + # print("--------------------------------") + return str(prediction.literacy_label).strip().lower() + + +def _compute_classifier_reward(target_level, gen_text): + # Keep API/model invocation for fail-fast behavior; on success, classifier reward is disabled. + _predict_label(gen_text) + return 0.0 + +def _score_flat_top_iqr(value, bounds, weight=1.0): + lower, upper = bounds + + # 1. Optimal Zone: Maximum Reward + if lower <= value <= upper: + return weight + + # 2. Buffer Zone: Partial Reward + # If the value is within 20% of the boundaries, give partial credit. + buffer = 0.20 + if value < lower: + distance = lower - value + # Linear decay from weight to 0 over the buffer distance + return max(0, weight * (1 - (distance / buffer))) + else: + distance = value - upper + return max(0, weight * (1 - (distance / buffer))) + +def compute_completeness_reward(comp_s, weight=3.0): + # If the model is nearly perfect, give it a big boost + if comp_s >= 0.9: + return weight * 1.2 # 20% bonus for being in your 'Good' range + + # If it's between 0.7 and 0.9, give it a linear reward + if comp_s >= 0.7: + return weight * comp_s + + # Below 0.7, it's missing too much medical info. + # We penalize it to force it to prioritize facts over style. + return (comp_s * weight) - 1.0 + +def compute_score(data_source, solution_str, ground_truth, extra_info=None): + gold_subs = ground_truth.get('summary_subclaims', []) + full_subs = ground_truth.get('fulltext_subclaims', []) + + # 1. Strict Format & Data Validation + if not gold_subs or not full_subs: + return 0.0 + + data = _parse_solution_json(solution_str) + if not data: + return -2.0 # Penalize format failure more than content failure + + target_level = extra_info.get("target_level") if extra_info else None + level_map = { + "low_health_literacy": "low", + "intermediate_health_literacy": "intermediate", + "proficient_health_literacy": "proficient", + } + level_key = level_map.get(target_level) + + if not target_level or not level_key: + return 0.0 + + gen_text = data.get(target_level, "") + if not gen_text or len(gen_text.strip()) < 10: + return -1.0 # Penalize empty or trivial responses + + + comp_s, cov_s = verifier.evaluate_level(gen_text, gold_subs, full_subs) + + # 2. Re-balanced Weights + W_COMPLETENESS = 3.0 # Increased weight for facts + W_COVERAGE = 1.5 + W_CLASSIFIER = 1.0 + + comp_reward = compute_completeness_reward(comp_s, weight=W_COMPLETENESS) + + # --- UPDATED COVERAGE REWARD --- + cov_range = verifier.cov_iqr_ranges[level_key] + cov_reward = _score_flat_top_iqr(cov_s, cov_range, weight=W_COVERAGE) + + # --- CLASSIFIER REWARD --- + classifier_reward = _compute_classifier_reward(target_level, gen_text) * W_CLASSIFIER + + # 3. Total Calculation + # We add a small penalty for extremely short text to avoid "cheating" the coverage floor + length_penalty = -1.0 if len(gen_text.split()) < 15 else 0.0 + + return comp_reward + cov_reward + classifier_reward + length_penalty + +def _load_accuracy_examples(json_path): + with open(json_path, "r", encoding="utf-8") as f: + payload = json.load(f) + examples = payload.get("examples", []) + if not isinstance(examples, list): + raise ValueError("Invalid examples file: 'examples' must be a list") + return examples + + +def run_accuracy_check(json_path, use_actual_api=False): + examples = _load_accuracy_examples(json_path) + print(f"Loaded {len(examples)} examples from: {json_path}") + mode = "ACTUAL_API" if use_actual_api else "MOCKED_OR_ACTUAL_PER_EXAMPLE" + print(f"Mode: {mode}") + + original_eval = verifier.evaluate_level + original_classifier = _compute_classifier_reward + tolerance = 1e-6 + pass_count = 0 + examples=[examples[0], examples[1], examples[2],examples[4]] + + for idx, example in enumerate(examples, start=1): + print("--------------------------------") + name = example.get("name", f"example_{idx}") + data_source = example.get("data_source", "test") + ground_truth = example.get("ground_truth", {}) + solution_str = example.get("solution_str") + extra_info = example.get("extra_info", {}) + expected_score = example.get("expected_score") + expected_min = example.get("expected_min") + expected_max = example.get("expected_max") + mocked = example.get("mocked", {}) + + try: + # Optional deterministic mocking for accuracy checks. + if mocked and not use_actual_api: + comp_s = float(mocked.get("comp_s", 0.0)) + cov_s = float(mocked.get("cov_s", 0.0)) + classifier_match = bool(mocked.get("classifier_match", False)) + + def _mock_evaluate_level(_gen_text, _gold_subs, _full_subs, c=comp_s, v=cov_s): + return c, v + + def _mock_classifier_reward(_target_level, _gen_text, match=classifier_match): + return 1.0 if match else 0.0 + + verifier.evaluate_level = _mock_evaluate_level + globals()["_compute_classifier_reward"] = _mock_classifier_reward + else: + verifier.evaluate_level = original_eval + globals()["_compute_classifier_reward"] = original_classifier + + score = compute_score(data_source, solution_str, ground_truth, extra_info) + if expected_min is not None and expected_max is not None: + low = float(expected_min) + high = float(expected_max) + is_pass = low <= score <= high + status = "PASS" if is_pass else "FAIL" + print( + f"[{idx}] {name}: {status} | " + f"score={score:.6f}, expected_range=[{low:.6f}, {high:.6f}]" + ) + if is_pass: + pass_count += 1 + continue + + if expected_score is None: + print(f"[{idx}] {name}: score={score:.6f} (no expected_score provided)") + continue + + diff = abs(score - float(expected_score)) + is_pass = diff <= tolerance + status = "PASS" if is_pass else "FAIL" + print( + f"[{idx}] {name}: {status} | " + f"score={score:.6f}, expected={float(expected_score):.6f}, diff={diff:.6f}" + ) + if is_pass: + pass_count += 1 + except Exception as exc: + print(f"[{idx}] {name}: ERROR | {exc}") + raise + finally: + verifier.evaluate_level = original_eval + globals()["_compute_classifier_reward"] = original_classifier + + checked = sum( + 1 + for ex in examples + if ex.get("expected_score") is not None + or (ex.get("expected_min") is not None and ex.get("expected_max") is not None) + ) + print(f"\nAccuracy check done: {pass_count}/{checked} matched expected scores.") + + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Reward function accuracy checker") + parser.add_argument( + "--examples", + default=None, + help="Path to JSON file containing reward test examples", + ) + parser.add_argument( + "--actual-api", + action="store_true", + help="Force real API path and ignore mocked values", + ) + args = parser.parse_args() + + here = os.path.dirname(os.path.abspath(__file__)) + examples_path = args.examples or os.path.join(here, "reward_accuracy_examples.json") + run_accuracy_check(examples_path, use_actual_api=args.actual_api) \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/reward_func/reward_func/testing/reward_accuracy_examples.json b/code/RL_model/verl/verl_train/reward_func/reward_func/testing/reward_accuracy_examples.json new file mode 100644 index 0000000000000000000000000000000000000000..6fa4ca048be3a6256a1057c937de18dc763d1fde --- /dev/null +++ b/code/RL_model/verl/verl_train/reward_func/reward_func/testing/reward_accuracy_examples.json @@ -0,0 +1,177 @@ +{ + "examples": [ + { + "name": "perfect_proficient_long_text", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins are lipid-lowering medications that reduce LDL cholesterol by inhibiting HMG-CoA reductase and are broadly used in cardiovascular risk management." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "mocked": { + "comp_s": 1.0, + "cov_s": 0.85, + "classifier_match": true + }, + "expected_score": 6.1 + }, + { + "name": "medium_completeness_near_cov_floor", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins lower LDL levels and are common in practice, though this answer omits deeper mechanism details and some context for complete explanation." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "mocked": { + "comp_s": 0.8, + "cov_s": 0.7, + "classifier_match": true + }, + "expected_score": 4.35625 + }, + { + "name": "low_completeness_cov_out_of_range", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins are medications, but this response is incomplete and misses key medical claims and mechanistic evidence in detail." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "mocked": { + "comp_s": 0.5, + "cov_s": 0.2, + "classifier_match": false + }, + "expected_score": 0.5 + }, + { + "name": "high_quality_but_short_text_penalty", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins lower LDL by HMG-CoA inhibition and support prevention in adults." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "mocked": { + "comp_s": 1.0, + "cov_s": 0.85, + "classifier_match": true + }, + "expected_score": 5.1 + }, + { + "name": "format_failure_invalid_json_string", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": "not a json payload", + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_score": -2.0 + }, + { + "name": "target_text_too_short", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Too short" + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_score": -1.0 + }, + { + "name": "invalid_target_level", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins are lipid-lowering medications with strong evidence in LDL reduction and related cardiovascular risk reduction outcomes." + }, + "extra_info": { + "target_level": "expert_health_literacy" + }, + "expected_score": 0.0 + }, + { + "name": "missing_subclaims_in_ground_truth", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [], + "fulltext_subclaims": [] + }, + "solution_str": { + "proficient_health_literacy": "Statins reduce LDL cholesterol and inhibit HMG-CoA reductase, with broad clinical evidence." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_score": 0.0 + } + ] +} diff --git a/code/RL_model/verl/verl_train/reward_func/reward_func/testing/reward_accuracy_examples_actual_api.json b/code/RL_model/verl/verl_train/reward_func/reward_func/testing/reward_accuracy_examples_actual_api.json new file mode 100644 index 0000000000000000000000000000000000000000..ad0ae64c08d243745fae416412e7c42d59b94d78 --- /dev/null +++ b/code/RL_model/verl/verl_train/reward_func/reward_func/testing/reward_accuracy_examples_actual_api.json @@ -0,0 +1,105 @@ +{ + "examples": [ + { + "name": "strong_supported_proficient", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins are lipid-lowering agents that reduce LDL cholesterol through inhibition of HMG-CoA reductase and are foundational in cardiovascular risk reduction." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_min": 3.0, + "expected_max": 6.2 + }, + { + "name": "partially_supported_proficient", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins lower LDL cholesterol and are commonly prescribed, but this response does not explain the precise biochemical mechanism." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_min": 1.0, + "expected_max": 5.5 + }, + { + "name": "not_supported_content_proficient", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "Statins mainly increase HDL and have little role in lowering LDL, and their primary mechanism is unrelated to HMG-CoA reductase inhibition." + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_min": -1.0, + "expected_max": 3.0 + }, + { + "name": "invalid_format_string", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": "this is not valid json", + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_score": -2.0 + }, + { + "name": "too_short_generation", + "data_source": "test", + "ground_truth": { + "summary_subclaims": [ + "Statins reduce LDL cholesterol." + ], + "fulltext_subclaims": [ + "Statins reduce LDL cholesterol.", + "Statins inhibit HMG-CoA reductase." + ] + }, + "solution_str": { + "proficient_health_literacy": "short" + }, + "extra_info": { + "target_level": "proficient_health_literacy" + }, + "expected_score": -1.0 + } + ] +} diff --git a/code/RL_model/verl/verl_train/scripts/veomni/moe_merge.py b/code/RL_model/verl/verl_train/scripts/veomni/moe_merge.py new file mode 100644 index 0000000000000000000000000000000000000000..aa1c57d42e4de0a0665bc39466c8af498a0ef36a --- /dev/null +++ b/code/RL_model/verl/verl_train/scripts/veomni/moe_merge.py @@ -0,0 +1,121 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Merge individual MoE expert weights into stacked tensors for efficient loading. + +This script takes a HuggingFace checkpoint with individual expert weights +(e.g., model.layers.{i}.mlp.experts.{j}.gate_proj.weight) and merges them +into stacked tensors (e.g., model.layers.{i}.mlp.experts.gate_proj) for +faster loading and better memory efficiency in VeOmni. + +The merging process: +1. Loads individual expert weights from the HF checkpoint +2. Stacks them into single tensors for each projection type +3. Handles all three projection types: gate_proj, up_proj, down_proj +4. Supports both Qwen3-MoE (num_experts) and DeepSeek (n_routed_experts) formats +5. Handles models with initial dense layers (first_k_dense_replace) + +Usage: python moe_merge.py --raw_hf_path --merge_hf_path +""" + +import os +from argparse import ArgumentParser +from dataclasses import dataclass +from glob import glob +from typing import Generator + +import torch +from safetensors.torch import safe_open +from tqdm import tqdm +from transformers import AutoConfig +from veomni.models import build_tokenizer, save_model_weights + + +@dataclass +class StateDictIterator: + filepath: str + + def __iter__(self) -> Generator[tuple[str, "torch.Tensor"], None, None]: + if self.filepath.endswith(".safetensors"): + with safe_open(self.filepath, framework="pt", device="cpu") as f: + for key in f.keys(): + yield key, f.get_tensor(key) + + else: + state_dict = torch.load(self.filepath, map_location="cpu", weights_only=True, mmap=True) + for key in state_dict.keys(): + yield key, state_dict[key] + + +def main(raw_hf_path, merge_hf_path): + torch.set_default_dtype(torch.bfloat16) + os.makedirs(merge_hf_path, exist_ok=True) + + config = AutoConfig.from_pretrained(raw_hf_path) + tokenizer = build_tokenizer(raw_hf_path) + + safetensor_files = list(glob(os.path.join(raw_hf_path, "*.safetensors"))) + safetensor_files.sort() + state_dict_iterators = [StateDictIterator(shard_file) for shard_file in safetensor_files] + new_state_dict = {} + for state_dict_iterator in tqdm(state_dict_iterators, desc="Loading checkpoint shards"): + for name, tensor in state_dict_iterator: + new_state_dict[name] = tensor.cpu() + + print(new_state_dict.keys()) + + if hasattr(config, "num_experts"): + # qwen3moe + num_experts = config.num_experts + elif hasattr(config, "n_routed_experts"): + # deepseek + num_experts = config.n_routed_experts + else: + raise RuntimeError("could not find how many experts to assign") + num_hidden_layers = config.num_hidden_layers + + if hasattr(config, "first_k_dense_replace"): + # deepseek first k dense layer + moe_layer_start_idx = config.first_k_dense_replace + else: + # moe layer only in the model + moe_layer_start_idx = 0 + + for i in range(moe_layer_start_idx, num_hidden_layers): + gate_proj = [] + for j in range(num_experts): + gate_proj.append(new_state_dict.pop(f"model.layers.{i}.mlp.experts.{j}.gate_proj.weight")) + + new_state_dict[f"model.layers.{i}.mlp.experts.gate_proj"] = torch.stack(gate_proj) + up_proj = [] + for j in range(num_experts): + up_proj.append(new_state_dict.pop(f"model.layers.{i}.mlp.experts.{j}.up_proj.weight")) + + new_state_dict[f"model.layers.{i}.mlp.experts.up_proj"] = torch.stack(up_proj) + down_proj = [] + for j in range(num_experts): + down_proj.append(new_state_dict.pop(f"model.layers.{i}.mlp.experts.{j}.down_proj.weight")) + + new_state_dict[f"model.layers.{i}.mlp.experts.down_proj"] = torch.stack(down_proj) + + model_assets = [config, tokenizer] + save_model_weights(merge_hf_path, new_state_dict, model_assets=model_assets) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--raw_hf_path", type=str, required=True) + parser.add_argument("--merge_hf_path", type=str, required=True) + args = parser.parse_args() + main(args.raw_hf_path, args.merge_hf_path) diff --git a/code/RL_model/verl/verl_train/scripts/veomni/moe_split.py b/code/RL_model/verl/verl_train/scripts/veomni/moe_split.py new file mode 100644 index 0000000000000000000000000000000000000000..f38a990466e87eb34aa68eaca71d8b2a38cb3ba4 --- /dev/null +++ b/code/RL_model/verl/verl_train/scripts/veomni/moe_split.py @@ -0,0 +1,96 @@ +# Copyright 2025 Bytedance Ltd. and/or its affiliates + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Reverse process of moe_merge.py - splits merged MoE expert weights back to individual experts. + +This script takes a HF checkpoint that has been processed by moe_merge.py (where expert weights +are stacked into single tensors) and splits them back to the original format with individual +expert weights. + +The process reverses the merging by: +1. Loading stacked tensors like model.layers.{i}.mlp.experts.gate_proj +2. Unstacking them back to individual experts model.layers.{i}.mlp.experts.{j}.gate_proj.weight +3. Handling all three projection types: gate_proj, up_proj, down_proj + +Usage: python moe_split.py --merge_hf_path --split_hf_path +""" + +import os +from argparse import ArgumentParser +from dataclasses import dataclass +from glob import glob +from typing import Generator + +import torch +from safetensors.torch import safe_open +from tqdm import tqdm +from transformers import AutoConfig +from veomni.models import build_tokenizer, save_model_weights + + +@dataclass +class StateDictIterator: + filepath: str + + def __iter__(self) -> Generator[tuple[str, "torch.Tensor"], None, None]: + if self.filepath.endswith(".safetensors"): + with safe_open(self.filepath, framework="pt", device="cpu") as f: + for key in f.keys(): + yield key, f.get_tensor(key) + + else: + state_dict = torch.load(self.filepath, map_location="cpu", weights_only=True, mmap=True) + for key in state_dict.keys(): + yield key, state_dict[key] + + +def main(merge_hf_path, split_hf_path): + torch.set_default_dtype(torch.bfloat16) + os.makedirs(split_hf_path, exist_ok=True) + + config = AutoConfig.from_pretrained(merge_hf_path) + tokenizer = build_tokenizer(merge_hf_path) + + safetensor_files = list(glob(os.path.join(merge_hf_path, "*.safetensors"))) + safetensor_files.sort() + state_dict_iterators = [StateDictIterator(shard_file) for shard_file in safetensor_files] + new_state_dict = {} + for state_dict_iterator in tqdm(state_dict_iterators, desc="Loading checkpoint shards"): + for name, tensor in state_dict_iterator: + new_state_dict[name] = tensor.cpu() + + num_experts = config.num_experts + num_hidden_layers = config.num_hidden_layers + for i in range(num_hidden_layers): + print(f"Converting layer {i}") + for proj_name in ["gate_proj", "up_proj", "down_proj"]: + stacked_key = f"model.layers.{i}.mlp.experts.{proj_name}" + if stacked_key in new_state_dict: + stacked_tensor = new_state_dict.pop(stacked_key) + for j in range(num_experts): + expert_key = f"model.layers.{i}.mlp.experts.{j}.{proj_name}.weight" + new_state_dict[expert_key] = stacked_tensor[j] + + model_assets = [config, tokenizer] + + print("Saving to safetensors") + save_model_weights(split_hf_path, new_state_dict, model_assets=model_assets) + + +if __name__ == "__main__": + parser = ArgumentParser() + parser.add_argument("--merge_hf_path", type=str, required=True) + parser.add_argument("--split_hf_path", type=str, required=True) + args = parser.parse_args() + main(args.merge_hf_path, args.split_hf_path) diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/run-yafkkssq.wandb b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/run-yafkkssq.wandb new file mode 100644 index 0000000000000000000000000000000000000000..22ecee240b4f90ea9cf45a0d3f475f2a146ca301 Binary files /dev/null and b/code/RL_model/verl/verl_train/wandb/run-20260202_002102-yafkkssq/run-yafkkssq.wandb differ diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/run-yi6yp3s2.wandb b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/run-yi6yp3s2.wandb new file mode 100644 index 0000000000000000000000000000000000000000..b2501f7ef948933f14b9ed9d962f1fdf875debd5 Binary files /dev/null and b/code/RL_model/verl/verl_train/wandb/run-20260202_003348-yi6yp3s2/run-yi6yp3s2.wandb differ diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..df7d07761e7fbd809313a9ab1c91fe4c72c64b0f --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/config.yaml @@ -0,0 +1,813 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + fuf68lnlibccqvxk70m1032ahqwkzxbk: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=41231 + - --object-store-name=/tmp/ray/session_2026-02-02_00-55-57_658139_2118443/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-02_00-55-57_658139_2118443/sockets/raylet + - --redis-address=None + - --metrics-agent-port=46572 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=53339 + - --gcs-address=172.16.34.29:64292 + - --session-name=session_2026-02-02_00-55-57_658139_2118443 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8301 + - --cluster-id=3fb45a5efab9c85f2d2dc92d825cba072fe6d77d03b5cf1471358a50 + - --startup-token=128 + - --worker-launch-time-ms=1770011772784 + - --node-id=fba45097558b85364cb7345825a4d246ab3119c5c2a395a3e02fd1c9 + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "182816346112" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-02T06:01:57.990006Z" + writerId: fuf68lnlibccqvxk70m1032ahqwkzxbk + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "2": + - 1 + - 11 + - 30 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + "3": + - 2 + - 13 + - 16 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 32 + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: false + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.6 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: 8192 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 3 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 512 + quantization: null + quantization_config_file: null + response_length: 768 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 90 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 512 + max_response_length: 768 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 512 + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-optimized-multiclinsum-gs + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: null + max_critic_ckpt_to_keep: null + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 20 + test_freq: 5 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..25215981b2894427eeca8423d2d416d8ae0ac1e6 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/output.log @@ -0,0 +1,6 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/requirements.txt b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..c168a7097ef00c84a34fe1db69f95d6eaafb03f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/requirements.txt @@ -0,0 +1,269 @@ +verl==0.8.0.dev0 +psutil==7.1.3 +colorama==0.4.6 +annotated-doc==0.0.4 +sentry-sdk==2.51.0 +requests==2.32.5 +nvidia-cufile-cu12==1.13.1.3 +ml_dtypes==0.5.4 +xformers==0.0.32.post1 +sglang==0.5.2 +multidict==6.7.1 +typing_extensions==4.15.0 +nvidia-cusparselt-cu12==0.7.1 +openai-harmony==0.0.4 +transformers==4.56.1 +Werkzeug==3.1.5 +identify==2.6.16 +pytest==9.0.2 +nvidia-cuda-runtime-cu12==12.8.90 +GitPython==3.1.46 +cupy-cuda12x==13.6.0 +tokenizers==0.22.2 +pybind11==3.0.1 +google-api-core==2.29.0 +partial-json-parser==0.2.1.1.post7 +aiohttp-cors==0.8.1 +sniffio==1.3.1 +tensordict==0.10.0 +smart_open==7.5.0 +cffi==2.0.0 +wcwidth==0.5.3 +asttokens==3.0.1 +opencensus==0.11.4 +rpds-py==0.30.0 +py-spy==0.4.1 +gguf==0.17.1 +nvidia-nvjitlink-cu12==12.8.93 +httpx==0.28.1 +cuda-python==13.1.1 +annotated-types==0.7.0 +regex==2026.1.15 +vllm==0.11.0 +idna==3.11 +fsspec==2025.10.0 +parso==0.8.5 +pydantic-extra-types==2.11.0 +MarkupSafe==3.0.3 +cryptography==46.0.4 +filelock==3.20.3 +modelscope==1.34.0 +outlines==0.1.11 +dnspython==2.8.0 +scipy==1.17.0 +zipp==3.23.0 +PyYAML==6.0.3 +onnx==1.20.1 +torchdata==0.11.0 +cuda-pathfinder==1.3.3 +verl==0.8.0.dev0 +httptools==0.7.1 +opencv-python-headless==4.13.0.90 +importlib_metadata==8.7.1 +peft==0.18.1 +opentelemetry-sdk==1.39.1 +python-json-logger==4.0.0 +cuda-bindings==13.1.1 +mdurl==0.1.2 +referencing==0.37.0 +xxhash==3.6.0 +interegular==0.3.3 +fastapi-cli==0.0.20 +uv==0.9.28 +tensorboard==2.20.0 +nvidia-cublas-cu12==12.8.4.1 +sentencepiece==0.2.1 +rich-toolkit==0.18.1 +numpy==2.2.0 +yarl==1.22.0 +opencv-fixer==0.2.5 +tqdm==4.67.2 +python-dotenv==1.2.1 +timm==1.0.16 +aiohappyeyeballs==2.6.1 +decord==0.6.0 +jiter==0.12.0 +airportsdata==20250909 +markdown-it-py==4.0.0 +nvidia-cusolver-cu12==11.7.3.90 +pyarrow==23.0.0 +opentelemetry-proto==1.39.1 +anyio==4.12.1 +pycryptodomex==3.23.0 +prometheus_client==0.24.1 +aiohttp==3.13.3 +urllib3==2.6.3 +pexpect==4.9.0 +pydantic-settings==2.12.0 +distro==1.9.0 +av==16.1.0 +cloudpickle==3.1.2 +mpmath==1.3.0 +certifi==2026.1.4 +antlr4-python3-runtime==4.9.3 +torchvision==0.23.0 +accelerate==1.12.0 +watchfiles==1.1.1 +ruff==0.14.14 +wheel==0.46.3 +omegaconf==2.3.0 +nvidia-cufft-cu12==11.3.3.83 +multiprocess==0.70.18 +frozendict==2.4.7 +sympy==1.14.0 +setproctitle==1.3.7 +setuptools==79.0.1 +py-cpuinfo==9.0.0 +ipython_pygments_lexers==1.1.1 +rich==14.3.2 +uvicorn==0.40.0 +openai==1.99.1 +outlines_core==0.2.11 +llvmlite==0.44.0 +nvidia-cuda-cupti-cu12==12.8.90 +attrs==25.4.0 +anthropic==0.77.0 +packaging==25.0 +fastrlock==0.8.3 +astor==0.8.1 +pluggy==1.6.0 +nvidia-cuda-nvrtc-cu12==12.8.93 +psutil==7.2.2 +virtualenv==20.36.1 +cbor2==5.8.0 +compressed-tensors==0.11.0 +nvidia-cusparse-cu12==12.5.8.93 +networkx==3.6.1 +httpcore==1.0.9 +onnxscript==0.3.1 +smmap==5.0.2 +opencv-python==4.13.0.90 +traitlets==5.14.3 +python-multipart==0.0.22 +pyvers==0.1.0 +huggingface-hub==0.36.0 +pillow==12.1.0 +jsonschema==4.26.0 +cfgv==3.5.0 +optree==0.18.0 +email-validator==2.3.0 +tabulate==0.9.0 +pre_commit==4.5.1 +msgpack==1.1.2 +depyf==0.19.0 +numba==0.61.2 +six==1.17.0 +aiosignal==1.4.0 +nvidia-nvtx-cu12==12.8.90 +propcache==0.4.1 +torch_memory_saver==0.0.8 +h11==0.16.0 +frozenlist==1.8.0 +websockets==16.0 +nvidia-cudnn-frontend==1.18.0 +build==1.4.0 +google-auth==2.48.0 +pycountry==24.6.1 +stack-data==0.6.3 +typing-inspection==0.4.2 +googleapis-common-protos==1.72.0 +pandas==3.0.0 +typer==0.21.1 +protobuf==6.33.5 +fastapi==0.128.0 +blake3==1.0.8 +opentelemetry-semantic-conventions==0.60b1 +opentelemetry-exporter-prometheus==0.60b1 +nvidia-cudnn-cu12==9.10.2.21 +Markdown==3.10.1 +liger_kernel==0.6.4 +nodeenv==1.10.0 +prompt_toolkit==3.0.52 +torchaudio==2.8.0 +codetiming==1.4.0 +platformdirs==4.5.1 +jsonschema-specifications==2025.9.1 +hydra-core==1.3.2 +tensorboard-data-server==0.7.2 +lm-format-enforcer==0.11.3 +pyasn1_modules==0.4.2 +tiktoken==0.12.0 +starlette==0.50.0 +pyproject_hooks==1.2.0 +flash_attn==2.8.1 +rsa==4.9.1 +ray==2.53.0 +nest-asyncio==1.6.0 +lark==1.2.2 +fastar==0.8.0 +orjson==3.11.6 +prometheus-fastapi-instrumentator==7.1.0 +opentelemetry-api==1.39.1 +mathruler==0.1.0 +pydantic_core==2.41.5 +fastapi-cloud-cli==0.11.0 +pynvml==13.0.1 +loguru==0.7.3 +torch==2.8.0 +msgspec==0.20.0 +nvidia-curand-cu12==10.3.9.90 +blobfile==3.0.0 +gitdb==4.0.12 +llguidance==0.7.30 +hf_transfer==0.1.9 +nvidia-nccl-cu12==2.27.3 +qwen-vl-utils==0.0.14 +ptyprocess==0.7.0 +ipdb==0.13.13 +opencensus-context==0.1.3 +jedi==0.19.2 +click==8.3.1 +datasets==4.5.0 +soxr==1.0.0 +sgl-kernel==0.3.9.post2 +colorful==0.5.8 +pyasn1==0.6.2 +charset-normalizer==3.4.4 +nvidia-ml-py==13.590.48 +hf-xet==1.2.0 +dill==0.4.0 +absl-py==2.4.0 +pydantic==2.12.5 +wrapt==2.1.0 +flashinfer-python==0.3.1 +python-dateutil==2.9.0.post0 +torchao==0.9.0 +cachetools==7.0.0 +soundfile==0.13.1 +diskcache==5.6.3 +onnx-ir==0.1.15 +docstring_parser==0.17.0 +matplotlib-inline==0.2.1 +Pygments==2.19.2 +wandb==0.24.1 +pure_eval==0.2.3 +ninja==1.13.0 +proto-plus==1.27.0 +pyzmq==27.1.0 +iniconfig==2.3.0 +Jinja2==3.1.6 +megatron-core==0.13.1 +uvloop==0.22.1 +pycparser==3.0 +pylatexenc==2.10 +decorator==5.2.1 +shellingham==1.5.4 +lxml==6.0.2 +safetensors==0.7.0 +xgrammar==0.1.25 +pybase64==1.4.3 +ipython==9.9.0 +mistral_common==1.9.0 +rignore==0.7.6 +einops==0.8.2 +distlib==0.4.0 +triton==3.4.0 +executing==2.2.1 +grpcio==1.76.0 +pip==25.3 +verl==0.8.0.dev0 diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/wandb-metadata.json b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/wandb-metadata.json new file mode 100644 index 0000000000000000000000000000000000000000..7cedb2d254237655940edeaeacdce77b6cb61643 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/wandb-metadata.json @@ -0,0 +1,93 @@ +{ + "os": "Linux-5.15.0-160-generic-x86_64-with-glibc2.35", + "python": "CPython 3.12.12", + "startedAt": "2026-02-02T06:01:57.990006Z", + "args": [ + "--node-ip-address=172.16.34.29", + "--node-manager-port=41231", + "--object-store-name=/tmp/ray/session_2026-02-02_00-55-57_658139_2118443/sockets/plasma_store", + "--raylet-name=/tmp/ray/session_2026-02-02_00-55-57_658139_2118443/sockets/raylet", + "--redis-address=None", + "--metrics-agent-port=46572", + "--logging-rotate-bytes=536870912", + "--logging-rotate-backup-count=5", + "--runtime-env-agent-port=53339", + "--gcs-address=172.16.34.29:64292", + "--session-name=session_2026-02-02_00-55-57_658139_2118443", + "--temp-dir=/tmp/ray", + "--webui=127.0.0.1:8301", + "--cluster-id=3fb45a5efab9c85f2d2dc92d825cba072fe6d77d03b5cf1471358a50", + "--startup-token=128", + "--worker-launch-time-ms=1770011772784", + "--node-id=fba45097558b85364cb7345825a4d246ab3119c5c2a395a3e02fd1c9", + "--runtime-env-hash=1096984665" + ], + "program": "/home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py", + "git": { + "remote": "https://github.com/verl-project/verl", + "commit": "d9939add7a2a01923a9088891f913a5d20c4e622" + }, + "email": "shahidulshakib034@gmail.com", + "root": "/data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train", + "host": "gamma", + "executable": "/home/mshahidul/miniconda3/envs/verl2/bin/python3", + "cpu_count": 64, + "cpu_count_logical": 128, + "gpu": "NVIDIA A100 80GB PCIe", + "gpu_count": 6, + "disk": { + "/": { + "total": "3766429188096", + "used": "182816346112" + } + }, + "memory": { + "total": "1081814863872" + }, + "gpu_nvidia": [ + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-df506764-0db5-91b4-8ec9-154a3bb8123f" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538" + }, + { + "name": "NVIDIA A100 80GB PCIe", + "memoryTotal": "85899345920", + "cudaCores": 6912, + "architecture": "Ampere", + "uuid": "GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb" + }, + { + "name": "NVIDIA H100 PCIe", + "memoryTotal": "85520809984", + "cudaCores": 14592, + "architecture": "Hopper", + "uuid": "GPU-d42b6057-13e8-1e88-6aa1-9307df72dece" + } + ], + "cudaVersion": "13.0", + "writerId": "fuf68lnlibccqvxk70m1032ahqwkzxbk" +} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/wandb-summary.json b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/wandb-summary.json new file mode 100644 index 0000000000000000000000000000000000000000..c90edda74f6e5a2a5eb008d0c2f1dd0de08aa4f7 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/files/wandb-summary.json @@ -0,0 +1 @@ +{"_runtime":82,"_wandb":{"runtime":82}} \ No newline at end of file diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug-core.log b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug-core.log new file mode 100644 index 0000000000000000000000000000000000000000..fbe160fb08386e559dffef07bae3054c0f7d2985 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug-core.log @@ -0,0 +1,11 @@ +{"time":"2026-02-02T01:01:58.156691718-05:00","level":"INFO","msg":"main: starting server","port-filename":"/tmp/tmpolpydbzo/port-2126993.txt","pid":2126993,"log-level":0,"disable-analytics":false,"shutdown-on-parent-exit":false,"enable-dcgm-profiling":false} +{"time":"2026-02-02T01:01:58.157903644-05:00","level":"INFO","msg":"server: will exit if parent process dies","ppid":2126993} +{"time":"2026-02-02T01:01:58.157883049-05:00","level":"INFO","msg":"server: accepting connections","addr":{"Name":"/tmp/wandb-2126993-2137614-1550517905/socket","Net":"unix"}} +{"time":"2026-02-02T01:01:58.311753734-05:00","level":"INFO","msg":"connection: ManageConnectionData: new connection created","id":"1(@)"} +{"time":"2026-02-02T01:01:58.325882885-05:00","level":"INFO","msg":"handleInformInit: received","streamId":"i30r1tt3","id":"1(@)"} +{"time":"2026-02-02T01:02:00.237452689-05:00","level":"INFO","msg":"handleInformInit: stream started","streamId":"i30r1tt3","id":"1(@)"} +{"time":"2026-02-02T01:02:07.147226508-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"z0k9sho6yl49"} +{"time":"2026-02-02T01:03:22.63651528-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"z0k9sho6yl49"} +{"time":"2026-02-02T01:03:23.241240894-05:00","level":"INFO","msg":"connection: cancelling request","id":"1(@)","requestId":"z0k9sho6yl49"} +{"time":"2026-02-02T01:03:23.243936572-05:00","level":"INFO","msg":"handleInformFinish: finish message received","streamId":"i30r1tt3","id":"1(@)"} +{"time":"2026-02-02T01:03:23.246321952-05:00","level":"INFO","msg":"handleInformFinish: stream closed","streamId":"i30r1tt3","id":"1(@)"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug-internal.log b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug-internal.log new file mode 100644 index 0000000000000000000000000000000000000000..e54375be33a3ea82ebea37eb7253e19a34d5f9ea --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug-internal.log @@ -0,0 +1,12 @@ +{"time":"2026-02-02T01:01:58.327001572-05:00","level":"INFO","msg":"stream: starting","core version":"0.24.1"} +{"time":"2026-02-02T01:02:00.234182291-05:00","level":"INFO","msg":"stream: created new stream","id":"i30r1tt3"} +{"time":"2026-02-02T01:02:00.234395295-05:00","level":"INFO","msg":"handler: started","stream_id":"i30r1tt3"} +{"time":"2026-02-02T01:02:00.237397058-05:00","level":"INFO","msg":"stream: started","id":"i30r1tt3"} +{"time":"2026-02-02T01:02:00.237426491-05:00","level":"INFO","msg":"writer: started","stream_id":"i30r1tt3"} +{"time":"2026-02-02T01:02:00.237461995-05:00","level":"INFO","msg":"sender: started","stream_id":"i30r1tt3"} +{"time":"2026-02-02T01:03:23.096185701-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-02T01:03:23.235995802-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2026-02-02T01:03:23.243976554-05:00","level":"INFO","msg":"stream: closing","id":"i30r1tt3"} +{"time":"2026-02-02T01:03:23.244006004-05:00","level":"INFO","msg":"handler: closed","stream_id":"i30r1tt3"} +{"time":"2026-02-02T01:03:23.245634572-05:00","level":"INFO","msg":"sender: closed","stream_id":"i30r1tt3"} +{"time":"2026-02-02T01:03:23.245676977-05:00","level":"INFO","msg":"stream: closed","id":"i30r1tt3"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..fb78edb84ceff7e5ad76c345220e0a9063904372 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug.log @@ -0,0 +1,24 @@ +2026-02-02 01:01:58,009 INFO MainThread:2126993 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-02 01:01:58,010 INFO MainThread:2126993 [wandb_setup.py:_flush():81] Configure stats pid to 2126993 +2026-02-02 01:01:58,010 INFO MainThread:2126993 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-02 01:01:58,010 INFO MainThread:2126993 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug.log +2026-02-02 01:01:58,010 INFO MainThread:2126993 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/logs/debug-internal.log +2026-02-02 01:01:58,010 INFO MainThread:2126993 [wandb_init.py:init():844] calling init triggers +2026-02-02 01:01:58,012 INFO MainThread:2126993 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 32, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 512, 'response_length': 768, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.6, 'ignore_eos': False, 'enforce_eager': False, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 512, 'max_response_length': 768, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 90, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-optimized-multiclinsum-gs', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 20, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 5, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': 'checkpoints/readctrl-verl/qwen3-4b-instruct-optimized-multiclinsum-gs', 'max_actor_ckpt_to_keep': None, 'max_critic_ckpt_to_keep': None, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto'}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-02 01:01:58,012 INFO MainThread:2126993 [wandb_init.py:init():892] starting backend +2026-02-02 01:01:58,312 INFO MainThread:2126993 [wandb_init.py:init():895] sending inform_init request +2026-02-02 01:01:58,320 INFO MainThread:2126993 [wandb_init.py:init():903] backend started and connected +2026-02-02 01:01:58,333 INFO MainThread:2126993 [wandb_init.py:init():973] updated telemetry +2026-02-02 01:01:58,360 INFO MainThread:2126993 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-02 01:02:00,586 INFO MainThread:2126993 [wandb_init.py:init():1042] starting run threads in backend +2026-02-02 01:02:02,118 INFO MainThread:2126993 [wandb_run.py:_console_start():2529] atexit reg +2026-02-02 01:02:02,118 INFO MainThread:2126993 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-02 01:02:02,119 INFO MainThread:2126993 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-02 01:02:02,119 INFO MainThread:2126993 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-02 01:02:02,129 INFO MainThread:2126993 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-02 01:03:22,632 INFO MainThread:2126993 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/i30r1tt3 +2026-02-02 01:03:22,633 INFO MainThread:2126993 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-02 01:03:22,634 INFO MainThread:2126993 [wandb_run.py:_restore():2476] restore +2026-02-02 01:03:22,635 INFO MainThread:2126993 [wandb_run.py:_restore():2482] restore done +2026-02-02 01:03:23,240 INFO MainThread:2126993 [wandb_run.py:_footer_sync_info():3871] logging synced files diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/run-i30r1tt3.wandb b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/run-i30r1tt3.wandb new file mode 100644 index 0000000000000000000000000000000000000000..e9083ae61577c444c4e180ce249e0606e1b57eb6 Binary files /dev/null and b/code/RL_model/verl/verl_train/wandb/run-20260202_010157-i30r1tt3/run-i30r1tt3.wandb differ diff --git a/code/RL_model/verl/verl_train/wandb/run-20260202_092950-nfoupjps/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260202_092950-nfoupjps/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..8c4c4c7448e73ff9601cde3741747ad83fae3182 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260202_092950-nfoupjps/files/output.log @@ -0,0 +1,14 @@ +wandb: Detected [openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/RL_model/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(-4.696390149172615), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(-4.696390188374168), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(-4.696390149172615) - val-core/multiclinsum/acc/mean@1:np.float64(-4.696390188374168) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/90 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-12T19:59:33.769556165-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/cje0bmdl/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-12T20:34:48.778448264-05:00","level":"INFO","msg":"api: retrying HTTP error","status":502,"url":"https://api.wandb.ai/files/shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/cje0bmdl/file_stream","body":"\n\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} +{"time":"2026-02-12T23:45:58.600140119-05:00","level":"INFO","msg":"fileTransfer: Close: file transfer manager closed"} +{"time":"2026-02-12T23:45:58.759458619-05:00","level":"INFO","msg":"handler: operation stats","stats":{}} +{"time":"2026-02-12T23:45:58.771998992-05:00","level":"INFO","msg":"stream: closing","id":"cje0bmdl"} +{"time":"2026-02-12T23:45:58.77201155-05:00","level":"INFO","msg":"handler: closed","stream_id":"cje0bmdl"} +{"time":"2026-02-12T23:45:58.774612639-05:00","level":"INFO","msg":"sender: closed","stream_id":"cje0bmdl"} +{"time":"2026-02-12T23:45:58.774635954-05:00","level":"INFO","msg":"stream: closed","id":"cje0bmdl"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260211_190231-cje0bmdl/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260211_190231-cje0bmdl/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..863920e7baffbd9cf54287c6f19a9b4ea4c4ea58 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260211_190231-cje0bmdl/logs/debug.log @@ -0,0 +1,24 @@ +2026-02-11 19:02:31,742 INFO MainThread:638572 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-11 19:02:31,742 INFO MainThread:638572 [wandb_setup.py:_flush():81] Configure stats pid to 638572 +2026-02-11 19:02:31,742 INFO MainThread:638572 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-11 19:02:31,742 INFO MainThread:638572 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260211_190231-cje0bmdl/logs/debug.log +2026-02-11 19:02:31,742 INFO MainThread:638572 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260211_190231-cje0bmdl/logs/debug-internal.log +2026-02-11 19:02:31,742 INFO MainThread:638572 [wandb_init.py:init():844] calling init triggers +2026-02-11 19:02:31,743 INFO MainThread:638572 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 45, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'optimizer_offload': True, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 16, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 45, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-en', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 5, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 10, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-11 19:02:31,743 INFO MainThread:638572 [wandb_init.py:init():892] starting backend +2026-02-11 19:02:32,025 INFO MainThread:638572 [wandb_init.py:init():895] sending inform_init request +2026-02-11 19:02:32,031 INFO MainThread:638572 [wandb_init.py:init():903] backend started and connected +2026-02-11 19:02:32,037 INFO MainThread:638572 [wandb_init.py:init():973] updated telemetry +2026-02-11 19:02:32,055 INFO MainThread:638572 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-11 19:02:33,716 INFO MainThread:638572 [wandb_init.py:init():1042] starting run threads in backend +2026-02-11 19:02:34,758 INFO MainThread:638572 [wandb_run.py:_console_start():2529] atexit reg +2026-02-11 19:02:34,758 INFO MainThread:638572 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-11 19:02:34,759 INFO MainThread:638572 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-11 19:02:34,759 INFO MainThread:638572 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-11 19:02:34,776 INFO MainThread:638572 [wandb_init.py:init():1082] run started, returning control to user process +2026-02-12 23:45:58,010 INFO MainThread:638572 [wandb_run.py:_finish():2295] finishing run shahidulshakib034-khulna-university-of-engineering-techn/readctrl-verl/cje0bmdl +2026-02-12 23:45:58,011 INFO MainThread:638572 [wandb_run.py:_atexit_cleanup():2494] got exitcode: 0 +2026-02-12 23:45:58,013 INFO MainThread:638572 [wandb_run.py:_restore():2476] restore +2026-02-12 23:45:58,013 INFO MainThread:638572 [wandb_run.py:_restore():2482] restore done +2026-02-12 23:45:58,769 INFO MainThread:638572 [wandb_run.py:_footer_sync_info():3871] logging synced files diff --git a/code/RL_model/verl/verl_train/wandb/run-20260213_012459-7qz9wu2i/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260213_012459-7qz9wu2i/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..fc57f5d3541010cfee0b9b9e49629816d0f96e41 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260213_012459-7qz9wu2i/files/output.log @@ -0,0 +1,14 @@ +wandb: Detected [dspy, litellm, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(2.61803122623238), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(2.6180312253643105), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(2.61803122623238) - val-core/multiclinsum/acc/mean@1:np.float64(2.6180312253643105) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/45 [00:00\n\n502 Server Error\n\n\n

Error: Server Error

\n

The server encountered a temporary error and could not complete your request.

Please try again in 30 seconds.

\n

\n\n"} diff --git a/code/RL_model/verl/verl_train/wandb/run-20260213_024109-70p0ly3w/logs/debug.log b/code/RL_model/verl/verl_train/wandb/run-20260213_024109-70p0ly3w/logs/debug.log new file mode 100644 index 0000000000000000000000000000000000000000..9c91a0448d9ab5752b27ae9a355627fb0e0ae4ff --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260213_024109-70p0ly3w/logs/debug.log @@ -0,0 +1,19 @@ +2026-02-13 02:41:09,153 INFO MainThread:4029501 [wandb_setup.py:_flush():81] Current SDK version is 0.24.1 +2026-02-13 02:41:09,153 INFO MainThread:4029501 [wandb_setup.py:_flush():81] Configure stats pid to 4029501 +2026-02-13 02:41:09,153 INFO MainThread:4029501 [wandb_setup.py:_flush():81] Loading settings from environment variables +2026-02-13 02:41:09,153 INFO MainThread:4029501 [wandb_init.py:setup_run_log_directory():717] Logging user logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260213_024109-70p0ly3w/logs/debug.log +2026-02-13 02:41:09,153 INFO MainThread:4029501 [wandb_init.py:setup_run_log_directory():718] Logging internal logs to /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/wandb/run-20260213_024109-70p0ly3w/logs/debug-internal.log +2026-02-13 02:41:09,153 INFO MainThread:4029501 [wandb_init.py:init():844] calling init triggers +2026-02-13 02:41:09,154 INFO MainThread:4029501 [wandb_init.py:init():849] wandb.init called with sweep_config: {} +config: {'actor_rollout_ref': {'actor': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-06, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 45, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'optimizer_offload': True, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': 16, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 16384, 'clip_ratio': 0.2, 'clip_ratio_low': 0.2, 'clip_ratio_high': 0.2, 'tau_pos': 1.0, 'tau_neg': 1.05, 'freeze_vision_tower': False, 'policy_loss': {'_target_': 'verl.workers.config.PolicyLossConfig', 'loss_mode': 'vanilla', 'clip_cov_ratio': 0.0002, 'clip_cov_lb': 1.0, 'clip_cov_ub': 5.0, 'kl_cov_ratio': 0.0002, 'ppo_kl_coef': 0.1}, 'clip_ratio_c': 3.0, 'loss_agg_mode': 'token-mean', 'loss_scale_factor': None, 'entropy_coeff': 0, 'calculate_entropy': False, 'use_kl_loss': True, 'use_prefix_grouper': False, 'use_torch_compile': True, 'kl_loss_coef': 0.001, 'kl_loss_type': 'low_var_kl', 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'use_fused_kernels': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'grad_clip': 1.0, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False, 'use_remove_padding': True, 'calculate_sum_pi_squared': False, 'sum_pi_squared_checkpointing': False}, 'ref': {'rollout_n': 3, 'strategy': 'fsdp', 'use_torch_compile': True, 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'router_replay': {'_target_': 'verl.workers.config.RouterReplayConfig', 'mode': 'disabled', 'record_file': None, 'replay_file': None}, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': True, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': True, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, '_target_': 'verl.workers.config.FSDPActorConfig', 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'entropy_checkpointing': False}, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': 'vllm', 'mode': 'async', 'temperature': 1.0, 'top_k': -1, 'top_p': 1, 'prompt_length': 1024, 'response_length': 2048, 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.4, 'ignore_eos': False, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'tensor_model_parallel_size': 1, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'max_num_batched_tokens': 8192, 'max_model_len': 8192, 'max_num_seqs': 1024, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'logprobs_mode': 'processed_logprobs', 'scheduling_policy': 'fcfs', 'load_format': 'dummy', 'log_prob_micro_batch_size': None, 'log_prob_micro_batch_size_per_gpu': 32, 'log_prob_use_dynamic_bsz': False, 'log_prob_max_token_len_per_gpu': 16384, 'disable_log_stats': True, 'do_sample': True, 'n': 3, 'over_sample_rate': 0, 'multi_stage_wake_up': False, 'engine_kwargs': {'vllm': {}, 'sglang': {}, 'trtllm': {}}, 'val_kwargs': {'_target_': 'verl.workers.config.SamplingConfig', 'top_k': -1, 'top_p': 1.0, 'temperature': 0, 'n': 1, 'do_sample': False}, 'multi_turn': {'_target_': 'verl.workers.config.MultiTurnConfig', 'enable': False, 'max_assistant_turns': None, 'tool_config_path': None, 'max_user_turns': None, 'max_parallel_calls': 1, 'max_tool_response_length': 256, 'tool_response_truncate_side': 'middle', 'interaction_config_path': None, 'use_inference_chat_template': False, 'tokenization_sanity_check_mode': 'strict', 'format': 'hermes', 'num_repeat_rollouts': None}, 'calculate_log_probs': False, 'agent': {'_target_': 'verl.workers.config.AgentLoopConfig', 'num_workers': 8, 'default_agent_loop': 'single_turn_agent', 'agent_loop_config_path': None, 'custom_async_server': {'_target_': 'verl.workers.config.CustomAsyncServerConfig', 'path': None, 'name': None}}, 'checkpoint_engine': {'_target_': 'verl.workers.config.CheckpointEngineConfig', 'backend': 'naive', 'update_weights_bucket_megabytes': 2048, 'engine_kwargs': {}}, 'trace': {'_target_': 'verl.workers.config.TraceConfig', 'backend': None, 'token2text': False, 'max_samples_per_step_per_worker': None}, 'skip_rollout': False, 'skip_dump_dir': '/tmp/rollout_dump', 'skip_tokenizer_init': True, 'enable_rollout_routing_replay': False, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'prometheus': {'_target_': 'verl.workers.config.PrometheusConfig', 'enable': False, 'port': 9090, 'file': '/tmp/ray/session_latest/metrics/prometheus/prometheus.yml', 'served_model_name': 'Qwen/Qwen3-4B-Instruct-2507'}, 'quantization': None, 'quantization_config_file': None, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}, 'layered_summon': False}, 'model': {'_target_': 'verl.workers.config.HFModelConfig', 'path': 'Qwen/Qwen3-4B-Instruct-2507', 'hf_config_path': None, 'tokenizer_path': None, 'use_shm': False, 'trust_remote_code': False, 'custom_chat_template': None, 'external_lib': None, 'override_config': {}, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': True, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'exclude_modules': None, 'lora_adapter_path': None, 'use_liger': False, 'use_fused_kernels': False, 'fused_kernel_options': {'impl_backend': 'torch'}, 'tiled_mlp': {'enabled': False, 'num_shards': 4}, 'mtp': {'_target_': 'verl.workers.config.MtpConfig', 'enable': False, 'enable_train': False, 'enable_rollout': False, 'detach_encoder': False, 'mtp_loss_scaling_factor': 0.1, 'speculative_algorithm': 'EAGLE', 'speculative_num_steps': 3, 'speculative_eagle_topk': 1, 'speculative_num_draft_tokens': 4, 'method': 'mtp', 'num_speculative_tokens': 1}}, 'hybrid_engine': True, 'nccl_timeout': 600}, 'data': {'tokenizer': None, 'use_shm': False, 'train_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet', 'val_files': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet', 'train_max_samples': -1, 'val_max_samples': -1, 'prompt_key': 'prompt', 'reward_fn_key': 'data_source', 'max_prompt_length': 1024, 'max_response_length': 2048, 'train_batch_size': 512, 'val_batch_size': None, 'tool_config_path': None, 'return_raw_input_ids': False, 'return_raw_chat': True, 'return_full_prompt': False, 'shuffle': True, 'seed': None, 'dataloader_num_workers': 8, 'image_patch_size': 14, 'validation_shuffle': False, 'filter_overlong_prompts': True, 'filter_overlong_prompts_workers': 1, 'truncation': 'error', 'image_key': 'images', 'video_key': 'videos', 'trust_remote_code': False, 'custom_cls': {'path': None, 'name': None}, 'return_multi_modal_inputs': True, 'sampler': {'class_path': None, 'class_name': None}, 'datagen': {'path': None, 'name': None}, 'apply_chat_template_kwargs': {}}, 'reward_manager': {'_target_': 'verl.trainer.config.config.RewardManagerConfig', 'source': 'register', 'name': 'naive', 'module': {'_target_': 'verl.trainer.config.config.ModuleConfig', 'path': None, 'name': 'custom_reward_manager'}}, 'critic': {'optim': {'_target_': 'verl.workers.config.FSDPOptimizerConfig', 'optimizer': 'AdamW', 'optimizer_impl': 'torch.optim', 'lr': 1e-05, 'lr_warmup_steps_ratio': 0.0, 'total_training_steps': 45, 'weight_decay': 0.01, 'lr_warmup_steps': -1, 'betas': [0.9, 0.999], 'clip_grad': 1.0, 'min_lr_ratio': 0.0, 'num_cycles': 0.5, 'lr_scheduler_type': 'constant', 'warmup_style': None, 'override_optimizer_config': None}, 'model': {'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'optimizer_offload': False, 'offload_policy': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False, 'model_dtype': 'fp32', 'use_orig_params': False, 'seed': 42, 'full_determinism': False, 'ulysses_sequence_parallel_size': 1, 'entropy_from_logits_with_chunking': False, 'use_torch_compile': True, 'entropy_checkpointing': False, 'forward_only': False, 'strategy': 'fsdp', 'dtype': 'bfloat16'}, 'path': '~/models/deepseek-llm-7b-chat', 'tokenizer_path': 'Qwen/Qwen3-4B-Instruct-2507', 'override_config': {}, 'external_lib': None, 'trust_remote_code': False, '_target_': 'verl.workers.config.FSDPCriticModelCfg', 'use_shm': False, 'enable_gradient_checkpointing': True, 'enable_activation_offload': False, 'use_remove_padding': False, 'lora_rank': 0, 'lora_alpha': 16, 'target_modules': 'all-linear', 'tiled_mlp': {'enabled': False, 'num_shards': 4}}, '_target_': 'verl.workers.config.FSDPCriticConfig', 'rollout_n': 3, 'strategy': 'fsdp', 'enable': None, 'ppo_mini_batch_size': 256, 'ppo_micro_batch_size': None, 'ppo_micro_batch_size_per_gpu': None, 'use_dynamic_bsz': False, 'ppo_max_token_len_per_gpu': 32768, 'forward_max_token_len_per_gpu': 32768, 'ppo_epochs': 1, 'shuffle': False, 'data_loader_seed': 42, 'cliprange_value': 0.5, 'loss_agg_mode': 'token-mean', 'checkpoint': {'_target_': 'verl.trainer.config.CheckpointConfig', 'save_contents': ['model', 'optimizer', 'extra'], 'load_contents': ['model', 'optimizer', 'extra'], 'async_save': False}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'forward_micro_batch_size': None, 'forward_micro_batch_size_per_gpu': None, 'ulysses_sequence_parallel_size': 1, 'grad_clip': 1.0}, 'reward_model': {'enable': False, 'enable_resource_pool': False, 'n_gpus_per_node': 8, 'nnodes': 0, 'strategy': 'fsdp', 'model': {'input_tokenizer': 'Qwen/Qwen3-4B-Instruct-2507', 'path': '~/models/FsfairX-LLaMA3-RM-v0.1', 'external_lib': None, 'trust_remote_code': False, 'override_config': {}, 'use_shm': False, 'use_remove_padding': False, 'use_fused_kernels': False, 'fsdp_config': {'_target_': 'verl.workers.config.FSDPEngineConfig', 'wrap_policy': {'min_num_params': 0}, 'param_offload': False, 'reshard_after_forward': True, 'fsdp_size': -1, 'forward_prefetch': False}}, 'micro_batch_size': None, 'micro_batch_size_per_gpu': None, 'max_length': None, 'use_dynamic_bsz': False, 'forward_max_token_len_per_gpu': 32768, 'reward_manager': 'naive', 'reward_loop_source': 'register', 'reward_loop_module_path': None, 'reward_loop_class_name': None, 'launch_reward_fn_async': False, 'sandbox_fusion': {'url': None, 'max_concurrent': 64, 'memory_limit_mb': 1024}, 'profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'enable': False, 'all_ranks': False, 'ranks': [], 'save_path': 'outputs/profile', 'tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False}, 'npu': {'_target_': 'verl.utils.profiler.config.NPUToolConfig', 'contents': [], 'level': 'level0', 'analysis': True, 'discrete': False}, 'torch': {'_target_': 'verl.utils.profiler.config.TorchProfilerToolConfig', 'contents': [], 'discrete': False}, 'torch_memory': {'_target_': 'verl.utils.profiler.config.TorchMemoryToolConfig', 'trace_alloc_max_entries': 100000, 'stack_depth': 32}}}, 'ulysses_sequence_parallel_size': 1, 'use_reward_loop': True, 'num_workers': 1, 'rollout': {'_target_': 'verl.workers.config.RolloutConfig', 'name': '???', 'dtype': 'bfloat16', 'gpu_memory_utilization': 0.5, 'enforce_eager': True, 'cudagraph_capture_sizes': None, 'free_cache_engine': True, 'data_parallel_size': 1, 'expert_parallel_size': 1, 'tensor_model_parallel_size': 2, 'max_num_batched_tokens': 8192, 'max_model_len': None, 'max_num_seqs': 1024, 'load_format': 'auto', 'engine_kwargs': {}, 'limit_images': None, 'enable_chunked_prefill': True, 'enable_prefix_caching': True, 'disable_log_stats': True, 'skip_tokenizer_init': False, 'prompt_length': 2048, 'response_length': 2048}}, 'algorithm': {'rollout_correction': {'rollout_is': None, 'rollout_is_threshold': 2.0, 'rollout_rs': None, 'rollout_rs_threshold': None, 'bypass_mode': False, 'loss_type': 'ppo_clip', 'rollout_is_batch_normalize': False}, '_target_': 'verl.trainer.config.AlgoConfig', 'gamma': 1.0, 'lam': 1.0, 'adv_estimator': 'grpo', 'norm_adv_by_std_in_grpo': True, 'use_kl_in_reward': False, 'kl_penalty': 'kl', 'kl_ctrl': {'_target_': 'verl.trainer.config.KLControlConfig', 'type': 'fixed', 'kl_coef': 0.001, 'horizon': 10000, 'target_kl': 0.1}, 'use_pf_ppo': False, 'pf_ppo': {'reweight_method': 'pow', 'weight_pow': 2.0}}, 'custom_reward_function': {'path': '/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward.py', 'name': 'compute_score'}, 'trainer': {'balance_batch': True, 'total_epochs': 15, 'total_training_steps': None, 'project_name': 'readctrl-verl', 'experiment_name': 'qwen3-4b-instruct-en', 'logger': ['console', 'wandb'], 'log_val_generations': 0, 'rollout_data_dir': None, 'validation_data_dir': None, 'nnodes': 1, 'n_gpus_per_node': 2, 'save_freq': 5, 'esi_redundant_time': 0, 'resume_mode': 'auto', 'resume_from_path': None, 'val_before_train': True, 'val_only': False, 'test_freq': 10, 'critic_warmup': 0, 'default_hdfs_dir': None, 'del_local_ckpt_after_load': False, 'default_local_dir': '/home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2', 'max_actor_ckpt_to_keep': 1, 'max_critic_ckpt_to_keep': 1, 'ray_wait_register_center_timeout': 300, 'device': 'cuda', 'use_legacy_worker_impl': 'auto', 'remove_previous_ckpt_in_save': True}, 'global_profiler': {'_target_': 'verl.utils.profiler.ProfilerConfig', 'tool': None, 'steps': None, 'profile_continuous_steps': False, 'save_path': 'outputs/profile', 'global_tool_config': {'nsys': {'_target_': 'verl.utils.profiler.config.NsightToolConfig', 'discrete': False, 'controller_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph'}, 'worker_nsight_options': {'trace': 'cuda,nvtx,cublas,ucx', 'cuda-memory-usage': 'true', 'cuda-graph-trace': 'graph', 'capture-range': 'cudaProfilerApi', 'capture-range-end': None, 'kill': 'none'}}, 'torch_memory': {'trace_alloc_max_entries': 100000, 'stack_depth': 32, 'context': 'all', 'stacks': 'all', 'kw_args': {}}}}, 'transfer_queue': {'enable': False}, 'ray_kwargs': {'ray_init': {'num_cpus': None}, 'timeline_json_file': None}, '_wandb': {}} +2026-02-13 02:41:09,154 INFO MainThread:4029501 [wandb_init.py:init():892] starting backend +2026-02-13 02:41:09,439 INFO MainThread:4029501 [wandb_init.py:init():895] sending inform_init request +2026-02-13 02:41:09,449 INFO MainThread:4029501 [wandb_init.py:init():903] backend started and connected +2026-02-13 02:41:09,461 INFO MainThread:4029501 [wandb_init.py:init():973] updated telemetry +2026-02-13 02:41:09,491 INFO MainThread:4029501 [wandb_init.py:init():997] communicating run to backend with 90.0 second timeout +2026-02-13 02:41:11,023 INFO MainThread:4029501 [wandb_init.py:init():1042] starting run threads in backend +2026-02-13 02:41:11,833 INFO MainThread:4029501 [wandb_run.py:_console_start():2529] atexit reg +2026-02-13 02:41:11,834 INFO MainThread:4029501 [wandb_run.py:_redirect():2377] redirect: wrap_raw +2026-02-13 02:41:11,834 INFO MainThread:4029501 [wandb_run.py:_redirect():2446] Wrapping output streams. +2026-02-13 02:41:11,834 INFO MainThread:4029501 [wandb_run.py:_redirect():2469] Redirects installed. +2026-02-13 02:41:11,844 INFO MainThread:4029501 [wandb_init.py:init():1082] run started, returning control to user process diff --git a/code/RL_model/verl/verl_train/wandb/run-20260215_022720-l2pbuwit/files/config.yaml b/code/RL_model/verl/verl_train/wandb/run-20260215_022720-l2pbuwit/files/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..decdf38ce1af4f7f9e6da4c0791511513c0bbbfb --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260215_022720-l2pbuwit/files/config.yaml @@ -0,0 +1,819 @@ +_wandb: + value: + cli_version: 0.24.1 + e: + wp5pmeliennks5qkojngvwh9a8uzjc9c: + args: + - --node-ip-address=172.16.34.29 + - --node-manager-port=39511 + - --object-store-name=/tmp/ray/session_2026-02-15_02-22-37_205252_3325228/sockets/plasma_store + - --raylet-name=/tmp/ray/session_2026-02-15_02-22-37_205252_3325228/sockets/raylet + - --redis-address=None + - --metrics-agent-port=61413 + - --logging-rotate-bytes=536870912 + - --logging-rotate-backup-count=5 + - --runtime-env-agent-port=64723 + - --gcs-address=172.16.34.29:60097 + - --session-name=session_2026-02-15_02-22-37_205252_3325228 + - --temp-dir=/tmp/ray + - --webui=127.0.0.1:8297 + - --cluster-id=5504f58ff048c9f7f62ad2f8778cb34d6f9d254ebad862b6d823e9e0 + - --startup-token=128 + - --worker-launch-time-ms=1771140170294 + - --node-id=5ef9bb9e2fac4b8223fbe3c0bc2f2170d9c493a8c14e4dac39152109 + - --runtime-env-hash=1096984665 + cpu_count: 64 + cpu_count_logical: 128 + cudaVersion: "13.0" + disk: + /: + total: "3766429188096" + used: "214758481920" + email: shahidulshakib034@gmail.com + executable: /home/mshahidul/miniconda3/envs/verl2/bin/python3 + git: + commit: d9939add7a2a01923a9088891f913a5d20c4e622 + remote: https://github.com/verl-project/verl + gpu: NVIDIA A100 80GB PCIe + gpu_count: 6 + gpu_nvidia: + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-4a3678c7-34a9-356f-f7b7-7f7e2f44b596 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-df506764-0db5-91b4-8ec9-154a3bb8123f + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-2c3dbd62-b384-2996-a0f6-b32dcfcc3538 + - architecture: Ampere + cudaCores: 6912 + memoryTotal: "85899345920" + name: NVIDIA A100 80GB PCIe + uuid: GPU-1ff3dabe-4b9a-ea62-5cc3-01f12f32d328 + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-eefc4b8c-0e79-c1d6-a9ff-8325040572eb + - architecture: Hopper + cudaCores: 14592 + memoryTotal: "85520809984" + name: NVIDIA H100 PCIe + uuid: GPU-d42b6057-13e8-1e88-6aa1-9307df72dece + host: gamma + memory: + total: "1081814863872" + os: Linux-5.15.0-160-generic-x86_64-with-glibc2.35 + program: /home/mshahidul/miniconda3/envs/verl2/lib/python3.12/site-packages/ray/_private/workers/default_worker.py + python: CPython 3.12.12 + root: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train + startedAt: "2026-02-15T07:27:20.374663Z" + writerId: wp5pmeliennks5qkojngvwh9a8uzjc9c + m: [] + python_version: 3.12.12 + t: + "1": + - 1 + - 11 + - 30 + - 35 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + - 107 + "2": + - 1 + - 11 + - 30 + - 35 + - 41 + - 49 + - 50 + - 51 + - 71 + - 95 + - 98 + - 105 + - 107 + "3": + - 2 + - 13 + - 16 + - 61 + "4": 3.12.12 + "5": 0.24.1 + "6": 4.56.1 + "12": 0.24.1 + "13": linux-x86_64 +actor_rollout_ref: + value: + actor: + _target_: verl.workers.config.FSDPActorConfig + calculate_entropy: false + calculate_sum_pi_squared: false + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + clip_ratio: 0.2 + clip_ratio_c: 3 + clip_ratio_high: 0.2 + clip_ratio_low: 0.2 + data_loader_seed: 42 + entropy_checkpointing: false + entropy_coeff: 0 + entropy_from_logits_with_chunking: false + freeze_vision_tower: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: true + param_offload: true + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + grad_clip: 1 + kl_loss_coef: 0.001 + kl_loss_type: low_var_kl + loss_agg_mode: token-mean + loss_scale_factor: null + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-06 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 45 + warmup_style: null + weight_decay: 0.01 + policy_loss: + _target_: verl.workers.config.PolicyLossConfig + clip_cov_lb: 1 + clip_cov_ratio: 0.0002 + clip_cov_ub: 5 + kl_cov_ratio: 0.0002 + loss_mode: vanilla + ppo_kl_coef: 0.1 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 16384 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: 16 + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + shuffle: false + strategy: fsdp + sum_pi_squared_checkpointing: false + tau_neg: 1.05 + tau_pos: 1 + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_fused_kernels: false + use_kl_loss: true + use_prefix_grouper: false + use_remove_padding: true + use_torch_compile: true + hybrid_engine: true + model: + _target_: verl.workers.config.HFModelConfig + custom_chat_template: null + enable_activation_offload: false + enable_gradient_checkpointing: true + exclude_modules: null + external_lib: null + fused_kernel_options: + impl_backend: torch + hf_config_path: null + lora_adapter_path: null + lora_alpha: 16 + lora_rank: 0 + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + path: Qwen/Qwen3-4B-Instruct-2507 + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: null + trust_remote_code: false + use_fused_kernels: false + use_liger: false + use_remove_padding: true + use_shm: false + nccl_timeout: 600 + ref: + _target_: verl.workers.config.FSDPActorConfig + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: true + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: true + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + router_replay: + _target_: verl.workers.config.RouterReplayConfig + mode: disabled + record_file: null + replay_file: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_torch_compile: true + rollout: + _target_: verl.workers.config.RolloutConfig + agent: + _target_: verl.workers.config.AgentLoopConfig + agent_loop_config_path: null + custom_async_server: + _target_: verl.workers.config.CustomAsyncServerConfig + name: null + path: null + default_agent_loop: single_turn_agent + num_workers: 8 + calculate_log_probs: false + checkpoint_engine: + _target_: verl.workers.config.CheckpointEngineConfig + backend: naive + update_weights_bucket_megabytes: 2048 + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + do_sample: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enable_rollout_routing_replay: false + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.4 + ignore_eos: false + layered_summon: false + load_format: dummy + log_prob_max_token_len_per_gpu: 16384 + log_prob_micro_batch_size: null + log_prob_micro_batch_size_per_gpu: 32 + log_prob_use_dynamic_bsz: false + logprobs_mode: processed_logprobs + max_model_len: 8192 + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + mode: async + mtp: + _target_: verl.workers.config.MtpConfig + detach_encoder: false + enable: false + enable_rollout: false + enable_train: false + method: mtp + mtp_loss_scaling_factor: 0.1 + num_speculative_tokens: 1 + speculative_algorithm: EAGLE + speculative_eagle_topk: 1 + speculative_num_draft_tokens: 4 + speculative_num_steps: 3 + multi_stage_wake_up: false + multi_turn: + _target_: verl.workers.config.MultiTurnConfig + enable: false + format: hermes + interaction_config_path: null + max_assistant_turns: null + max_parallel_calls: 1 + max_tool_response_length: 256 + max_user_turns: null + num_repeat_rollouts: null + tokenization_sanity_check_mode: strict + tool_config_path: null + tool_response_truncate_side: middle + use_inference_chat_template: false + "n": 3 + name: vllm + over_sample_rate: 0 + pipeline_model_parallel_size: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + prometheus: + _target_: verl.workers.config.PrometheusConfig + enable: false + file: /tmp/ray/session_latest/metrics/prometheus/prometheus.yml + port: 9090 + served_model_name: Qwen/Qwen3-4B-Instruct-2507 + prompt_length: 1024 + quantization: null + quantization_config_file: null + response_length: 2048 + scheduling_policy: fcfs + skip_dump_dir: /tmp/rollout_dump + skip_rollout: false + skip_tokenizer_init: true + temperature: 1 + tensor_model_parallel_size: 1 + top_k: -1 + top_p: 1 + trace: + _target_: verl.workers.config.TraceConfig + backend: null + max_samples_per_step_per_worker: null + token2text: false + val_kwargs: + _target_: verl.workers.config.SamplingConfig + do_sample: false + "n": 1 + temperature: 0 + top_k: -1 + top_p: 1 +algorithm: + value: + _target_: verl.trainer.config.AlgoConfig + adv_estimator: grpo + gamma: 1 + kl_ctrl: + _target_: verl.trainer.config.KLControlConfig + horizon: 10000 + kl_coef: 0.001 + target_kl: 0.1 + type: fixed + kl_penalty: kl + lam: 1 + norm_adv_by_std_in_grpo: true + pf_ppo: + reweight_method: pow + weight_pow: 2 + rollout_correction: + bypass_mode: false + loss_type: ppo_clip + rollout_is: null + rollout_is_batch_normalize: false + rollout_is_threshold: 2 + rollout_rs: null + rollout_rs_threshold: null + use_kl_in_reward: false + use_pf_ppo: false +critic: + value: + _target_: verl.workers.config.FSDPCriticConfig + checkpoint: + _target_: verl.trainer.config.CheckpointConfig + async_save: false + load_contents: + - model + - optimizer + - extra + save_contents: + - model + - optimizer + - extra + cliprange_value: 0.5 + data_loader_seed: 42 + enable: null + forward_max_token_len_per_gpu: 32768 + forward_micro_batch_size: null + forward_micro_batch_size_per_gpu: null + grad_clip: 1 + loss_agg_mode: token-mean + model: + _target_: verl.workers.config.FSDPCriticModelCfg + enable_activation_offload: false + enable_gradient_checkpointing: true + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + dtype: bfloat16 + entropy_checkpointing: false + entropy_from_logits_with_chunking: false + forward_only: false + forward_prefetch: false + fsdp_size: -1 + full_determinism: false + model_dtype: fp32 + offload_policy: false + optimizer_offload: false + param_offload: false + reshard_after_forward: true + seed: 42 + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_orig_params: false + use_torch_compile: true + wrap_policy: + min_num_params: 0 + lora_alpha: 16 + lora_rank: 0 + path: ~/models/deepseek-llm-7b-chat + target_modules: all-linear + tiled_mlp: + enabled: false + num_shards: 4 + tokenizer_path: Qwen/Qwen3-4B-Instruct-2507 + trust_remote_code: false + use_remove_padding: false + use_shm: false + optim: + _target_: verl.workers.config.FSDPOptimizerConfig + betas: + - 0.9 + - 0.999 + clip_grad: 1 + lr: 1e-05 + lr_scheduler_type: constant + lr_warmup_steps: -1 + lr_warmup_steps_ratio: 0 + min_lr_ratio: 0 + num_cycles: 0.5 + optimizer: AdamW + optimizer_impl: torch.optim + override_optimizer_config: null + total_training_steps: 45 + warmup_style: null + weight_decay: 0.01 + ppo_epochs: 1 + ppo_max_token_len_per_gpu: 32768 + ppo_micro_batch_size: null + ppo_micro_batch_size_per_gpu: null + ppo_mini_batch_size: 256 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + rollout_n: 3 + shuffle: false + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false +custom_reward_function: + value: + name: compute_score + path: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward_func/reward_new_v2.py +data: + value: + custom_cls: + name: null + path: null + datagen: + name: null + path: null + dataloader_num_workers: 8 + filter_overlong_prompts: true + filter_overlong_prompts_workers: 1 + image_key: images + image_patch_size: 14 + max_prompt_length: 1024 + max_response_length: 2048 + prompt_key: prompt + return_full_prompt: false + return_multi_modal_inputs: true + return_raw_chat: true + return_raw_input_ids: false + reward_fn_key: data_source + sampler: + class_name: null + class_path: null + seed: null + shuffle: true + tokenizer: null + tool_config_path: null + train_batch_size: 512 + train_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet + train_max_samples: -1 + truncation: error + trust_remote_code: false + use_shm: false + val_batch_size: null + val_files: /home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet + val_max_samples: -1 + validation_shuffle: false + video_key: videos +global_profiler: + value: + _target_: verl.utils.profiler.ProfilerConfig + global_tool_config: + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + controller_nsight_options: + cuda-graph-trace: graph + cuda-memory-usage: "true" + trace: cuda,nvtx,cublas,ucx + discrete: false + worker_nsight_options: + capture-range: cudaProfilerApi + capture-range-end: null + cuda-graph-trace: graph + cuda-memory-usage: "true" + kill: none + trace: cuda,nvtx,cublas,ucx + torch_memory: + context: all + stack_depth: 32 + stacks: all + trace_alloc_max_entries: 100000 + profile_continuous_steps: false + save_path: outputs/profile + steps: null + tool: null +ray_kwargs: + value: + ray_init: + num_cpus: null + timeline_json_file: null +reward_manager: + value: + _target_: verl.trainer.config.config.RewardManagerConfig + module: + _target_: verl.trainer.config.config.ModuleConfig + name: custom_reward_manager + path: null + name: naive + source: register +reward_model: + value: + enable: false + enable_resource_pool: false + forward_max_token_len_per_gpu: 32768 + launch_reward_fn_async: false + max_length: null + micro_batch_size: null + micro_batch_size_per_gpu: null + model: + external_lib: null + fsdp_config: + _target_: verl.workers.config.FSDPEngineConfig + forward_prefetch: false + fsdp_size: -1 + param_offload: false + reshard_after_forward: true + wrap_policy: + min_num_params: 0 + input_tokenizer: Qwen/Qwen3-4B-Instruct-2507 + path: ~/models/FsfairX-LLaMA3-RM-v0.1 + trust_remote_code: false + use_fused_kernels: false + use_remove_padding: false + use_shm: false + n_gpus_per_node: 8 + nnodes: 0 + num_workers: 1 + profiler: + _target_: verl.utils.profiler.ProfilerConfig + all_ranks: false + enable: false + ranks: [] + save_path: outputs/profile + tool: null + tool_config: + npu: + _target_: verl.utils.profiler.config.NPUToolConfig + analysis: true + contents: [] + discrete: false + level: level0 + nsys: + _target_: verl.utils.profiler.config.NsightToolConfig + discrete: false + torch: + _target_: verl.utils.profiler.config.TorchProfilerToolConfig + contents: [] + discrete: false + torch_memory: + _target_: verl.utils.profiler.config.TorchMemoryToolConfig + stack_depth: 32 + trace_alloc_max_entries: 100000 + reward_loop_class_name: null + reward_loop_module_path: null + reward_loop_source: register + reward_manager: naive + rollout: + _target_: verl.workers.config.RolloutConfig + cudagraph_capture_sizes: null + data_parallel_size: 1 + disable_log_stats: true + dtype: bfloat16 + enable_chunked_prefill: true + enable_prefix_caching: true + enforce_eager: true + expert_parallel_size: 1 + free_cache_engine: true + gpu_memory_utilization: 0.5 + limit_images: null + load_format: auto + max_model_len: null + max_num_batched_tokens: 8192 + max_num_seqs: 1024 + name: ??? + prompt_length: 2048 + response_length: 2048 + skip_tokenizer_init: false + tensor_model_parallel_size: 2 + sandbox_fusion: + max_concurrent: 64 + memory_limit_mb: 1024 + url: null + strategy: fsdp + ulysses_sequence_parallel_size: 1 + use_dynamic_bsz: false + use_reward_loop: true +trainer: + value: + balance_batch: true + critic_warmup: 0 + default_hdfs_dir: null + default_local_dir: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2 + del_local_ckpt_after_load: false + device: cuda + esi_redundant_time: 0 + experiment_name: qwen3-4b-instruct-en + log_val_generations: 0 + logger: + - console + - wandb + max_actor_ckpt_to_keep: 1 + max_critic_ckpt_to_keep: 1 + n_gpus_per_node: 2 + nnodes: 1 + project_name: readctrl-verl + ray_wait_register_center_timeout: 300 + remove_previous_ckpt_in_save: true + resume_from_path: null + resume_mode: auto + rollout_data_dir: null + save_freq: 5 + test_freq: 10 + total_epochs: 15 + total_training_steps: null + use_legacy_worker_impl: auto + val_before_train: true + val_only: false + validation_data_dir: null +transfer_queue: + value: + enable: false diff --git a/code/RL_model/verl/verl_train/wandb/run-20260215_022720-l2pbuwit/files/output.log b/code/RL_model/verl/verl_train/wandb/run-20260215_022720-l2pbuwit/files/output.log new file mode 100644 index 0000000000000000000000000000000000000000..6c009f8c554597c03a29263204db31e4948103f2 --- /dev/null +++ b/code/RL_model/verl/verl_train/wandb/run-20260215_022720-l2pbuwit/files/output.log @@ -0,0 +1,14 @@ +wandb: Detected [dspy, litellm, openai] in use. +wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script. +wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/ +Checkpoint tracker file does not exist: /home/mshahidul/readctrl/code/RL_model/models/RL_model_subclaim_classifier_v2/latest_checkpointed_iteration.txt +Training from scratch +test_gen_batch meta info: {'eos_token_id': 151645, 'pad_token_id': 151643, 'recompute_log_prob': False, 'do_sample': False, 'validate': True, 'global_steps': 0} +validation generation end +("Initial validation metrics: {'val-aux/multiclinsum/reward/mean@1': " + "np.float64(3.476556508484001), 'val-core/multiclinsum/acc/mean@1': " + "np.float64(3.4765565582656657), 'val-aux/num_turns/min': np.int32(2), " + "'val-aux/num_turns/max': np.int32(2), 'val-aux/num_turns/mean': " + 'np.float64(2.0)}') +step:0 - val-aux/multiclinsum/reward/mean@1:np.float64(3.476556508484001) - val-core/multiclinsum/acc/mean@1:np.float64(3.4765565582656657) - val-aux/num_turns/min:np.int32(2) - val-aux/num_turns/max:np.int32(2) - val-aux/num_turns/mean:np.float64(2.0) +Training Progress: 0%| | 0/45 [00:00