Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/main_ppo.log +0 -0
code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/main_ppo.log +0 -0
code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/hydra.yaml +212 -0
code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/main_ppo.log +0 -0
code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/main_ppo.log +0 -0
code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/main_ppo.log +0 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/reward_fn.py +100 -0
code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_reward_model_genrm.py +156 -0
code/RL_model/verl/verl_train/tests/trainer/config/legacy_ppo_megatron_trainer.yaml +471 -0
code/RL_model/verl/verl_train/tests/trainer/config/legacy_ppo_trainer.yaml +1126 -0
code/RL_model/verl/verl_train/tests/trainer/config/test_algo_config_on_cpu.py +204 -0
code/RL_model/verl/verl_train/tests/trainer/config/test_legacy_config_on_cpu.py +176 -0
code/RL_model/verl/verl_train/tests/trainer/ppo/__init__.py +16 -0
code/RL_model/verl/verl_train/tests/trainer/ppo/test_core_algos_on_cpu.py +317 -0
code/RL_model/verl/verl_train/tests/trainer/ppo/test_metric_utils_on_cpu.py +489 -0
code/RL_model/verl/verl_train/tests/trainer/ppo/test_rollout_corr.py +386 -0
code/RL_model/verl/verl_train/tests/trainer/ppo/test_rollout_corr_integration.py +262 -0
data/extracting_subclaim/old/extracted_subclaims_classified_multiclinsum_test_en_en.json +0 -0
data/extracting_subclaim/subset/extracted_subclaims_0_100.json +0 -0
data/extracting_subclaim/subset/extracted_subclaims_100_200.json +0 -0
data/extracting_subclaim/subset/extracted_subclaims_200_300.json +0 -0
data/extracting_subclaim/subset/extracted_subclaims_300_400.json +0 -0
data/extracting_subclaim/subset/extracted_subclaims_400_500.json +0 -0
data/extracting_subclaim/subset/extracted_subclaims_500_-1.json +0 -0
data/extracting_subclaim/subset_testset/extracted_subclaims_multiclinsum_test_en_1500_2000.json +0 -0
data/extracting_subclaim/subset_testset/extracted_subclaims_multiclinsum_test_en_2000_2500.json +0 -0
data/extracting_subclaim/subset_testset/extracted_subclaims_multiclinsum_test_en_2500_3000.json +0 -0
data/extracting_subclaim/subset_testset/extracted_subclaims_multiclinsum_test_en_500_1000.json +0 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1018_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1021_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1034_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1074_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1081_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1097_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_10_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1106_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1111_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1114_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1116_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1146_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1158_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1195_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1235_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1298_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1494_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1520_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_155_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1582_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1642_pt_sum.txt +1 -0
data/test_raw_data/pt_test/multiclinsum_test_pt/summaries/multiclinsum_test_1665_pt_sum.txt +1 -0

code/RL_model/verl/verl_train/outputs/2026-02-11/17-42-24/main_ppo.log ADDED Viewed

File without changes

code/RL_model/verl/verl_train/outputs/2026-02-11/17-44-32/main_ppo.log ADDED Viewed

File without changes

code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,212 @@

+hydra:
+  run:
+    dir: outputs/${now:%Y-%m-%d}/${now:%H-%M-%S}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - algorithm.adv_estimator=grpo
+    - data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet
+    - data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet
+    - custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py
+    - data.train_batch_size=512
+    - data.max_prompt_length=1024
+    - data.max_response_length=2048
+    - data.filter_overlong_prompts=True
+    - data.truncation=error
+    - actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507
+    - actor_rollout_ref.actor.optim.lr=1e-6
+    - actor_rollout_ref.model.use_remove_padding=True
+    - actor_rollout_ref.actor.ppo_mini_batch_size=256
+    - actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16
+    - actor_rollout_ref.actor.use_kl_loss=True
+    - actor_rollout_ref.actor.kl_loss_coef=0.001
+    - actor_rollout_ref.actor.kl_loss_type=low_var_kl
+    - actor_rollout_ref.actor.entropy_coeff=0
+    - actor_rollout_ref.model.enable_gradient_checkpointing=True
+    - actor_rollout_ref.actor.fsdp_config.param_offload=True
+    - actor_rollout_ref.actor.fsdp_config.optimizer_offload=True
+    - actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.rollout.tensor_model_parallel_size=1
+    - actor_rollout_ref.rollout.name=vllm
+    - actor_rollout_ref.rollout.gpu_memory_utilization=0.4
+    - actor_rollout_ref.rollout.enforce_eager=True
+    - actor_rollout_ref.rollout.max_model_len=8192
+    - actor_rollout_ref.rollout.n=3
+    - actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32
+    - actor_rollout_ref.ref.fsdp_config.param_offload=True
+    - algorithm.use_kl_in_reward=False
+    - trainer.critic_warmup=0
+    - trainer.logger=["console","wandb"]
+    - trainer.project_name=readctrl-verl
+    - trainer.experiment_name=qwen3-4b-instruct-en
+    - trainer.n_gpus_per_node=2
+    - trainer.nnodes=1
+    - trainer.save_freq=5
+    - trainer.test_freq=10
+    - +trainer.remove_previous_ckpt_in_save=true
+    - trainer.max_actor_ckpt_to_keep=1
+    - trainer.max_critic_ckpt_to_keep=1
+    - trainer.resume_mode=auto
+    - trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier
+    - trainer.total_epochs=15
+  job:
+    name: main_ppo
+    chdir: null
+    override_dirname: +trainer.remove_previous_ckpt_in_save=true,actor_rollout_ref.actor.entropy_coeff=0,actor_rollout_ref.actor.fsdp_config.optimizer_offload=True,actor_rollout_ref.actor.fsdp_config.param_offload=True,actor_rollout_ref.actor.kl_loss_coef=0.001,actor_rollout_ref.actor.kl_loss_type=low_var_kl,actor_rollout_ref.actor.optim.lr=1e-6,actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16,actor_rollout_ref.actor.ppo_mini_batch_size=256,actor_rollout_ref.actor.use_kl_loss=True,actor_rollout_ref.model.enable_gradient_checkpointing=True,actor_rollout_ref.model.path=Qwen/Qwen3-4B-Instruct-2507,actor_rollout_ref.model.use_remove_padding=True,actor_rollout_ref.ref.fsdp_config.param_offload=True,actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.enforce_eager=True,actor_rollout_ref.rollout.gpu_memory_utilization=0.4,actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32,actor_rollout_ref.rollout.max_model_len=8192,actor_rollout_ref.rollout.n=3,actor_rollout_ref.rollout.name=vllm,actor_rollout_ref.rollout.tensor_model_parallel_size=1,algorithm.adv_estimator=grpo,algorithm.use_kl_in_reward=False,custom_reward_function.path=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/reward_func/reward.py,data.filter_overlong_prompts=True,data.max_prompt_length=1024,data.max_response_length=2048,data.train_batch_size=512,data.train_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/train.parquet,data.truncation=error,data.val_files=/home/mshahidul/readctrl/code/RL_model/verl/verl_train/dataset/test.parquet,trainer.critic_warmup=0,trainer.default_local_dir=/home/mshahidul/readctrl/code/RL_model/RL_model_subclaim_classifier,trainer.experiment_name=qwen3-4b-instruct-en,trainer.logger=["console","wandb"],trainer.max_actor_ckpt_to_keep=1,trainer.max_critic_ckpt_to_keep=1,trainer.n_gpus_per_node=2,trainer.nnodes=1,trainer.project_name=readctrl-verl,trainer.resume_mode=auto,trainer.save_freq=5,trainer.test_freq=10,trainer.total_epochs=15
+    id: ???
+    num: ???
+    config_name: ppo_trainer
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.3'
+    cwd: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/verl/trainer/config
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /data/home_beta/mshahidul/readctrl/code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37
+    choices:
+      algorithm@algorithm.rollout_correction: rollout_correction
+      reward_model: dp_reward_loop
+      critic: dp_critic
+      critic/../engine@critic.model.fsdp_config: fsdp
+      critic/../optim@critic.optim: fsdp
+      model@actor_rollout_ref.model: hf_model
+      rollout@actor_rollout_ref.rollout: rollout
+      ref@actor_rollout_ref.ref: dp_ref
+      ref/../engine@actor_rollout_ref.ref.fsdp_config: fsdp
+      data: legacy_data
+      actor@actor_rollout_ref.actor: dp_actor
+      actor/../engine@actor_rollout_ref.actor.fsdp_config: fsdp
+      actor/../optim@actor_rollout_ref.actor.optim: fsdp
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

code/RL_model/verl/verl_train/outputs/2026-02-11/18-09-37/main_ppo.log ADDED Viewed

File without changes

code/RL_model/verl/verl_train/outputs/2026-02-11/18-29-53/main_ppo.log ADDED Viewed

File without changes

code/RL_model/verl/verl_train/outputs/2026-02-11/18-56-56/main_ppo.log ADDED Viewed

File without changes

code/RL_model/verl/verl_train/tests/experimental/reward_loop/reward_fn.py ADDED Viewed

	@@ -0,0 +1,100 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import aiohttp
+from openai.types.chat import ChatCompletion
+from transformers import PreTrainedTokenizer
+GRM_PROMPT_TEMPLATE = """
+You are given a problem and a proposed solution.
+Problem:
+{problem}
+Solution:
+{solution}
+Please evaluate how well the solution addresses the problem.
+Give a score from 1 to 10, where:
+- 1 means the solution is completely irrelevant or incorrect.
+- 5 means the solution is partially correct but incomplete or not well reasoned.
+- 10 means the solution is fully correct, well-reasoned, and directly solves the problem.
+Only output the score as a single number (integer).
+""".strip()
+async def chat_complete(router_address: str, chat_complete_request: dict):
+    url = f"http://{router_address}/v1/chat/completions"
+    try:
+        timeout = aiohttp.ClientTimeout(total=None)
+        session = aiohttp.ClientSession(timeout=timeout)
+        async with session.post(url, json=chat_complete_request) as resp:
+            output = await resp.text()
+            output = json.loads(output)
+            return ChatCompletion(**output)
+    except Exception as e:
+        raise e
+    finally:
+        await session.close()
+async def compute_score_gsm8k(
+    data_source: str,
+    solution_str: str,
+    ground_truth: str,
+    extra_info: dict,
+    reward_router_address: str,
+    reward_model_tokenizer: PreTrainedTokenizer,
+):
+    """Compute the reward score."""
+    grm_prompt = GRM_PROMPT_TEMPLATE.format(problem=extra_info["question"], solution=solution_str)
+    messages = [{"role": "user", "content": grm_prompt}]
+    sampling_params = {"temperature": 0.7, "top_p": 0.8, "max_tokens": 4096}
+    model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
+    chat_complete_request = {
+        "messages": messages,
+        "model": model_name,
+        **sampling_params,
+    }
+    result = await chat_complete(
+        router_address=reward_router_address,
+        chat_complete_request=chat_complete_request,
+    )
+    grm_response = result.choices[0].message.content
+    try:
+        score = int(grm_response.split("\n\n")[-1].strip())
+    except Exception:
+        score = 0
+    return {"score": score, "acc": score == 10, "genrm_response": grm_response}
+def compute_score_math_verify(
+    data_source: str,
+    solution_str: str,
+    ground_truth: str,
+    extra_info: dict,
+    **kwargs,
+):
+    """Compute the reward score."""
+    from verl.utils.reward_score.math_verify import compute_score
+    return compute_score(
+        model_output=solution_str,
+        ground_truth=ground_truth,
+    )

code/RL_model/verl/verl_train/tests/experimental/reward_loop/test_reward_model_genrm.py ADDED Viewed

	@@ -0,0 +1,156 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ray
+import torch
+from hydra import compose, initialize_config_dir
+from verl.experimental.reward_loop import RewardLoopManager
+from verl.protocol import DataProto
+from verl.utils import hf_tokenizer
+from verl.utils.model import compute_position_id_with_mask
+def create_data_samples(tokenizer) -> DataProto:
+    convs = [
+        [
+            {
+                "role": "user",
+                "content": "What is the range of the numeric output of a sigmoid node in a neural network?",
+            },
+            {"role": "assistant", "content": "Between -1 and 1."},
+        ],
+        [
+            {
+                "role": "user",
+                "content": "What is the range of the numeric output of a sigmoid node in a neural network?",
+            },
+            {"role": "assistant", "content": "Between 0 and 1."},
+        ],
+        [
+            {"role": "user", "content": "What is the capital of Australia?"},
+            {
+                "role": "assistant",
+                "content": "Canberra is the capital city of Australia.",
+            },
+        ],
+        [
+            {"role": "user", "content": "What is the capital of Australia?"},
+            {
+                "role": "assistant",
+                "content": "Sydney is the capital of Australia.",
+            },
+        ],
+    ]
+    raw_prompt = [conv[:1] for conv in convs]
+    data_source = ["gsm8k"] * len(convs)
+    reward_info = [{"ground_truth": "Not Used"}] * len(convs)
+    extra_info = [{"question": conv[0]["content"]} for conv in convs]
+    prompt_length, response_length = 1024, 4096
+    pad_token_id = tokenizer.pad_token_id
+    prompts, responses, input_ids, attention_masks = [], [], [], []
+    for conv in convs:
+        prompt_tokens = tokenizer.apply_chat_template(conv[:1], tokenize=True)
+        response_tokens = tokenizer.apply_chat_template(conv, tokenize=True)[len(prompt_tokens) :]
+        padded_prompt = [pad_token_id] * (prompt_length - len(prompt_tokens)) + prompt_tokens
+        padded_response = response_tokens + [pad_token_id] * (response_length - len(response_tokens))
+        attention_mask = (
+            [0] * (prompt_length - len(prompt_tokens))
+            + [1] * len(prompt_tokens)
+            + [1] * len(response_tokens)
+            + [0] * (response_length - len(response_tokens))
+        )
+        prompts.append(torch.tensor(padded_prompt))
+        responses.append(torch.tensor(padded_response))
+        input_ids.append(torch.tensor(padded_prompt + padded_response))
+        attention_masks.append(torch.tensor(attention_mask))
+    prompts = torch.stack(prompts)
+    responses = torch.stack(responses)
+    input_ids = torch.stack(input_ids)
+    attention_masks = torch.stack(attention_masks)
+    position_ids = compute_position_id_with_mask(attention_masks)
+    data = DataProto.from_dict(
+        tensors={
+            "prompts": prompts,
+            "responses": responses,
+            "input_ids": input_ids,
+            "attention_mask": attention_masks,
+            "position_ids": position_ids,
+        },
+        non_tensors={
+            "data_source": data_source,
+            "reward_model": reward_info,
+            "raw_prompt": raw_prompt,
+            "extra_info": extra_info,
+        },
+    )
+    return data, convs
+def test_reward_model_manager():
+    ray.init(
+        runtime_env={
+            "env_vars": {
+                "TOKENIZERS_PARALLELISM": "true",
+                "NCCL_DEBUG": "WARN",
+                "VLLM_LOGGING_LEVEL": "INFO",
+                "VLLM_USE_V1": "1",
+            }
+        }
+    )
+    with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+        config = compose(config_name="ppo_trainer")
+    rollout_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
+    reward_model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-1.5B-Instruct")
+    config.actor_rollout_ref.model.path = rollout_model_name
+    config.custom_reward_function.path = "tests/experimental/reward_loop/reward_fn.py"
+    config.custom_reward_function.name = "compute_score_gsm8k"
+    config.reward_model.reward_manager = "dapo"
+    config.reward_model.enable = True
+    config.reward_model.enable_resource_pool = True
+    config.reward_model.n_gpus_per_node = 8
+    config.reward_model.nnodes = 1
+    config.reward_model.model.path = reward_model_name
+    config.reward_model.rollout.name = os.getenv("ROLLOUT_NAME", "vllm")
+    config.reward_model.rollout.gpu_memory_utilization = 0.9
+    config.reward_model.rollout.tensor_model_parallel_size = 2
+    config.reward_model.rollout.skip_tokenizer_init = False
+    config.reward_model.rollout.prompt_length = 2048
+    config.reward_model.rollout.response_length = 4096
+    # 1. init reward model manager
+    reward_loop_manager = RewardLoopManager(config)
+    # 2. init test data
+    rollout_tokenizer = hf_tokenizer(rollout_model_name)
+    data, convs = create_data_samples(rollout_tokenizer)
+    # 3. generate responses
+    outputs = reward_loop_manager.compute_rm_score(data)
+    for idx, (conv, output) in enumerate(zip(convs, outputs, strict=True)):
+        print(f"Problem {idx}:\n{conv[0]['content']}\n")
+        print(f"AI Solution {idx}:\n{conv[1]['content']}\n")
+        print(f"GRM Response {idx}:\n{output.non_tensor_batch['genrm_response']}\n")
+        print("=" * 50 + "\n")
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/trainer/config/legacy_ppo_megatron_trainer.yaml ADDED Viewed

	@@ -0,0 +1,471 @@

+data:
+  tokenizer: null
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  train_max_samples: -1  # set to -1 to use full dataset
+  val_max_samples: -1  # set to -1 to use full dataset
+  prompt_key: prompt
+  reward_fn_key: data_source
+  max_prompt_length: 512
+  max_response_length: 512
+  train_batch_size: 1024
+  val_batch_size: null # DEPRECATED: Validation datasets are sent to inference engines as a whole batch, which will schedule the memory themselves
+  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
+  return_raw_chat: True
+  return_full_prompt: False
+  shuffle: True
+  seed: null # An integer seed to use when shuffling the data. If not set or set to `null`, the data shuffling will not be seeded, resulting in a different data order on each run.
+  filter_overlong_prompts: False # for large-scale dataset, filtering overlong prompts could be timeconsuming. You cat set the filter_overlong_prompts_workers to use multiprocessing to speed up.
+  filter_overlong_prompts_workers: 1
+  truncation: error
+  trust_remote_code: False  # main_ppo will check this config to determine whether to use remote code for tokenizer
+  custom_cls:
+      path: null
+      name: null
+  sampler:
+    class_path: null
+    class_name: null
+  dataloader_num_workers: 8
+  return_multi_modal_inputs: True
+actor_rollout_ref:
+  hybrid_engine: True
+  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    custom_chat_template: null
+    external_lib: null
+    override_config:
+      model_config: {}
+      moe_config:
+        freeze_moe_router: False
+    enable_gradient_checkpointing: True
+    gradient_checkpointing_kwargs:
+      ## Activation Checkpointing
+      activations_checkpoint_method: null # 'uniform', 'block'; not used with 'selective'
+      # 'uniform' divides the total number of transformer layers and checkpoints the input activation of each chunk
+      # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity
+      activations_checkpoint_granularity: null # 'selective' or 'full'
+      # 'full' will checkpoint the entire transformer layer and 'selective' only checkpoints memory intensive part of attention
+      activations_checkpoint_num_layers: null # not used with 'selective'
+    trust_remote_code: False
+  actor:
+    strategy: megatron  # This is for backward-compatibility
+    ppo_mini_batch_size: 256
+    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
+    use_torch_compile: True # False to disable torch compile
+    # pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
+    clip_ratio: 0.2 # default value if clip_ratio_low and clip_ratio_high are not specified
+    clip_ratio_low: 0.2
+    clip_ratio_high: 0.2
+    clip_ratio_c: 3.0 # lower bound of the value for Dual-clip PPO from https://arxiv.org/pdf/1912.09729
+    loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean" / "seq-mean-token-sum-norm"
+    # NOTE: "token-mean" is the default behavior
+    loss_scale_factor: null  # Scale factor for "seq-mean-token-sum-norm" mode. If null, uses response_length.
+    entropy_coeff: 0
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
+    ppo_epochs: 1
+    data_loader_seed: 42
+    shuffle: False
+    policy_loss:   # policy loss config
+      loss_mode: "vanilla" # Loss function mode: vanilla / clip-cov / kl-cov / gpg from https://arxiv.org/abs/2505.22617,
+      clip_cov_ratio: 0.0002 # Ratio of tokens to be clipped for clip-cov loss
+      clip_cov_lb: 1.0 # Lower bound for clip-cov loss
+      clip_cov_ub: 5.0 # Upper bound for clip-cov loss
+      kl_cov_ratio: 0.0002 # Ratio of tokens to be applied kl penalty for kl-cov loss
+      ppo_kl_coef: 0.1 # KL divergence penalty coefficient
+    optim:
+      optimizer: adam
+      lr: 1e-6
+      clip_grad: 1.0
+      total_training_steps: -1  # must be override by program
+      lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
+      lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
+      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+      lr_decay_steps: null
+      lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root
+      min_lr: 0.0 # minimum learning rate, default to 0.0
+      weight_decay: 0.01
+      weight_decay_incr_style: constant # select from constant/linear/cosine
+      lr_wsd_decay_style: exponential # select from constant/exponential/cosine
+      lr_wsd_decay_steps: null
+      use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
+    megatron:
+      param_offload: False
+      grad_offload: False
+      optimizer_offload: False
+      tensor_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      expert_tensor_parallel_size: null
+      pipeline_model_parallel_size: 1
+      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+      context_parallel_size: 1
+      sequence_parallel: True
+      use_distributed_optimizer: True
+      use_dist_checkpointing: False
+      dist_checkpointing_path: null
+      seed: 42
+      override_transformer_config: {} # additional transformer config like: num_layers_in_first(/last)_pipeline_stage
+      use_mbridge: True
+      vanilla_mbridge: True
+    profile: # profile the actor model in `update_policy`
+      use_profile: False # open it when you want to profile the actor model
+      profile_ranks: null # list, you can specify the ranks to profile
+      step_start: -1 # start step in update_policy
+      step_end: -1 # end step
+      save_path: null # the path to save the profile result
+    load_weight: True
+    checkpoint:
+      async_save: False # save checkpoint asynchronously
+      # What to include in saved checkpoints
+      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      save_contents: ['model', 'optimizer', 'extra']
+      # For more flexibility, you can specify the contents to load from the checkpoint.
+      load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
+  ref:
+    strategy: ${actor_rollout_ref.actor.strategy}
+    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
+    megatron:
+      param_offload: False
+      tensor_model_parallel_size: 1
+      expert_model_parallel_size: 1
+      expert_tensor_parallel_size: null
+      pipeline_model_parallel_size: 1
+      virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+      context_parallel_size: 1
+      sequence_parallel: True
+      use_distributed_optimizer: True
+      use_dist_checkpointing: False
+      dist_checkpointing_path: null
+      seed: ${actor_rollout_ref.actor.megatron.seed}
+      override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
+      use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
+      vanilla_mbridge: ${actor_rollout_ref.actor.megatron.vanilla_mbridge}
+    profile:
+      use_profile: False
+      profile_ranks: null
+      step_start: -1
+      step_end: -1
+      save_path: null
+    load_weight: True
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+  rollout:
+    name: vllm
+    mode: async # sync: LLM, async: AsyncLLM
+    temperature: 1.0
+    top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+    top_p: 1
+    prompt_length: ${data.max_prompt_length}  # for xperf_gpt
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    dtype: bfloat16 # should align with FSDP
+    gpu_memory_utilization: 0.5
+    ignore_eos: False
+    enforce_eager: False
+    free_cache_engine: True
+    load_format: dummy
+    tensor_model_parallel_size: 2
+    max_num_batched_tokens: 8192
+    max_model_len: null
+    max_num_seqs: 1024
+    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
+    # for hf rollout
+    do_sample: True
+    layer_name_map:
+      qkv_layer_name: qkv
+      gate_proj_layer_name: gate_up
+    # number of responses (i.e. num sample times)
+    n: 1
+    engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
+      vllm: {}
+      sglang: {}
+    val_kwargs:
+      # sampling parameters for validation
+      top_k: -1 # 0 for hf rollout, -1 for vllm rollout
+      top_p: 1.0
+      temperature: 0
+      n: 1
+      do_sample: False # default eager for validation
+    # Multi-turn interaction config for tools or chat.
+    multi_turn:
+      # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
+      enable: False
+      # null for no limit (default max_length // 3)
+      max_assistant_turns: null
+      # null for no tool
+      tool_config_path: null
+      # null for no limit (default max_length // 3)
+      max_user_turns: null
+      # max parallel call for tools in single turn
+      max_parallel_calls: 1
+      # max length of tool response
+      max_tool_response_length: 256
+      # truncate side of tool response: left, middle, right
+      tool_response_truncate_side: middle
+      # null for no interaction
+      interaction_config_path: null
+      # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
+      # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
+      #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
+      use_inference_chat_template: False
+      # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
+      # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
+      # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
+      # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
+      # Qwen/QwQ-32B, Qwen/Qwen3-xxB
+      # - disable: disable tokenization sanity check
+      # - strict: enable strict tokenization sanity check (default)
+      # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
+      tokenization_sanity_check_mode: strict
+      # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
+      format: hermes
+    # [Experimental] agent loop based rollout configs
+    agent:
+      # Number of agent loop workers
+      num_workers: 8
+      custom_async_server:
+        path: null
+        name: null
+    # support logging rollout prob for debugging purpose
+    calculate_log_probs: False
+    # Nsight system profiler configs
+  profiler:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.ProfilerConfig
+    discrete: False
+    all_ranks: False
+    ranks: []
+critic:
+  rollout_n: ${actor_rollout_ref.rollout.n}
+  strategy: ${actor_rollout_ref.actor.strategy}
+  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
+  optim:
+    optimizer: adam
+    lr: 1e-6
+    clip_grad: 1.0
+    total_training_steps: -1  # must be override by program
+    lr_warmup_init: 0.0  # initial learning rate for warmup, default to 0.0
+    lr_warmup_steps: null # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
+    lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
+    lr_decay_steps: null
+    lr_decay_style: constant # select from constant/linear/cosine/inverse_square_root
+    min_lr: 0.0 # minimum learning rate, default to 0.0
+    weight_decay: 0.01
+    weight_decay_incr_style: constant # select from constant/linear/cosine
+    lr_wsd_decay_style: exponential # select from constant/exponential/cosine
+    lr_wsd_decay_steps: null
+    use_checkpoint_opt_param_scheduler: False # use checkpoint optimizer parameter scheduler
+  model:
+    path: ~/models/deepseek-llm-7b-chat
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    override_config:
+      model_config: {}
+      moe_config:
+        freeze_moe_router: False
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    trust_remote_code: False
+    enable_gradient_checkpointing: True
+    gradient_checkpointing_kwargs:
+      ## Activation Checkpointing
+      activations_checkpoint_method: null
+      activations_checkpoint_granularity: null
+      activations_checkpoint_num_layers: null
+  megatron:
+    param_offload: False
+    grad_offload: False
+    optimizer_offload: False
+    tensor_model_parallel_size: 1
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: null
+    pipeline_model_parallel_size: 1
+    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+    context_parallel_size: 1
+    sequence_parallel: True
+    use_distributed_optimizer: True
+    use_dist_checkpointing: False
+    dist_checkpointing_path: null
+    seed: ${actor_rollout_ref.actor.megatron.seed}
+    override_transformer_config: ${actor_rollout_ref.actor.megatron.override_transformer_config}
+    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
+    vanilla_mbridge: ${actor_rollout_ref.actor.megatron.vanilla_mbridge}
+  load_weight: True
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
+  ppo_micro_batch_size_per_gpu: null
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  data_loader_seed: ${actor_rollout_ref.actor.data_loader_seed}
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  cliprange_value: 0.5
+  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
+  checkpoint:
+    async_save: False # save checkpoint asynchronously
+    # What to include in saved checkpoints
+    # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+    save_contents: ['model', 'optimizer', 'extra']
+    load_contents: ${critic.checkpoint.save_contents}
+  # Nsight system profiler configs
+  profiler:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.ProfilerConfig
+    discrete: False
+    all_ranks: False
+    ranks: []
+reward_model:
+  enable: False
+  strategy: ${actor_rollout_ref.actor.strategy}
+  nccl_timeout: 600 # seconds, default is 10 minutes for torch, you can set it to a larger value if you have long-running operations like 32B or 72B model using megatron
+  megatron:
+    param_offload: False
+    tensor_model_parallel_size: 1
+    expert_model_parallel_size: 1
+    expert_tensor_parallel_size: null
+    pipeline_model_parallel_size: 1
+    virtual_pipeline_model_parallel_size: null # change VPP interface for parallelism tests
+    context_parallel_size: 1
+    sequence_parallel: True
+    use_distributed_optimizer: False
+    use_dist_checkpointing: False
+    dist_checkpointing_path: null
+    seed: ${actor_rollout_ref.actor.megatron.seed}
+    override_transformer_config: {}
+    use_mbridge: ${actor_rollout_ref.actor.megatron.use_mbridge}
+    vanilla_mbridge: ${actor_rollout_ref.actor.megatron.vanilla_mbridge}
+  model:
+    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    trust_remote_code: False
+    external_lib: ${actor_rollout_ref.model.external_lib}
+  load_weight: True
+  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
+  micro_batch_size_per_gpu: null
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  max_length: null
+  reward_manager: naive
+  launch_reward_fn_async: False # custom reward function executed async on CPU, during log_prob
+  sandbox_fusion:
+    url: null # faas url to run code in cloud sandbox
+    max_concurrent: 64 # max concurrent requests to sandbox
+    memory_limit_mb: 1024 # Max memory limit for each sandbox process in MB
+  # Nsight system profiler configs
+  profiler:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.ProfilerConfig
+    discrete: False
+    all_ranks: False
+    ranks: []
+custom_reward_function:
+  path: null
+  name: compute_score
+algorithm:
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.trainer.config.AlgoConfig
+  gamma: 1.0
+  lam: 1.0
+  adv_estimator: gae
+  norm_adv_by_std_in_grpo: True
+  use_kl_in_reward: False
+  kl_penalty: kl  # how to estimate kl divergence
+  kl_ctrl:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.trainer.config.KLControlConfig
+    type: fixed
+    kl_coef: 0.001
+    horizon: 10000
+    target_kl: 0.1
+  use_pf_ppo: False
+  pf_ppo:
+    reweight_method: pow  # ["pow", "max_min", "max_random"]
+    weight_pow: 2.0
+trainer:
+  balance_batch: True
+  total_epochs: 30
+  total_training_steps: null
+  profile_steps: null # [1,2,5] or [] or null
+  project_name: verl_examples
+  experiment_name: gsm8k
+  logger: ['console', 'wandb']
+  log_val_generations: 0
+  nnodes: 1
+  n_gpus_per_node: 8
+  save_freq: -1
+  esi_redundant_time: 0
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or disable or resume_path if resume_from_path is set
+  resume_from_path: null
+  del_local_ckpt_after_load: False
+  val_before_train: True
+  test_freq: -1
+  critic_warmup: 0
+  default_hdfs_dir: null
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  max_actor_ckpt_to_keep: null
+  max_critic_ckpt_to_keep: null
+  # The timeout for ray worker group to wait for the register center to be ready
+  ray_wait_register_center_timeout: 300
+  device: cuda
+  # see ppo_trainer.yaml for more details
+  controller_nsight_options:
+    trace: "cuda,nvtx,cublas,ucx"
+    cuda-memory-usage: "true"
+    cuda-graph-trace: "graph"
+  worker_nsight_options:
+    trace: "cuda,nvtx,cublas,ucx"
+    cuda-memory-usage: "true"
+    cuda-graph-trace: "graph"
+    capture-range: "cudaProfilerApi"
+    capture-range-end: null
+    kill: none
+  npu_profile:
+    options:
+      save_path: ./profiler_data
+      roles: ["all"]
+      level: level0
+      with_memory: False
+      record_shapes: False
+      with_npu: True
+      with_cpu: True
+      with_module: False
+      with_stack: False
+      analysis: True
+ray_kwargs:
+  ray_init:
+    num_cpus: null # `None` means using all CPUs, which might cause hang if limited in systems like SLURM. Please set to a number allowed then.
+  timeline_json_file: null

code/RL_model/verl/verl_train/tests/trainer/config/legacy_ppo_trainer.yaml ADDED Viewed

	@@ -0,0 +1,1126 @@

+# Format checks enforced on CI:
+# 1. Comments must appear above each field.
+# 2. There must be a blank line between each field.
+# 3. Inline comments (after a field on the same line) are not allowed.
+# 4. Indentation level is respected for nested fields.
+# dataset config
+data:
+  # Tokenizer class or path. If null, it will be inferred from the model.
+  tokenizer: null
+  # Whether to use shared memory for data loading.
+  use_shm: False
+  # Training set parquet. Can be a list or a single file.
+  # The program will read all files into memory, so it can't be too large (< 100GB).
+  # The path can be either a local path or an HDFS path.
+  # For HDFS path, we provide utils to download it to DRAM and convert it to a local path.
+  train_files: ~/data/rlhf/gsm8k/train.parquet
+  # Validation parquet. Can be a list or a single file.
+  val_files: ~/data/rlhf/gsm8k/test.parquet
+  # Maximum sample length to be used.
+  # Set to -1 to use full dataset, otherwise, randomly
+  # select the specified number of samples from train dataset
+  train_max_samples: -1
+  # Maximum sample length to be used.
+  # Set to -1 to use full dataset, otherwise, randomly
+  # select the specified number of samples from val dataset
+  val_max_samples: -1
+  # The field in the dataset where the prompt is located. Default is 'prompt'.
+  prompt_key: prompt
+  # The field used to select the reward function (if using different ones per example).
+  reward_fn_key: data_source
+  # Maximum prompt length. All prompts will be left-padded to this length.
+  # An error will be reported if the length is too long.
+  max_prompt_length: 512
+  # Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length.
+  max_response_length: 512
+  # Batch size sampled for one training iteration of different RL algorithms.
+  train_batch_size: 1024
+  # Batch size used during validation. Can be null.
+  val_batch_size: null
+  # Whether to return the original input_ids without adding chat template.
+  # This is used when the reward model's chat template differs from the policy.
+  # If using a model-based RM with different templates, this should be True.
+  return_raw_input_ids: False
+  # Whether to return the original chat (prompt) without applying chat template.
+  return_raw_chat: True
+  # Whether to return the full prompt with chat template.
+  return_full_prompt: False
+  # Whether to shuffle the data in the dataloader.
+  shuffle: True
+  # An integer seed to use when shuffling the data. If not set or set to
+  # `null`, the data shuffling will not be seeded, resulting in a different data order on each run.
+  seed: null
+  # num dataloader workers
+  dataloader_num_workers: 8
+  # Whether to shuffle the validation set.
+  validation_shuffle: False
+  # Whether to filter overlong prompts.
+  filter_overlong_prompts: False
+  # Number of workers for filtering overlong prompts.
+  # For large-scale datasets, filtering can be time-consuming.
+  # Use multiprocessing to speed up. Default is 1.
+  filter_overlong_prompts_workers: 1
+  # Truncate the input_ids or prompt if they exceed max_prompt_length.
+  # Options: 'error', 'left', or 'right'. Default is 'error'.
+  truncation: error
+  # The field in the multi-modal dataset where the image is located. Default is 'images'.
+  image_key: images
+  # The field in the multi-modal dataset where the video is located.
+  video_key: videos
+  # If the remote tokenizer has a Python file, this flag determines whether to allow using it.
+  trust_remote_code: False
+  # Optional: specify a custom dataset class path and name if overriding default loading behavior.
+  custom_cls:
+    # The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
+    path: null
+    # The name of the dataset class within the specified file.
+    name: null
+  # Whether to return multi-modal inputs in the dataset. Set to False if rollout generates new multi-modal inputs.
+  return_multi_modal_inputs: True
+  # Data generation configuration for augmenting the dataset.
+  datagen:
+    # The path to the file containing your customized data generation class.
+    # E.g. 'pkg://verl.experimental.dynamic_dataset.dynamicgen_dataset'
+    path: null
+    # The class name of the data generation class within the specified file.
+    # E.g. 'MockDataGenerator'
+    name: null
+  # settings related to data sampler
+  sampler:
+    # the path to the module containing a curriculum class which implements the
+    # AbstractSampler interface
+    class_path: null
+    # the name of the curriculum class like `MySampler`
+    class_name: null
+  # Additional kwargs when calling tokenizer.apply_chat_template
+  apply_chat_template_kwargs: {}
+# config for actor, rollout and reference model
+actor_rollout_ref:
+  # Whether it's a hybrid engine, currently only supports hybrid engine
+  hybrid_engine: true
+  # common configs for the model
+  model:
+    _target_: verl.workers.config.HFModelConfig
+    # Huggingface model path. This can be either local path or HDFS path.
+    path: ~/models/deepseek-llm-7b-chat
+    # Custom chat template for the model.
+    custom_chat_template: null
+    # Whether to use shared memory (SHM) for accelerating the loading of model weights
+    use_shm: false
+    # Additional Python packages to register huggingface models/tokenizers.
+    external_lib: null
+    # Used to override model's original configurations, mainly dropout
+    override_config: {}
+    # Enable gradient checkpointing for actor
+    enable_gradient_checkpointing: true
+    # Enable activation offloading for actor
+    enable_activation_offload: false
+    # Whether to remove padding tokens in inputs during training
+    use_remove_padding: true
+    # Set to positive value to enable LoRA (e.g., 32)
+    lora_rank: 0
+    # LoRA scaling factor
+    lora_alpha: 16
+    # Target modules to apply LoRA. Options: "all-linear" (not recommended for VLMs) or
+    # [q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj]
+    target_modules: all-linear
+    # Exclude modules from applying Lora. Similar usage to target_modules and Peft.
+    # Example: '.*visual.*' for excluding the ViT in Qwen2.5-VL, as currently vllm does not support ViT Lora.
+    exclude_modules: null
+    # Whether to use Liger for linear layer fusion
+    use_liger: false
+    # Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
+    use_fused_kernels: false
+    # Options for fused kernels. If use_fused_kernels is true, this will be used.
+    fused_kernel_options:
+      # Implementation backend for fused kernels. Options: "triton" or "torch".
+      impl_backend: torch
+    # Whether to enable loading a remote code model
+    trust_remote_code: false
+  # actor configs
+  actor:
+    # fsdp, fsdp2 or megatron. fsdp backend used here.
+    strategy: fsdp
+    # Split each sample into sub-batches of this size for PPO
+    ppo_mini_batch_size: 256
+    # [Deprecated] Global micro batch size
+    ppo_micro_batch_size: null
+    # Local per-GPU micro batch size
+    ppo_micro_batch_size_per_gpu: null
+    # Whether to automatically adjust batch size at runtime
+    use_dynamic_bsz: false
+    # Max tokens per GPU in one PPO batch; affects gradient accumulation
+    # Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
+    ppo_max_token_len_per_gpu: 16384
+    # Gradient clipping for actor updates
+    grad_clip: 1.0
+    # PPO clip ratio
+    clip_ratio: 0.2
+    # Lower bound for asymmetric clipping (used in dual-clip PPO)
+    clip_ratio_low: 0.2
+    # Upper bound for asymmetric clipping (used in dual-clip PPO)
+    clip_ratio_high: 0.2
+    # policy loss config
+    policy_loss:
+      # Loss function mode: vanilla / clip-cov / kl-cov /gpg from https://arxiv.org/abs/2505.22617
+      loss_mode: "vanilla"
+      # Ratio of tokens to be clipped for clip-cov loss
+      clip_cov_ratio: 0.0002
+      # Lower bound for clip-cov loss
+      clip_cov_lb: 1.0
+      # Upper bound for clip-cov loss
+      clip_cov_ub: 5.0
+      # Ratio of tokens to be applied kl penalty for kl-cov loss
+      kl_cov_ratio: 0.0002
+      # KL divergence penalty coefficient
+      ppo_kl_coef: 0.1
+    # Constant C in Dual-clip PPO; clips when advantage < 0 and ratio > C
+    clip_ratio_c: 3.0
+    # Loss aggregation mode: "token-mean", "seq-mean-token-sum", "seq-mean-token-mean", or "seq-mean-token-sum-norm"
+    loss_agg_mode: token-mean
+    # Scale factor for "seq-mean-token-sum-norm" loss aggregation mode.
+    # If null, uses response_length. Set to a constant to ensure consistent normalization.
+    loss_scale_factor: null
+    # Entropy regularization coefficient in PPO loss
+    entropy_coeff: 0
+    # Whether to use KL loss instead of KL reward penalty. True for GRPO
+    use_kl_loss: false
+    # Whether to use torch.compile()
+    use_torch_compile: true
+    # KL loss coefficient when use_kl_loss is enabled. For GRPO
+    kl_loss_coef: 0.001
+    # Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full"
+    kl_loss_type: low_var_kl
+    # Number of PPO epochs per batch
+    ppo_epochs: 1
+    # Shuffle training data across PPO epochs
+    shuffle: false
+    # Sequence parallelism size for Ulysses-style model parallelism
+    ulysses_sequence_parallel_size: 1
+    # calculate entropy with chunking to reduce memory peak
+    entropy_from_logits_with_chunking: False
+    # recompute entropy
+    entropy_checkpointing: False
+    # checkpoint configs
+    checkpoint:
+      # What to include in saved checkpoints
+      # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+      save_contents: ['model', 'optimizer', 'extra']
+      # For more flexibility, you can specify the contents to load from the checkpoint.
+      load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
+    # optimizer configs
+    optim:
+      # Learning rate
+      lr: 1e-6
+      # Warmup steps; negative value delegates to lr_warmup_steps_ratio
+      lr_warmup_steps: -1
+      # Warmup steps ratio (used if lr_warmup_steps is negative)
+      lr_warmup_steps_ratio: 0.0
+      # Minimum LR ratio for cosine schedule
+      min_lr_ratio: 0.0
+      # Number of cosine cycles in LR schedule
+      num_cycles: 0.5
+      # LR scheduler type: "constant" or "cosine"
+      lr_scheduler_type: constant
+      # Total training steps (must be overridden at runtime)
+      total_training_steps: -1
+      # Weight decay
+      weight_decay: 0.01
+    # configs for FSDP
+    fsdp_config:
+      # policy for wrapping the model
+      wrap_policy:
+        # Minimum number of parameters to trigger wrapping a layer with FSDP
+        min_num_params: 0
+      # Whether to offload model parameters to CPU (trades speed for memory)
+      param_offload: false
+      # Whether to offload optimizer state to CPU
+      optimizer_offload: false
+      # Only for FSDP2: offload param/grad/optimizer during train
+      offload_policy: false
+      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
+      reshard_after_forward: true
+      # Number of GPUs in each FSDP shard group; -1 means auto
+      fsdp_size: -1
+      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+      # before the current forward computation.
+      forward_prefetch: False
+  # Reference model config.
+  # Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
+  ref:
+    # actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default
+    strategy: ${actor_rollout_ref.actor.strategy}
+    # config for FSDP strategy
+    fsdp_config:
+      # whether to offload parameters in FSDP
+      param_offload: False
+      # whether to perform reshard after model forward to save memory.
+      # only for fsdp2, [True, False, int between 1 and fsdp_size]
+      reshard_after_forward: True
+      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+      # before the current forward computation.
+      forward_prefetch: False
+      # the wrap policy for FSDP model
+      wrap_policy:
+        # minimum number of params in a wrapped module
+        min_num_params: 0
+    # whether to enable torch.compile
+    use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
+    # [Will be deprecated, use log_prob_micro_batch_size_per_gpu]
+    # The batch size for one forward pass in the computation of log_prob. Global batch size.
+    log_prob_micro_batch_size: null
+    # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
+    log_prob_micro_batch_size_per_gpu: null
+    # enable dynamic batch size (sequence packing) for log_prob computation
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    # the max token length per GPU
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    # sequence parallel size
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
+    # calculate entropy with chunking to reduce memory peak
+    entropy_from_logits_with_chunking: False
+    # recompute entropy
+    entropy_checkpointing: False
+  # Rollout model config.
+  rollout:
+    # actor_rollout_ref.rollout.name: hf/vllm/sglang.
+    name: vllm
+    # sync: LLM, async: AsyncLLM
+    mode: async
+    # Sampling temperature for rollout.
+    temperature: 1.0
+    # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
+    top_k: -1
+    # Top-p sampling parameter. Default 1.0.
+    top_p: 1
+    # typically the same as data max prompt length
+    prompt_length: ${data.max_prompt_length}
+    # typically the same as data max response length
+    response_length: ${data.max_response_length}
+    # for vllm rollout
+    # Rollout model parameters type. Align with actor model's FSDP/Megatron type.
+    dtype: bfloat16
+    # Fraction of GPU memory used by vLLM/SGLang for KV cache.
+    gpu_memory_utilization: 0.5
+    # Whether to ignore EOS and continue generating after EOS is hit.
+    ignore_eos: False
+    # Whether to disable CUDA graph. Default True to allow cache freeing.
+    enforce_eager: False
+    # Whether to free engine KVCache after generation. Set enforce_eager=True when enabled.
+    free_cache_engine: True
+    # Which loader to use for rollout model weights: dummy_dtensor, hf, megatron, etc.
+    # safetensors (for huge model, and set use_shm=True); dummy_dtensor: randomly init model weight
+    load_format: dummy
+    # for huge model, layered summon can save memory (prevent OOM) but make it slower
+    layered_summon: False
+    # TP size for rollout. Only effective for vLLM.
+    tensor_model_parallel_size: 2
+    # max number of tokens in a batch
+    max_num_batched_tokens: 8192
+    # max length for rollout
+    max_model_len: null
+    # max length of sequences
+    max_num_seqs: 1024
+    # [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
+    log_prob_micro_batch_size: null
+    # The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
+    log_prob_micro_batch_size_per_gpu: null
+    # enable dynamic batch size (sequence packing) for log_prob computation
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    # max token length for log_prob computation
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    # disable logging statistics
+    disable_log_stats: True
+    # may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
+    enable_chunked_prefill: True
+    # for hf rollout
+    # Whether to sample during training rollout. False uses greedy sampling.
+    do_sample: True
+    # number of responses (i.e. num sample times). > 1 for grpo
+    n: 1
+    # Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition.
+    multi_stage_wake_up: false
+    # Extra inference engine arguments, please refer vllm/sglang official doc for detail
+    engine_kwargs:
+      # vllm engine config
+      vllm: {}
+      # sglang engine config
+      sglang: {}
+    # Sampling parameters used during validation.
+    val_kwargs:
+      # sampling parameters for validation
+      # Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
+      top_k: -1
+      # Top-p sampling parameter. Default 1.0.
+      top_p: 1.0
+      # Sampling temperature for rollout.
+      temperature: 0
+      # whether to repeat n times for validation
+      n: 1
+      # Whether to sample during training rollout. False uses greedy sampling.
+      do_sample: False
+    # Multi-turn interaction config for tools or chat.
+    multi_turn:
+      # set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
+      enable: False
+      # null for no limit (default max_length // 3)
+      max_assistant_turns: null
+      # null for no tool
+      tool_config_path: null
+      # null for no limit (default max_length // 3)
+      max_user_turns: null
+      # max parallel call for tools in single turn
+      max_parallel_calls: 1
+      # max length of tool response
+      max_tool_response_length: 256
+      # truncate side of tool response: left, middle, right
+      tool_response_truncate_side: middle
+      # null for no interaction
+      interaction_config_path: null
+      # - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
+      # - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
+      #   which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
+      use_inference_chat_template: False
+      # Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
+      # To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
+      # Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
+      # To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
+      # Qwen/QwQ-32B, Qwen/Qwen3-xxB
+      # - disable: disable tokenization sanity check
+      # - strict: enable strict tokenization sanity check (default)
+      # - ignore_strippable: ignore strippable tokens when checking tokenization sanity
+      tokenization_sanity_check_mode: strict
+      # Format of the multi-turn interaction. Options: hermes, llama3_json, ...
+      format: hermes
+    # support logging rollout prob for debugging purpose
+    calculate_log_probs: False
+    # [Experimental] agent loop based rollout configs
+    agent:
+      # Number of agent loop workers
+      num_workers: 8
+      # custom async server configs
+      custom_async_server:
+        # Path to the custom async server implementation
+        path: null
+        # Class name of the custom async server class (e.g. AsyncvLLMServer)
+        name: null
+  # profiler configs
+  profiler:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.ProfilerConfig
+    # True for each task has its own database, False for all tasks in one training step share one database.
+    discrete: False
+    # Whether to profile all ranks.
+    all_ranks: False
+    # The ranks that will be profiled. [] or [0,1,...]
+    ranks: []
+# configs for the critic
+critic:
+  # Number of rollouts per update (mirrors actor rollout_n)
+  rollout_n: ${actor_rollout_ref.rollout.n}
+  # fsdp or fsdp2 strategy used for critic model training
+  strategy: ${actor_rollout_ref.actor.strategy}
+  # optimizer configs
+  optim:
+    # Learning rate
+    lr: 1e-5
+    # Warmup steps ratio; total steps will be injected at runtime
+    lr_warmup_steps_ratio: 0.
+    # Minimum LR ratio for cosine schedule
+    min_lr_ratio: 0.0
+    # LR scheduler type: "constant" or "cosine"
+    lr_scheduler_type: constant
+    # Total training steps (must be overridden at runtime)
+    total_training_steps: -1
+    # Weight decay
+    weight_decay: 0.01
+  # model config for the critic
+  model:
+    # Path to pretrained model weights
+    path: ~/models/deepseek-llm-7b-chat
+    # Whether to use shared memory for loading the model
+    use_shm: False
+    # Tokenizer path (defaults to actor's model path)
+    tokenizer_path: ${actor_rollout_ref.model.path}
+    # Hugging Face config override
+    override_config: { }
+    # External model implementation (optional)
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    # Enable gradient checkpointing to save memory
+    enable_gradient_checkpointing: True
+    # Offload activations to CPU to reduce GPU memory usage
+    enable_activation_offload: False
+    # Use remove padding optimization (saves compute)
+    use_remove_padding: False
+    # Whether to trust remote code from Hugging Face models
+    trust_remote_code: ${actor_rollout_ref.model.trust_remote_code}
+    # FSDP-specific config
+    fsdp_config:
+      # Whether to offload model parameters to CPU
+      param_offload: False
+      # Whether to offload optimizer state to CPU
+      optimizer_offload: False
+      # Only for FSDP2: offload param/grad/optimizer during train
+      offload_policy: False
+      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
+      reshard_after_forward: True
+      # Policy for wrapping layers with FSDP
+      wrap_policy:
+        # Minimum number of parameters to trigger wrapping
+        min_num_params: 0
+      # Number of GPUs in each FSDP shard group; -1 means auto
+      fsdp_size: -1
+      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+      # before the current forward computation.
+      forward_prefetch: False
+    # Set to positive value to enable LoRA (e.g., 32)
+    lora_rank: 0
+    # LoRA scaling factor
+    lora_alpha: 16
+    # LoRA target modules: "all-linear" or list of linear projection layers
+    target_modules: all-linear
+  # PPO mini-batch size per update
+  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
+  # [Deprecated] Global micro batch size
+  ppo_micro_batch_size: null
+  # Local per-GPU micro batch size
+  ppo_micro_batch_size_per_gpu: null
+  # Forward-only batch size (global)
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  # Forward-only batch size (per GPU)
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  # Whether to automatically adjust batch size at runtime
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  # Max tokens per GPU in one PPO batch (doubled for critic)
+  ppo_max_token_len_per_gpu: 32768
+  # Max token length per GPU in forward pass
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  # Sequence parallelism size for Ulysses-style model parallelism
+  ulysses_sequence_parallel_size: 1
+  # Number of PPO epochs per batch
+  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
+  # Shuffle training data across PPO epochs
+  shuffle: ${actor_rollout_ref.actor.shuffle}
+  # Gradient clipping for critic updates
+  grad_clip: 1.0
+  # PPO value function clipping range
+  cliprange_value: 0.5
+  # Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
+  loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
+  # checkpoint configs
+  checkpoint:
+    # What to include in saved checkpoints
+    # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
+    save_contents: ['model', 'optimizer', 'extra']
+    # What to include when loading checkpoints
+    load_contents: ${critic.checkpoint.save_contents}
+  # profiler configs
+  # the corresponding dataclass is verl.utils.profiler.ProfilerConfig.
+  profiler:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.ProfilerConfig
+    # True for each task has its own database, False for all tasks in one training step share one database.
+    discrete: False
+    # Whether to profile all ranks.
+    all_ranks: False
+    # The ranks that will be profiled. [] or [0,1,...]
+    ranks: []
+# configs for the reward model
+reward_model:
+  # Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
+  # In GSM8K and Math examples, we disable reward model.
+  # For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
+  # If False, the following parameters are not effective
+  enable: False
+  # FSDP strategy: "fsdp" or "fsdp2"
+  strategy: ${actor_rollout_ref.actor.strategy}
+  # model config for reward scoring
+  model:
+    # Input tokenizer. If the reward model’s chat template is inconsistent with the policy,
+    # we need to first decode to plaintext, then apply the rm’s chat_template.
+    # Then score with RM. If chat_templates are consistent, it can be set to null.
+    input_tokenizer: ${actor_rollout_ref.model.path}
+    # RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
+    # Other model types need to define their own RewardModelWorker and pass it from the code.
+    path: ~/models/FsfairX-LLaMA3-RM-v0.1
+    # Whether to use shared memory for loading the model
+    use_shm: False
+    # External model implementation (optional)
+    external_lib: ${actor_rollout_ref.model.external_lib}
+    # Use remove padding optimization (saves compute)
+    use_remove_padding: False
+    # Whether to use fused reward kernels for speedup
+    use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
+    # Whether to enable loading a remote code model, default to False
+    trust_remote_code: False
+    # FSDP-specific config
+    fsdp_config:
+      # Policy for wrapping layers with FSDP
+      wrap_policy:
+        # Minimum number of parameters to trigger wrapping
+        min_num_params: 0
+      # Whether to offload model parameters to CPU
+      param_offload: False
+      # Only for FSDP2: Reshard after forward pass to reduce memory footprint
+      reshard_after_forward: True
+      # Number of GPUs in each FSDP shard group; -1 means auto
+      fsdp_size: -1
+      # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
+      # before the current forward computation.
+      forward_prefetch: False
+  # [Deprecated] Global micro batch size
+  micro_batch_size: null
+  # Local per-GPU micro batch size
+  micro_batch_size_per_gpu: null
+  # Maximum sequence length to process for scoring
+  max_length: null
+  # Sequence parallelism size for Ulysses-style model parallelism
+  ulysses_sequence_parallel_size: 1
+  # Whether to dynamically adjust batch size at runtime
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  # Maximum number of tokens per GPU in one forward pass
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  # Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
+  # Default is naive. If all verification functions are multiprocessing-safe,
+  # the reward manager can be set to prime for parallel verification.
+  reward_manager: naive
+  # Whether to launch custom reward function asynchronously during log_prob
+  launch_reward_fn_async: False
+  # Cloud/local sandbox fusion configuration for custom reward logic
+  sandbox_fusion:
+    # Cloud/local function URL for sandbox execution
+    url: null
+    # Max concurrent requests allowed to sandbox
+    max_concurrent: 64
+    # Max memory limit for each sandbox process in MB
+    memory_limit_mb: 1024
+  # profiler configs
+  profiler:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.utils.profiler.ProfilerConfig
+    # True for each task has its own database, False for all tasks in one training step share one database.
+    discrete: False
+    # Whether to profile all ranks.
+    all_ranks: False
+    # The ranks that will be profiled. [] or [0,1,...]
+    ranks: []
+# custom reward function definition
+custom_reward_function:
+  # The path to the file containing your customized reward function.
+  # If not specified, pre-implemented reward functions will be used.
+  path: null
+  # The name of the reward function within the specified file. Default is 'compute_score'.
+  name: compute_score
+# config for the algorithm
+algorithm:
+  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+  _target_: verl.trainer.config.AlgoConfig
+  # Discount factor for future rewards
+  gamma: 1.0
+  # Trade-off between bias and variance in the GAE estimator
+  lam: 1.0
+  # Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
+  adv_estimator: gae
+  # Whether to normalize advantages by std (specific to GRPO)
+  norm_adv_by_std_in_grpo: True
+  # Whether to enable in-reward KL penalty
+  use_kl_in_reward: False
+  # How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full"
+  kl_penalty: kl
+  # KL control configuration
+  kl_ctrl:
+    # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
+    _target_: verl.trainer.config.KLControlConfig
+    # KL control type: "fixed" or "adaptive"
+    type: fixed
+    # Initial coefficient for KL penalty
+    kl_coef: 0.001
+    # Horizon value for adaptive controller (if enabled)
+    horizon: 10000
+    # Target KL divergence (used for adaptive controller)
+    target_kl: 0.1
+  # Whether to enable preference feedback PPO
+  use_pf_ppo: False
+  # Preference feedback PPO settings
+  pf_ppo:
+    # Method for reweighting samples: "pow", "max_min", or "max_random"
+    reweight_method: pow
+    # Power used for weight scaling in "pow" method
+    weight_pow: 2.0
+# config for the trainer
+trainer:
+  # Whether to balance batch sizes across distributed workers
+  balance_batch: True
+  # Number of epochs in training
+  total_epochs: 30
+  # Total training steps (can be set explicitly or derived from epochs)
+  total_training_steps: null
+  # The steps that will be profiled. null means no profiling. null or [1,2,5,...]
+  profile_steps: null
+  # controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
+  ## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
+  ## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
+  controller_nsight_options:
+    # Select the API(s) to be traced.
+    trace: "cuda,nvtx,cublas,ucx"
+    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
+    cuda-memory-usage: "true"
+    # CUDA graphs will be traced as a whole
+    cuda-graph-trace: "graph"
+  # worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
+  worker_nsight_options:
+    # Select the API(s) to be traced.
+    trace: "cuda,nvtx,cublas,ucx"
+    # Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
+    cuda-memory-usage: "true"
+    # CUDA graphs will be traced as a whole
+    cuda-graph-trace: "graph"
+    # Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
+    capture-range: "cudaProfilerApi"
+    # Specify the desired behavior when a capture range ends.
+    # In verl we need the orch.cuda.profiler.start/stop pair to repeats n times.
+    # valid values are "repeat-shutdown:n" or null.
+    # For normal whole step profiling, n = len(profile_steps);
+    # but for discrete profiling, n = len(profile_steps) * Number(subtasks).
+    # Or you can just leave it null and the program will use n = len(profile_steps) * 6;
+    capture-range-end: null
+    # Send signal to the target application's process group. We let the program to exit by itself.
+    kill: none
+  # Config for npu profiler. Must set when profile_steps is not None and torch_npu is available.
+  npu_profile:
+    # Options for the npu profiler
+    options:
+      # Storage path of collected data.
+      save_path: ./profiler_data
+      # The roles that will be profiled. Only takes effect in discrete mode.
+      # optional values: all, rollout_generate, actor_compute_log_prob, actor_update and ref_compute_log_prob.
+      # "all" means all roles will be profiled.
+      roles: ["all"]
+      # Collection level, optional values: level_none, level0, level1, level2.
+      level: level0
+      # Whether to enable memory analysis.
+      with_memory: False
+      # Whether to record tensor shape.
+      record_shapes: False
+      # Whether to record Device-side performance data.
+      with_npu: True
+      # Whether to record Host-side performance data.
+      with_cpu: True
+      # Whether to record Python call stack information.
+      with_module: False
+      # Whether to record operator call stack information.
+      with_stack: False
+      # Whether to automatically parse the data.
+      analysis: True
+  # Project name for experiment tracking (e.g., wandb)
+  project_name: verl_examples
+  # Experiment name for run identification in tracking tools
+  experiment_name: gsm8k
+  # Logging backends to use: "console", "wandb", etc.
+  logger: [ 'console', 'wandb' ]
+  # Number of generations to log during validation
+  log_val_generations: 0
+  # Directory for logging rollout data; no dump if null
+  rollout_data_dir: null
+  # Directory for logging validation data; no dump if null
+  validation_data_dir: null
+  # Number of nodes used in the training
+  nnodes: 1
+  # Number of GPUs per node
+  n_gpus_per_node: 8
+  # Save frequency (by iteration) for model checkpoints
+  save_freq: -1
+  # ESI refers to the elastic server instance used during training, similar to the training plan. For example,
+  # if you purchase 10 hours of computing power, the ESI will automatically shut down after 10 hours of training.
+  # To ensure a checkpoint is saved before ESI shuts down, the system will start saving a checkpoint in advance.
+  # The advance time is calculated as: Advance Time = Longest historical step duration + Checkpoint save duration + esi_redundant_time.
+  # Here, esi_redundant_time is a user-defined value that further extends the advance time for added safety.
+  esi_redundant_time: 0
+  # Resume mode: "auto", "disable", or "resume_path"
+  # "auto": resume from last checkpoint if available
+  # "disable": start from scratch
+  # "resume_path": resume from a user-defined path
+  resume_mode: auto
+  # Path to resume training from (only used when resume_mode is "resume_path")
+  resume_from_path: null
+  # Whether to run validation before training begins
+  val_before_train: True
+  # Whether to run validation only
+  val_only: False
+  # Validation frequency (in training iterations)
+  test_freq: -1
+  # Number of iterations to warm up the critic before updating policy
+  critic_warmup: 0
+  # Default path to distributed filesystem for saving checkpoints
+  default_hdfs_dir: null
+  # Whether to delete local checkpoints after loading
+  del_local_ckpt_after_load: False
+  # Default local directory for saving checkpoints
+  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
+  # Maximum number of actor checkpoints to keep
+  max_actor_ckpt_to_keep: null
+  # Maximum number of critic checkpoints to keep
+  max_critic_ckpt_to_keep: null
+  # Timeout (in seconds) for Ray worker to wait for registration
+  ray_wait_register_center_timeout: 300
+  # Device to run training on (e.g., "cuda", "cpu")
+  device: cuda
+# configs related to ray
+ray_kwargs:
+  # configs related to ray initialization
+  ray_init:
+    # Number of CPUs for Ray. Use a fixed number instead of null when using SLURM.
+    num_cpus: null
+  # Path to save Ray timeline JSON for performance profiling
+  timeline_json_file: null

code/RL_model/verl/verl_train/tests/trainer/config/test_algo_config_on_cpu.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import torch
+from omegaconf import OmegaConf
+from verl.trainer.config import AlgoConfig, KLControlConfig
+from verl.trainer.ppo.core_algos import (
+    compute_gae_advantage_return,
+    compute_grpo_outcome_advantage,
+    get_adv_estimator_fn,
+)
+from verl.utils.config import omega_conf_to_dataclass
+class TestAlgoConfig(unittest.TestCase):
+    """Test the AlgoConfig dataclass and its integration with core algorithms."""
+    def setUp(self):
+        """Set up test fixtures."""
+        # Create a sample algorithm config as DictConfig (similar to what comes from YAML)
+        self.config_dict = {
+            "_target_": "verl.trainer.config.AlgoConfig",
+            "gamma": 0.99,
+            "lam": 0.95,
+            "adv_estimator": "gae",
+            "norm_adv_by_std_in_grpo": True,
+            "use_kl_in_reward": True,
+            "kl_penalty": "kl",
+            "kl_ctrl": {
+                "_target_": "verl.trainer.config.KLControlConfig",
+                "type": "adaptive",
+                "kl_coef": 0.002,
+                "horizon": 5000,
+                "target_kl": 0.05,
+            },
+            "use_pf_ppo": True,
+            "pf_ppo": {"reweight_method": "max_min", "weight_pow": 3.0},
+        }
+        self.omega_config = OmegaConf.create(self.config_dict)
+    def test_dataclass_creation_from_dict(self):
+        """Test creating AlgoConfig from dictionary."""
+        config = omega_conf_to_dataclass(self.config_dict)
+        self.assertIsInstance(config, AlgoConfig)
+        self.assertEqual(config.gamma, 0.99)
+        self.assertEqual(config.lam, 0.95)
+        self.assertEqual(config.adv_estimator, "gae")
+        self.assertTrue(config.norm_adv_by_std_in_grpo)
+        self.assertTrue(config.use_kl_in_reward)
+        self.assertEqual(config.kl_penalty, "kl")
+        self.assertTrue(config.use_pf_ppo)
+    def test_dataclass_creation_from_omega_config(self):
+        """Test creating AlgoConfig from OmegaConf DictConfig."""
+        config = omega_conf_to_dataclass(self.omega_config)
+        self.assertIsInstance(config, AlgoConfig)
+        self.assertEqual(config.gamma, 0.99)
+        self.assertEqual(config.lam, 0.95)
+    def test_nested_configs(self):
+        """Test that nested configurations are properly converted."""
+        config = omega_conf_to_dataclass(self.omega_config)
+        # Test KL control config
+        self.assertIsInstance(config.kl_ctrl, KLControlConfig)
+        self.assertEqual(config.kl_ctrl.type, "adaptive")
+        self.assertEqual(config.kl_ctrl.kl_coef, 0.002)
+        self.assertEqual(config.kl_ctrl.horizon, 5000)
+        self.assertEqual(config.kl_ctrl.target_kl, 0.05)
+        # Test PF PPO config
+        self.assertEqual(config.pf_ppo.get("reweight_method"), "max_min")
+        self.assertEqual(config.pf_ppo.get("weight_pow"), 3.0)
+    def test_default_values(self):
+        """Test that default values are properly set."""
+        minimal_config = {"gamma": 0.8}
+        config = omega_conf_to_dataclass(minimal_config, AlgoConfig)
+        self.assertEqual(config.gamma, 0.8)
+        self.assertEqual(config.lam, 1.0)  # default value
+        self.assertEqual(config.adv_estimator, "gae")  # default value
+        self.assertTrue(config.norm_adv_by_std_in_grpo)  # default value
+        self.assertFalse(config.use_kl_in_reward)  # default value
+        self.assertEqual(config.kl_penalty, "kl")  # default value
+        self.assertFalse(config.use_pf_ppo)  # default value
+    def test_get_method_backward_compatibility(self):
+        """Test the get method for backward compatibility."""
+        config = omega_conf_to_dataclass(self.omega_config)
+        # Test existing attribute
+        self.assertEqual(config.get("gamma"), 0.99)
+        self.assertEqual(config.get("gamma", 1.0), 0.99)
+        # Test non-existing attribute
+        self.assertIsNone(config.get("non_existing"))
+        self.assertEqual(config.get("non_existing", "default"), "default")
+    def test_post_init_nested_configs(self):
+        """Test that __post_init__ properly initializes nested configs when None."""
+        # Create config without nested configs
+        minimal_config = AlgoConfig(gamma=0.9)
+        # Check that nested configs are initialized
+        self.assertIsNotNone(minimal_config.kl_ctrl)
+        self.assertIsInstance(minimal_config.kl_ctrl, KLControlConfig)
+        assert not minimal_config.pf_ppo
+    def test_config_init_from_yaml(self):
+        import os
+        from hydra import compose, initialize_config_dir
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+            cfg = compose(config_name="ppo_trainer")
+        algo_config = omega_conf_to_dataclass(cfg.algorithm)
+        from verl.trainer.config import AlgoConfig
+        assert isinstance(algo_config, AlgoConfig)
+class TestAlgoCompute(unittest.TestCase):
+    """Test the AlgoConfig dataclass and its integration with core algorithms."""
+    def setUp(self):
+        """Set up test fixtures."""
+        self.algo_config = AlgoConfig(
+            gamma=0.99,
+            lam=0.95,
+            adv_estimator="gae",
+            norm_adv_by_std_in_grpo=True,
+            use_kl_in_reward=True,
+            kl_penalty="kl",
+            kl_ctrl=KLControlConfig(type="adaptive", kl_coef=0.002, horizon=5000, target_kl=0.05),
+            use_pf_ppo=True,
+            pf_ppo={"reweight_method": "max_min", "weight_pow": 3.0},
+        )
+    def test_advantage_estimator_with_cfg(self):
+        """Test integration with advantage estimators from core_algos."""
+        config = self.algo_config
+        # Test GAE advantage estimator
+        adv_fn = get_adv_estimator_fn(config.adv_estimator)
+        self.assertIsNotNone(adv_fn)
+        # Test with actual GAE computation
+        batch_size, seq_len = 2, 5
+        token_level_rewards = torch.randn(batch_size, seq_len)
+        values = torch.randn(batch_size, seq_len)
+        response_mask = torch.ones(batch_size, seq_len)
+        advantages, returns = compute_gae_advantage_return(
+            token_level_rewards=token_level_rewards,
+            values=values,
+            response_mask=response_mask,
+            gamma=config.gamma,
+            lam=config.lam,
+        )
+        self.assertEqual(advantages.shape, (batch_size, seq_len))
+        self.assertEqual(returns.shape, (batch_size, seq_len))
+    def test_grpo_advantage_estimator_with_cfg(self):
+        """Test integration with GRPO advantage estimator."""
+        grpo_config = AlgoConfig(adv_estimator="grpo", norm_adv_by_std_in_grpo=True)
+        # Test GRPO advantage computation
+        batch_size, seq_len = 4, 3
+        token_level_rewards = torch.tensor([[1.0, 0.5, 0.0], [2.0, 1.0, 0.0], [0.5, 0.2, 0.0], [1.5, 0.8, 0.0]])
+        response_mask = torch.ones(batch_size, seq_len)
+        index = np.array([0, 0, 1, 1])  # Two groups
+        advantages, returns = compute_grpo_outcome_advantage(
+            token_level_rewards=token_level_rewards,
+            response_mask=response_mask,
+            index=index,
+            norm_adv_by_std_in_grpo=grpo_config.norm_adv_by_std_in_grpo,
+        )
+        self.assertEqual(advantages.shape, (batch_size, seq_len))
+        self.assertEqual(returns.shape, (batch_size, seq_len))
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/trainer/config/test_legacy_config_on_cpu.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+import warnings
+from hydra import compose, initialize_config_dir
+from hydra.core.global_hydra import GlobalHydra
+from omegaconf import OmegaConf
+_BREAKING_CHANGES = [
+    "critic.optim.lr",  # mcore critic lr init value 1e-6 -> 1e-5
+    "actor_rollout_ref.actor.optim.lr_warmup_steps",  # None -> -1
+    "critic.optim.lr_warmup_steps",  # None -> -1
+    "actor_rollout_ref.rollout.name",  # vllm -> ???
+    "actor_rollout_ref.actor.megatron.expert_tensor_parallel_size",
+    "actor_rollout_ref.ref.megatron.expert_tensor_parallel_size",
+    "critic.megatron.expert_tensor_parallel_size",
+    "reward_model.megatron.expert_tensor_parallel_size",
+]
+class TestConfigComparison(unittest.TestCase):
+    """Test that current configs match their legacy counterparts exactly."""
+    ignored_keys = [
+        "enable_gradient_checkpointing",
+        "gradient_checkpointing_kwargs",
+        "activations_checkpoint_method",
+        "activations_checkpoint_granularity",
+        "activations_checkpoint_num_layers",
+        "discrete",
+        "profiler",
+        "profile",
+        "use_profile",
+        "npu_profile",
+        "profile_steps",
+        "worker_nsight_options",
+        "controller_nsight_options",
+    ]
+    def _compare_configs_recursively(
+        self, current_config, legacy_config, path="", legacy_allow_missing=True, current_allow_missing=False
+    ):
+        """Recursively compare two OmegaConf configs and assert they are identical.
+        Args:
+            legacy_allow_missing (bool): sometimes the legacy megatron config contains fewer keys and
+              we allow that to happen
+        """
+        if isinstance(current_config, dict) and isinstance(legacy_config, dict):
+            current_keys = set(current_config.keys())
+            legacy_keys = set(legacy_config.keys())
+            missing_in_current = legacy_keys - current_keys
+            missing_in_legacy = current_keys - legacy_keys
+            # Ignore specific keys that are allowed to be missing
+            for key in self.ignored_keys:
+                if key in missing_in_current:
+                    missing_in_current.remove(key)
+                if key in missing_in_legacy:
+                    missing_in_legacy.remove(key)
+            if missing_in_current:
+                msg = f"Keys missing in current config at {path}: {missing_in_current}"
+                if current_allow_missing:
+                    warnings.warn(msg, stacklevel=1)
+                else:
+                    self.fail(f"Keys missing in current config at {path}: {missing_in_current}")
+            if missing_in_legacy:
+                # if the legacy
+                msg = f"Keys missing in legacy config at {path}: {missing_in_legacy}"
+                if legacy_allow_missing:
+                    warnings.warn(msg, stacklevel=1)
+                else:
+                    self.fail(msg)
+            for key in current_keys:
+                current_path = f"{path}.{key}" if path else key
+                if key in legacy_config:
+                    self._compare_configs_recursively(current_config[key], legacy_config[key], current_path)
+        elif isinstance(current_config, list) and isinstance(legacy_config, list):
+            self.assertEqual(
+                len(current_config),
+                len(legacy_config),
+                f"List lengths differ at {path}: current={len(current_config)}, legacy={len(legacy_config)}",
+            )
+            for i, (current_item, legacy_item) in enumerate(zip(current_config, legacy_config, strict=True)):
+                self._compare_configs_recursively(current_item, legacy_item, f"{path}[{i}]")
+        elif path not in _BREAKING_CHANGES:
+            self.assertEqual(
+                current_config,
+                legacy_config,
+                f"Values differ at {path}: current={current_config}, legacy={legacy_config}",
+            )
+    def test_ppo_trainer_config_matches_legacy(self):
+        """Test that ppo_trainer.yaml matches legacy_ppo_trainer.yaml exactly."""
+        import os
+        from hydra import compose, initialize_config_dir
+        from hydra.core.global_hydra import GlobalHydra
+        GlobalHydra.instance().clear()
+        try:
+            with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+                current_config = compose(config_name="ppo_trainer")
+            legacy_config = OmegaConf.load("tests/trainer/config/legacy_ppo_trainer.yaml")
+            current_dict = OmegaConf.to_container(current_config, resolve=True)
+            legacy_dict = OmegaConf.to_container(legacy_config, resolve=True)
+            if "defaults" in current_dict:
+                del current_dict["defaults"]
+            self._compare_configs_recursively(current_dict, legacy_dict)
+        finally:
+            GlobalHydra.instance().clear()
+    def test_ppo_megatron_trainer_config_matches_legacy(self):
+        """Test that ppo_megatron_trainer.yaml matches legacy_ppo_megatron_trainer.yaml exactly."""
+        GlobalHydra.instance().clear()
+        try:
+            with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+                current_config = compose(config_name="ppo_megatron_trainer")
+            legacy_config = OmegaConf.load("tests/trainer/config/legacy_ppo_megatron_trainer.yaml")
+            current_dict = OmegaConf.to_container(current_config, resolve=True)
+            legacy_dict = OmegaConf.to_container(legacy_config, resolve=True)
+            if "defaults" in current_dict:
+                del current_dict["defaults"]
+            self._compare_configs_recursively(
+                current_dict, legacy_dict, legacy_allow_missing=True, current_allow_missing=False
+            )
+        finally:
+            GlobalHydra.instance().clear()
+    def test_load_component(self):
+        """Test that ppo_megatron_trainer.yaml matches legacy_ppo_megatron_trainer.yaml exactly."""
+        GlobalHydra.instance().clear()
+        configs_to_load = [
+            ("verl/trainer/config/actor", "dp_actor"),
+            ("verl/trainer/config/actor", "megatron_actor"),
+            ("verl/trainer/config/ref", "dp_ref"),
+            ("verl/trainer/config/ref", "megatron_ref"),
+            ("verl/trainer/config/rollout", "rollout"),
+        ]
+        for config_dir, config_file in configs_to_load:
+            try:
+                with initialize_config_dir(config_dir=os.path.abspath(config_dir)):
+                    compose(config_name=config_file)
+            finally:
+                GlobalHydra.instance().clear()
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/trainer/ppo/__init__.py ADDED Viewed

	@@ -0,0 +1,16 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests for the PPO trainer module.
+"""

code/RL_model/verl/verl_train/tests/trainer/ppo/test_core_algos_on_cpu.py ADDED Viewed

	@@ -0,0 +1,317 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import random
+import unittest
+import numpy as np
+import pytest
+import torch
+import verl.trainer.ppo.core_algos
+from verl.trainer.ppo.core_algos import (
+    compute_gae_advantage_return,
+    compute_grpo_outcome_advantage,
+    compute_grpo_vectorized_outcome_advantage,
+    compute_rloo_outcome_advantage,
+    compute_rloo_vectorized_outcome_advantage,
+    get_adv_estimator_fn,
+    register_adv_est,
+)
+def mock_test_fn():
+    pass
+class TestRegisterAdvEst(unittest.TestCase):
+    def setUp(self):
+        """Clear the registry before each test"""
+        verl.trainer.ppo.core_algos.ADV_ESTIMATOR_REGISTRY.clear()
+        verl.trainer.ppo.core_algos.ADV_ESTIMATOR_REGISTRY = {
+            "gae": lambda x: x * 2,
+            "vtrace": lambda x: x + 1,
+        }
+        self.ADV_ESTIMATOR_REGISTRY = verl.trainer.ppo.core_algos.ADV_ESTIMATOR_REGISTRY
+    def tearDown(self) -> None:
+        verl.trainer.ppo.core_algos.ADV_ESTIMATOR_REGISTRY.clear()
+        return super().tearDown()
+    def test_register_new_function(self):
+        """Test registering a new function with a string name"""
+        @register_adv_est("test_estimator")
+        def test_fn():
+            pass
+        self.assertIn("test_estimator", self.ADV_ESTIMATOR_REGISTRY)
+        self.assertEqual(self.ADV_ESTIMATOR_REGISTRY["test_estimator"], test_fn)
+    def test_register_with_enum(self):
+        """Test registering with an enum value (assuming AdvantageEstimator exists)"""
+        from enum import Enum
+        class AdvantageEstimator(Enum):
+            TEST = "test_enum_estimator"
+        @register_adv_est(AdvantageEstimator.TEST)
+        def test_fn():
+            pass
+        self.assertIn("test_enum_estimator", self.ADV_ESTIMATOR_REGISTRY)
+        self.assertEqual(self.ADV_ESTIMATOR_REGISTRY["test_enum_estimator"], test_fn)
+    def test_duplicate_registration_same_function(self):
+        """Test that registering the same function twice doesn't raise an error"""
+        register_adv_est("duplicate_test")(mock_test_fn)
+        register_adv_est("duplicate_test")(mock_test_fn)
+        self.assertEqual(self.ADV_ESTIMATOR_REGISTRY["duplicate_test"], mock_test_fn)
+    def test_duplicate_registration_different_function(self):
+        """Test that registering different functions with same name raises ValueError"""
+        @register_adv_est("conflict_test")
+        def test_fn1():
+            pass
+        with self.assertRaises(ValueError):
+            @register_adv_est("conflict_test")
+            def test_fn2():
+                pass
+    def test_decorator_preserves_function(self):
+        """Test that the decorator returns the original function"""
+        def test_fn():
+            return "original"
+        decorated = register_adv_est("preserve_test")(test_fn)
+        self.assertEqual(decorated(), "original")
+    def test_multiple_registrations(self):
+        """Test registering multiple different functions"""
+        init_adv_count = len(self.ADV_ESTIMATOR_REGISTRY)
+        @register_adv_est("estimator1")
+        def fn1():
+            pass
+        @register_adv_est("estimator2")
+        def fn2():
+            pass
+        self.assertEqual(len(self.ADV_ESTIMATOR_REGISTRY), 2 + init_adv_count)
+        self.assertEqual(self.ADV_ESTIMATOR_REGISTRY["estimator1"], fn1)
+        self.assertEqual(self.ADV_ESTIMATOR_REGISTRY["estimator2"], fn2)
+    def test_get_adv_estimator_fn_valid_names(self):
+        """Test that valid names return the correct function from registry."""
+        # Test GAE
+        gae_fn = get_adv_estimator_fn("gae")
+        assert gae_fn(5) == 10  # 5 * 2 = 10
+        # Test Vtrace
+        vtrace_fn = get_adv_estimator_fn("vtrace")
+        assert vtrace_fn(5) == 6  # 5 + 1 = 6
+    def test_get_adv_estimator_fn_invalid_name(self):
+        """Test that invalid names raise ValueError."""
+        with pytest.raises(ValueError) as excinfo:
+            get_adv_estimator_fn("invalid_name")
+        assert "Unknown advantage estimator simply: invalid_name" in str(excinfo.value)
+    def test_get_adv_estimator_fn_case_sensitive(self):
+        """Test that name lookup is case-sensitive."""
+        with pytest.raises(ValueError):
+            get_adv_estimator_fn("GAE")  # Different case
+def test_multi_turn_compute_gae_advantage_return():
+    """Test multi-turn GAE skip observation tokens."""
+    gamma = random.uniform(0.0, 1.0)
+    lam = random.uniform(0.0, 1.0)
+    rewards = torch.tensor([[0.0, 0.0, 0.1, 0.1, 0.1, 0.0, 0.0, 0.1, 1.0, 0.0, 0.0]], dtype=torch.float)
+    values1 = torch.tensor(
+        [
+            [
+                random.uniform(-100.0, 100.0),
+                random.random(),
+                4.0,
+                5.0,
+                6.0,
+                random.uniform(-100.0, 0),
+                random.random(),
+                7.0,
+                9.0,
+                0.0,
+                0.0,
+            ]
+        ],
+        dtype=torch.float,
+    )
+    values2 = torch.tensor(
+        [
+            [
+                random.random(),
+                random.uniform(-100.0, 100.0),
+                4.0,
+                5.0,
+                6.0,
+                random.random(),
+                random.uniform(0.0, 100.0),
+                7.0,
+                9.0,
+                0.0,
+                0.0,
+            ]
+        ],
+        dtype=torch.float,
+    )
+    response_mask = torch.tensor([[0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0]], dtype=torch.float)
+    adv1, ret1 = compute_gae_advantage_return(rewards, values1, response_mask, gamma, lam)
+    adv2, ret2 = compute_gae_advantage_return(rewards, values2, response_mask, gamma, lam)
+    ret1 *= response_mask
+    ret2 *= response_mask
+    assert torch.equal(adv1, adv2), f"{adv1=}, {adv2=}"
+    assert torch.equal(ret1, ret2), f"{ret1=}, {ret2=}"
+    print(f" [CORRECT] \n\n{adv1=}, \n\n{ret1=}")
+def _make_group_index(batch_size: int, num_groups: int) -> np.ndarray:
+    """Create a numpy index array ensuring each group has at least 2 samples."""
+    assert num_groups * 2 <= batch_size, "batch_size must allow >=2 samples per group"
+    counts: list[int] = [2] * num_groups
+    remaining = batch_size - 2 * num_groups
+    for _ in range(remaining):
+        counts[random.randrange(num_groups)] += 1
+    index = []
+    for gid, c in enumerate(counts):
+        index.extend([gid] * c)
+    random.shuffle(index)
+    return np.asarray(index, dtype=np.int64)
+def _rand_mask(batch_size: int, seq_len: int) -> torch.Tensor:
+    mask = torch.randint(0, 2, (batch_size, seq_len), dtype=torch.int64).float()
+    rows_without_one = (mask.sum(dim=-1) == 0).nonzero(as_tuple=True)[0]
+    if len(rows_without_one) > 0:
+        mask[rows_without_one, -1] = 1.0
+    return mask
+@pytest.mark.parametrize(
+    "batch_size,seq_len,num_groups,seed",
+    [
+        (64, 128, 5, 0),
+        (128, 256, 8, 1),
+        (512, 512, 10, 2),
+    ],
+)
+def test_rloo_and_vectorized_equivalence(batch_size: int, seq_len: int, num_groups: int, seed: int):
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    index = _make_group_index(batch_size, num_groups)
+    response_mask = _rand_mask(batch_size, seq_len)
+    base_rewards = torch.randn(batch_size, seq_len, dtype=torch.float32)
+    token_level_rewards = base_rewards * response_mask
+    adv1, ret1 = compute_rloo_outcome_advantage(
+        token_level_rewards=token_level_rewards,
+        response_mask=response_mask,
+        index=index,
+    )
+    adv2, ret2 = compute_rloo_vectorized_outcome_advantage(
+        token_level_rewards=token_level_rewards,
+        response_mask=response_mask,
+        index=index,
+    )
+    # Print concise diagnostics for visibility during test runs
+    adv_max_diff = (adv1 - adv2).abs().max().item()
+    ret_max_diff = (ret1 - ret2).abs().max().item()
+    total_mask_tokens = int(response_mask.sum().item())
+    print(
+        f"[RLOO] seed={seed} groups={num_groups} shape={adv1.shape} "
+        f"mask_tokens={total_mask_tokens} adv_max_diff={adv_max_diff:.3e} ret_max_diff={ret_max_diff:.3e}"
+    )
+    assert adv1.shape == adv2.shape == (batch_size, seq_len)
+    assert ret1.shape == ret2.shape == (batch_size, seq_len)
+    assert torch.allclose(adv1, adv2, rtol=1e-5, atol=1e-6)
+    assert torch.allclose(ret1, ret2, rtol=1e-5, atol=1e-6)
+@pytest.mark.parametrize(
+    "batch_size,seq_len,num_groups,seed",
+    [
+        (64, 128, 5, 0),
+        (128, 256, 8, 1),
+        (512, 512, 10, 2),
+    ],
+)
+def test_grpo_and_vectorized_equivalence(batch_size: int, seq_len: int, num_groups: int, seed: int):
+    # Set seeds for reproducibility
+    torch.manual_seed(seed)
+    random.seed(seed)
+    np.random.seed(seed)
+    # Generate group indices (numpy array of shape [batch_size])
+    index = _make_group_index(batch_size, num_groups)
+    # Generate binary response mask (at least one valid token per row)
+    response_mask = _rand_mask(batch_size, seq_len)
+    # Generate token-level rewards and apply mask
+    base_rewards = torch.randn(batch_size, seq_len, dtype=torch.float32)
+    token_level_rewards = base_rewards * response_mask
+    # Compute GRPO outcome advantage (original implementation)
+    adv1, ret1 = compute_grpo_outcome_advantage(
+        token_level_rewards=token_level_rewards,
+        response_mask=response_mask,
+        index=index,
+    )
+    # Compute GRPO outcome advantage (vectorized implementation)
+    adv2, ret2 = compute_grpo_vectorized_outcome_advantage(
+        token_level_rewards=token_level_rewards,
+        response_mask=response_mask,
+        index=index,
+    )
+    # Diagnostic info for visibility (same style as RLOO test)
+    adv_max_diff = (adv1 - adv2).abs().max().item()
+    ret_max_diff = (ret1 - ret2).abs().max().item()
+    total_mask_tokens = int(response_mask.sum().item())
+    print(
+        f"[GRPO] seed={seed} groups={num_groups} shape={adv1.shape} "
+        f"mask_tokens={total_mask_tokens} adv_max_diff={adv_max_diff:.3e} ret_max_diff={ret_max_diff:.3e}"
+    )
+    # Assert shape and numerical equivalence
+    assert adv1.shape == adv2.shape == (batch_size, seq_len)
+    assert ret1.shape == ret2.shape == (batch_size, seq_len)
+    assert torch.allclose(adv1, adv2, rtol=1e-5, atol=1e-6)
+    assert torch.allclose(ret1, ret2, rtol=1e-5, atol=1e-6)
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/trainer/ppo/test_metric_utils_on_cpu.py ADDED Viewed

	@@ -0,0 +1,489 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tests for the metric utilities in verl.trainer.ppo.metric_utils.
+"""
+import unittest
+from unittest.mock import MagicMock, patch
+import numpy as np
+import torch
+from verl.trainer.ppo.metric_utils import (
+    bootstrap_metric,
+    calc_maj_val,
+    compute_data_metrics,
+    compute_throughout_metrics,
+    compute_timing_metrics,
+    process_validation_metrics,
+)
+from verl.utils.metric import (
+    reduce_metrics,
+)
+from verl.utils.metric.utils import (
+    AggregationType,
+    Metric,
+)
+class TestReduceMetrics(unittest.TestCase):
+    """Tests for the reduce_metrics function."""
+    def test_reduce_metrics_basic(self):
+        """Test that reduce_metrics correctly computes means."""
+        metrics = {
+            "loss": [1.0, 2.0, 3.0],
+            "accuracy": [0.0, 0.5, 1.0],
+        }
+        result = reduce_metrics(metrics)
+        self.assertEqual(result["loss"], 2.0)
+        self.assertEqual(result["accuracy"], 0.5)
+    def test_reduce_metrics_empty(self):
+        """Test that reduce_metrics handles empty lists."""
+        metrics = {
+            "empty": [],
+        }
+        result = reduce_metrics(metrics)
+        self.assertTrue(np.isnan(result["empty"]))
+    def test_reduce_metrics_single_value(self):
+        """Test that reduce_metrics works with single values."""
+        metrics = {
+            "single": [5.0],
+        }
+        result = reduce_metrics(metrics)
+        self.assertEqual(result["single"], 5.0)
+class TestMetric(unittest.TestCase):
+    """Tests for the Metric class."""
+    def test_init_with_string_aggregation(self):
+        """Test Metric initialization with string aggregation type."""
+        metric = Metric(aggregation="mean")
+        self.assertEqual(metric.aggregation, AggregationType.MEAN)
+        self.assertEqual(metric.values, [])
+    def test_init_with_enum_aggregation(self):
+        """Test Metric initialization with AggregationType enum."""
+        metric = Metric(aggregation=AggregationType.SUM)
+        self.assertEqual(metric.aggregation, AggregationType.SUM)
+        self.assertEqual(metric.values, [])
+    def test_init_with_value(self):
+        """Test Metric initialization with an initial value."""
+        metric = Metric(aggregation="mean", value=5.0)
+        self.assertEqual(metric.values, [5.0])
+    def test_init_with_invalid_aggregation(self):
+        """Test Metric initialization with invalid aggregation type."""
+        with self.assertRaises(ValueError):
+            Metric(aggregation="invalid")
+    def test_append_float(self):
+        """Test appending float values."""
+        metric = Metric(aggregation="mean")
+        metric.append(1.0)
+        metric.append(2.0)
+        self.assertEqual(metric.values, [1.0, 2.0])
+    def test_append_int(self):
+        """Test appending int values."""
+        metric = Metric(aggregation="mean")
+        metric.append(1)
+        metric.append(2)
+        self.assertEqual(metric.values, [1, 2])
+    def test_append_tensor(self):
+        """Test appending scalar tensor values."""
+        metric = Metric(aggregation="mean")
+        metric.append(torch.tensor(3.0))
+        metric.append(torch.tensor(4.0))
+        self.assertEqual(metric.values, [3.0, 4.0])
+    def test_append_non_scalar_tensor_raises(self):
+        """Test that appending non-scalar tensor raises ValueError."""
+        metric = Metric(aggregation="mean")
+        with self.assertRaises(ValueError):
+            metric.append(torch.tensor([1.0, 2.0]))
+    def test_append_metric(self):
+        """Test appending another Metric extends values."""
+        metric1 = Metric(aggregation="mean", value=1.0)
+        metric1.append(2.0)
+        metric2 = Metric(aggregation="mean", value=3.0)
+        metric2.append(metric1)
+        self.assertEqual(metric2.values, [3.0, 1.0, 2.0])
+    def test_extend_with_list(self):
+        """Test extending with a list of values."""
+        metric = Metric(aggregation="mean")
+        metric.extend([1.0, 2.0, 3.0])
+        self.assertEqual(metric.values, [1.0, 2.0, 3.0])
+    def test_extend_with_metric(self):
+        """Test extending with another Metric."""
+        metric1 = Metric(aggregation="mean")
+        metric1.extend([1.0, 2.0])
+        metric2 = Metric(aggregation="mean")
+        metric2.extend([3.0, 4.0])
+        metric2.extend(metric1)
+        self.assertEqual(metric2.values, [3.0, 4.0, 1.0, 2.0])
+    def test_extend_aggregation_mismatch_raises(self):
+        """Test that extending with mismatched aggregation raises ValueError."""
+        metric1 = Metric(aggregation="mean")
+        metric2 = Metric(aggregation="sum")
+        with self.assertRaises(ValueError):
+            metric1.extend(metric2)
+    def test_aggregate_mean(self):
+        """Test aggregation with mean."""
+        metric = Metric(aggregation="mean")
+        metric.extend([1.0, 2.0, 3.0, 4.0])
+        self.assertEqual(metric.aggregate(), 2.5)
+    def test_aggregate_sum(self):
+        """Test aggregation with sum."""
+        metric = Metric(aggregation="sum")
+        metric.extend([1.0, 2.0, 3.0, 4.0])
+        self.assertEqual(metric.aggregate(), 10.0)
+    def test_aggregate_min(self):
+        """Test aggregation with min."""
+        metric = Metric(aggregation="min")
+        metric.extend([3.0, 1.0, 4.0, 2.0])
+        self.assertEqual(metric.aggregate(), 1.0)
+    def test_aggregate_max(self):
+        """Test aggregation with max."""
+        metric = Metric(aggregation="max")
+        metric.extend([3.0, 1.0, 4.0, 2.0])
+        self.assertEqual(metric.aggregate(), 4.0)
+    def test_chain_multiple_metrics(self):
+        """Test chain combines multiple Metrics."""
+        metric1 = Metric(aggregation="sum")
+        metric1.extend([1.0, 2.0])
+        metric2 = Metric(aggregation="sum")
+        metric2.extend([3.0, 4.0])
+        chained = Metric.chain([metric1, metric2])
+        self.assertEqual(chained.aggregation, AggregationType.SUM)
+        self.assertEqual(chained.values, [1.0, 2.0, 3.0, 4.0])
+        self.assertEqual(chained.aggregate(), 10.0)
+    def test_from_dict(self):
+        """Test from_dict creates Metrics from dictionary."""
+        data = {"loss": 1.0, "accuracy": 0.9}
+        metrics = Metric.from_dict(data, aggregation="mean")
+        self.assertIn("loss", metrics)
+        self.assertIn("accuracy", metrics)
+        self.assertEqual(metrics["loss"].values, [1.0])
+        self.assertEqual(metrics["accuracy"].values, [0.9])
+        self.assertEqual(metrics["loss"].aggregation, AggregationType.MEAN)
+    def test_init_list(self):
+        """Test init_list creates new empty Metric with same aggregation."""
+        metric = Metric(aggregation="max")
+        metric.extend([1.0, 2.0])
+        new_metric = metric.init_list()
+        self.assertEqual(new_metric.aggregation, AggregationType.MAX)
+        self.assertEqual(new_metric.values, [])
+    def test_reduce_metrics_with_metric(self):
+        """Test reduce_metrics correctly handles Metric objects."""
+        metric = Metric(aggregation="mean")
+        metric.extend([1.0, 2.0, 3.0])
+        metrics = {
+            "custom_metric": metric,
+            "list_metric": [4.0, 5.0, 6.0],
+        }
+        result = reduce_metrics(metrics)
+        self.assertEqual(result["custom_metric"], 2.0)
+        self.assertEqual(result["list_metric"], 5.0)
+class TestComputeDataMetrics(unittest.TestCase):
+    """Tests for the compute_data_metrics function."""
+    def setUp(self):
+        """Set up common test data."""
+        # Create a mock DataProto object
+        self.batch = MagicMock()
+        self.batch.batch = {
+            "token_level_scores": torch.tensor([[1.0, 2.0], [3.0, 4.0]]),
+            "token_level_rewards": torch.tensor([[0.5, 1.0], [1.5, 2.0]]),
+            "advantages": torch.tensor([[0.1, 0.2], [0.3, 0.4]]),
+            "returns": torch.tensor([[1.1, 1.2], [1.3, 1.4]]),
+            "responses": torch.zeros((2, 2)),  # 2 samples, 2 tokens each
+            "attention_mask": torch.tensor(
+                [
+                    [1, 1, 1, 1],  # 2 prompt tokens, 2 response tokens
+                    [1, 1, 1, 1],
+                ]
+            ),
+            "response_mask": torch.tensor(
+                [
+                    [1, 1],  # 2 response tokens
+                    [1, 1],
+                ]
+            ),
+            "values": torch.tensor([[0.9, 1.0], [1.1, 1.2]]),
+        }
+    def test_compute_data_metrics_with_critic(self):
+        """Test compute_data_metrics with critic enabled."""
+        metrics = compute_data_metrics(self.batch, use_critic=True)
+        # Check that all expected metrics are present
+        self.assertIn("critic/score/mean", metrics)
+        self.assertIn("critic/rewards/mean", metrics)
+        self.assertIn("critic/advantages/mean", metrics)
+        self.assertIn("critic/returns/mean", metrics)
+        self.assertIn("critic/values/mean", metrics)
+        self.assertIn("critic/vf_explained_var", metrics)
+        self.assertIn("response_length/mean", metrics)
+        self.assertIn("prompt_length/mean", metrics)
+        # Check some specific values
+        self.assertAlmostEqual(metrics["critic/score/mean"], 5.0)  # Sum of token_level_scores
+        self.assertAlmostEqual(metrics["critic/rewards/mean"], 2.5)  # Sum of token_level_rewards
+    def test_compute_data_metrics_without_critic(self):
+        """Test compute_data_metrics with critic disabled."""
+        metrics = compute_data_metrics(self.batch, use_critic=False)
+        # Check that critic-specific metrics are not present
+        self.assertNotIn("critic/values/mean", metrics)
+        self.assertNotIn("critic/vf_explained_var", metrics)
+        # Check that other metrics are still present
+        self.assertIn("critic/score/mean", metrics)
+        self.assertIn("critic/rewards/mean", metrics)
+        self.assertIn("response_length/mean", metrics)
+class TestComputeTimingMetrics(unittest.TestCase):
+    """Tests for the compute_timing_metrics function."""
+    def setUp(self):
+        """Set up common test data."""
+        # Create a mock DataProto object
+        self.batch = MagicMock()
+        self.batch.batch = {
+            "responses": torch.zeros((2, 3)),  # 2 samples, 3 response tokens each
+            "attention_mask": torch.tensor(
+                [
+                    [1, 1, 1, 1, 1, 1],  # 3 prompt tokens, 3 response tokens
+                    [1, 1, 1, 1, 1, 1],
+                ]
+            ),
+        }
+        # Mock the _compute_response_info function to return known values
+        self.response_info = {
+            "prompt_length": torch.tensor([3.0, 3.0]),
+            "response_length": torch.tensor([3.0, 3.0]),
+            "response_mask": torch.ones((2, 3)),
+        }
+    @patch("verl.trainer.ppo.metric_utils._compute_response_info")
+    def test_compute_timing_metrics(self, mock_compute_response_info):
+        """Test compute_timing_metrics with various timing data."""
+        mock_compute_response_info.return_value = self.response_info
+        timing_raw = {
+            "gen": 0.5,  # 500ms
+            "ref": 0.3,  # 300ms
+            "values": 0.2,  # 200ms
+        }
+        metrics = compute_timing_metrics(self.batch, timing_raw)
+        # Check raw timing metrics
+        self.assertEqual(metrics["timing_s/gen"], 0.5)
+        self.assertEqual(metrics["timing_s/ref"], 0.3)
+        self.assertEqual(metrics["timing_s/values"], 0.2)
+        # Check per-token timing metrics
+        # gen uses only response tokens (6 tokens)
+        self.assertAlmostEqual(metrics["timing_per_token_ms/gen"], 0.5 * 1000 / 6, places=5)
+        # ref and values use all tokens (12 tokens)
+        self.assertAlmostEqual(metrics["timing_per_token_ms/ref"], 0.3 * 1000 / 12, places=5)
+        self.assertAlmostEqual(metrics["timing_per_token_ms/values"], 0.2 * 1000 / 12, places=5)
+class TestComputeThroughputMetrics(unittest.TestCase):
+    """Tests for the compute_throughout_metrics function."""
+    def setUp(self):
+        """Set up common test data."""
+        # Create a mock DataProto object
+        self.batch = MagicMock()
+        self.batch.meta_info = {
+            "global_token_num": [100, 200, 300],  # 600 tokens total
+        }
+    def test_compute_throughout_metrics(self):
+        """Test compute_throughout_metrics with various timing data."""
+        timing_raw = {
+            "step": 2.0,  # 2 seconds per step
+        }
+        # Test with 1 GPU
+        metrics = compute_throughout_metrics(self.batch, timing_raw, n_gpus=1)
+        self.assertEqual(metrics["perf/total_num_tokens"], 600)
+        self.assertEqual(metrics["perf/time_per_step"], 2.0)
+        self.assertEqual(metrics["perf/throughput"], 600 / 2.0)  # 300 tokens/sec
+        # Test with 2 GPUs
+        metrics = compute_throughout_metrics(self.batch, timing_raw, n_gpus=2)
+        self.assertEqual(metrics["perf/total_num_tokens"], 600)
+        self.assertEqual(metrics["perf/time_per_step"], 2.0)
+        self.assertEqual(metrics["perf/throughput"], 600 / (2.0 * 2))  # 150 tokens/sec/GPU
+class TestBootstrapMetric(unittest.TestCase):
+    """Tests for the bootstrap_metric function."""
+    def test_bootstrap_metric_basic(self):
+        """Test bootstrap_metric with simple data and functions."""
+        data = [1, 2, 3, 4, 5]
+        reduce_fns = [np.mean, np.max]
+        # Use a fixed seed for reproducibility
+        result = bootstrap_metric(data, subset_size=3, reduce_fns=reduce_fns, n_bootstrap=100, seed=42)
+        # Check that we get two results (one for each reduce_fn)
+        self.assertEqual(len(result), 2)
+        # Each result should be a tuple of (mean, std)
+        mean_result, max_result = result
+        self.assertEqual(len(mean_result), 2)
+        self.assertEqual(len(max_result), 2)
+        # The mean of means should be close to the true mean (3.0)
+        self.assertAlmostEqual(mean_result[0], 3.0, delta=0.3)
+        # The mean of maxes should be close to the expected value for samples of size 3
+        # For samples of size 3 from [1,2,3,4,5], the expected max is around 4.0-4.5
+        self.assertGreater(max_result[0], 3.5)
+        self.assertLess(max_result[0], 5.0)
+    def test_bootstrap_metric_empty(self):
+        """Test bootstrap_metric with empty data."""
+        with self.assertRaises(ValueError):
+            bootstrap_metric([], subset_size=1, reduce_fns=[np.mean])
+class TestCalcMajVal(unittest.TestCase):
+    """Tests for the calc_maj_val function."""
+    def test_calc_maj_val_basic(self):
+        """Test calc_maj_val with simple data."""
+        data = [
+            {"pred": "A", "val": 0.9},
+            {"pred": "B", "val": 0.8},
+            {"pred": "A", "val": 0.7},
+        ]
+        result = calc_maj_val(data, vote_key="pred", val_key="val")
+        # "A" is the majority vote, so we should get the first "val" for "A"
+        self.assertEqual(result, 0.9)
+    def test_calc_maj_val_tie(self):
+        """Test calc_maj_val with tied votes."""
+        data = [
+            {"pred": "A", "val": 0.9},
+            {"pred": "B", "val": 0.8},
+            {"pred": "B", "val": 0.7},
+            {"pred": "A", "val": 0.6},
+        ]
+        # In case of a tie, the first key in sorted order wins
+        # This depends on Python's dict implementation, but for this test
+        # we just verify that one of the valid values is returned
+        result = calc_maj_val(data, vote_key="pred", val_key="val")
+        self.assertTrue(result in [0.9, 0.8])
+class TestProcessValidationMetrics(unittest.TestCase):
+    """Tests for the process_validation_metrics function."""
+    def test_process_validation_metrics_basic(self):
+        """Test process_validation_metrics with simple data."""
+        data_sources = ["source1", "source1", "source2"]
+        sample_inputs = ["prompt1", "prompt1", "prompt2"]
+        infos_dict = {
+            "score": [0.8, 0.9, 0.7],
+        }
+        result = process_validation_metrics(data_sources, sample_inputs, infos_dict, seed=42)
+        # Check the structure of the result
+        self.assertIn("source1", result)
+        self.assertIn("source2", result)
+        # Check that source1 has metrics for score
+        self.assertIn("score", result["source1"])
+        # Check that mean@2 is present for source1/score
+        self.assertIn("mean@2", result["source1"]["score"])
+        # Check the value of mean@2 for source1/score
+        self.assertAlmostEqual(result["source1"]["score"]["mean@2"], 0.85)
+    def test_process_validation_metrics_with_pred(self):
+        """Test process_validation_metrics with prediction data."""
+        data_sources = ["source1", "source1", "source1"]
+        sample_inputs = ["prompt1", "prompt1", "prompt1"]
+        infos_dict = {
+            "score": [0.8, 0.9, 0.7],
+            "pred": ["A", "B", "A"],
+        }
+        result = process_validation_metrics(data_sources, sample_inputs, infos_dict, seed=42)
+        # Check that majority voting metrics are present
+        self.assertIn("maj@2/mean", result["source1"]["score"])
+        # For bootstrap with n=2, the majority vote could be either A or B
+        # depending on the random sampling, so we don't check the exact value
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/trainer/ppo/test_rollout_corr.py ADDED Viewed

	@@ -0,0 +1,386 @@

+#!/usr/bin/env python3
+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Quick Sanity Test for Rollout Correction
+This is a standalone test script that can be run without pytest to quickly verify
+the rollout correction implementation is working correctly. For comprehensive integration
+tests, see: tests/trainer/ppo/test_rollout_corr_integration.py
+Usage:
+    python test_rollout_corr.py
+This tests:
+- Basic rollout correction functionality (IS weights + rejection sampling)
+- Metrics completeness (IS metrics + rejection metrics + off-policy metrics)
+- Edge cases
+"""
+import pytest
+import torch
+from verl.trainer.ppo.rollout_corr_helper import (
+    SUPPORTED_ROLLOUT_RS_OPTIONS,
+    compute_offpolicy_metrics,
+    compute_rollout_correction_and_rejection_mask,
+)
+def test_basic_rollout_correction():
+    """Test basic rollout correction functionality."""
+    print("Testing basic rollout correction functionality...")
+    # Create test data
+    batch_size, seq_length = 4, 10
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Create slightly different log probs (simulating BF16 vs FP32 mismatch)
+    old_log_prob = torch.randn(batch_size, seq_length, device=device)
+    rollout_log_prob = old_log_prob + torch.randn(batch_size, seq_length, device=device) * 0.1
+    eos_mask = torch.ones(batch_size, seq_length, device=device)
+    # Test token-level truncate mode
+    print("\n1. Testing token-level truncate mode...")
+    weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=eos_mask,
+        rollout_is="token",  # Compute IS weights at token level
+        rollout_is_threshold=2.0,
+        rollout_rs=None,  # No rejection sampling (truncate mode)
+    )
+    weights = weights_proto.batch["rollout_is_weights"]
+    print(f"   Weights shape: {weights.shape}")
+    print(f"   Mean weight: {metrics['rollout_corr/rollout_is_mean']:.4f}")
+    print(f"   Max weight: {metrics['rollout_corr/rollout_is_max']:.4f}")
+    print(f"   Min weight: {metrics['rollout_corr/rollout_is_min']:.4f}")
+    assert weights.shape == old_log_prob.shape
+    assert weights.max() <= 2.0, "Weights should be capped at threshold"
+    print("   ✓ Token-level truncate mode passed")
+    # Test sequence-level mode
+    print("\n2. Testing sequence-level mode...")
+    weights_seq_proto, _, metrics_seq = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=eos_mask,
+        rollout_is="sequence",  # Compute IS weights at sequence level
+        rollout_is_threshold=5.0,
+        rollout_rs=None,  # No rejection sampling (truncate mode)
+    )
+    weights_seq = weights_seq_proto.batch["rollout_is_weights"]
+    print(f"   Mean weight: {metrics_seq['rollout_corr/rollout_is_mean']:.4f}")
+    print(f"   Effective sample size: {metrics_seq['rollout_corr/rollout_is_eff_sample_size']:.4f}")
+    # Check that all tokens in a sequence have the same weight
+    for i in range(batch_size):
+        seq_weights = weights_seq[i, eos_mask[i].bool()]
+        assert torch.allclose(seq_weights, seq_weights[0]), "All tokens in sequence should have same weight"
+    print("   ✓ Sequence-level mode passed")
+    # Test K1 sequence mean rejection sampling (mask mode)
+    print("\n3. Testing K1 (sequence mean) rejection sampling...")
+    weights_geo_proto, modified_mask_geo, metrics_geo = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=eos_mask,
+        rollout_is=None,  # No IS weights (pure mask mode)
+        rollout_rs="seq_mean_k1",  # Rejection sampling with sequence-mean log ratio bounds
+        rollout_rs_threshold="0.5_1.5",
+    )
+    print(f"   Masked fraction: {metrics_geo['rollout_corr/rollout_rs_masked_fraction']:.4f}")
+    print("   ✓ K1 sequence mean rejection sampling passed")
+    # Test disabled IS (rollout_is=None, rollout_rs=None)
+    print("\n4. Testing disabled IS...")
+    weights_disabled, modified_response_mask_disabled, metrics_disabled = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=eos_mask,
+        rollout_is=None,
+        rollout_rs=None,
+    )
+    assert weights_disabled is None, "Should return None when IS is disabled"
+    assert torch.equal(modified_response_mask_disabled, eos_mask), "Should return original mask unchanged"
+    # Note: off-policy metrics are still computed even when IS/RS are disabled
+    assert "rollout_corr/kl" in metrics_disabled, "Should still compute off-policy metrics"
+    print("   ✓ Disabled IS passed")
+    print("\n✓ All tests passed!")
+@pytest.mark.parametrize(
+    ("option", "threshold"),
+    [
+        ("token_k1", "0.5_1.5"),
+        ("token_k2", 2.0),
+        ("token_k3", 2.0),
+        ("seq_sum_k1", "0.6_1.4"),
+        ("seq_sum_k2", 2.5),
+        ("seq_sum_k3", 2.5),
+        ("seq_mean_k1", "0.5_1.5"),
+        ("seq_mean_k2", 2.0),
+        ("seq_mean_k3", 2.0),
+        ("seq_max_k2", 2.0),
+        ("seq_max_k3", 2.0),
+    ],
+)
+def test_each_supported_rollout_rs_option(option: str, threshold):
+    """Ensure every supported RS option produces metrics without error."""
+    assert option in SUPPORTED_ROLLOUT_RS_OPTIONS
+    batch_size, seq_length = 3, 7
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    old_log_prob = torch.randn(batch_size, seq_length, device=device)
+    rollout_log_prob = old_log_prob + torch.randn(batch_size, seq_length, device=device) * 0.15
+    response_mask = torch.ones(batch_size, seq_length, device=device)
+    _, modified_mask, metrics = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=response_mask,
+        rollout_is=None,
+        rollout_rs=option,
+        rollout_rs_threshold=threshold,
+    )
+    expected_key = f"rollout_corr/rollout_rs_{option}_mean"
+    assert expected_key in metrics, f"Missing metric for {option}"
+    assert modified_mask.shape == response_mask.shape
+def test_rollout_rs_multiple_options():
+    """Verify multiple RS options with mixed threshold formats."""
+    batch_size, seq_length = 2, 6
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    old_log_prob = torch.randn(batch_size, seq_length, device=device)
+    rollout_log_prob = old_log_prob + torch.randn(batch_size, seq_length, device=device) * 0.2
+    response_mask = torch.ones(batch_size, seq_length, device=device)
+    rollout_rs = "token_k1,seq_max_k3"
+    rollout_rs_threshold = "0.4_1.8,3.0"
+    _, _, metrics = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=response_mask,
+        rollout_is=None,
+        rollout_rs=rollout_rs,
+        rollout_rs_threshold=rollout_rs_threshold,
+    )
+    for option in rollout_rs.split(","):
+        key = f"rollout_corr/rollout_rs_{option}_mean"
+        assert key in metrics, f"Metrics missing for chained option {option}"
+def test_metrics_completeness():
+    """Test that all expected metrics are returned."""
+    print("\nTesting metrics completeness...")
+    batch_size, seq_length = 3, 8
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    old_log_prob = torch.randn(batch_size, seq_length, device=device)
+    rollout_log_prob = old_log_prob + torch.randn(batch_size, seq_length, device=device) * 0.2
+    eos_mask = torch.ones(batch_size, seq_length, device=device)
+    _, _, metrics = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=eos_mask,
+        rollout_is="token",
+        rollout_is_threshold=2.5,
+        rollout_rs=None,
+    )
+    # Expected IS metrics
+    expected_is_metrics = [
+        "rollout_corr/rollout_is_mean",
+        "rollout_corr/rollout_is_max",
+        "rollout_corr/rollout_is_min",
+        "rollout_corr/rollout_is_std",
+        "rollout_corr/rollout_is_eff_sample_size",
+        "rollout_corr/rollout_is_ratio_fraction_high",
+        "rollout_corr/rollout_is_ratio_fraction_low",
+    ]
+    # Expected off-policy diagnostic metrics (also included now)
+    expected_offpolicy_metrics = [
+        "rollout_corr/training_ppl",
+        "rollout_corr/training_log_ppl",
+        "rollout_corr/kl",
+        "rollout_corr/k3_kl",
+        "rollout_corr/rollout_ppl",
+        "rollout_corr/rollout_log_ppl",
+        "rollout_corr/log_ppl_diff",
+        "rollout_corr/log_ppl_abs_diff",
+        "rollout_corr/log_ppl_diff_max",
+        "rollout_corr/log_ppl_diff_min",
+        "rollout_corr/ppl_ratio",
+        "rollout_corr/chi2_token",
+        "rollout_corr/chi2_seq",
+    ]
+    expected_metrics = expected_is_metrics + expected_offpolicy_metrics
+    missing_metrics = [m for m in expected_metrics if m not in metrics]
+    if missing_metrics:
+        print(f"   ✗ Missing metrics: {missing_metrics}")
+        return False
+    print(f"   ✓ All {len(expected_metrics)} expected metrics present")
+    print(f"   Total metrics returned: {len(metrics)}")
+    return True
+def test_offpolicy_metrics():
+    """Test off-policy metrics computation."""
+    print("\nTesting off-policy metrics computation...")
+    batch_size, seq_length = 4, 12
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Create test data with some mismatch
+    old_log_prob = torch.randn(batch_size, seq_length, device=device) - 2.0  # training policy
+    rollout_log_prob = torch.randn(batch_size, seq_length, device=device) - 1.5  # rollout policy (more confident)
+    response_mask = torch.ones(batch_size, seq_length, device=device)
+    # Test with rollout log probs
+    metrics = compute_offpolicy_metrics(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=response_mask,
+    )
+    expected_metrics = [
+        "training_ppl",
+        "training_log_ppl",
+        "kl",
+        "k3_kl",
+        "rollout_ppl",
+        "rollout_log_ppl",
+        "log_ppl_diff",
+        "log_ppl_abs_diff",
+        "log_ppl_diff_max",
+        "log_ppl_diff_min",
+        "ppl_ratio",
+        "chi2_token",
+        "chi2_seq",
+    ]
+    for metric in expected_metrics:
+        assert metric in metrics, f"Missing metric: {metric}"
+    print(f"   Training PPL: {metrics['training_ppl']:.4f}")
+    print(f"   Rollout PPL: {metrics['rollout_ppl']:.4f}")
+    print(f"   KL divergence: {metrics['kl']:.6f}")
+    print(f"   K3 KL: {metrics['k3_kl']:.6f}")
+    print(f"   PPL ratio: {metrics['ppl_ratio']:.4f}")
+    print(f"   ✓ All {len(expected_metrics)} off-policy metrics present")
+    # Test without rollout log probs
+    metrics_no_rollout = compute_offpolicy_metrics(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=None,
+        response_mask=response_mask,
+    )
+    assert "training_ppl" in metrics_no_rollout
+    assert "rollout_ppl" not in metrics_no_rollout
+    print("   ✓ Off-policy metrics work without rollout log probs")
+def test_mask_mode():
+    """Test mask mode applies rejection via response_mask, keeps true IS weights."""
+    print("\nTesting mask mode behavior...")
+    batch_size = 2
+    seq_length = 5
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    # Sequence 0: ratio ≈ 0.37 (below 0.5, should be rejected)
+    # Sequence 1: ratio ≈ 1.65 (in [0.5, 2.0], should be accepted)
+    old_log_prob = torch.tensor([[-2.0] * seq_length, [-2.0] * seq_length], device=device)
+    rollout_log_prob = torch.tensor(
+        [
+            [-1.0] * seq_length,  # exp(-2.0 - (-1.0)) = exp(-1.0) ≈ 0.37
+            [-2.5] * seq_length,  # exp(-2.0 - (-2.5)) = exp(0.5) ≈ 1.65
+        ],
+        device=device,
+    )
+    response_mask = torch.ones(batch_size, seq_length, device=device)
+    weights_proto, modified_response_mask, metrics = compute_rollout_correction_and_rejection_mask(
+        old_log_prob=old_log_prob,
+        rollout_log_prob=rollout_log_prob,
+        response_mask=response_mask,
+        rollout_is="token",  # Compute IS weights
+        rollout_is_threshold=2.0,
+        rollout_rs="token_k1",  # Also apply rejection sampling (mask mode)
+        rollout_rs_threshold="0.5_2.0",
+    )
+    weights = weights_proto.batch["rollout_is_weights"]
+    # KEY FIX: Weights should be safety-bounded ratios (NOT zeroed)
+    assert torch.all(weights[0, :] > 0), "Weights should remain as safety-bounded ratios (not zeroed)"
+    assert torch.allclose(weights[0, 0], torch.tensor(0.368, device=device), atol=0.01), (
+        "First seq ratio should be ≈0.37"
+    )
+    assert torch.allclose(weights[1, 0], torch.tensor(1.649, device=device), atol=0.01), (
+        "Second seq ratio should be ≈1.65"
+    )
+    # Rejection should be applied via response_mask
+    assert torch.all(modified_response_mask[0, :] == 0), "First sequence should be rejected via mask"
+    assert torch.all(modified_response_mask[1, :] == 1), "Second sequence should be accepted"
+    # Verify rejection sampling metrics exist
+    assert "rollout_corr/rollout_rs_masked_fraction" in metrics, "Should have rollout_rs_masked_fraction metric"
+    assert abs(metrics["rollout_corr/rollout_rs_masked_fraction"] - 0.5) < 0.01, "Should reject 50% of tokens"
+    print(f"   First seq IS weight: {weights[0, 0]:.4f} (expected ≈0.37)")
+    print(f"   Second seq IS weight: {weights[1, 0]:.4f} (expected ≈1.65)")
+    print(f"   First seq mask: {modified_response_mask[0, 0]:.0f} (expected 0 - rejected)")
+    print(f"   Second seq mask: {modified_response_mask[1, 0]:.0f} (expected 1 - accepted)")
+    print(f"   Masked fraction: {metrics['rollout_corr/rollout_rs_masked_fraction']:.2f}")
+    print("   ✓ Mask mode correctly separates IS weights from rejection")
+if __name__ == "__main__":
+    print("=" * 60)
+    print("Rollout Correction Test Suite")
+    print("=" * 60)
+    try:
+        test_basic_rollout_correction()
+        test_metrics_completeness()
+        test_offpolicy_metrics()
+        test_mask_mode()
+        print("\n" + "=" * 60)
+        print("ALL TESTS PASSED ✓")
+        print("=" * 60)
+    except Exception as e:
+        print(f"\n✗ Test failed with error: {e}")
+        import traceback
+        traceback.print_exc()
+        exit(1)

code/RL_model/verl/verl_train/tests/trainer/ppo/test_rollout_corr_integration.py ADDED Viewed

	@@ -0,0 +1,262 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Integration tests for Rollout Correction."""
+import pytest
+import torch
+from verl.trainer.config.algorithm import RolloutCorrectionConfig
+from verl.trainer.ppo.core_algos import compute_policy_loss_vanilla
+from verl.trainer.ppo.rollout_corr_helper import (
+    compute_offpolicy_metrics,
+    compute_rollout_correction_and_rejection_mask,
+)
+from verl.workers.config.actor import ActorConfig
+class TestRolloutISIntegration:
+    """Integration tests for Rollout Correction with PPO."""
+    @pytest.fixture
+    def sample_data(self):
+        """Create sample training data."""
+        batch_size, seq_length = 4, 16
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        return {
+            "old_log_prob": torch.randn(batch_size, seq_length, device=device),
+            "log_prob": torch.randn(batch_size, seq_length, device=device),
+            "rollout_log_prob": torch.randn(batch_size, seq_length, device=device),
+            "advantages": torch.randn(batch_size, seq_length, device=device),
+            "response_mask": torch.ones(batch_size, seq_length, device=device),
+        }
+    @pytest.fixture
+    def config_with_rollout_is(self):
+        """Create config for policy loss computation.
+        Note: rollout_is config has been moved to algorithm config.
+        This config only needs fields used by policy loss (clip_ratio, etc).
+        """
+        config = ActorConfig(
+            strategy="fsdp",
+            rollout_n=1,
+            ppo_micro_batch_size=2,
+            clip_ratio=0.2,
+        )
+        return config
+    def test_policy_loss_with_rollout_is(self, sample_data, config_with_rollout_is):
+        """Test that policy loss computation works with rollout correction weights.
+        Note: In production, IS weights are computed centrally in the trainer
+        (before advantage computation) and passed to policy loss.
+        This test simulates that workflow.
+        """
+        # First compute IS weights (as trainer would do centrally)
+        rollout_is_weights_proto, _, _ = compute_rollout_correction_and_rejection_mask(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+            rollout_is="token",
+            rollout_is_threshold=2.0,
+            rollout_rs=None,
+        )
+        rollout_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"]
+        # Policy loss function receives pre-computed IS weights
+        pg_loss, _ = compute_policy_loss_vanilla(
+            old_log_prob=sample_data["old_log_prob"],
+            log_prob=sample_data["log_prob"],
+            advantages=sample_data["advantages"],
+            response_mask=sample_data["response_mask"],
+            loss_agg_mode="token-mean",
+            config=config_with_rollout_is,
+            rollout_is_weights=rollout_is_weights,
+        )
+        # Check loss is valid
+        assert isinstance(pg_loss, torch.Tensor)
+        assert pg_loss.ndim == 0  # Scalar
+        assert not torch.isnan(pg_loss)
+        assert not torch.isinf(pg_loss)
+    def test_rollout_is_weights_computation(self, sample_data):
+        """Test rollout correction weights and metrics computation."""
+        weights_proto, _, metrics = compute_rollout_correction_and_rejection_mask(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+            rollout_is="token",
+            rollout_is_threshold=2.0,
+            rollout_rs=None,
+        )
+        # Check weights
+        from verl.protocol import DataProto
+        assert isinstance(weights_proto, DataProto)
+        weights = weights_proto.batch["rollout_is_weights"]
+        assert isinstance(weights, torch.Tensor)
+        assert weights.shape == sample_data["old_log_prob"].shape
+        # Check metrics are returned
+        assert isinstance(metrics, dict)
+        assert len(metrics) > 0
+        assert "rollout_corr/rollout_is_mean" in metrics
+    def test_all_aggregation_levels(self, sample_data):
+        """Test all aggregation levels (token, sequence for IS; K1 for RS)."""
+        # Test IS weight levels
+        is_levels = ["token", "sequence"]
+        for level in is_levels:
+            _, _, metrics = compute_rollout_correction_and_rejection_mask(
+                old_log_prob=sample_data["old_log_prob"],
+                rollout_log_prob=sample_data["rollout_log_prob"],
+                response_mask=sample_data["response_mask"],
+                rollout_is=level,
+                rollout_is_threshold=2.0,
+                rollout_rs=None,
+            )
+            assert "rollout_corr/rollout_is_mean" in metrics
+        # Test rejection sampling with K1 sequence mean level
+        _, _, metrics_geo = compute_rollout_correction_and_rejection_mask(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+            rollout_is=None,
+            rollout_rs="seq_mean_k1",
+            rollout_rs_threshold="0.999_1.001",
+        )
+        assert "rollout_corr/rollout_rs_seq_mean_k1_mean" in metrics_geo
+    def test_both_bounding_modes(self, sample_data):
+        """Test both truncate and mask modes."""
+        # Test truncate mode (IS weights only)
+        _, _, metrics_truncate = compute_rollout_correction_and_rejection_mask(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+            rollout_is="token",
+            rollout_is_threshold=2.0,
+            rollout_rs=None,
+        )
+        assert "rollout_corr/rollout_is_mean" in metrics_truncate
+        # Test mask mode (rejection sampling)
+        _, _, metrics_mask = compute_rollout_correction_and_rejection_mask(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+            rollout_is="token",  # Can also compute IS weights in mask mode
+            rollout_is_threshold=2.0,
+            rollout_rs="token_k1",  # Enable rejection sampling
+            rollout_rs_threshold=1.3,  # Float upper bound (lower inferred automatically)
+        )
+        assert "rollout_corr/rollout_is_mean" in metrics_mask
+        assert "rollout_corr/rollout_rs_token_k1_mean" in metrics_mask
+    def test_offpolicy_metrics(self, sample_data):
+        """Test off-policy diagnostic metrics computation."""
+        metrics = compute_offpolicy_metrics(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+        )
+        # Check key metrics are present
+        assert "training_ppl" in metrics
+        assert "rollout_ppl" in metrics
+        assert "kl" in metrics
+        assert isinstance(metrics["kl"], float)
+    def test_metrics_only_mode(self, sample_data, config_with_rollout_is):
+        """Test metrics-only mode: compute IS weights/metrics but don't apply to loss.
+        This tests the use case where rollout_is_threshold is set (enables computation)
+        but rollout_is=False (disables weight application to policy loss).
+        """
+        # Compute IS weights (as trainer would do)
+        rollout_is_weights_proto, _, is_metrics = compute_rollout_correction_and_rejection_mask(
+            old_log_prob=sample_data["old_log_prob"],
+            rollout_log_prob=sample_data["rollout_log_prob"],
+            response_mask=sample_data["response_mask"],
+            rollout_is="token",
+            rollout_is_threshold=2.0,
+            rollout_rs=None,
+        )
+        # Metrics should be computed
+        assert len(is_metrics) > 0
+        assert "rollout_corr/rollout_is_mean" in is_metrics
+        # In metrics-only mode, we compute loss WITHOUT applying weights
+        # (simulating rollout_is=False)
+        pg_loss_no_weights, _ = compute_policy_loss_vanilla(
+            old_log_prob=sample_data["old_log_prob"],
+            log_prob=sample_data["log_prob"],
+            advantages=sample_data["advantages"],
+            response_mask=sample_data["response_mask"],
+            loss_agg_mode="token-mean",
+            config=config_with_rollout_is,
+            rollout_is_weights=None,  # Don't apply weights
+        )
+        # Compare to loss WITH weights (rollout_is=True)
+        rollout_is_weights = rollout_is_weights_proto.batch["rollout_is_weights"]
+        pg_loss_with_weights, _ = compute_policy_loss_vanilla(
+            old_log_prob=sample_data["old_log_prob"],
+            log_prob=sample_data["log_prob"],
+            advantages=sample_data["advantages"],
+            response_mask=sample_data["response_mask"],
+            loss_agg_mode="token-mean",
+            config=config_with_rollout_is,
+            rollout_is_weights=rollout_is_weights,
+        )
+        # Losses should be different (weights have an effect)
+        assert not torch.allclose(pg_loss_no_weights, pg_loss_with_weights)
+class TestRolloutCorrectionConfigNormalization:
+    """Unit tests for RolloutCorrectionConfig canonicalization logic."""
+    def test_alias_normalization_and_threshold_parsing(self):
+        config = RolloutCorrectionConfig(
+            rollout_is="token",
+            rollout_is_threshold=2.5,
+            rollout_rs="seq_mean_k1,seq_max_k3",
+            rollout_rs_threshold="0.8_1.2,3.0",
+        )
+        assert config.rollout_is == "token"
+        assert config.rollout_is_threshold == pytest.approx(2.5)
+        assert config.rollout_rs == "seq_mean_k1,seq_max_k3"
+        assert config.rollout_rs_threshold == "0.8_1.2,3.0"
+    def test_missing_threshold_raises(self):
+        config = RolloutCorrectionConfig(rollout_rs="token_k1")
+        assert config.rollout_rs == "token_k1"
+        assert config.rollout_rs_threshold is None
+    def test_float_threshold_conversion_in_factory(self):
+        config = RolloutCorrectionConfig.decoupled_geo_rs_seq_tis(rs_threshold=1.001)
+        assert config.rollout_rs == "seq_mean_k1"
+        assert config.rollout_rs_threshold == 1.001
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "-s"])

data/extracting_subclaim/old/extracted_subclaims_classified_multiclinsum_test_en_en.json ADDED Viewed