shahidul034 commited on Feb 15

Commit

7b53b83

verified ·

1 Parent(s): c3dfccb

Add files using upload-large-folder tool

Browse files

Files changed (33) hide show

code/RL_model/verl/verl_train/tests/single_controller/__init__.py +13 -0
code/RL_model/verl/verl_train/tests/single_controller/test_auto_padding_on_cpu.py +152 -0
code/RL_model/verl/verl_train/tests/single_controller/test_colocated_workers.py +86 -0
code/RL_model/verl/verl_train/tests/single_controller/test_colocated_workers_fused.py +86 -0
code/RL_model/verl/verl_train/tests/single_controller/test_data_transfer.py +109 -0
code/RL_model/verl/verl_train/tests/single_controller/test_decorator_on_cpu.py +200 -0
code/RL_model/verl/verl_train/tests/single_controller/test_device_mesh_register.py +158 -0
code/RL_model/verl/verl_train/tests/single_controller/test_driverfunc_to_worker.py +85 -0
code/RL_model/verl/verl_train/tests/single_controller/test_fused_workers_on_cpu.py +90 -0
code/RL_model/verl/verl_train/tests/single_controller/test_high_level_scheduling_api.py +103 -0
code/RL_model/verl/verl_train/tests/single_controller/test_rvdz.py +51 -0
code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_torch.py +116 -0
code/RL_model/verl/verl_train/tests/special_e2e/README.md +1 -0
code/RL_model/verl/verl_train/tests/utils/test_activation_offload.py +175 -0
code/RL_model/verl/verl_train/tests/utils/test_check_ipc_version_support_on_npu.py +231 -0
code/RL_model/verl/verl_train/tests/utils/test_config_on_cpu.py +97 -0
code/RL_model/verl/verl_train/tests/utils/test_flops_counter.py +480 -0
code/RL_model/verl/verl_train/tests/utils/test_fs_on_cpu.py +94 -0
code/RL_model/verl/verl_train/tests/utils/test_groupwise.py +98 -0
code/RL_model/verl/verl_train/tests/utils/test_import_utils_on_cpu.py +97 -0
code/RL_model/verl/verl_train/tests/utils/test_linear_cross_entropy.py +361 -0
code/RL_model/verl/verl_train/tests/utils/test_mlflow_key_sanitization.py +64 -0
code/RL_model/verl/verl_train/tests/utils/test_model_on_cpu.py +52 -0
code/RL_model/verl/verl_train/tests/utils/test_nvtx_profile.py +168 -0
code/RL_model/verl/verl_train/tests/utils/test_rollout_skip_on_cpu.py +142 -0
code/RL_model/verl/verl_train/tests/utils/test_rollout_trace_on_cpu.py +246 -0
code/RL_model/verl/verl_train/tests/utils/test_seqlen_balancing.py +278 -0
code/RL_model/verl/verl_train/tests/utils/test_shared_memory.py +260 -0
code/RL_model/verl/verl_train/tests/utils/test_special_linear_cross_entropy_tp.py +514 -0
code/RL_model/verl/verl_train/tests/utils/test_special_mstx_profile.py +274 -0
code/RL_model/verl/verl_train/tests/utils/test_temp_env_on_cpu.py +143 -0
code/RL_model/verl/verl_train/tests/utils/test_timeout_decorator_cpu.py +238 -0
code/RL_model/verl/verl_train/tests/utils/test_torch_functional.py +152 -0

code/RL_model/verl/verl_train/tests/single_controller/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

code/RL_model/verl/verl_train/tests/single_controller/test_auto_padding_on_cpu.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import ray
+import torch
+from verl import DataProto
+from verl.protocol import DataProtoConfig
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+# or set env var VERL_AUTO_PADDING = "1" / "true"
+DataProtoConfig.auto_padding = True
+@ray.remote
+class Actor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def add(self, data: DataProto):
+        data.batch["a"] += self.rank
+        return data
+def test_auto_padding():
+    ray.init(num_cpus=100)
+    chunk_size = 4
+    actor_cls = RayClassWithInitArgs(cls=Actor)
+    resource_pool = RayResourcePool(process_on_nodes=[chunk_size], use_gpu=False)
+    actor_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=actor_cls)
+    # test locally first
+    for test_size in range(4, 20):
+        local_data = DataProto.from_dict({"a": torch.zeros(test_size)}, {"na": np.zeros(test_size, dtype=object)})
+        # print(f"before padding, local_data = {local_data}")
+        padding_size = (chunk_size - (test_size % chunk_size)) if (test_size % chunk_size > 0) else 0
+        local_data.padding(padding_size)
+        # print(f"after padding, local_data = {local_data}")
+        assert len(local_data) == len(local_data) + len(local_data) % chunk_size, (
+            f"expecting padded length to be {len(local_data) + len(local_data) % chunk_size}, but got {len(local_data)}"
+        )
+        chunked = local_data.chunk(chunk_size)
+        assert len(chunked) == chunk_size, f"during test_size = {test_size}, expecting {chunk_size}, got {chunked}"
+        for dp in chunked:
+            assert len(dp) == test_size // chunk_size + bool(test_size % chunk_size), (
+                f"test size = {test_size}, expecting dp to be length of "
+                f"{test_size // chunk_size + bool(test_size % chunk_size)}, but got {len(dp)}: {dp} {chunked}"
+            )
+    # test with RayWorkerGroup method decorated as dispatch_mode=Dispatch.DP_COMPUTE_PROTO
+    data = DataProto.from_dict({"a": torch.zeros(10)}, {"na": np.array([str(i) for i in range(10)], dtype=object)})
+    output = actor_wg.add(data)
+    print(output.batch["a"])
+    assert len(output) == 10, "Failed in args split and padding."
+    data = DataProto.from_dict({"a": torch.zeros(10)}, {"na": np.array([str(i) for i in range(10)], dtype=object)})
+    output = actor_wg.add(data=data)
+    print(output.batch["a"])
+    assert len(output) == 10, "Failed in kwargs split and padding."
+    data = DataProto.from_dict({"a": torch.zeros(1)}, {"na": np.array([str(i) for i in range(1)], dtype=object)})
+    output = actor_wg.add(data)
+    print(output.batch["a"])
+    assert len(output) == 1, "Failed in args split and padding."
+    data = DataProto.from_dict({"a": torch.zeros(1)}, {"na": np.array([str(i) for i in range(1)], dtype=object)})
+    output = actor_wg.add(data=data)
+    print(output.batch["a"])
+    assert len(output) == 1, "Failed in kwargs split and padding."
+    data = DataProto.from_dict({"a": torch.zeros(8)}, {"na": np.array([str(i) for i in range(8)], dtype=object)})
+    output = actor_wg.add(data)
+    print(output.batch["a"])
+    assert len(output) == 8, "Failed in args split and padding."
+    data = DataProto.from_dict({"a": torch.zeros(8)}, {"na": np.array([str(i) for i in range(8)], dtype=object)})
+    output = actor_wg.add(data=data)
+    print(output.batch["a"])
+    assert len(output) == 8, "Failed in kwargs split and padding."
+    # test data proto specific config
+    DataProtoConfig.auto_padding = False
+    data = DataProto.from_dict(
+        {"a": torch.zeros(10)}, {"na": np.array([str(i) for i in range(10)], dtype=object)}, auto_padding=True
+    )
+    output = actor_wg.add(data)
+    print(output.batch["a"])
+    assert len(output) == 10, "Failed in args split and padding."
+    data = DataProto.from_dict(
+        {"a": torch.zeros(10)}, {"na": np.array([str(i) for i in range(10)], dtype=object)}, auto_padding=True
+    )
+    output = actor_wg.add(data=data)
+    print(output.batch["a"])
+    assert len(output) == 10, "Failed in kwargs split and padding."
+    data = DataProto.from_single_dict(
+        {"a": torch.zeros(1), "na": np.array([str(i) for i in range(1)], dtype=object)}, auto_padding=True
+    )
+    output = actor_wg.add(data)
+    print(output.batch["a"])
+    assert len(output) == 1, "Failed in args split and padding."
+    data = DataProto.from_single_dict(
+        {"a": torch.zeros(1), "na": np.array([str(i) for i in range(1)], dtype=object)}, auto_padding=True
+    )
+    output = actor_wg.add(data=data)
+    print(output.batch["a"])
+    assert len(output) == 1, "Failed in kwargs split and padding."
+    data = DataProto.from_single_dict({"a": torch.zeros(8), "na": np.array([str(i) for i in range(8)], dtype=object)})
+    output = actor_wg.add(data)
+    print(output.batch["a"])
+    assert len(output) == 8, "Failed in args split and padding."
+    data = DataProto.from_single_dict({"a": torch.zeros(8), "na": np.array([str(i) for i in range(8)], dtype=object)})
+    output = actor_wg.add(data=data)
+    print(output.batch["a"])
+    assert len(output) == 8, "Failed in kwargs split and padding."
+    ray.shutdown()
+if __name__ == "__main__":
+    test_auto_padding()

code/RL_model/verl/verl_train/tests/single_controller/test_colocated_workers.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+from verl import DataProto
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray.base import (
+    RayClassWithInitArgs,
+    RayResourcePool,
+    RayWorkerGroup,
+    create_colocated_worker_cls,
+)
+from verl.utils.device import get_device_name
+@ray.remote
+class Actor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def add(self, data: DataProto):
+        data.batch["a"] += self.rank
+        return data
+@ray.remote
+class Critic(Worker):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    async def sub(self, data: DataProto):
+        data.batch["a"] -= self.config["b"]
+        return data
+def test_colocated_workers():
+    ray.init()
+    import torch
+    data = DataProto.from_dict({"a": torch.zeros(10)})
+    # create separate workers on the same resource pool
+    actor_cls = RayClassWithInitArgs(cls=Actor)
+    critic_cls = RayClassWithInitArgs(cls=Critic, config={"b": 10})
+    resource_pool = RayResourcePool(process_on_nodes=[2])
+    actor_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=actor_cls, device_name=get_device_name())
+    critic_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=critic_cls, device_name=get_device_name())
+    expected_actor_output = actor_wg.add(data)
+    expected_critic_output = critic_wg.sub(data)
+    # create colocated workers
+    cls_dict = {"actor": actor_cls, "critic": critic_cls}
+    ray_cls_with_init = create_colocated_worker_cls(cls_dict)
+    wg_dict = RayWorkerGroup(
+        resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name()
+    )
+    spawn_wg = wg_dict.spawn(prefix_set=cls_dict.keys())
+    colocated_actor_wg = spawn_wg["actor"]
+    colocated_critic_wg = spawn_wg["critic"]
+    actor_output = colocated_actor_wg.add(data)
+    critic_output = colocated_critic_wg.sub(data)
+    torch.testing.assert_close(expected_actor_output.batch, actor_output.batch, atol=0, rtol=0)
+    torch.testing.assert_close(expected_critic_output.batch, critic_output.batch, atol=0, rtol=0)
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/single_controller/test_colocated_workers_fused.py ADDED Viewed

	@@ -0,0 +1,86 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+from verl import DataProto
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray.base import (
+    RayClassWithInitArgs,
+    RayResourcePool,
+    RayWorkerGroup,
+    create_colocated_worker_cls_fused,
+)
+from verl.utils.device import get_device_name
+@ray.remote
+class Actor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def add(self, data: DataProto):
+        data.batch["a"] += self.rank
+        return data
+@ray.remote
+class Critic(Worker):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.config = config
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def sub(self, data: DataProto):
+        data.batch["a"] -= self.config["b"]
+        return data
+def test_colocated_workers_fused():
+    ray.init()
+    import torch
+    data = DataProto.from_dict({"a": torch.zeros(10)})
+    # create separate workers on the same resource pool
+    actor_cls = RayClassWithInitArgs(cls=Actor)
+    critic_cls = RayClassWithInitArgs(cls=Critic, config={"b": 10})
+    resource_pool = RayResourcePool(process_on_nodes=[2])
+    actor_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=actor_cls, device_name=get_device_name())
+    critic_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=critic_cls, device_name=get_device_name())
+    expected_actor_output = actor_wg.add(data)
+    expected_critic_output = critic_wg.sub(data)
+    # create colocated workers
+    cls_dict = {"actor": actor_cls, "critic": critic_cls}
+    ray_cls_with_init = create_colocated_worker_cls_fused(cls_dict)
+    wg_dict = RayWorkerGroup(
+        resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, device_name=get_device_name()
+    )
+    spawn_wg = wg_dict.spawn(prefix_set=cls_dict.keys())
+    colocated_actor_wg = spawn_wg["actor"]
+    colocated_critic_wg = spawn_wg["critic"]
+    actor_output = colocated_actor_wg.add(data)
+    critic_output = colocated_critic_wg.sub(data)
+    torch.testing.assert_close(expected_actor_output.batch, actor_output.batch, atol=0, rtol=0)
+    torch.testing.assert_close(expected_critic_output.batch, critic_output.batch, atol=0, rtol=0)
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/single_controller/test_data_transfer.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+In this test, we instantiate a data parallel worker with 8 GPUs
+"""
+import ray
+import tensordict
+import torch
+from codetiming import Timer
+from packaging import version
+from torch import distributed as dist
+from verl import DataProto
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils.device import get_device_name
+from verl.utils.ray_utils import parallel_put
+@ray.remote
+class DummyWorker(Worker):
+    def __init__(self):
+        super().__init__()
+        dist.init_process_group()
+    @register(dispatch_mode=Dispatch.DP_COMPUTE, blocking=False)
+    def do_nothing(self, data):
+        for key in data.batch.keys():
+            data.batch[key] += 1
+        if version.parse(tensordict.__version__) >= version.parse("0.5.0"):
+            data.batch = data.batch.consolidate()
+        return data
+def test_data_transfer():
+    ray.init()
+    # construct resource pool
+    resource_pool = RayResourcePool([8])
+    cls_with_init = RayClassWithInitArgs(cls=DummyWorker)
+    # construct worker group
+    wg = RayWorkerGroup(resource_pool, cls_with_init, device_name=get_device_name())
+    # this is real dataset size
+    batch_size = 4096
+    seqlen = 32768
+    data_dict = {}
+    for i in range(2):
+        data_dict[str(i)] = torch.randint(0, 10000, (batch_size, seqlen))
+    data = DataProto.from_dict(tensors=data_dict)
+    print(data)
+    # we manually split data here and send to each worker
+    data_list = data.chunk(wg.world_size)
+    for i in range(wg.world_size):
+        # consolidate is necessary
+        if version.parse(tensordict.__version__) >= version.parse("0.5.0"):
+            data_list[i].batch = data_list[i].batch.consolidate()
+    with Timer(name="ray.pickle", initial_text=True):
+        for i in range(wg.world_size):
+            ray.cloudpickle.pickle.dumps(data_list[i])
+    with Timer(name="raw.pickle", initial_text=True):
+        import pickle
+        for i in range(wg.world_size):
+            pickle.dumps(data_list[i])
+    # we put in advance
+    with Timer(name="put", initial_text=True):
+        # takes around 40 seconds
+        data_list_ref = parallel_put(data_list)
+        # for i in range(wg.world_size):
+        #     data_list[i] = ray.put(data_list[i])
+    with Timer(name="launch", initial_text=True):
+        output_ref = wg.do_nothing(data_list_ref)
+    with Timer(name="get", initial_text=True):
+        # takes around 40 seconds
+        output_lst = ray.get(output_ref)
+    for input_data, output_data in zip(data_list, output_lst, strict=True):
+        for key in input_data.batch.keys():
+            assert torch.all(torch.eq(input_data.batch[key] + 1, output_data.batch[key])), (
+                input_data.batch[key],
+                output_data.batch[key],
+                key,
+            )
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/single_controller/test_decorator_on_cpu.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import asyncio
+import time
+import pytest
+import ray
+import torch
+from tensordict import TensorDict
+from verl.protocol import DataProto, DataProtoFuture
+from verl.single_controller.base.decorator import Dispatch, make_nd_compute_dataproto_dispatch_fn, register
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils import tensordict_utils as tu
+# Pytest fixture for Ray setup/teardown
+@pytest.fixture
+def ray_init_shutdown():
+    ray.init(num_cpus=100)
+    yield
+    ray.shutdown()
+# Define a simple worker for testing
+@ray.remote
+class DecoratorTestWorker(Worker):
+    def __init__(self, initial_value=0):
+        super().__init__()
+        self.value = initial_value
+        # Simulate some setup if needed
+        time.sleep(0.1)  # Ensure worker init completes
+        self._register_dispatch_collect_info(mesh_name="train", dp_rank=self.rank, is_collect=True)
+    # Test method for synchronous DP compute (default behavior)
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
+    def dp_compute(self, data: DataProto) -> DataProto:
+        time.sleep(0.1)  # Simulate work
+        rank_value = torch.tensor(self.rank, device=data.batch["input"].device, dtype=data.batch["input"].dtype)
+        data.batch["output"] = data.batch["input"] + self.value + rank_value
+        return data
+    # Test async def method with DP compute (default behavior)
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO, blocking=False)
+    async def async_dp_compute(self, data: DataProto) -> DataProto:
+        # Simulate async work
+        await asyncio.sleep(0.1)  # Simulate async work
+        rank_value = torch.tensor(self.rank, device=data.batch["input"].device, dtype=data.batch["input"].dtype)
+        data.batch["output_async"] = data.batch["input"] * 2 + self.value + rank_value
+        return data
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="train"), blocking=False)
+    def dp_compute_td(self, data: TensorDict) -> TensorDict:
+        # note that we have to call contiguous so that we can modify data in plac
+        data = tu.contiguous(data)
+        rank_value = torch.tensor(self.rank, device=data["input"].device, dtype=data["input"].dtype)
+        data["output"] = data["input"] + self.value + rank_value
+        position_ids = data.pop("position_ids")
+        position_ids._ragged_idx = 2
+        for i, position_id in enumerate(position_ids.unbind(dim=0)):
+            assert (position_id == torch.arange(4 + rank_value * 2 + i).expand(position_id.shape)).all()
+        return data
+# Test function for synchronous DP compute
+def test_decorator_dp_compute(ray_init_shutdown):
+    """
+    Tests the default behavior of a synchronous decorated method with DP_COMPUTE_PROTO.
+    Verifies the result correctness.
+    """
+    num_workers = 2
+    resource_pool = RayResourcePool([num_workers], use_gpu=False, max_colocate_count=1)  # Use CPU for simplicity
+    cls_with_args = RayClassWithInitArgs(cls=DecoratorTestWorker, initial_value=10)
+    worker_group = RayWorkerGroup(
+        resource_pool, cls_with_args, name_prefix=f"decorator_test_sync_dp_{int(time.time())}"
+    )
+    # Prepare input data (size 4, for 2 workers)
+    input_tensor = torch.arange(4, dtype=torch.float32)
+    data = DataProto(batch=TensorDict({"input": input_tensor}, batch_size=[4]))
+    # Call the decorated method
+    output = worker_group.dp_compute(data)
+    # Assert the result correctness
+    assert isinstance(output, DataProto), "Expected DataProto result"
+    assert "output" in output.batch.keys()
+    assert len(output) == len(data), "Output length should match input length"
+    # Expected output calculation for DP_COMPUTE_PROTO with 2 workers
+    # Worker 0 gets data[0:2], Worker 1 gets data[2:4]
+    # Worker 0 adds initial_value(10) + rank(0) = 10
+    # Worker 1 adds initial_value(10) + rank(1) = 11
+    expected_output_part1 = torch.tensor([0, 1], dtype=torch.float32) + 10 + 0
+    expected_output_part2 = torch.tensor([2, 3], dtype=torch.float32) + 10 + 1
+    expected_output = torch.cat([expected_output_part1, expected_output_part2])
+    torch.testing.assert_close(output.batch["output"], expected_output, msg="Sync DP compute output data mismatch")
+# Test function for async def method with DP compute
+def test_decorator_async_function(ray_init_shutdown):
+    """
+    Tests the decorator with an `async def` method using DP_COMPUTE_PROTO.
+    Verifies that the call returns a future and the result is correct after .get().
+    """
+    num_workers = 2
+    resource_pool = RayResourcePool([num_workers], use_gpu=False, max_colocate_count=1)
+    cls_with_args = RayClassWithInitArgs(cls=DecoratorTestWorker, initial_value=5)
+    worker_group = RayWorkerGroup(
+        resource_pool, cls_with_args, name_prefix=f"decorator_test_async_dp_{int(time.time())}"
+    )
+    # Prepare input data (size 4, for 2 workers)
+    input_tensor = torch.arange(4, dtype=torch.float32)
+    data = DataProto(batch=TensorDict({"input": input_tensor}, batch_size=[4]))
+    # Call the async decorated method - this should return a future
+    future_output: DataProtoFuture = worker_group.async_dp_compute(data)
+    # Assert that the call returned a future
+    assert isinstance(future_output, DataProtoFuture), "Expected DataProtoFuture for async def call"
+    # Get the result (this should block)
+    result_data = future_output.get()
+    # Assert the result correctness
+    assert isinstance(result_data, DataProto)
+    assert "output_async" in result_data.batch.keys()
+    assert len(result_data) == len(data), "Output length should match input length"
+    # Expected output calculation for DP_COMPUTE_PROTO with 2 workers
+    # Worker 0 gets data[0:2], Worker 1 gets data[2:4]
+    # Worker 0 calculates: input * 2 + initial_value(5) + rank(0)
+    # Worker 1 calculates: input * 2 + initial_value(5) + rank(1)
+    expected_output_part1 = (torch.tensor([0, 1], dtype=torch.float32) * 2) + 5 + 0
+    expected_output_part2 = (torch.tensor([2, 3], dtype=torch.float32) * 2) + 5 + 1
+    expected_output = torch.cat([expected_output_part1, expected_output_part2])
+    torch.testing.assert_close(
+        result_data.batch["output_async"], expected_output, msg="Async DP compute output data mismatch"
+    )
+def test_decorator_dp_compute_td(ray_init_shutdown):
+    num_workers = 2
+    resource_pool = RayResourcePool([num_workers], use_gpu=False, max_colocate_count=1)  # Use CPU for simplicity
+    cls_with_args = RayClassWithInitArgs(cls=DecoratorTestWorker, initial_value=10)
+    worker_group = RayWorkerGroup(
+        resource_pool, cls_with_args, name_prefix=f"decorator_test_sync_dp_{int(time.time())}"
+    )
+    # Prepare input data (size 4, for 2 workers)
+    input_tensor = torch.arange(4, dtype=torch.float32)
+    position_ids = torch.nested.as_nested_tensor(
+        [
+            torch.arange(4).expand(4, 4).contiguous(),
+            torch.arange(5).expand(4, 5).contiguous(),
+            torch.arange(6).expand(4, 6).contiguous(),
+            torch.arange(7).expand(4, 7).contiguous(),
+        ],
+        layout=torch.jagged,
+    )
+    data = TensorDict({"input": input_tensor, "position_ids": position_ids}, batch_size=[4])
+    # Call the decorated method
+    output = worker_group.dp_compute_td(data)
+    output = output.get()
+    # Assert the result correctness
+    assert isinstance(output, TensorDict), "Expected DataProto result"
+    assert "output" in output.keys()
+    assert len(output) == len(data), "Output length should match input length"
+    # Expected output calculation for DP_COMPUTE_PROTO with 2 workers
+    # Worker 0 gets data[0:2], Worker 1 gets data[2:4]
+    # Worker 0 adds initial_value(10) + rank(0) = 10
+    # Worker 1 adds initial_value(10) + rank(1) = 11
+    expected_output_part1 = torch.tensor([0, 1], dtype=torch.float32) + 10 + 0
+    expected_output_part2 = torch.tensor([2, 3], dtype=torch.float32) + 10 + 1
+    expected_output = torch.cat([expected_output_part1, expected_output_part2])
+    torch.testing.assert_close(output["output"], expected_output, msg="Sync DP compute output data mismatch")

code/RL_model/verl/verl_train/tests/single_controller/test_device_mesh_register.py ADDED Viewed

	@@ -0,0 +1,158 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import ray
+import torch
+from tensordict import TensorDict
+import verl.utils.tensordict_utils as tu
+from verl import DataProto
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import make_nd_compute_dataproto_dispatch_fn, register
+from verl.utils.device import get_device_name, get_nccl_backend
+@ray.remote
+class TestActor(Worker):
+    def __init__(self):
+        super().__init__()
+        import torch.distributed
+        torch.distributed.init_process_group(backend=get_nccl_backend())
+        self.infer_device_mesh = torch.distributed.device_mesh.init_device_mesh(
+            device_type=get_device_name(), mesh_shape=[2, 4], mesh_dim_names=["dp", "tp"]
+        )
+        self.train_device_mesh = torch.distributed.device_mesh.init_device_mesh(
+            device_type=get_device_name(), mesh_shape=[2, 2, 2], mesh_dim_names=["pp", "dp", "tp"]
+        )
+        self._register_dispatch_collect_info(
+            "infer",
+            dp_rank=self.infer_device_mesh["dp"].get_local_rank(),
+            is_collect=self.infer_device_mesh["tp"].get_local_rank() == 0,
+        )
+        self._register_dispatch_collect_info(
+            "train",
+            dp_rank=self.train_device_mesh["dp"].get_local_rank(),
+            is_collect=self.train_device_mesh["tp"].get_local_rank() == 0
+            and self.train_device_mesh["pp"].get_local_rank() == 1,
+        )
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="infer"))
+    def generate_data_proto(self, data: DataProto):
+        tp_rank = self.infer_device_mesh["tp"].get_local_rank()
+        dp_rank = self.infer_device_mesh["dp"].get_local_rank()
+        data.batch["a"] += (tp_rank + 1) * dp_rank
+        return data
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="infer"))
+    def generate_tensordict(self, data: TensorDict):
+        tp_rank = self.infer_device_mesh["tp"].get_local_rank()
+        dp_rank = self.infer_device_mesh["dp"].get_local_rank()
+        data["a"] += (tp_rank + 1) * dp_rank
+        return data
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="train"))
+    def train_data_proto(self, data: DataProto):
+        tp_rank = self.train_device_mesh["tp"].get_local_rank()
+        dp_rank = self.train_device_mesh["dp"].get_local_rank()
+        pp_rank = self.train_device_mesh["pp"].get_local_rank()
+        data.batch["a"] += (tp_rank + 1) * (dp_rank + 2) * (pp_rank + 3)
+        # tp rank 0, pp rank 1, dp rank 0, output data added: 8 + 3 = 11
+        # tp rank 0, pp rank 1, dp rank 1, output data added: 12 + 4 = 16
+        return data
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="train"))
+    def train_tensordict(self, data: TensorDict):
+        tp_rank = self.train_device_mesh["tp"].get_local_rank()
+        dp_rank = self.train_device_mesh["dp"].get_local_rank()
+        pp_rank = self.train_device_mesh["pp"].get_local_rank()
+        data["a"] += (tp_rank + 1) * (dp_rank + 2) * (pp_rank + 3)
+        # tp rank 0, pp rank 1, dp rank 0, output data added: 8 + 3 = 11
+        # tp rank 0, pp rank 1, dp rank 1, output data added: 12 + 4 = 16
+        return data
+    @register(dispatch_mode=make_nd_compute_dataproto_dispatch_fn(mesh_name="infer"))
+    def generate_nested_tensor(self, data: TensorDict):
+        tp_rank = self.infer_device_mesh["tp"].get_local_rank()
+        dp_rank = self.infer_device_mesh["dp"].get_local_rank()
+        assert data.shape[0] == 8
+        data["input_ids"] += tp_rank + dp_rank
+        print(data)
+        return data
+def test_dist_global_info_wg():
+    # create a worker group with size 8
+    # register a infer dist info with tp=4, dp=2
+    # register a train dist info with tp=2, dp=2, pp=2
+    # test the correctness of data dispatch and computation
+    from verl.single_controller.ray import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+    ray.init()
+    ray_cls = RayClassWithInitArgs(TestActor)
+    resource_pool = RayResourcePool(process_on_nodes=[8])
+    wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=ray_cls, device_name=get_device_name())
+    infer_input_data_proto = DataProto.from_single_dict(data={"a": torch.tensor([1, 2])})
+    infer_output_data_proto = wg.generate_data_proto(infer_input_data_proto)
+    assert wg._dispatch_info["infer"] == [0, 0, 0, 0, 1, 1, 1, 1]
+    assert torch.all(torch.eq(infer_output_data_proto.batch["a"], torch.tensor([1, 3])))
+    infer_input_tensordict = infer_input_data_proto.to_tensordict()
+    infer_output_tensordict = wg.generate_tensordict(infer_input_tensordict)
+    assert torch.all(torch.eq(infer_output_tensordict["a"], torch.tensor([1, 3])))
+    train_input_data_proto = DataProto.from_single_dict(data={"a": torch.tensor([3, 4])})
+    train_output_data_proto = wg.train_data_proto(train_input_data_proto)
+    assert wg._dispatch_info["train"] == [0, 0, 1, 1, 0, 0, 1, 1]
+    assert torch.all(torch.eq(train_output_data_proto.batch["a"], torch.tensor([11, 16])))
+    train_input_tensordict = train_input_data_proto.to_tensordict()
+    train_output_tensordict = wg.train_tensordict(train_input_tensordict)
+    assert torch.all(torch.eq(train_output_tensordict["a"], torch.tensor([11, 16])))
+    # create a batch size of input_ids
+    input_ids = [
+        torch.randint(low=0, high=128, size=(np.random.randint(low=1, high=10, dtype=np.int64),)) for _ in range(16)
+    ]
+    input_ids = torch.nested.as_nested_tensor(input_ids, layout=torch.jagged)
+    data = tu.get_tensordict(tensor_dict={"input_ids": input_ids})
+    output = wg.generate_nested_tensor(data)
+    input_ids_chunked = list(input_ids.chunk(2))
+    print(input_ids_chunked)
+    input_ids_chunked[0] += 0
+    input_ids_chunked[1] += 1
+    expected = tu.concat_nested_tensors(input_ids_chunked)
+    assert torch.all(torch.eq(output["input_ids"].values(), expected.values()))
+    ray.shutdown()
+if __name__ == "__main__":
+    test_dist_global_info_wg()

code/RL_model/verl/verl_train/tests/single_controller/test_driverfunc_to_worker.py ADDED Viewed

	@@ -0,0 +1,85 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import ray
+import torch
+from tensordict import TensorDict
+from verl import DataProto
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray import RayWorkerGroup
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool
+from verl.utils.device import get_device_name
+os.environ["RAY_DEDUP_LOGS"] = "0"
+os.environ["NCCL_DEBUG"] = "WARN"
+@ray.remote
+class ModelActor(Worker):
+    def __init__(self):
+        pass
+class HackSelf:
+    def __init__(self):
+        pass
+def get_aux_metrics(self, test_proto):
+    sequence_ids = test_proto.batch["sequence_ids"]
+    decode_count = []
+    for i in range(sequence_ids.size(0)):
+        decode_count.append(len(sequence_ids[i].tolist()))
+    ret_proto = DataProto(
+        batch=TensorDict(
+            {"sequence_ids": sequence_ids, "decode_count": torch.tensor(decode_count)}, batch_size=sequence_ids.size(0)
+        )
+    )
+    return ret_proto
+def test():
+    # construct model
+    ray.init()
+    # create 2 workers, each hold a GPU
+    resource_pool = RayResourcePool([2], use_gpu=True, name_prefix="a")
+    class_with_args = RayClassWithInitArgs(cls=ModelActor)
+    shard_wg = RayWorkerGroup(resource_pool, class_with_args, device_name=get_device_name())
+    test_bs = 8
+    test_proto = DataProto(
+        TensorDict(
+            {
+                "sequence_ids": torch.ones([test_bs, 2048], dtype=torch.int64),
+            },
+            batch_size=test_bs,
+        ),
+        meta_info={"query_length": 1536},
+    )
+    # Sharding among different ranks
+    ret_proto1 = shard_wg.execute_with_func_generator(get_aux_metrics, test_proto)
+    # compare execute on driver
+    hs = HackSelf()
+    ret_proto2 = get_aux_metrics(hs, test_proto)
+    torch.testing.assert_close(ret_proto1.batch["decode_count"], ret_proto2.batch["decode_count"])
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/single_controller/test_fused_workers_on_cpu.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+from verl.single_controller.base import Worker
+from verl.single_controller.base.decorator import Dispatch, register
+from verl.single_controller.ray.base import (
+    RayClassWithInitArgs,
+    RayResourcePool,
+    RayWorkerGroup,
+    create_colocated_worker_raw_cls,
+)
+@ray.remote
+class Actor(Worker):
+    def __init__(self) -> None:
+        super().__init__()
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def add(self, x):
+        x += self.rank
+        return x
+@ray.remote
+class Critic(Worker):
+    def __init__(self, val) -> None:
+        super().__init__()
+        self.val = val
+    @register(dispatch_mode=Dispatch.ALL_TO_ALL)
+    def sub(self, x):
+        x -= self.val
+        return x
+actor_cls = RayClassWithInitArgs(cls=Actor)
+critic_cls = RayClassWithInitArgs(cls=Critic, val=10)
+cls_dict = {"actor": actor_cls, "critic": critic_cls}
+FusedBaseClass = create_colocated_worker_raw_cls(cls_dict)
+@ray.remote
+class HybridWorker(FusedBaseClass):
+    @register(dispatch_mode=Dispatch.ONE_TO_ALL)
+    def foo(self, x):
+        return self.critic.sub(self.actor.add(x))
+def test_fused_workers():
+    ray.init(num_cpus=100)
+    # create separate workers on the same resource pool
+    process_on_nodes = [2]
+    resource_pool = RayResourcePool(process_on_nodes=process_on_nodes, use_gpu=False)
+    # create colocated workers
+    hybrid_cls_with_init = RayClassWithInitArgs(cls=HybridWorker)
+    hybrid_cls_with_init.fused_worker_used = True
+    fused_wg = RayWorkerGroup(resource_pool=resource_pool, ray_cls_with_init=hybrid_cls_with_init)
+    fused_wg.fuse(cls_dict.keys())
+    x = fused_wg.actor.add(0.1)
+    print(x)
+    y = fused_wg.critic.sub(x)
+    print(y)
+    z = fused_wg.foo(0.1)
+    print(z)
+    for i, j in zip(y, z, strict=True):
+        assert i == j
+    ray.shutdown()
+if __name__ == "__main__":
+    test_fused_workers()

code/RL_model/verl/verl_train/tests/single_controller/test_high_level_scheduling_api.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import time
+import ray
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup, merge_resource_pool
+from verl.utils.device import get_device_name
+@ray.remote
+class TestActor(Worker):
+    # TODO: pass *args and **kwargs is bug prone and not very convincing
+    def __init__(self, cuda_visible_devices=None) -> None:
+        super().__init__(cuda_visible_devices)
+    def get_node_id(self):
+        return ray.get_runtime_context().get_node_id()
+def test():
+    ray.init()
+    # test single-node-no-partition
+    print("test single-node-no-partition")
+    resource_pool = RayResourcePool([8], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestActor)
+    print("create actor worker group")
+    actor_wg = RayWorkerGroup(
+        resource_pool, class_with_args, name_prefix="high_level_api_actor", device_name=get_device_name()
+    )
+    print("create critic worker group")
+    critic_wg = RayWorkerGroup(
+        resource_pool, class_with_args, name_prefix="hight_level_api_critic", device_name=get_device_name()
+    )
+    print("create rm worker group")
+    rm_wg = RayWorkerGroup(
+        resource_pool, class_with_args, name_prefix="high_level_api_rm", device_name=get_device_name()
+    )
+    print("create ref worker group")
+    ref_wg = RayWorkerGroup(
+        resource_pool, class_with_args, name_prefix="high_level_api_ref", device_name=get_device_name()
+    )
+    assert actor_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert critic_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert rm_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert ref_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    del actor_wg
+    del critic_wg
+    del rm_wg
+    del ref_wg
+    gc.collect()  # make sure ray actors are deleted
+    [ray.util.remove_placement_group(pg) for pg in resource_pool.get_placement_groups()]
+    print("wait 5s to remove placemeng_group")
+    time.sleep(5)
+    # test single-node-multi-partition
+    print("test single-node-multi-partition")
+    rm_resource_pool = RayResourcePool([4], use_gpu=True, name_prefix="rm")
+    ref_resource_pool = RayResourcePool([4], use_gpu=True, name_prefix="ref")
+    total_resource_pool = merge_resource_pool(rm_resource_pool, ref_resource_pool)
+    assert rm_resource_pool.world_size == 4
+    assert ref_resource_pool.world_size == 4
+    assert total_resource_pool.world_size == 8
+    actor_wg = RayWorkerGroup(
+        total_resource_pool, class_with_args, name_prefix="high_level_api_actor", device_name=get_device_name()
+    )
+    critic_wg = RayWorkerGroup(
+        total_resource_pool, class_with_args, name_prefix="high_level_api_critic", device_name=get_device_name()
+    )
+    rm_wg = RayWorkerGroup(
+        rm_resource_pool, class_with_args, name_prefix="high_level_api_rm", device_name=get_device_name()
+    )
+    ref_wg = RayWorkerGroup(
+        ref_resource_pool, class_with_args, name_prefix="high_level_api_ref", device_name=get_device_name()
+    )
+    assert actor_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert critic_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(8)]
+    assert rm_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(4)]
+    assert ref_wg.execute_all_sync("get_cuda_visible_devices") == [str(i) for i in range(4, 8)]
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/single_controller/test_rvdz.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+@ray.remote
+class TestWorker:
+    def __init__(self, rank, world_size, group_name):
+        self.rank = rank
+        self.world_size = world_size
+        self.group_name = group_name
+        self.communicator = None
+    def init(self):
+        from verl.utils.rendezvous.ray_backend import create_nccl_communicator_in_ray
+        self.communicator = create_nccl_communicator_in_ray(self.rank, self.world_size, self.group_name)
+    def test(self):
+        if self.communicator is None:
+            return None
+        return self.communicator.rank_id()
+def test_rvdz():
+    ray.init()
+    group_name = "test_group"
+    world_size = 2
+    workers = [TestWorker.options(num_gpus=1).remote(rank, world_size, group_name) for rank in range(world_size)]
+    ray.get([worker.init.remote() for worker in workers])
+    ranks = ray.get([worker.test.remote() for worker in workers])
+    assert ranks == [0, 1], f"expecting [0, 1], got {ranks}"
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/single_controller/test_worker_group_torch.py ADDED Viewed

	@@ -0,0 +1,116 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+os.environ["RAY_DEDUP_LOGS"] = "0"
+os.environ["NCCL_DEBUG"] = "WARN"
+import ray
+import torch
+import torch.distributed
+from verl.single_controller.base.worker import Worker
+from verl.single_controller.ray.base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup
+from verl.utils.device import get_device_name
+@ray.remote
+class TestAllGatherActor(Worker):
+    def __init__(self, size) -> None:
+        super().__init__()
+        self.size = size
+    def init(self):
+        torch.distributed.init_process_group()
+        self.tensor = torch.zeros(size=(self.size,), dtype=torch.int64, device=get_device_name())
+        self.tensor += self.rank
+    def all_gather(self):
+        world_size = self._world_size
+        output = torch.zeros(
+            size=(self.tensor.shape[0] * world_size,), dtype=self.tensor.dtype, device=self.tensor.device
+        )
+        torch.distributed.all_gather_into_tensor(output, self.tensor, async_op=False)
+        return output
+@ray.remote
+class TestAllGatherActorV2(Worker):
+    def __init__(self, size) -> None:
+        super().__init__()
+        self.size = size
+        torch.distributed.init_process_group()
+        self.tensor = torch.zeros(size=(self.size,), dtype=torch.int64, device=get_device_name())
+        self.tensor += self.rank
+    def all_gather(self):
+        world_size = self._world_size
+        output = torch.zeros(
+            size=(self.tensor.shape[0] * world_size,), dtype=self.tensor.dtype, device=self.tensor.device
+        )
+        torch.distributed.all_gather_into_tensor(output, self.tensor, async_op=False)
+        return output
+def test_all_gather_torch():
+    """
+    In this test, we instantiate 4 GPUs in a group and test the all_gather
+    """
+    ray.init()
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestAllGatherActor, size=2)
+    worker_group = RayWorkerGroup(
+        resource_pool, class_with_args, name_prefix="worker_group_torch", device_name=get_device_name()
+    )
+    worker_group.execute_all_sync("init")
+    output = worker_group.execute_all_sync("all_gather")
+    for i in range(1, len(output)):
+        assert torch.all(output[i] == output[0])
+    output = output[0].cpu()
+    print(output)
+    assert torch.all(output == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3], dtype=torch.int64))
+    ray.shutdown()
+def test_all_gather_torch_v2():
+    """
+    In this test, we instantiate 4 GPUs in a group and test the all_gather
+    """
+    ray.init()
+    # create 4 workers, each hold a GPU
+    resource_pool = RayResourcePool([4], use_gpu=True)
+    class_with_args = RayClassWithInitArgs(cls=TestAllGatherActorV2, size=2)
+    worker_group = RayWorkerGroup(
+        resource_pool, class_with_args, name_prefix="worker_group_torch", device_name=get_device_name()
+    )
+    output = worker_group.execute_all_sync("all_gather")
+    for i in range(1, len(output)):
+        assert torch.all(output[i] == output[0])
+    output = output[0].cpu()
+    print(output)
+    assert torch.all(output == torch.tensor([0, 0, 1, 1, 2, 2, 3, 3], dtype=torch.int64))
+    ray.shutdown()

code/RL_model/verl/verl_train/tests/special_e2e/README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ This folder is reserved for end-to-end tests that typically require multiple GPUs.

code/RL_model/verl/verl_train/tests/utils/test_activation_offload.py ADDED Viewed

	@@ -0,0 +1,175 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import shutil
+import tempfile
+import pytest
+import torch
+import torch.distributed
+import torch.multiprocessing as mp
+from torch.distributed import init_device_mesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import MixedPrecision, ShardingStrategy
+from transformers import AutoModelForCausalLM, AutoTokenizer, Qwen2Config
+from verl.utils.activation_offload import enable_activation_offloading
+from verl.utils.checkpoint.fsdp_checkpoint_manager import FSDPCheckpointManager
+from verl.utils.device import get_device_name, get_nccl_backend, get_torch_device
+from verl.utils.fsdp_utils import MixedPrecisionPolicy, apply_fsdp2, get_fsdp_wrap_policy
+def create_random_input_ids(batch_size, seq_len, vocab_size):
+    if get_device_name() == "cuda":
+        from flash_attn.bert_padding import unpad_input
+    elif get_device_name() == "npu":
+        from verl.utils.attention_utils import unpad_input
+    from verl.utils.model import compute_position_id_with_mask, create_random_mask
+    input_ids = torch.randint(0, vocab_size, (batch_size, seq_len), device=get_device_name())
+    attention_mask = create_random_mask(
+        input_ids, max_ratio_of_left_padding=0.1, min_ratio_of_valid_token=0.5, max_ratio_of_valid_token=0.7
+    )
+    position_ids = compute_position_id_with_mask(attention_mask)
+    input_ids = unpad_input(input_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1)
+    position_ids = unpad_input(position_ids.unsqueeze(-1), attention_mask)[0].transpose(0, 1)
+    return input_ids, position_ids
+def _fsdp_activation_offloading_test(rank, world_size, rendezvous_file, strategy="fsdp"):
+    get_torch_device().set_device(rank)
+    torch.distributed.init_process_group(
+        backend=get_nccl_backend(),
+        init_method=f"file://{rendezvous_file}",
+        rank=rank,
+        world_size=world_size,
+    )
+    device_mesh = init_device_mesh(get_device_name(), mesh_shape=(world_size,), mesh_dim_names=("dp",))
+    model_name = os.path.expanduser("~/models/Qwen/Qwen2.5-0.5B-Instruct")
+    config = Qwen2Config(num_hidden_layers=4)
+    with torch.device(get_device_name()):
+        model = AutoModelForCausalLM.from_config(
+            config=config, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
+        )
+        model = model.to(device=get_device_name())
+    # Wrap model with FSDP
+    mixed_precision = MixedPrecision(param_dtype=torch.bfloat16, reduce_dtype=torch.float32, buffer_dtype=torch.float32)
+    if strategy == "fsdp":
+        model = FSDP(
+            model,
+            use_orig_params=False,
+            device_id=get_torch_device().current_device(),
+            sharding_strategy=ShardingStrategy.FULL_SHARD,
+            mixed_precision=mixed_precision,
+            device_mesh=device_mesh,
+            auto_wrap_policy=get_fsdp_wrap_policy(module=model),
+        )
+    else:
+        mp_policy = MixedPrecisionPolicy(
+            param_dtype=torch.bfloat16, reduce_dtype=torch.float32, cast_forward_inputs=True
+        )
+        fsdp_kwargs = {
+            "mesh": device_mesh,
+            "mp_policy": mp_policy,
+        }
+        apply_fsdp2(model, fsdp_kwargs, {})
+    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
+    lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.9)
+    # Create checkpoint manager
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    checkpoint_manager = FSDPCheckpointManager(
+        model=model, optimizer=optimizer, lr_scheduler=lr_scheduler, tokenizer=tokenizer
+    )
+    # Generate sample input
+    batch_size = 2
+    seq_len = 32
+    vocab_size = 32000
+    # First input for initial update
+    input_ids1, position_ids1 = create_random_input_ids(batch_size, seq_len, vocab_size)
+    # Second input for verification
+    input_ids2, position_ids2 = create_random_input_ids(batch_size, seq_len, vocab_size)
+    # Step 1: Initial update and save checkpoint
+    outputs1 = model(input_ids=input_ids1, position_ids=position_ids1)
+    loss1 = outputs1.logits.mean()
+    loss1.backward()
+    optimizer.step()
+    lr_scheduler.step()
+    optimizer.zero_grad()
+    # Save checkpoint after first update
+    temp_dir = tempfile.mkdtemp()
+    checkpoint_path = os.path.join(temp_dir, "checkpoint")
+    checkpoint_manager.save_checkpoint(local_path=checkpoint_path, hdfs_path=None, global_step=0)
+    # Step 2: Second update and forward pass
+    outputs2 = model(input_ids=input_ids2, position_ids=position_ids2)
+    loss2 = outputs2.logits.mean()
+    loss2.backward()
+    optimizer.step()
+    lr_scheduler.step()
+    optimizer.zero_grad()
+    # Record logits after second update
+    with torch.no_grad():
+        logits_without_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits
+    # Step 3: wrap module with activation offloading and load checkpoint
+    enable_activation_offloading(model, strategy=strategy)
+    checkpoint_manager.load_checkpoint(checkpoint_path)
+    # Step 4: Repeat the second update with same input
+    outputs3 = model(input_ids=input_ids2, position_ids=position_ids2)
+    loss3 = outputs3.logits.mean()
+    loss3.backward()
+    optimizer.step()
+    lr_scheduler.step()
+    optimizer.zero_grad()
+    # Record logits after loaded checkpoint and update
+    with torch.no_grad():
+        logits_with_offloading = model(input_ids=input_ids2, position_ids=position_ids2).logits
+    # Step 4: Verify outputs match
+    torch.testing.assert_close(logits_without_offloading, logits_with_offloading, atol=0.0, rtol=0.0)
+    print(f"Activaiton offloading for {strategy} test passed on {world_size} GPUs!")
+    # Cleanup
+    shutil.rmtree(temp_dir)
+    torch.distributed.barrier()
+    torch.distributed.destroy_process_group()
+@pytest.mark.parametrize("world_size", (2, 4))
+@pytest.mark.parametrize("strategy", ("fsdp", "fsdp2"))
+def test_activation_offloading(world_size, strategy, tmp_path):
+    rendezvous_file = str(tmp_path / "rdzv_file")
+    os.makedirs(os.path.dirname(rendezvous_file), exist_ok=True)
+    mp.spawn(
+        fn=_fsdp_activation_offloading_test,
+        args=(world_size, rendezvous_file, strategy),
+        nprocs=world_size,
+        join=True,
+    )

code/RL_model/verl/verl_train/tests/utils/test_check_ipc_version_support_on_npu.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# This code is licensed under the MIT-style license found in the
+# LICENSE file in the root directory of this source tree.
+import logging
+import unittest
+from unittest.mock import Mock, mock_open, patch
+from verl.utils.device import check_ipc_version_support, get_npu_versions
+class TestCheckIPCVersionSupport(unittest.TestCase):
+    """Test cases for the check_ipc_version_support function."""
+    def setUp(self):
+        """Set up test logging to suppress INFO messages."""
+        # Suppress INFO log messages during testing
+        logging.disable(logging.INFO)
+    def tearDown(self):
+        """Restore logging."""
+        logging.disable(logging.NOTSET)
+    def test_standard_version_with_support(self):
+        """Test standard version that meets minimum requirements."""
+        # Software 25.5.0 >= 25.3.rc1, CANN 8.3.0 >= 8.3.rc1
+        result = check_ipc_version_support("25.5.0", "8.3.0")
+        self.assertTrue(result)
+    def test_standard_version_newer(self):
+        """Test newer standard versions."""
+        # Software 26.0.0 >= 25.3.rc1, CANN 9.0.0 >= 8.3.rc1
+        result = check_ipc_version_support("26.0.0", "9.0.0")
+        self.assertTrue(result)
+    def test_rc_version_format(self):
+        """Test RC version format with additional parts."""
+        # Software 25.3.rc1.2 -> 25.3.rc1 >= 25.3.rc1
+        # CANN 8.3.rc1.2 -> 8.3.rc1 >= 8.3.rc1
+        result = check_ipc_version_support("25.3.rc1.2", "8.3.rc1.2")
+        self.assertTrue(result)
+    def test_exact_rc_version(self):
+        """Test exact RC version."""
+        # Software 25.3.rc1 >= 25.3.rc1
+        # CANN 8.3.rc1 >= 8.3.rc1
+        result = check_ipc_version_support("25.3.rc1", "8.3.rc1")
+        self.assertTrue(result)
+    def test_t_suffix_version(self):
+        """Test version with lowercase t suffix."""
+        # Software 25.5.t3.b001 -> 25.5 >= 25.3.rc1
+        # CANN 8.3.rc1 >= 8.3.rc1
+        result = check_ipc_version_support("25.5.t3.b001", "8.3.rc1")
+        self.assertTrue(result)
+    def test_t_suffix_version_older(self):
+        """Test version with lowercase t suffix that's too old."""
+        # Software 25.5.t3.b001 -> 25.5 >= 25.3.rc1 (should pass)
+        # CANN 8.2.rc1 < 8.3.rc1 (should fail)
+        result = check_ipc_version_support("25.5.t3.b001", "8.2.rc1")
+        self.assertFalse(result)
+    def test_software_version_below_minimum(self):
+        """Test software version below minimum requirement."""
+        # Software 25.2.0 < 25.3.rc1
+        result = check_ipc_version_support("25.2.0", "8.3.0")
+        self.assertFalse(result)
+    def test_cann_version_below_minimum(self):
+        """Test CANN version below minimum requirement."""
+        # Software 25.5.0 >= 25.3.rc1
+        # CANN 8.2.0 < 8.3.rc1
+        result = check_ipc_version_support("25.5.0", "8.2.0")
+        self.assertFalse(result)
+    def test_both_versions_below_minimum(self):
+        """Test both versions below minimum requirement."""
+        # Software 25.2.0 < 25.3.rc1
+        # CANN 8.2.0 < 8.3.rc1
+        result = check_ipc_version_support("25.2.0", "8.2.0")
+        self.assertFalse(result)
+    def test_invalid_software_version(self):
+        """Test invalid software version format."""
+        with self.assertRaises(RuntimeError) as context:
+            check_ipc_version_support("invalid.version", "8.3.0")
+        self.assertIn("Invalid software version format", str(context.exception))
+    def test_invalid_cann_version(self):
+        """Test invalid CANN version format."""
+        with self.assertRaises(RuntimeError) as context:
+            check_ipc_version_support("25.5.0", "invalid.version")
+        self.assertIn("Invalid CANN version format", str(context.exception))
+    def test_rc_with_more_parts(self):
+        """Test RC version with more than 3 parts."""
+        # Should extract only first 3 parts: 25.3.rc1
+        result = check_ipc_version_support("25.3.rc1.2.3.4", "8.3.rc1.2.3.4")
+        self.assertTrue(result)
+    def test_standard_with_more_parts(self):
+        """Test standard version with more than 3 parts."""
+        # Should extract only first 3 parts: 25.5.0
+        result = check_ipc_version_support("25.5.0.1.2.3", "8.3.0.1.2.3")
+        self.assertTrue(result)
+    def test_rc_edge_case_versions(self):
+        """Test edge case RC versions."""
+        # RC1 is the minimum
+        result = check_ipc_version_support("25.3.rc1", "8.3.rc1")
+        self.assertTrue(result)
+        # RC0 should fail
+        result = check_ipc_version_support("25.3.rc0", "8.3.rc1")
+        self.assertFalse(result)
+    def test_major_version_differences(self):
+        """Test major version number differences."""
+        # Much newer major versions
+        result = check_ipc_version_support("30.0.0", "10.0.0")
+        self.assertTrue(result)
+        # Older major versions
+        result = check_ipc_version_support("24.0.0", "7.0.0")
+        self.assertFalse(result)
+class TestGetNPUVersions(unittest.TestCase):
+    """Test cases for the get_npu_versions function."""
+    @patch("subprocess.run")
+    @patch("platform.machine")
+    @patch("os.path.exists")
+    @patch("builtins.open", new_callable=mock_open, read_data="version=8.3.rc1\n")
+    def test_get_npu_versions_success(self, mock_file, mock_exists, mock_machine, mock_run):
+        """Test successful retrieval of versions."""
+        # Mock npu-smi output
+        mock_run.return_value = Mock(stdout="Software Version : 25.5.0\nOther Info\n", check=True)
+        # Mock architecture
+        mock_machine.return_value = "x86_64"
+        # Mock path exists
+        mock_exists.return_value = True
+        software_version, cann_version = get_npu_versions()
+        self.assertEqual(software_version, "25.5.0")
+        self.assertEqual(cann_version, "8.3.rc1")
+    @patch("subprocess.run")
+    def test_get_npu_versions_missing_software_version(self, mock_run):
+        """Test error when Software Version is missing."""
+        mock_run.return_value = Mock(stdout="Other Info Without Software Version\n", check=True)
+        with self.assertRaises(RuntimeError) as context:
+            get_npu_versions()
+        self.assertIn("Could not find Software Version", str(context.exception))
+    @patch("subprocess.run")
+    @patch("platform.machine")
+    @patch("os.path.exists")
+    @patch("builtins.open", new_callable=mock_open, read_data="version=8.3.rc1\n")
+    def test_get_npu_versions_unsupported_architecture(self, mock_file, mock_exists, mock_machine, mock_run):
+        """Test error with unsupported architecture."""
+        mock_run.return_value = Mock(stdout="Software Version : 25.5.0\n", check=True)
+        mock_machine.return_value = "armv7l"  # Unsupported architecture
+        mock_exists.return_value = True
+        with self.assertRaises(RuntimeError) as context:
+            get_npu_versions()
+        self.assertIn("Unsupported architecture", str(context.exception))
+    @patch("subprocess.run")
+    @patch("platform.machine")
+    @patch("os.path.exists")
+    @patch("builtins.open", new_callable=mock_open, read_data="version=8.3.rc1\n")
+    def test_get_npu_versions_cann_path_not_exists(self, mock_file, mock_exists, mock_machine, mock_run):
+        """Test error when CANN path doesn't exist."""
+        mock_run.return_value = Mock(stdout="Software Version : 25.5.0\n", check=True)
+        mock_machine.return_value = "x86_64"
+        mock_exists.return_value = False  # Path doesn't exist
+        with self.assertRaises(RuntimeError) as context:
+            get_npu_versions()
+        self.assertIn("CANN toolkit path does not exist", str(context.exception))
+    @patch("subprocess.run")
+    @patch("platform.machine")
+    @patch("os.path.exists")
+    @patch("builtins.open")
+    def test_get_npu_versions_info_file_not_exists(self, mock_file, mock_exists, mock_machine, mock_run):
+        """Test error when CANN info file doesn't exist."""
+        mock_run.return_value = Mock(stdout="Software Version : 25.5.0\n", check=True)
+        mock_machine.return_value = "x86_64"
+        # First call is for CANN path exists, second call is for info file exists
+        mock_exists.side_effect = [True, False]
+        with self.assertRaises(RuntimeError) as context:
+            get_npu_versions()
+        self.assertIn("CANN toolkit info file does not exist", str(context.exception))
+    @patch("subprocess.run")
+    @patch("platform.machine")
+    @patch("os.path.exists")
+    @patch("builtins.open", new_callable=mock_open, read_data="other_info=no_version\n")
+    def test_get_npu_versions_missing_cann_version(self, mock_file, mock_exists, mock_machine, mock_run):
+        """Test error when CANN version is missing from info file."""
+        mock_run.return_value = Mock(stdout="Software Version : 25.5.0\n", check=True)
+        mock_machine.return_value = "x86_64"
+        mock_exists.return_value = True
+        with self.assertRaises(RuntimeError) as context:
+            get_npu_versions()
+        self.assertIn("Could not find version in CANN toolkit info file", str(context.exception))
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/utils/test_config_on_cpu.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from dataclasses import dataclass, field
+from omegaconf import OmegaConf
+from verl.base_config import BaseConfig
+from verl.utils import omega_conf_to_dataclass
+@dataclass
+class TestDataclass(BaseConfig):
+    hidden_size: int = 0
+    activation: str = "relu"
+@dataclass
+class TestTrainConfig(BaseConfig):
+    batch_size: int = 0
+    model: TestDataclass = field(default_factory=TestDataclass)
+    override_config: dict = field(default_factory=dict)
+_cfg_str = """train_config:
+  _target_: tests.utils.test_config_on_cpu.TestTrainConfig
+  batch_size: 32
+  model:
+    hidden_size: 768
+    activation: relu
+  override_config: {}"""
+class TestConfigOnCPU(unittest.TestCase):
+    """Test cases for configuration utilities on CPU.
+    Test Plan:
+    1. Test basic OmegaConf to dataclass conversion for simple nested structures
+    2. Test nested OmegaConf to dataclass conversion for complex hierarchical configurations
+    3. Verify all configuration values are correctly converted and accessible
+    """
+    def setUp(self):
+        self.config = OmegaConf.create(_cfg_str)
+    def test_omega_conf_to_dataclass(self):
+        sub_cfg = self.config.train_config.model
+        cfg = omega_conf_to_dataclass(sub_cfg, TestDataclass)
+        self.assertEqual(cfg.hidden_size, 768)
+        self.assertEqual(cfg.activation, "relu")
+        assert isinstance(cfg, TestDataclass)
+    def test_nested_omega_conf_to_dataclass(self):
+        cfg = omega_conf_to_dataclass(self.config.train_config, TestTrainConfig)
+        self.assertEqual(cfg.batch_size, 32)
+        self.assertEqual(cfg.model.hidden_size, 768)
+        self.assertEqual(cfg.model.activation, "relu")
+        assert isinstance(cfg, TestTrainConfig)
+        assert isinstance(cfg.model, TestDataclass)
+class TestPrintCfgCommand(unittest.TestCase):
+    """Test suite for the print_cfg.py command-line tool."""
+    def test_command_with_override(self):
+        """Test that the command runs without error when overriding config values."""
+        import subprocess
+        # Run the command
+        result = subprocess.run(
+            ["python3", "scripts/print_cfg.py"],
+            capture_output=True,
+            text=True,
+        )
+        # Verify the command exited successfully
+        self.assertEqual(result.returncode, 0, f"Command failed with stderr: {result.stderr}")
+        # Verify the output contains expected config information
+        self.assertIn("critic", result.stdout)
+        self.assertIn("profiler", result.stdout)
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/utils/test_flops_counter.py ADDED Viewed

	@@ -0,0 +1,480 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import pytest
+from verl.utils.flops_counter import FlopsCounter
+VALID_CONFIG_TYPE = {"llama", "qwen2", "qwen3", "qwen3_moe", "deepseek_v3", "mistral", "gemma3_text", "apertus"}
+class Config:
+    def __init__(self, config_dict):
+        for key, value in config_dict.items():
+            if isinstance(value, dict):
+                value = Config(value)
+            setattr(self, key, value)
+CONFIG = {
+    "llama": {
+        "config": {  # llama2-7B
+            "model_type": "llama",
+            "vocab_size": 32000,
+            "hidden_size": 4096,
+            "intermediate_size": 11008,
+            "num_hidden_layers": 32,
+            "num_attention_heads": 32,
+            "num_key_value_heads": 32,
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # 6*(vocab*hidden*2+layer*(hidden*(q+k+v+head*head_dim)+ hidden*inter*3))*token_sum +
+        # 6*sum(seqlen^2)*layer*head*head_dim
+        # 6*(32000*4096*2+32*(4096*4096*4+4096*11008*3))*(512+1024+2048) +
+        # 6*(512*512+1024*1024+2048*2048)*32*4096
+        # 6*(32000*4096*2+32*(4096*4096*4+4096*11008*3))*(4096+4096+4096) +
+        # 6*(4096*4096+4096*4096+4096*4096)*32*4096
+        "expected_flops_tuple": (149226491215872 / 1e12, 536372695793664 / 1e12),
+    },
+    "qwen2": {
+        "config": {  # Qwen/Qwen2.5-7B-Instruct
+            "model_type": "qwen2",
+            "vocab_size": 152064,
+            "hidden_size": 3584,
+            "intermediate_size": 18944,
+            "num_hidden_layers": 28,
+            "num_attention_heads": 28,
+            "num_key_value_heads": 4,
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # 6*(vocab*hidden*2+layer*(hidden*(q+k+v+head*head_dim)+ hidden*inter*3))*token_sum +
+        # 6*sum(seqlen^2)*layer*head*head_dim
+        # 6*(152064*3584*2+28*(3584*(3584+512+512+3584)+3584*18944*3))*(512+1024+2048) +
+        # 6*(512*512+1024*1024+2048*2048)*28*3584
+        # 6*(152064*3584*2+28*(3584*(3584+512+512+3584)+3584*18944*3))*(4096+4096+4096) +
+        # 6*(4096*4096+4096*4096+4096*4096)*28*3584
+        "expected_flops_tuple": (167073690943488 / 1e12, 591764889010176 / 1e12),
+    },
+    "qwen3": {
+        "config": {  # Qwen/Qwen3-8B
+            "model_type": "qwen3",
+            "vocab_size": 151936,
+            "hidden_size": 4096,
+            "intermediate_size": 12288,
+            "num_hidden_layers": 36,
+            "num_attention_heads": 32,
+            "num_key_value_heads": 8,
+            "head_dim": 128,
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # 6*(vocab*hidden*2+layer*(hidden*(q+k+v+head*head_dim)+ hidden*inter*3))*token_sum +
+        # 6*sum(seqlen^2)*layer*head*head_dim
+        # 6*(151936*4096*2+36*(4096*(128*32+128*8*2+128*32)+4096*12288*3))*(512+1024+2048) +
+        # 6*(512*512+1024*1024+2048*2048)*36*128*32
+        # 6*(151936*4096*2+36*(4096*(128*32+128*8*2+128*32)+4096*12288*3))*(4096+4096+4096) +
+        # 6*(4096*4096+4096*4096+4096*4096)*36*128*32
+        "expected_flops_tuple": (180997438046208 / 1e12, 648394032807936 / 1e12),
+    },
+    "qwen3_moe": {
+        "config": {  # Qwen/Qwen3-30B-A3B-Base
+            "model_type": "qwen3_moe",
+            "hidden_size": 2048,
+            "vocab_size": 151936,
+            "num_hidden_layers": 48,
+            "num_key_value_heads": 4,
+            "num_attention_heads": 32,
+            "head_dim": 128,
+            "moe_intermediate_size": 768,
+            "num_experts_per_tok": 8,
+            "num_experts": 128,
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # 6*(vocab*hidden*2+layer*(hidden*(q+k+v+head*head_dim)+hidden*inter*top_k_exp*3 +
+        # hidden*num_experts))*token_sum + 6*sum(seqlen^2)*layer*head*head_dim
+        # 6*(151936*2048*2+48*(2048*(128*32+128*4*2+128*32)+2048*768*8*3+2048*128))*(512+1024+2048) +
+        # 6*(512*512+1024*1024+2048*2048)*48*128*32
+        # 6*(151936*2048*2+48*(2048*(128*32+128*4*2+128*32)+2048*768*8*3+2048*128))*(4096+4096+4096) +
+        # 6*(4096*4096+4096*4096+4096*4096)*48*128*32
+        "expected_flops_tuple": (78593069678592 / 1e12, 306570470621184 / 1e12),
+    },
+    "deepseek_v3": {
+        "config": {  # deepseek-ai/DeepSeek-Prover-V2-671B
+            "model_type": "deepseek_v3",
+            "hidden_size": 7168,
+            "vocab_size": 129280,
+            "moe_intermediate_size": 2048,
+            "num_hidden_layers": 61,
+            "first_k_dense_replace": 3,
+            "num_attention_heads": 128,
+            "n_routed_experts": 256,
+            "num_experts_per_tok": 8,
+            "n_shared_experts": 1,
+            "kv_lora_rank": 512,
+            "qk_rope_head_dim": 64,
+            "v_head_dim": 128,
+            "intermediate_size": 18432,
+            "qk_nope_head_dim": 128,
+            "q_lora_rank": 1536,
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # (1536*7168+128*192*1536+7168*(512+64)+128*(128+128)*512+128*128*7168) = 187105280
+        # 6*(129280*7168*2+ 3*(7168*18432*3+187105280)+ 58*(187105280+7168*256+7168*2048*9*3))*(512+1024+2048) +
+        # 3*(512*512+1024*1024+2048*2048)*61*(192+128)*128
+        # 6*(129280*7168*2+ 3*(7168*18432*3+187105280)+ 58*(187105280+7168*256+7168*2048*9*3))*(4096+4096+4096) +
+        # 3*(4096*4096+4096*4096+4096*4096)*61*(192+128)*128
+        "expected_flops_tuple": (848766538088448 / 1e12, 3145850406567936 / 1e12),
+    },
+    "mistral": {
+        "config": {  # mistralai/Mistral-Small-24B-Instruct-2501
+            "model_type": "mistral",
+            "vocab_size": 131072,
+            "hidden_size": 5120,
+            "intermediate_size": 32768,
+            "num_hidden_layers": 40,
+            "num_attention_heads": 32,
+            "num_key_value_heads": 8,
+            "head_dim": 128,
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # Mistral uses same architecture as Llama, with GQA
+        # 6*(vocab*hidden*2+layer*(hidden*(q+k+v+head*head_dim)+ hidden*inter*3))*token_sum +
+        # 12*sum(seqlen^2)*layer*head*head_dim
+        # vocab part: 131072*5120*2 = 1342177280
+        # attn part per layer: 5120*(128*32+128*8+128*8+128*32) = 5120*10240 = 52428800
+        # mlp part per layer: 5120*32768*3 = 503316480
+        # total per layer: 52428800 + 503316480 = 555745280
+        # all layers: 1342177280 + 40*555745280 = 23571988480
+        # For batch [512, 1024, 2048], tokens_sum = 3584:
+        # dense flops: 6 * 23571988480 * 3584 = 506892040273920
+        # attn flops: 6 * 5505024 * 40 * 128 * 32 = 10823317585920
+        # total: 517715357859840 / 1e12 = 517.71535785984
+        # For batch [4096, 4096, 4096], tokens_sum = 12288:
+        # dense flops: 6 * 23571988480 * 12288 = 1737915566653440
+        # attn flops: 6 * 50331648 * 40 * 128 * 32 = 98956046499840
+        # total: 1836871613153280 / 1e12 = 1836.87161315328
+        "expected_flops_tuple": (512303699066880 / 1e12, 1787393589903360 / 1e12),
+    },
+    "gemma3_text": {
+        "config": {  # Gemma3-12B-IT-TextOnly
+            "model_type": "gemma3_text",
+            "vocab_size": 262208,
+            "hidden_size": 3840,
+            "intermediate_size": 15360,
+            "num_hidden_layers": 48,
+            "num_attention_heads": 16,
+            "num_key_value_heads": 8,
+            "head_dim": 256,
+            "sliding_window": 1024,
+            "layer_types": None,
+            # Will be auto-generated based on sliding_window_pattern
+            "sliding_window_pattern": 6,
+            # Every 6th layer is full attention
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # Gemma3 has alternating sliding window attention
+        # With sliding_window_pattern=6: layers 5,11,17,23,29,35,41,47 use full attention (8 layers)
+        # Other 40 layers use sliding window attention with window_size=1024
+        #
+        # Non-attention FLOPs:
+        # vocab part: 262208*3840*2 = 2013757440
+        # attn part per layer: 3840*(256*16+256*8+256*8+256*16) = 3840*12288 = 47185920
+        # mlp part per layer: 3840*15360*3 = 176947200
+        # total per layer: 47185920 + 176947200 = 224133120
+        # all layers: 2013757440 + 48*224133120 = 12772147200
+        #
+        # For batch [512, 1024, 2048], tokens_sum = 3584:
+        # dense flops: 6 * 12772147200 * 3584 = 274652253388800
+        # seqlen_square_sum: 180355072 (calculated with sliding window logic)
+        # attn flops: 6 * 180355072 * 256 * 16 = 8864812498944
+        # total: 283517065887744 / 1e12 = 283.517065887744
+        #
+        # For batch [4096, 4096, 4096], tokens_sum = 12288:
+        # dense flops: 6 * 12772147200 * 12288 = 941664868761600
+        # seqlen_square_sum: 905969664 (calculated with sliding window logic)
+        # attn flops: 6 * 905969664 * 256 * 16 = 44530220924928
+        # total: 986195089686528 / 1e12 = 986.195089686528
+        "expected_flops_tuple": (279084659638272 / 1e12, 963929979224064 / 1e12),
+    },
+    "gpt_oss": {
+        "config": {
+            "model_type": "gpt_oss",
+            "vocab_size": 201088,
+            "hidden_size": 2880,
+            "num_hidden_layers": 24,
+            "num_attention_heads": 64,
+            "num_key_value_heads": 8,
+            "head_dim": 64,
+            "intermediate_size": 2880,
+            "num_local_experts": 32,
+            "num_experts_per_tok": 4,
+            "sliding_window": 128,
+            "layer_types": [
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+            ],
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # GPT-OSS has alternating sliding / full attention
+        # Even layers (12 layers) use sliding window attention with window_size = 128
+        # Odd layers  (12 layers) use full attention
+        #
+        # Non-attention FLOPs:
+        # vocab part: 201088 * 2880 * 2 = 1158266880
+        # attn linear part per layer:
+        #   Q: 2880 * (64 * 64) = 11796480
+        #   K: 2880 * (8  * 64) = 1474560
+        #   V: 2880 * (8  * 64) = 1474560
+        #   O: (64 * 64) * 2880 = 11796480
+        #   attn linear total = 26542080
+        # mlp (MoE, SwiGLU) part per layer:
+        #   gate: 2880 * 32 = 92160
+        #   active experts: 3 * 2880 * 2880 * 4 = 99532800
+        #   mlp total = 99624960
+        # total per layer: 26542080 + 99624960 = 126167040
+        # all layers:
+        #   126167040 * 24 = 3028008960
+        # total dense params:
+        #   3028008960 + 1158266880 = 4186275840
+        #
+        # For batch [512, 1024, 2048], tokens_sum = 3584:
+        # dense flops: 6 * 4186275840 * 3584 = 90021675663360
+        # seqlen_square_sum: 71565312 (calculated with sliding window logic)
+        # attn flops: 6 * 71565312 * 64 * 64 = 3517578215424
+        # total: 93539253878784 / 1e12 = 93.539253878784
+        #
+        # For batch [4096, 4096, 4096], tokens_sum = 12288:
+        # dense flops: 6 * 4186275840 * 12288 = 308646629068800
+        # seqlen_square_sum: 622854144 (calculated with sliding window logic)
+        # attn flops: 6 * 622854144 * 64 * 64 = 30613642948608
+        # total: 339260272017408 / 1e12 = 339.260272017408
+        "expected_flops_tuple": (91780464771072 / 1e12, 323953008574464 / 1e12),
+    },
+    "apertus": {
+        "config": {  # swiss-ai/Apertus-8B
+            "model_type": "apertus",
+            "vocab_size": 131072,
+            "hidden_size": 4096,
+            "intermediate_size": 21504,
+            "num_hidden_layers": 32,
+            "num_attention_heads": 32,
+            "num_key_value_heads": 32,
+            "hidden_act": "xielu",
+            # head_dim will be derived as 4096 / 32 = 128
+        },
+        "batch_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # Calculation for Apertus (hidden_act="xielu" -> MLP uses [k_mlp=2]*H*I params; qk_norm=True -> [k_qkn=2]*H):
+        # V=131072, H=4096, I=21504, L=32, k_mlp=2 (XIELU), k_qkn=2 (QK norm), S=6
+        # S*(2*V*H + L*(4*H**2 + k_mlp*H*I + k_qkn*H)) * (SUM[seqlen]) + 6*SUM[seqlen**2]*L*H
+        "expected_flops_tuple": (194825353691136 / 1e12, 692711652851712 / 1e12),
+    },
+    "qwen3_vl": {
+        "config": {  # Qwen/Qwen3-VL-8B
+            "model_type": "qwen3_vl",
+            # -------- Text config --------
+            "text_config": {
+                "vocab_size": 151936,
+                "hidden_size": 4096,
+                "intermediate_size": 12288,
+                "num_hidden_layers": 36,
+                "num_attention_heads": 32,
+                "num_key_value_heads": 8,
+                "head_dim": 128,
+            },
+            # -------- Vision config (ViT) --------
+            "vision_config": {
+                "deepstack_visual_indexes": [8, 16, 24],
+                "num_heads": 16,
+                "depth": 27,
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "out_hidden_size": 4096,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+                "in_channels": 3,
+                "patch_size": 16,
+            },
+        },
+        "batch_seqlens_tuple": (
+            [512, 1024, 2048],
+            [4096, 4096, 4096],
+        ),
+        "images_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # -----Text-----
+        # 6*(vocab*hidden*2
+        #   + layer*(hidden*(q+k+v+o) + hidden*inter*3)
+        # )*token_sum
+        # + 6*sum(seqlen^2)*layer*hidden
+        #
+        # -----ViT-----
+        # patch_embed_N =hidden*temporal_patch_size*in_channels* patch_size^2
+        # attn_linear_N =hidden*(4*hidden)
+        # mlp_N =hidden*inter*2
+        # merger_N =((o+hidden*spatial_merge_size^2) * (hidden*spatial_merge_size^2))
+        # deepstack_merger_N =merger_N * 3
+        # dense_N =patch_embed_N + (attn_linear_N + mlp_N) * 27 + deepstack_merger_N + merger_N
+        #
+        # 6*(151936*4096*2
+        #   + 36*(4096*(4096+1024+1024+4096) + 4096*12288*3)
+        # )*(512+1024+2048)
+        # + 12*(512*512+1024*1024+2048*2048)*36*4096
+        # + 6 * dense_N * (512 + 1024 + 2048)
+        # + 12 * (512**2 + 1024**2 + 2048**2) * 27 * 16 * 72
+        #
+        # 6*(151936*4096*2
+        #   + 36*(4096*(4096+1024+1024+4096) + 4096*12288*3)
+        # )*(4096+4096+4096)
+        # + 12*(4096*4096+4096*4096+4096*4096)*36*4096
+        # + 6 * dense_N * (4096 + 4096 + 2048)
+        # + 12 * (4096**2 + 4096**2 + 4096**2) * 27 * 16 * 72
+        "expected_flops_tuple": (
+            195379819708416 / 1e12,
+            709446422495232 / 1e12,
+        ),
+    },
+    "qwen3_vl_moe": {
+        "config": {  # Qwen/Qwen3-VL-30B-A3B
+            "model_type": "qwen3_vl_moe",
+            # -------- Text config --------
+            "text_config": {
+                "vocab_size": 151936,
+                "hidden_size": 2048,
+                "num_hidden_layers": 48,
+                "num_attention_heads": 32,
+                "num_key_value_heads": 4,
+                "head_dim": 128,
+                "moe_intermediate_size": 768,
+                "num_experts": 128,
+                "num_experts_per_tok": 8,
+            },
+            # -------- Vision config (ViT) --------
+            "vision_config": {
+                "deepstack_visual_indexes": [8, 16, 24],
+                "num_heads": 16,
+                "depth": 27,
+                "hidden_size": 1152,
+                "intermediate_size": 4304,
+                "out_hidden_size": 4096,
+                "spatial_merge_size": 2,
+                "temporal_patch_size": 2,
+                "in_channels": 3,
+                "patch_size": 16,
+            },
+        },
+        "batch_seqlens_tuple": (
+            [512, 1024, 2048],
+            [4096, 4096, 4096],
+        ),
+        "images_seqlens_tuple": ([512, 1024, 2048], [4096, 4096, 4096]),
+        # -----Text-----
+        # 6*(vocab*hidden*2
+        #   + layer*(hidden*(q+k+v+head*head_dim)+hidden*inter*top_k_exp*3+hidden*num_experts)
+        # )*token_sum
+        # + 6*sum(seqlen^2)*layer*hidden
+        #
+        # -----ViT-----
+        # patch_embed_N =hidden*temporal_patch_size*in_channels* patch_size^2
+        # attn_linear_N =hidden*(4*hidden)
+        # mlp_N =hidden*inter*2
+        # merger_N =((o+hidden*spatial_merge_size^2) * (hidden*spatial_merge_size^2))
+        # deepstack_merger_N =merger_N * 3
+        # dense_N =patch_embed_N + (attn_linear_N + mlp_N) * 27 + deepstack_merger_N + merger_N
+        #
+        # 6*(151936*2048*2
+        #   + 48*(2048*(128*32+128*4*2+128*32)+2048*768*8*3+2048*128)
+        # )*(512+1024+2048)
+        # + 12*(512*512+1024*1024+2048*2048)*48*4096
+        # + 6 * dense_N * (512 + 1024 + 2048)
+        # + 12 * (512**2 + 1024**2 + 2048**2) * 27 * 16 * 72
+        #
+        # 6*(151936*2048*2
+        #   48*(2048*(128*32+128*4*2+128*32)+2048*768*8*3+2048*128)
+        # )*(4096+4096+4096)
+        # + 12*(4096*4096+4096*4096+4096*4096)*48*4096
+        # + 6 * dense_N * (4096 + 4096 + 2048)
+        # + 12 * (4096**2 + 4096**2 + 4096**2) * 27 * 16 * 72
+        "expected_flops_tuple": (
+            92975451340800 / 1e12,
+            367622860308480 / 1e12,
+        ),
+    },
+}
+@pytest.mark.parametrize(
+    "config_type",
+    [
+        "llama",
+        "qwen2",
+        "qwen3",
+        "qwen3_moe",
+        "deepseek_v3",
+        "mistral",
+        "gemma3_text",
+        "apertus",
+        "gpt_oss",
+        "qwen3_vl",
+        "qwen3_vl_moe",
+    ],
+)
+def test_flops_counter(config_type: str):
+    test_config = CONFIG[config_type]
+    config = Config(test_config["config"])
+    flops_counter = FlopsCounter(config)
+    if "images_seqlens_tuple" in test_config:
+        for batch_seqlens, images_seqlens, expected_flops in zip(
+            test_config["batch_seqlens_tuple"],
+            test_config["images_seqlens_tuple"],
+            test_config["expected_flops_tuple"],
+            strict=True,
+        ):
+            # set delta time to 1 to get the flops
+            counted_flops, _ = flops_counter.estimate_flops(batch_seqlens, 1, images_seqlens=images_seqlens)
+            print(f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}")
+            assert math.isclose(counted_flops, expected_flops), (
+                f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}"
+            )
+    else:
+        for batch_seqlens, expected_flops in zip(
+            test_config["batch_seqlens_tuple"], test_config["expected_flops_tuple"], strict=True
+        ):
+            # set delta time to 1 to get the flops
+            counted_flops, _ = flops_counter.estimate_flops(batch_seqlens, 1)
+            print(f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}")
+            assert math.isclose(counted_flops, expected_flops), (
+                f"Expect flops for {test_config['config']} is {expected_flops}, but get {counted_flops}"
+            )

code/RL_model/verl/verl_train/tests/utils/test_fs_on_cpu.py ADDED Viewed

	@@ -0,0 +1,94 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from pathlib import Path
+import verl.utils.fs as fs
+def test_record_and_check_directory_structure(tmp_path):
+    # Create test directory structure
+    test_dir = tmp_path / "test_dir"
+    test_dir.mkdir()
+    (test_dir / "file1.txt").write_text("test")
+    (test_dir / "subdir").mkdir()
+    (test_dir / "subdir" / "file2.txt").write_text("test")
+    # Create structure record
+    record_file = fs._record_directory_structure(test_dir)
+    # Verify record file exists
+    assert os.path.exists(record_file)
+    # Initial check should pass
+    assert fs._check_directory_structure(test_dir, record_file) is True
+    # Modify structure and verify check fails
+    (test_dir / "new_file.txt").write_text("test")
+    assert fs._check_directory_structure(test_dir, record_file) is False
+def test_copy_from_hdfs_with_mocks(tmp_path, monkeypatch):
+    # Mock HDFS dependencies
+    monkeypatch.setattr(fs, "is_non_local", lambda path: True)
+    # side_effect will simulate the copy by creating parent dirs + empty file
+    def fake_copy(src: str, dst: str, *args, **kwargs):
+        dst_path = Path(dst)
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+        dst_path.write_bytes(b"")  # touch an empty file
+    monkeypatch.setattr(fs, "copy", fake_copy)  # Mock actual HDFS copy
+    # Test parameters
+    test_cache = tmp_path / "cache"
+    hdfs_path = "hdfs://test/path/file.txt"
+    # Test initial copy
+    local_path = fs.copy_to_local(hdfs_path, cache_dir=test_cache)
+    expected_path = os.path.join(test_cache, fs.md5_encode(hdfs_path), os.path.basename(hdfs_path))
+    assert local_path == expected_path
+    assert os.path.exists(local_path)
+def test_always_recopy_flag(tmp_path, monkeypatch):
+    # Mock HDFS dependencies
+    monkeypatch.setattr(fs, "is_non_local", lambda path: True)
+    copy_call_count = 0
+    def fake_copy(src: str, dst: str, *args, **kwargs):
+        nonlocal copy_call_count
+        copy_call_count += 1
+        dst_path = Path(dst)
+        dst_path.parent.mkdir(parents=True, exist_ok=True)
+        dst_path.write_bytes(b"")
+    monkeypatch.setattr(fs, "copy", fake_copy)  # Mock actual HDFS copy
+    test_cache = tmp_path / "cache"
+    hdfs_path = "hdfs://test/path/file.txt"
+    # Initial copy (always_recopy=False)
+    fs.copy_to_local(hdfs_path, cache_dir=test_cache)
+    assert copy_call_count == 1
+    # Force recopy (always_recopy=True)
+    fs.copy_to_local(hdfs_path, cache_dir=test_cache, always_recopy=True)
+    assert copy_call_count == 2
+    # Subsequent normal call (always_recopy=False)
+    fs.copy_to_local(hdfs_path, cache_dir=test_cache)
+    assert copy_call_count == 2  # Should not increment

code/RL_model/verl/verl_train/tests/utils/test_groupwise.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright 2023-2024 SGLang Team
+# Copyright 2025 ModelBest Inc. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+os.environ.setdefault("VERL_FORCE_DEVICE", "cpu")  # ensure CPU for tests
+import numpy as np
+import pytest
+import torch
+from verl.utils import as_torch_index, group_mean_std
+def test_as_torch_index_basic_integers():
+    g = as_torch_index([2, 2, 5, 7, 5, 2])
+    assert g.dtype == torch.long
+    assert g.device.type == "cpu"
+    # Values should be contiguous 0..G-1, keeping equal labels equal
+    assert g.tolist()[0] == g.tolist()[1]
+    assert len(torch.unique(g)) == 3  # {2,5,7} -> 3 groups
+def test_as_torch_index_near_integer_floats():
+    arr = np.array([1.0000001, 2.0, 1.0, 3.0000000001], dtype=np.float64)
+    g = as_torch_index(arr)  # should round to integers then factorize
+    assert g.dtype == torch.long
+    assert len(torch.unique(g)) == 3  # {1,2,3}
+def test_as_torch_index_factorization_mixed():
+    labels = ["a", "b", "a", "c", "0042", 42]
+    g = as_torch_index(labels)
+    # "0042" and 42 should NOT be the same group (strings are not coerced here)
+    assert g.tolist()[4] != g.tolist()[5]
+    assert len(torch.unique(g)) == 5
+def test_group_mean_std_simple():
+    # groups: 0 -> [1, 3], 1 -> [2]
+    scores = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32)
+    gidx = as_torch_index([0, 1, 0])
+    mean_g, std_g, cnt_g = group_mean_std(scores, gidx)
+    # group 0: mean = (1+3)/2 = 2
+    # sample std (unbiased) = sqrt( (sum(x^2) - (sum(x)^2)/n) / (n-1) )
+    # = sqrt( (1^2+3^2) - (1+3)^2/2 ) / (2-1) = sqrt(10 - 16/2) = sqrt(2)
+    assert torch.allclose(mean_g, torch.tensor([2.0, 0.0]))
+    assert torch.allclose(cnt_g, torch.tensor([2.0, 1.0]))
+    # singleton group -> std = 1.0
+    assert mean_g[1].item() == 0.0
+    assert std_g[1].item() == 1.0
+    assert pytest.approx(std_g[0].item(), rel=1e-6) == (2.0**0.5)
+def test_group_mean_std_empty():
+    scores = torch.tensor([], dtype=torch.float32)
+    gidx = torch.tensor([], dtype=torch.long)
+    mean_g, std_g, cnt_g = group_mean_std(scores, gidx)
+    assert mean_g.numel() == 0 and std_g.numel() == 0 and cnt_g.numel() == 0
+def test_group_mean_std_default_device_no_force_env(monkeypatch):
+    """
+    Regression test:
+    - group_mean_std(device=None) must not pass a device *module* (e.g., torch.cuda)
+      into Tensor.to(device=...), which crashes with:
+      TypeError: to() received an invalid combination of arguments - got (..., device=module, ...)
+    """
+    # Simulate a non-pytest environment (training code path) while keeping the test CPU-only.
+    monkeypatch.delenv("VERL_FORCE_DEVICE", raising=False)
+    monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False)
+    # Force device selection to CPU even if CUDA is available on the test machine.
+    import verl.utils.device as device_mod
+    monkeypatch.setattr(device_mod, "is_cuda_available", False)
+    monkeypatch.setattr(device_mod, "is_npu_available", False)
+    scores = torch.tensor([1.0, 2.0, 3.0], dtype=torch.float32)
+    gidx = torch.tensor([0, 1, 0], dtype=torch.long)
+    mean_g, std_g, cnt_g = group_mean_std(scores, gidx)
+    assert mean_g.device.type == "cpu"
+    assert std_g.device.type == "cpu"
+    assert cnt_g.device.type == "cpu"

code/RL_model/verl/verl_train/tests/utils/test_import_utils_on_cpu.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import pytest
+from verl.utils.import_utils import load_extern_object
+# Path to the test module
+TEST_MODULE_PATH = os.path.join(os.path.dirname(__file__), "_test_module.py")
+def test_load_extern_object_class():
+    """Test loading a class from an external file"""
+    TestClass = load_extern_object(TEST_MODULE_PATH, "TestClass")
+    # Verify the class was loaded correctly
+    assert TestClass is not None
+    assert TestClass.__name__ == "TestClass"
+    # Test instantiation and functionality
+    instance = TestClass()
+    assert instance.value == "default"
+    # Test with a custom value
+    custom_instance = TestClass("custom")
+    assert custom_instance.get_value() == "custom"
+def test_load_extern_object_function():
+    """Test loading a function from an external file"""
+    test_function = load_extern_object(TEST_MODULE_PATH, "test_function")
+    # Verify the function was loaded correctly
+    assert test_function is not None
+    assert callable(test_function)
+    # Test function execution
+    result = test_function()
+    assert result == "test_function_result"
+def test_load_extern_object_constant():
+    """Test loading a constant from an external file"""
+    constant = load_extern_object(TEST_MODULE_PATH, "TEST_CONSTANT")
+    # Verify the constant was loaded correctly
+    assert constant is not None
+    assert constant == "test_constant_value"
+def test_load_extern_object_nonexistent_file():
+    """Test behavior when file doesn't exist"""
+    with pytest.raises(FileNotFoundError):
+        load_extern_object("/nonexistent/path.py", "SomeType")
+def test_load_extern_object_nonexistent_type():
+    """Test behavior when type doesn't exist in the file"""
+    with pytest.raises(AttributeError):
+        load_extern_object(TEST_MODULE_PATH, "NonExistentType")
+def test_load_extern_object_none_path():
+    """Test behavior when file path is None"""
+    with pytest.raises(AttributeError):
+        load_extern_object(None, "SomeType")
+def test_load_extern_object_invalid_module():
+    """Test behavior when module has syntax errors"""
+    # Create a temporary file with syntax errors
+    import tempfile
+    with tempfile.NamedTemporaryFile(suffix=".py", mode="w+", delete=False) as temp_file:
+        temp_file.write("This is not valid Python syntax :")
+        temp_path = temp_file.name
+    try:
+        with pytest.raises(RuntimeError):
+            load_extern_object(temp_path, "SomeType")
+    finally:
+        # Clean up the temporary file
+        if os.path.exists(temp_path):
+            os.remove(temp_path)

code/RL_model/verl/verl_train/tests/utils/test_linear_cross_entropy.py ADDED Viewed

	@@ -0,0 +1,361 @@

+#
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import verl.utils.torch_functional as verl_F
+from verl.utils.experimental.torch_functional import FusedLinearForPPO
+from verl.utils.kernel.linear_cross_entropy import linear_cross_entropy
+from verl.utils.torch_functional import logprobs_from_logits
+compute_entropy_from_logits = torch.compile(verl_F.entropy_from_logits, dynamic=True)
+fused_linear_for_ppo = FusedLinearForPPO()
+fused_linear_for_ppo.compile(dynamic=True)
+MAX_TEST_CASES = os.environ.get("MAX_TEST_CASES", 5)
+def run_torch_entropy(
+    hidden: torch.Tensor, weight: torch.Tensor, labels: torch.Tensor, temperature: float, reduction="none"
+) -> list[torch.Tensor]:
+    hidden = hidden.squeeze(0).to(torch.float32)
+    weight = weight.transpose(0, 1).to(torch.float32)
+    logits = torch.matmul(hidden, weight)  # [num_tokens, vocab_size]
+    logits /= temperature
+    pd = torch.nn.functional.softmax(logits, dim=-1)  # [num_tokens, vocab_size]
+    entropy_a = torch.logsumexp(logits, dim=-1)  # [num_tokens]
+    entropy_b = torch.sum(pd * logits, dim=-1)  # [num_tokens]
+    entropy = entropy_a - entropy_b
+    logprobs = torch.nn.functional.cross_entropy(logits, labels.squeeze(0), reduction=reduction)  # [num_tokens]
+    logprobs = torch.neg(logprobs)
+    return logprobs, entropy
+def run_verl_original_entropy(
+    hidden: torch.Tensor,
+    weight: torch.Tensor,
+    labels: torch.Tensor,
+    temperature: float,
+) -> list[torch.Tensor]:
+    hidden = hidden.squeeze(0).to(torch.float32)
+    weight = weight.transpose(0, 1).to(torch.float32)
+    logits = torch.matmul(hidden, weight)  # [num_tokens, vocab_size]
+    logits /= temperature
+    # compute entropy
+    entropy = compute_entropy_from_logits(logits)  # ((total_nnz / sp) + pad)
+    # if use_sp: ((total_nnz / sp) + pad) ; if not use_sp: (batch, seqlen)
+    logprobs = logprobs_from_logits(logits=logits, labels=labels, inplace_backward=False)
+    return logprobs, entropy
+# To be tested
+def run_verl_torch_fused_entropy(
+    hidden: torch.Tensor,
+    weight: torch.Tensor,
+    labels: torch.Tensor,
+    temperature: float,
+):
+    hidden = hidden.to(torch.float32)
+    weight = weight.to(torch.float32)
+    logprobs, entropy = fused_linear_for_ppo(
+        hidden,
+        weight,
+        labels,
+        temperature=temperature,
+    )
+    return logprobs.squeeze(0), entropy.squeeze(0)
+class TestLinearCrossEntropy:
+    def __init__(self, test_case_idx: int, temperature: float = 1.5) -> None:
+        self.test_case_idx = test_case_idx
+        self.temperature = temperature
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+        gc.collect()
+        torch.cuda.synchronize()
+    def generate_hyper(self):
+        global MAX_TEST_CASES
+        self.dtype = torch.bfloat16
+        if self.test_case_idx == 0:
+            self.batch_size = 1
+            self.num_tokens = 1937
+            self.hidden_size = 3584
+            self.vocab_size = 152064
+        elif self.test_case_idx == 1:
+            self.batch_size = 1
+            self.num_tokens = 2169
+            self.hidden_size = 896
+            self.vocab_size = 151936
+        elif self.test_case_idx == 2:
+            self.batch_size = 1
+            self.num_tokens = 1530
+            self.hidden_size = 2048
+            self.vocab_size = 32256
+        elif self.test_case_idx == 3:
+            self.batch_size = 1
+            self.num_tokens = 1388
+            self.hidden_size = 4096
+            self.vocab_size = 102400
+        elif self.test_case_idx == 4:
+            self.batch_size = 1
+            self.num_tokens = 8192
+            self.hidden_size = 4096
+            self.vocab_size = 102400
+        else:
+            raise ValueError(f"Invalid test case index: {self.test_case_idx}")
+        assert MAX_TEST_CASES <= 5, "MAX_TEST_CASES should be less than or equal to 5."
+    def generate_forward_inputs(self):
+        hidden = (
+            torch.empty((self.batch_size, self.num_tokens, self.hidden_size), dtype=self.dtype, device="cuda")
+            .uniform_(-0.5, 0.5)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((self.vocab_size, self.hidden_size), dtype=self.dtype, device="cuda")
+            .uniform_(-0.5, 0.5)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, self.vocab_size, (self.batch_size, self.num_tokens), device="cuda")
+        return hidden, weight, labels
+    def generate_backward_inputs(self):
+        g_entropy = torch.empty((self.num_tokens,), dtype=self.dtype, device="cuda").uniform_(-0.5, 0.5)
+        g_logprobs = torch.empty((self.num_tokens,), dtype=self.dtype, device="cuda").uniform_(-1, 1)
+        return g_entropy, g_logprobs
+    def verify_correctness(self, iterations=5):
+        self.cleanup()
+        self.generate_hyper()
+        torch_forward_latency = list()
+        torch_backward_latency = list()
+        verl_forward_latency = list()
+        verl_backward_latency = list()
+        verl_fused_forward_latency = list()
+        verl_fused_backward_latency = list()
+        kernel_forward_latency = list()
+        kernel_backward_latency = list()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        for i in range(iterations):
+            print(f"[INFO]: Iteration {i + 1} / {iterations}...", end="\r")
+            hidden, weight, labels = self.generate_forward_inputs()
+            start_event.record()
+            (torch_logprobs, torch_entropy) = run_torch_entropy(hidden, weight, labels, self.temperature)
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_forward_latency.append(start_event.elapsed_time(end_event))
+            start_event.record()
+            (verl_logprobs, verl_entropy) = run_verl_original_entropy(hidden, weight, labels, self.temperature)
+            end_event.record()
+            torch.cuda.synchronize()
+            verl_forward_latency.append(start_event.elapsed_time(end_event))
+            start_event.record()
+            (verl_fused_logprobs, verl_fused_entropy) = run_verl_torch_fused_entropy(
+                hidden, weight, labels, self.temperature
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            verl_fused_forward_latency.append(start_event.elapsed_time(end_event))
+            start_event.record()
+            (kernel_logprobs, kernel_entropy) = linear_cross_entropy(hidden, weight, labels, self.temperature)
+            end_event.record()
+            torch.cuda.synchronize()
+            kernel_forward_latency.append(start_event.elapsed_time(end_event))
+            torch.testing.assert_close(torch_logprobs, verl_logprobs, atol=1e-4, rtol=1e-4)
+            torch.testing.assert_close(torch_entropy, verl_entropy, atol=1e-4, rtol=1e-4)
+            torch.testing.assert_close(torch_logprobs, verl_fused_logprobs, atol=1e-4, rtol=1e-4)
+            torch.testing.assert_close(torch_entropy, verl_fused_entropy, atol=1e-4, rtol=1e-4)
+            torch.testing.assert_close(verl_logprobs, verl_fused_logprobs, atol=1e-4, rtol=1e-4)
+            torch.testing.assert_close(verl_entropy, verl_fused_entropy, atol=1e-4, rtol=1e-4)
+            torch.testing.assert_close(torch_logprobs, kernel_logprobs, atol=1e-3, rtol=2e-4)
+            torch.testing.assert_close(torch_entropy, kernel_entropy, atol=5e-3, rtol=5e-4)
+            torch.testing.assert_close(verl_logprobs, kernel_logprobs, atol=1e-3, rtol=2e-4)
+            torch.testing.assert_close(verl_entropy, kernel_entropy, atol=5e-3, rtol=5e-4)
+            torch.testing.assert_close(verl_fused_logprobs, kernel_logprobs, atol=1e-3, rtol=2e-4)
+            torch.testing.assert_close(verl_fused_entropy, kernel_entropy, atol=5e-3, rtol=5e-4)
+            # backward
+            g_entropy, g_logprobs = self.generate_backward_inputs()
+            start_event.record()
+            (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+                (torch_entropy, torch_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_backward_latency.append(start_event.elapsed_time(end_event))
+            start_event.record()
+            (d_verl_hidden, d_verl_weight) = torch.autograd.grad(
+                (verl_entropy, verl_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            verl_backward_latency.append(start_event.elapsed_time(end_event))
+            start_event.record()
+            (d_verl_fused_hidden, d_verl_fused_weight) = torch.autograd.grad(
+                (verl_fused_entropy, verl_fused_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            verl_fused_backward_latency.append(start_event.elapsed_time(end_event))
+            start_event.record()
+            (d_kernel_hidden, d_kernel_weight) = torch.autograd.grad(
+                (kernel_entropy, kernel_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            kernel_backward_latency.append(start_event.elapsed_time(end_event))
+            torch.testing.assert_close(d_torch_hidden, d_verl_hidden, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_torch_weight, d_verl_weight, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_torch_hidden, d_verl_fused_hidden, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_torch_weight, d_verl_fused_weight, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_verl_hidden, d_verl_fused_hidden, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_verl_weight, d_verl_fused_weight, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_torch_hidden, d_verl_hidden, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_torch_weight, d_verl_weight, atol=1e-2, rtol=1e-4)
+            torch.testing.assert_close(d_torch_hidden, d_kernel_hidden, atol=2e-2, rtol=4e-2)
+            torch.testing.assert_close(d_torch_weight, d_kernel_weight, atol=2e-2, rtol=4e-2)
+            torch.testing.assert_close(d_verl_hidden, d_kernel_hidden, atol=2e-2, rtol=4e-2)
+            torch.testing.assert_close(d_verl_weight, d_kernel_weight, atol=2e-2, rtol=4e-2)
+            torch.testing.assert_close(d_verl_fused_hidden, d_kernel_hidden, atol=2e-2, rtol=4e-2)
+            torch.testing.assert_close(d_verl_fused_weight, d_kernel_weight, atol=2e-2, rtol=4e-2)
+        # remove first latency
+        torch_forward_latency = torch_forward_latency[1:]
+        torch_backward_latency = torch_backward_latency[1:]
+        verl_forward_latency = verl_forward_latency[1:]
+        verl_backward_latency = verl_backward_latency[1:]
+        verl_fused_forward_latency = verl_fused_forward_latency[1:]
+        verl_fused_backward_latency = verl_fused_backward_latency[1:]
+        kernel_forward_latency = kernel_forward_latency[1:]
+        kernel_backward_latency = kernel_backward_latency[1:]
+        print("\n[INFO]: Verified forward & backward correctness.")
+        print(
+            f"[INFO]: Forward pass: Torch implementation average time: "
+            f"{sum(torch_forward_latency) / len(torch_forward_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Backward pass: torch implementation average time: "
+            f"{sum(torch_backward_latency) / len(torch_backward_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Forward pass: VeRL implementation average time: "
+            f"{sum(verl_forward_latency) / len(verl_forward_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Backward pass: VeRL implementation average time: "
+            f"{sum(verl_backward_latency) / len(verl_backward_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Forward pass: VeRL Fused Entropy implementation average time: "
+            f"{sum(verl_fused_forward_latency) / len(verl_fused_forward_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Backward pass: VeRL Fused Entropy implementation average time: "
+            f"{sum(verl_fused_backward_latency) / len(verl_fused_backward_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Forward pass: Kernel implementation average time: "
+            f"{sum(kernel_forward_latency) / len(kernel_forward_latency):.2f} ms"
+        )
+        print(
+            f"[INFO]: Backward pass: kernel implementation average time: "
+            f"{sum(kernel_backward_latency) / len(kernel_backward_latency):.2f} ms"
+        )
+    def check_storage(self, method_name, run_forward):
+        self.cleanup()
+        self.generate_hyper()
+        hidden, weight, labels = self.generate_forward_inputs()
+        torch.cuda.reset_peak_memory_stats()
+        (logprobs, entropy) = run_forward(hidden, weight, labels, self.temperature)
+        torch.cuda.synchronize()
+        torch_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+        print(f"[INFO]: {method_name} Forward pass peak memory: {torch_max_memory:.2f} MB")
+        g_entropy, g_logprobs = self.generate_backward_inputs()
+        torch.cuda.reset_peak_memory_stats()
+        (d_torch_hidden, d_torch_weight) = torch.autograd.grad(
+            (entropy, logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+        )
+        torch.cuda.synchronize()
+        torch_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+        print(f"[INFO]: {method_name} Backward pass peak memory: {torch_backward_max_memory:.2f} MB")
+    def check_storage_all(self):
+        self.check_storage("Torch", run_torch_entropy)
+        self.check_storage("VeRL", run_verl_original_entropy)
+        self.check_storage("VeRL Torch Fused", run_verl_torch_fused_entropy)
+        self.check_storage("Kernel", linear_cross_entropy)
+if __name__ == "__main__":
+    # torch.cuda.memory._record_memory_history()
+    for test_case_idx in range(MAX_TEST_CASES):
+        print(f"[INFO] Running test case {test_case_idx}")
+        test = TestLinearCrossEntropy(test_case_idx)
+        test.verify_correctness()
+        test.check_storage_all()
+    # torch.cuda.memory._dump_snapshot("test_linear_cross_entropy.pkl")

code/RL_model/verl/verl_train/tests/utils/test_mlflow_key_sanitization.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest.mock import patch
+from verl.utils.tracking import _MlflowLoggingAdapter
+class TestMlflowLoggingAdapter(unittest.TestCase):
+    def test_sanitize_key_and_warning(self):
+        """Test key sanitization for invalid characters and consecutive slashes with warnings."""
+        adapter = _MlflowLoggingAdapter()
+        data = {
+            "valid_key": 1.0,
+            "invalid@key!": 2.0,
+            "another/valid-key": 3.0,
+            "bad key#": 4.0,
+            "val-aux//reward/mean_at_1": 5.0,
+            "val-core///acc/best_at_5": 6.0,
+            "metric////with/many////slashes": 7.0,
+        }
+        # Patch mlflow.log_metrics to capture the metrics actually sent
+        with (
+            patch("mlflow.log_metrics") as mock_log_metrics,
+            patch.object(adapter, "logger") as mock_logger,
+        ):
+            adapter.log(data, step=5)
+            # Check that invalid characters are sanitized
+            sent_metrics = mock_log_metrics.call_args[1]["metrics"]
+            self.assertIn("invalid_at_key_", sent_metrics)  # @ becomes _at_, ! becomes _
+            self.assertIn("bad key_", sent_metrics)  # # becomes _, space remains
+            self.assertNotIn("invalid@key!", sent_metrics)
+            self.assertNotIn("bad key#", sent_metrics)
+            # Check that consecutive slashes are collapsed to single slashes
+            self.assertIn("val-aux/reward/mean_at_1", sent_metrics)
+            self.assertIn("val-core/acc/best_at_5", sent_metrics)
+            self.assertIn("metric/with/many/slashes", sent_metrics)
+            self.assertNotIn("val-aux//reward/mean_at_1", sent_metrics)
+            self.assertNotIn("val-core///acc/best_at_5", sent_metrics)
+            # Check that warnings were logged for all sanitized keys
+            warning_msgs = [str(call) for call in mock_logger.warning.call_args_list]
+            # Warnings for invalid characters
+            self.assertTrue(any("invalid@key!" in msg and "invalid_at_key_" in msg for msg in warning_msgs))
+            self.assertTrue(any("bad key#" in msg and "bad key_" in msg for msg in warning_msgs))
+            # Warnings for consecutive slashes
+            self.assertTrue(any("val-aux//reward/mean_at_1" in msg for msg in warning_msgs))
+            self.assertTrue(any("val-core///acc/best_at_5" in msg for msg in warning_msgs))
+            self.assertTrue(any("metric////with/many////slashes" in msg for msg in warning_msgs))
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/utils/test_model_on_cpu.py ADDED Viewed

	@@ -0,0 +1,52 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from types import SimpleNamespace  # Or use a mock object library
+import pytest
+from verl.utils.model import update_model_config
+# Parametrize with different override scenarios
+@pytest.mark.parametrize(
+    "override_kwargs",
+    [
+        {"param_a": 5, "new_param": "plain_added"},
+        {"param_a": 2, "nested_params": {"sub_param_x": "updated_x", "sub_param_z": True}},
+    ],
+)
+def test_update_model_config(override_kwargs):
+    """
+    Tests that update_model_config correctly updates attributes,
+    handling both plain and nested overrides via parametrization.
+    """
+    # Create a fresh mock config object for each test case
+    mock_config = SimpleNamespace(
+        param_a=1, nested_params=SimpleNamespace(sub_param_x="original_x", sub_param_y=100), other_param="keep_me"
+    )
+    # Apply the updates using the parametrized override_kwargs
+    update_model_config(mock_config, override_kwargs)
+    # Assertions to check if the config was updated correctly
+    if "nested_params" in override_kwargs:  # Case 2: Nested override
+        override_nested = override_kwargs["nested_params"]
+        assert mock_config.nested_params.sub_param_x == override_nested["sub_param_x"], "Nested sub_param_x mismatch"
+        assert mock_config.nested_params.sub_param_y == 100, "Nested sub_param_y should be unchanged"
+        assert hasattr(mock_config.nested_params, "sub_param_z"), "Expected nested sub_param_z to be added"
+        assert mock_config.nested_params.sub_param_z == override_nested["sub_param_z"], "Value of sub_param_z mismatch"
+    else:  # Case 1: Plain override (nested params untouched)
+        assert mock_config.nested_params.sub_param_x == "original_x", "Nested sub_param_x should be unchanged"
+        assert mock_config.nested_params.sub_param_y == 100, "Nested sub_param_y should be unchanged"
+        assert not hasattr(mock_config.nested_params, "sub_param_z"), "Nested sub_param_z should not exist"

code/RL_model/verl/verl_train/tests/utils/test_nvtx_profile.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest.mock import MagicMock, patch
+from verl.utils import omega_conf_to_dataclass
+from verl.utils.profiler.config import NsightToolConfig, ProfilerConfig
+from verl.utils.profiler.profile import DistProfiler
+class TestProfilerConfig(unittest.TestCase):
+    def test_config_init(self):
+        import os
+        from hydra import compose, initialize_config_dir
+        with initialize_config_dir(config_dir=os.path.abspath("verl/trainer/config")):
+            cfg = compose(config_name="ppo_trainer")
+        for config in [
+            cfg.actor_rollout_ref.actor.profiler,
+            cfg.actor_rollout_ref.rollout.profiler,
+            cfg.actor_rollout_ref.ref.profiler,
+            cfg.critic.profiler,
+            cfg.reward_model.profiler,
+        ]:
+            profiler_config = omega_conf_to_dataclass(config)
+            self.assertEqual(profiler_config.tool, config.tool)
+            self.assertEqual(profiler_config.enable, config.enable)
+            self.assertEqual(profiler_config.all_ranks, config.all_ranks)
+            self.assertEqual(profiler_config.ranks, config.ranks)
+            self.assertEqual(profiler_config.save_path, config.save_path)
+            self.assertEqual(profiler_config.ranks, config.ranks)
+            assert isinstance(profiler_config, ProfilerConfig)
+            with self.assertRaises(AttributeError):
+                _ = profiler_config.non_existing_key
+            assert config.get("non_existing_key") == profiler_config.get("non_existing_key")
+            assert config.get("non_existing_key", 1) == profiler_config.get("non_existing_key", 1)
+    def test_frozen_config(self):
+        """Test that modifying frozen keys in ProfilerConfig raises exceptions."""
+        from dataclasses import FrozenInstanceError
+        from verl.utils.profiler.config import ProfilerConfig
+        # Create a new ProfilerConfig instance
+        config = ProfilerConfig(all_ranks=False, ranks=[0])
+        with self.assertRaises(FrozenInstanceError):
+            config.all_ranks = True
+        with self.assertRaises(FrozenInstanceError):
+            config.ranks = [1, 2, 3]
+        with self.assertRaises(TypeError):
+            config["all_ranks"] = True
+        with self.assertRaises(TypeError):
+            config["ranks"] = [1, 2, 3]
+class TestNsightSystemsProfiler(unittest.TestCase):
+    """Test suite for NsightSystemsProfiler functionality.
+    Test Plan:
+    1. Initialization: Verify profiler state after creation
+    2. Basic Profiling: Test start/stop functionality
+    3. Discrete Mode: TODO: Test discrete profiling behavior
+    4. Annotation: Test the annotate decorator in both normal and discrete modes
+    5. Config Validation: Verify proper config initialization from OmegaConf
+    """
+    def setUp(self):
+        self.config = ProfilerConfig(tool="nsys", enable=True, all_ranks=True)
+        self.rank = 0
+        self.profiler = DistProfiler(self.rank, self.config, tool_config=NsightToolConfig(discrete=False))
+    def test_initialization(self):
+        self.assertEqual(self.profiler.check_this_rank(), True)
+        self.assertEqual(self.profiler.check_this_step(), False)
+    def test_start_stop_profiling(self):
+        with patch("torch.cuda.profiler.start") as mock_start, patch("torch.cuda.profiler.stop") as mock_stop:
+            # Test start
+            self.profiler.start()
+            self.assertTrue(self.profiler.check_this_step())
+            mock_start.assert_called_once()
+            # Test stop
+            self.profiler.stop()
+            self.assertFalse(self.profiler.check_this_step())
+            mock_stop.assert_called_once()
+    # def test_discrete_profiling(self):
+    #     discrete_config = ProfilerConfig(discrete=True, all_ranks=True)
+    #     profiler = NsightSystemsProfiler(self.rank, discrete_config)
+    #     with patch("torch.cuda.profiler.start") as mock_start, patch("torch.cuda.profiler.stop") as mock_stop:
+    #         profiler.start()
+    #         self.assertTrue(profiler.this_step)
+    #         mock_start.assert_not_called()  # Shouldn't start immediately in discrete mode
+    #         profiler.stop()
+    #         self.assertFalse(profiler.this_step)
+    #         mock_stop.assert_not_called()  # Shouldn't stop immediately in discrete mode
+    def test_annotate_decorator(self):
+        mock_self = MagicMock()
+        mock_self.profiler = self.profiler
+        mock_self.profiler.start()
+        decorator = mock_self.profiler.annotate(message="test")
+        @decorator
+        def test_func(self, *args, **kwargs):
+            return "result"
+        with (
+            patch("torch.cuda.profiler.start") as mock_start,
+            patch("torch.cuda.profiler.stop") as mock_stop,
+            patch("verl.utils.profiler.nvtx_profile.mark_start_range") as mock_start_range,
+            patch("verl.utils.profiler.nvtx_profile.mark_end_range") as mock_end_range,
+        ):
+            result = test_func(mock_self)
+            self.assertEqual(result, "result")
+            mock_start_range.assert_called_once()
+            mock_end_range.assert_called_once()
+            mock_start.assert_not_called()  # Not discrete mode
+            mock_stop.assert_not_called()  # Not discrete mode
+    # def test_annotate_discrete_mode(self):
+    #     discrete_config = ProfilerConfig(discrete=True, all_ranks=True)
+    #     profiler = NsightSystemsProfiler(self.rank, discrete_config)
+    #     mock_self = MagicMock()
+    #     mock_self.profiler = profiler
+    #     mock_self.profiler.this_step = True
+    #     @NsightSystemsProfiler.annotate(message="test")
+    #     def test_func(self, *args, **kwargs):
+    #         return "result"
+    #     with (
+    #         patch("torch.cuda.profiler.start") as mock_start,
+    #         patch("torch.cuda.profiler.stop") as mock_stop,
+    #         patch("verl.utils.profiler.nvtx_profile.mark_start_range") as mock_start_range,
+    #         patch("verl.utils.profiler.nvtx_profile.mark_end_range") as mock_end_range,
+    #     ):
+    #         result = test_func(mock_self)
+    #         self.assertEqual(result, "result")
+    #         mock_start_range.assert_called_once()
+    #         mock_end_range.assert_called_once()
+    #         mock_start.assert_called_once()  # Should start in discrete mode
+    #         mock_stop.assert_called_once()  # Should stop in discrete mode
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/utils/test_rollout_skip_on_cpu.py ADDED Viewed

	@@ -0,0 +1,142 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import shutil
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock
+import pytest
+import torch
+from verl.utils.rollout_skip import DataProto, RolloutSkip
+len_prompt = 50
+len_response = 100
+def temp_dir():
+    # Create a temporary directory
+    temp_dir = Path(tempfile.mkdtemp())
+    yield temp_dir
+    # Cleanup
+    shutil.rmtree(temp_dir)
+def build_generate_fn(gen_bs, n):
+    len_tokenizer = 1024
+    def iterate():
+        while True:
+            prompt = torch.randint(len_tokenizer, size=(gen_bs, len_prompt)).repeat_interleave(n, dim=0)
+            generate = torch.randint(len_tokenizer, size=(gen_bs * n, len_response))
+            data = DataProto.from_dict(tensors={"prompt": prompt, "response": generate})
+            yield data
+    mock_infer_engine = iterate()
+    def fn(batch, **kwargs):
+        # Simulate the inference engine returning the next batch
+        return next(mock_infer_engine)
+    return fn
+@pytest.fixture(params=[(32, 4), (64, 4), (64, 8)])
+def mock_rollout_wg(request):
+    gen_bs, n = request.param
+    rollout_wg = MagicMock()
+    config = MagicMock()
+    config.actor_rollout_ref.rollout = {
+        "n": n,
+        "skip_dump_dir": next(temp_dir()),
+    }
+    config.data = {"gen_batch_size": gen_bs}
+    rollout_wg.generate_sequences = build_generate_fn(gen_bs, n)
+    yield config, rollout_wg
+    # Cleanup
+    shutil.rmtree(next(temp_dir()))
+class TestRolloutSkip:
+    def test_initialization(self, capsys):
+        """Test that RolloutSkip initializes correctly"""
+        config = MagicMock()
+        config.actor_rollout_ref.rollout = {
+            "n": 16,
+            "skip_dump_dir": "tmp/rollout_dump",
+        }
+        config.data = {"gen_batch_size": 128}
+        mock_rollout_wg = MagicMock()
+        skip = RolloutSkip(config, mock_rollout_wg)
+        assert skip.n == 16
+        assert skip.gbs == 128
+        assert str(skip.dumped_dir) == "tmp/rollout_dump"
+        assert skip._rollout_wg == mock_rollout_wg
+        skip.wrap_generate_sequences()
+        captured = capsys.readouterr()
+        assert "Successfully patched" in captured.out
+    def test_generate_without_wrap(self, mock_rollout_wg):
+        """Test that generate_sequences works without wrapping"""
+        config, rollout_wg = mock_rollout_wg
+        _ = RolloutSkip(config, rollout_wg)
+        _result = rollout_wg.generate_sequences(MagicMock())
+        for _ in range(10):
+            result = rollout_wg.generate_sequences(MagicMock())
+            assert isinstance(result, DataProto)
+            # * make sure the data is different
+            assert torch.abs(_result.batch["prompt"] - result.batch["prompt"]).sum() > 0
+            assert torch.abs(_result.batch["response"] - result.batch["response"]).sum() > 0
+            _result = result
+    def test_dump(self, mock_rollout_wg, capsys):
+        config, rollout_wg = mock_rollout_wg
+        skip = RolloutSkip(config, rollout_wg)
+        skip.wrap_generate_sequences()
+        result = rollout_wg.generate_sequences(MagicMock())
+        # * check if dump is OK
+        assert skip.curr_path_dump.exists()
+        captured = capsys.readouterr()
+        assert "Successfully dump data in" in captured.out
+        # * get file size, estimate file size
+        file_size = skip.curr_path_dump.stat().st_size
+        est_file_size = (len_prompt + len_response) * skip.gbs * skip.n * result.batch["prompt"].dtype.itemsize
+        assert file_size >= est_file_size, "Dumped file size is smaller than expected"
+    def test_generate_with_wrap(self, mock_rollout_wg, capsys):
+        """Test that generate_sequences works without wrapping"""
+        config, rollout_wg = mock_rollout_wg
+        skip = RolloutSkip(config, rollout_wg)
+        skip.wrap_generate_sequences()
+        _result = rollout_wg.generate_sequences(MagicMock())
+        for _ in range(10):
+            result = rollout_wg.generate_sequences(MagicMock())
+            assert isinstance(result, DataProto)
+            # * make sure the data is different
+            assert torch.abs(_result.batch["prompt"] - result.batch["prompt"]).sum() == 0
+            assert torch.abs(_result.batch["response"] - result.batch["response"]).sum() == 0
+            captured = capsys.readouterr()
+            assert "Successfully load pre-generated data from" in captured.out
+            _result = result

code/RL_model/verl/verl_train/tests/utils/test_rollout_trace_on_cpu.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import sys
+from unittest.mock import MagicMock, patch
+import pytest
+from verl.utils.rollout_trace import RolloutTraceConfig, rollout_trace_attr, rollout_trace_op
+@pytest.fixture(autouse=True)
+def reset_rollout_trace_config_singleton():
+    """Fixture to reset the RolloutTraceConfig singleton before each test."""
+    RolloutTraceConfig.reset()
+@pytest.fixture
+def mock_weave_client():
+    """Mocks the weave module and its client, yielding the mock client."""
+    mock_weave = MagicMock()
+    mock_client = MagicMock()
+    mock_call = MagicMock()
+    mock_client.create_call.return_value = mock_call
+    mock_weave.init.return_value = mock_client
+    # Also mock the call_context if it's used internally by the decorator
+    mock_weave.trace.context.call_context.return_value = MagicMock()
+    with patch.dict(sys.modules, {"weave": mock_weave, "weave.trace.context": mock_weave.trace.context}):
+        yield mock_client
+class TracedClass:
+    @rollout_trace_op
+    # @weave.op
+    # @mlflow.trace
+    async def my_method(self, a, b="default"):
+        return f"result: {a}, {b}"
+    @rollout_trace_op
+    # @weave.op
+    # @mlflow.trace
+    async def middle_method(self, a, b="default"):
+        await self.my_method("test_a1", b="test_b1")
+        return f"result: {a}, {b}"
+    @rollout_trace_op
+    # @mlflow.trace
+    async def my_method_with_exception(self):
+        raise ValueError("Test Exception")
+    async def upper_method(self):
+        await self.my_method("test_a0", b="test_b0")
+        await self.middle_method("test_a2", b="test_b2")
+        return True
+class UntracedClass:
+    @rollout_trace_op
+    async def my_method(self, x):
+        return x * 2
+async def test_rollout_trace_on_untraced_class():
+    """Tests that the decorator works correctly when no backend is configured."""
+    instance = UntracedClass()
+    assert await instance.my_method(10) == 20
+async def test_rollout_trace_with_tracer(mock_weave_client):
+    """Tests that the decorator calls the tracer's methods correctly."""
+    RolloutTraceConfig.init(project_name="my-project", experiment_name="my-experiment", backend="weave")
+    instance = TracedClass()
+    assert RolloutTraceConfig.get_client() is mock_weave_client
+    result = await instance.my_method("test_a", b="test_b")
+    assert result == "result: test_a, test_b"
+    mock_weave_client.create_call.assert_called_once()
+    call_kwargs = mock_weave_client.create_call.call_args.kwargs
+    assert call_kwargs["op"] == "TracedClass.my_method"
+    expected_inputs = {"a": "test_a", "b": "test_b"}
+    assert call_kwargs["inputs"] == expected_inputs
+    mock_call = mock_weave_client.create_call.return_value
+    mock_weave_client.finish_call.assert_called_once_with(mock_call, output=result)
+async def test_rollout_trace_with_exception(mock_weave_client):
+    """Tests that `finish` is called with the exception when one is raised."""
+    RolloutTraceConfig.init(project_name="my-project", experiment_name="my-experiment", backend="weave")
+    instance = TracedClass()
+    with pytest.raises(ValueError, match="Test Exception"):
+        await instance.my_method_with_exception()
+    mock_weave_client.create_call.assert_called_once()
+    mock_call = mock_weave_client.create_call.return_value
+    mock_weave_client.finish_call.assert_called_once()
+    # Check that finish_call was called with the exception
+    args, kwargs = mock_weave_client.finish_call.call_args
+    assert args[0] == mock_call
+    assert "exception" in kwargs
+    assert isinstance(kwargs["exception"], ValueError)
+async def test_rollout_trace_with_dummy_backend(mock_weave_client):
+    """Tests that the tracer is not called when the backend is 'dummy'."""
+    RolloutTraceConfig.init(project_name="my-project", experiment_name="my-experiment", backend="dummy")
+    instance = TracedClass()
+    await instance.my_method("test_a")
+    mock_weave_client.create_call.assert_not_called()
+async def test_trace_disabled_with_trace_false(mock_weave_client):
+    """Tests that tracing is disabled when trace=False."""
+    RolloutTraceConfig.init(
+        project_name="my-project",
+        experiment_name="my-experiment",
+        backend="weave",
+    )
+    instance = TracedClass()
+    assert RolloutTraceConfig.get_backend() == "weave"
+    with rollout_trace_attr(step=1, sample_index=0, rollout_n=0, trace=False):
+        result = await instance.my_method("test_a", b="test_b")
+        assert result == "result: test_a, test_b"
+    # No tracing should have occurred
+    mock_weave_client.create_call.assert_not_called()
+    # Verify that tracing works again with trace=True (default)
+    with rollout_trace_attr(step=1, sample_index=0, rollout_n=0):
+        result = await instance.my_method("test_a", b="test_b")
+        assert result == "result: test_a, test_b"
+    assert mock_weave_client.create_call.call_count == 1
+async def test_trace_false_disables_nested_trace_ops(mock_weave_client):
+    """Tests that trace=False disables all nested @rollout_trace_op calls."""
+    RolloutTraceConfig.init(
+        project_name="my-project",
+        experiment_name="my-experiment",
+        backend="weave",
+    )
+    instance = TracedClass()
+    with rollout_trace_attr(step=1, sample_index=0, rollout_n=0, trace=False):
+        # Call upper_method which internally calls my_method and middle_method
+        # All of these are decorated with @rollout_trace_op
+        result = await instance.upper_method()
+        assert result is True
+    # No tracing should have occurred for any of the nested calls
+    mock_weave_client.create_call.assert_not_called()
+    with rollout_trace_attr(step=1, sample_index=0, rollout_n=0):
+        result = await instance.my_method("test_a", b="test_b")
+        assert result == "result: test_a, test_b"
+    assert mock_weave_client.create_call.call_count == 1
+async def test_trace_enabled_restored_after_exception(mock_weave_client):
+    """Tests that trace state is restored even if an exception occurs when trace=False."""
+    RolloutTraceConfig.init(
+        project_name="my-project",
+        experiment_name="my-experiment",
+        backend="weave",
+    )
+    instance = TracedClass()
+    assert RolloutTraceConfig.get_backend() == "weave"
+    # Use trace=False and raise an exception
+    try:
+        with rollout_trace_attr(step=1, sample_index=0, rollout_n=0, trace=False):
+            raise RuntimeError("Test exception with trace disabled")
+    except RuntimeError:
+        pass
+    with rollout_trace_attr(step=1, sample_index=0, rollout_n=0):
+        result = await instance.my_method("test_a", b="test_b")
+        assert result == "result: test_a, test_b"
+    assert mock_weave_client.create_call.call_count == 1
+@pytest.mark.skipif(
+    os.environ.get("RUN_WEAVE_INTEGRATION_TESTS", "false").lower() != "true",
+    reason="Skipping weave integration test. Set RUN_WEAVE_INTEGRATION_TESTS=true to run.",
+)
+async def test_rollout_trace_with_real_weave_backend():
+    """Integration test with a real weave backend."""
+    # This assumes that the weave environment (e.g., project) is configured
+    RolloutTraceConfig.init(project_name="my-project", experiment_name="my-experiment", backend="weave")
+    instance = TracedClass()
+    with rollout_trace_attr(step=1, sample_index=2, rollout_n=3):
+        await instance.upper_method()
+    with pytest.raises(ValueError, match="Test Exception"):
+        await instance.my_method_with_exception()
+    print("\nWeave integration test ran successfully. Check your weave project for the trace.")
+@pytest.mark.skipif(
+    os.environ.get("RUN_MLFLOW_INTEGRATION_TESTS", "false").lower() != "true",
+    reason="Skipping mlflow integration test. Set RUN_MLFLOW_INTEGRATION_TESTS=true to run.",
+)
+async def test_rollout_trace_with_real_mlflow_backend():
+    """Integration test with a real mlflow backend."""
+    # This assumes that the mlflow environment (e.g., project) is configured
+    RolloutTraceConfig.init(project_name="my-project", experiment_name="my-experiment", backend="mlflow")
+    instance = TracedClass()
+    with rollout_trace_attr(step=1, sample_index=2, rollout_n=3, name="agent_run"):
+        assert await instance.upper_method()
+    # with pytest.raises(ValueError, match="Test Exception"):
+    #     await instance.my_method_with_exception()
+    print("\nWeave integration test ran successfully. Check your weave project for the trace.")

code/RL_model/verl/verl_train/tests/utils/test_seqlen_balancing.py ADDED Viewed

	@@ -0,0 +1,278 @@

+# Copyright 2025 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from verl import DataProto
+from verl.utils.device import get_device_name, get_nccl_backend, get_torch_device
+from verl.utils.model import create_random_mask
+from verl.utils.seqlen_balancing import (
+    ceildiv,
+    get_reverse_idx,
+    prepare_dynamic_batch,
+    rearrange_micro_batches,
+    restore_dynamic_batch,
+)
+def test_seqlen_balancing():
+    input_ids = torch.randint(low=0, high=10, size=(20, 100))
+    attention_mask = create_random_mask(
+        input_ids=input_ids, max_ratio_of_left_padding=0.1, max_ratio_of_valid_token=0.9, min_ratio_of_valid_token=0.5
+    )
+    data = {"input_ids": input_ids, "attention_mask": attention_mask}
+    dataproto = DataProto.from_single_dict(data)
+    micro_batches, micro_bsz_idx_lst = rearrange_micro_batches(dataproto.batch, max_token_len=300)
+    batch = torch.cat(micro_batches)
+    micro_bsz_idx = []
+    for idx in micro_bsz_idx_lst:
+        micro_bsz_idx.extend(idx)
+    reverse_idx_map = get_reverse_idx(micro_bsz_idx)
+    reverse_idx_map = torch.tensor(reverse_idx_map)
+    new_batch = batch[reverse_idx_map]
+    torch.testing.assert_close(new_batch, dataproto.batch)
+def test_dynamic_batch():
+    input_ids = torch.randint(low=0, high=10, size=(20, 100))
+    attention_mask = create_random_mask(
+        input_ids=input_ids, max_ratio_of_left_padding=0.1, max_ratio_of_valid_token=0.9, min_ratio_of_valid_token=0.5
+    )
+    data = {"input_ids": input_ids, "attention_mask": attention_mask}
+    dataproto = DataProto.from_single_dict(data)
+    micro_batches, micro_bsz_idx_lst = prepare_dynamic_batch(dataproto, max_token_len=300)
+    input_ids = torch.cat([micro_batch.batch["input_ids"] for micro_batch in micro_batches], dim=0)
+    input_ids = restore_dynamic_batch(input_ids, micro_bsz_idx_lst)
+    torch.testing.assert_close(input_ids, dataproto.batch["input_ids"])
+def _worker(rank, world_size, init_method, max_token_len, use_same_dp, min_mb):
+    # 1) init process group & CUDA
+    get_torch_device().set_device(rank)
+    dist.init_process_group(
+        backend=get_nccl_backend(),
+        init_method=init_method,
+        world_size=world_size,
+        rank=rank,
+    )
+    # 2) build a small random batch (each rank different length to force mismatch)
+    torch.manual_seed(42 + rank)
+    input_ids = torch.randint(0, 10, (20 + rank * 5, 100), device=f"{get_device_name()}:{rank}")
+    attention_mask = create_random_mask(
+        input_ids=input_ids,
+        max_ratio_of_left_padding=0.1,
+        max_ratio_of_valid_token=0.9,
+        min_ratio_of_valid_token=0.5,
+    )
+    dp = {"input_ids": input_ids, "attention_mask": attention_mask}
+    proto = DataProto.from_single_dict(dp)
+    batch = proto.batch
+    # 3) call rearrange_micro_batches with one of the two params under test
+    micros, idx_lst = rearrange_micro_batches(
+        batch,
+        max_token_len=max_token_len,
+        dp_group=dist.group.WORLD,
+        same_micro_num_in_dp=use_same_dp,
+        min_num_micro_batch=min_mb,
+    )
+    # 4) check the enforced counts
+    seq_len_effective: torch.Tensor = batch["attention_mask"].sum(dim=1)
+    total_seqlen = seq_len_effective.sum().item()
+    local = min(len(seq_len_effective), ceildiv(total_seqlen, max_token_len))
+    if min_mb is not None:
+        expected = max(local, min_mb)
+        assert len(micros) == expected
+    if use_same_dp:
+        # gather all local_counts
+        counts = [torch.zeros(1, device=f"{get_device_name()}:{rank}") for _ in range(world_size)]
+        counts[rank].fill_(local)
+        dist.all_gather(counts, counts[rank])
+        expected = max(int(c.item()) for c in counts)
+        assert len(micros) == expected
+    else:
+        # if neither, we get the local natural count
+        assert len(micros) == local
+    # 5) reconstruction sanity: concat→reverse_idx→orig
+    flat = torch.cat(micros, dim=0)
+    idx = []
+    for sub in idx_lst:
+        idx.extend(sub)
+    inv = get_reverse_idx(idx)
+    inv = torch.tensor(inv, device=flat.device)
+    reconstructed = flat[inv]
+    torch.testing.assert_close(reconstructed, batch)
+    dist.destroy_process_group()
+def test_dataproto_split_uneven():
+    """Test DataProto.split with uneven splits"""
+    # Create test data with 10 items
+    input_ids = torch.randint(low=0, high=10, size=(10, 5))
+    attention_mask = torch.ones(10, 5)
+    data = {"input_ids": input_ids, "attention_mask": attention_mask}
+    dataproto = DataProto.from_single_dict(data)
+    # Test split with size 3 (should create chunks of [3, 3, 3, 1])
+    splits = dataproto.split(3)
+    assert len(splits) == 4
+    assert len(splits[0]) == 3
+    assert len(splits[1]) == 3
+    assert len(splits[2]) == 3
+    assert len(splits[3]) == 1
+    reconstructed = DataProto.concat(splits)
+    torch.testing.assert_close(reconstructed.batch["input_ids"], dataproto.batch["input_ids"])
+    torch.testing.assert_close(reconstructed.batch["attention_mask"], dataproto.batch["attention_mask"])
+    # Test split with size equal to length (should create one chunk)
+    splits = dataproto.split(10)
+    assert len(splits) == 1
+    assert len(splits[0]) == 10
+    # Test split with size larger than length (should create one chunk with all data)
+    splits = dataproto.split(15)
+    assert len(splits) == 1
+    assert len(splits[0]) == 10
+    # Test with non-tensor batch data
+    import numpy as np
+    data_with_non_tensor = {
+        "input_ids": input_ids,
+        "attention_mask": attention_mask,
+        "labels": np.array([f"label_{i}" for i in range(10)], dtype=object),
+    }
+    dataproto_with_non_tensor = DataProto.from_single_dict(data_with_non_tensor)
+    splits = dataproto_with_non_tensor.split(3)
+    assert len(splits) == 4
+    assert len(splits[0]) == 3
+    assert len(splits[1]) == 3
+    assert len(splits[2]) == 3
+    assert len(splits[3]) == 1
+    # Verify non-tensor data integrity
+    reconstructed = DataProto.concat(splits)
+    np.testing.assert_array_equal(
+        reconstructed.non_tensor_batch["labels"], dataproto_with_non_tensor.non_tensor_batch["labels"]
+    )
+def test_seqlen_balancing_distributed_params(tmp_path):
+    world_size = 2
+    init_file = tmp_path / "dist_init"
+    init_file.write_text("")  # empty file
+    init_method = f"file://{init_file}"
+    # test min_num_micro_batch only
+    mp.spawn(
+        _worker,
+        args=(world_size, init_method, 300, False, 4),
+        nprocs=world_size,
+        join=True,
+    )
+    # test same_micro_num_in_dp only
+    mp.spawn(
+        _worker,
+        args=(world_size, init_method, 300, True, None),
+        nprocs=world_size,
+        join=True,
+    )
+def test_group_balanced_partitions():
+    """Test group-level balancing keeps same-uid samples together."""
+    from verl.utils.seqlen_balancing import get_group_balanced_partitions
+    # Create test data: 4 groups with different sizes
+    # Group 0 (uid=0): indices 0,1,2,3 with seqlens [100, 100, 100, 100]
+    # Group 1 (uid=1): indices 4,5,6,7 with seqlens [200, 200, 200, 200]
+    # Group 2 (uid=2): indices 8,9,10,11 with seqlens [150, 150, 150, 150]
+    # Group 3 (uid=3): indices 12,13,14,15 with seqlens [50, 50, 50, 50]
+    seqlen_list = [100] * 4 + [200] * 4 + [150] * 4 + [50] * 4
+    uid_list = [0] * 4 + [1] * 4 + [2] * 4 + [3] * 4
+    # Partition into 2 groups
+    partitions = get_group_balanced_partitions(seqlen_list, uid_list, k_partitions=2)
+    assert len(partitions) == 2
+    # Verify all indices are covered
+    all_indices = set()
+    for partition in partitions:
+        all_indices.update(partition)
+    assert all_indices == set(range(16))
+    # Verify same-uid samples stay together
+    for partition in partitions:
+        uids_in_partition = set(uid_list[i] for i in partition)
+        for uid in uids_in_partition:
+            # All samples with this uid should be in this partition
+            uid_indices = [i for i, u in enumerate(uid_list) if u == uid]
+            assert all(i in partition for i in uid_indices), f"uid {uid} samples split across partitions"
+def test_group_balanced_partitions_single_sample_groups():
+    """Test group balancing with single-sample groups (n=1)."""
+    from verl.utils.seqlen_balancing import get_group_balanced_partitions
+    # Each sample is its own group
+    seqlen_list = [100, 200, 150, 50, 300, 250]
+    uid_list = [0, 1, 2, 3, 4, 5]
+    partitions = get_group_balanced_partitions(seqlen_list, uid_list, k_partitions=2)
+    assert len(partitions) == 2
+    all_indices = set()
+    for partition in partitions:
+        all_indices.update(partition)
+    assert all_indices == set(range(6))
+def test_group_balanced_partitions_equal_size():
+    """Test group balancing with equal_size constraint simulation."""
+    from verl.utils.seqlen_balancing import get_group_balanced_partitions
+    # 8 groups, partition into 4 (simulating world_size=4)
+    # Each group has 2 samples
+    seqlen_list = [100, 100, 200, 200, 150, 150, 50, 50, 300, 300, 250, 250, 180, 180, 120, 120]
+    uid_list = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7]
+    partitions = get_group_balanced_partitions(seqlen_list, uid_list, k_partitions=4)
+    assert len(partitions) == 4
+    # Verify all indices are covered
+    all_indices = set()
+    for partition in partitions:
+        all_indices.update(partition)
+    assert all_indices == set(range(16))
+    # Verify same-uid samples stay together
+    for partition in partitions:
+        uids_in_partition = set(uid_list[i] for i in partition)
+        for uid in uids_in_partition:
+            uid_indices = [i for i, u in enumerate(uid_list) if u == uid]
+            assert all(i in partition for i in uid_indices)

code/RL_model/verl/verl_train/tests/utils/test_shared_memory.py ADDED Viewed

	@@ -0,0 +1,260 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing
+import unittest
+from multiprocessing import shared_memory
+import torch
+from verl.workers.rollout.vllm_rollout.utils import create_shared_memory, rebuild_shared_memory
+class TestSharedMemory(unittest.TestCase):
+    """Test cases for shared memory utility functions."""
+    def setUp(self):
+        """Set up test fixtures before each test method."""
+        # Use short unique names to avoid POSIX shared memory name length limits
+        import uuid
+        short_id = uuid.uuid4().hex[:8]
+        self.test_name = f"shm_{short_id}"
+    def tearDown(self):
+        """Clean up shared memory after each test method."""
+        # Note: We're relying on the OS to clean up shared memory
+        # as we properly delete all references in the tests
+        pass
+    def test_create_shared_memory_new(self):
+        """Test creating new shared memory with unique name."""
+        size = 1024
+        shm = create_shared_memory(size, self.test_name)
+        # Verify shared memory object is created correctly
+        self.assertIsNotNone(shm)
+        # Note: shared memory may have system-dependent size rounding
+        self.assertGreaterEqual(shm.size, size)
+        self.assertEqual(shm.name, self.test_name)
+        # Clean up - delete tensor references first
+        del shm
+    def test_create_shared_memory_attach_existing(self):
+        """Test that create_shared_memory attaches to existing shared memory when FileExistsError occurs."""
+        size = 2048
+        # First, create shared memory
+        shm1 = create_shared_memory(size, self.test_name)
+        self.assertGreaterEqual(shm1.size, size)
+        # Second call should attach to existing memory
+        shm2 = create_shared_memory(size, self.test_name)
+        # Verify we attached to the same shared memory
+        self.assertIsNotNone(shm2)
+        self.assertGreaterEqual(shm2.size, size)
+        self.assertEqual(shm2.name, self.test_name)
+        # Both should reference the same shared memory
+        self.assertEqual(shm1.name, shm2.name)
+        # Clean up
+        del shm1, shm2
+    def test_rebuild_shared_memory_default_dtype(self):
+        """Test rebuilding tensor from shared memory with default dtype (uint8)."""
+        size = 1024
+        # Create and write to shared memory
+        shm = create_shared_memory(size, self.test_name)
+        test_data = torch.arange(size, dtype=torch.uint8)
+        shm.buf[:size] = test_data.numpy().tobytes()
+        # Rebuild tensor from shared memory
+        tensor, _ = rebuild_shared_memory(self.test_name, size)
+        # Verify tensor properties
+        self.assertEqual(tensor.dtype, torch.uint8)
+        self.assertEqual(len(tensor), size)
+        # Verify data integrity
+        reconstructed = torch.frombuffer(shm.buf[:size], dtype=torch.uint8)
+        self.assertTrue(torch.equal(tensor, reconstructed))
+        # Clean up - delete references before closing
+        del tensor, reconstructed
+    def test_rebuild_shared_memory_custom_dtype(self):
+        """Test rebuilding tensor from shared memory with custom dtype."""
+        size = 256  # 256 bytes = 64 float32 values
+        # Create and write to shared memory
+        shm = create_shared_memory(size, self.test_name)
+        test_data = torch.arange(64, dtype=torch.float32)
+        shm.buf[:size] = test_data.numpy().tobytes()
+        # Rebuild tensor with custom dtype
+        tensor, _ = rebuild_shared_memory(self.test_name, size, dtype=torch.float32)
+        # Verify tensor properties
+        self.assertEqual(tensor.dtype, torch.float32)
+        self.assertEqual(len(tensor), 64)
+        # Verify data integrity
+        reconstructed = torch.frombuffer(shm.buf[:size], dtype=torch.float32)
+        self.assertTrue(torch.equal(tensor, reconstructed))
+        # Clean up - delete references before closing
+        del tensor, reconstructed
+    def test_shared_memory_data_integrity(self):
+        """Test that data remains intact between create and rebuild operations."""
+        size = 512
+        # Create test data with various patterns
+        test_data = torch.randint(0, 256, (size,), dtype=torch.uint8)
+        # Create shared memory and write data
+        shm = create_shared_memory(size, self.test_name)
+        shm.buf[:size] = test_data.numpy().tobytes()
+        # Rebuild tensor
+        tensor, _ = rebuild_shared_memory(self.test_name, size)
+        # Verify data integrity
+        reconstructed = torch.frombuffer(shm.buf[:size], dtype=torch.uint8)
+        self.assertTrue(torch.equal(test_data, reconstructed))
+        # Clean up - delete references before closing
+        del tensor, reconstructed
+    def test_shared_memory_different_dtypes(self):
+        """Test shared memory operations with different tensor dtypes."""
+        test_cases = [
+            (torch.float32, 256, 64),  # 256 bytes / 4 bytes = 64 values
+            (torch.float64, 256, 32),  # 256 bytes / 8 bytes = 32 values
+            (torch.int32, 256, 64),  # 256 bytes / 4 bytes = 64 values
+            (torch.int64, 256, 32),  # 256 bytes / 8 bytes = 32 values
+            (torch.uint8, 256, 256),  # 256 bytes / 1 byte = 256 values
+        ]
+        for dtype, size, expected_len in test_cases:
+            # Create test data
+            test_data = torch.arange(expected_len, dtype=dtype)
+            # Create shared memory and write data
+            shm = create_shared_memory(size, self.test_name)
+            shm.buf[:size] = test_data.numpy().tobytes()
+            # Rebuild tensor
+            tensor, _ = rebuild_shared_memory(self.test_name, size, dtype=dtype)
+            # Verify properties and data
+            self.assertEqual(tensor.dtype, dtype)
+            self.assertEqual(len(tensor), expected_len)
+            reconstructed = torch.frombuffer(shm.buf[:size], dtype=dtype)
+            self.assertTrue(torch.equal(test_data, reconstructed))
+            # Clean up - delete references before closing
+            del tensor, reconstructed
+    def test_shared_memory_multiple_operations(self):
+        """Test multiple create/rebuild operations with the same name."""
+        size = 512
+        # First iteration
+        test_data1 = torch.arange(size, dtype=torch.uint8)
+        shm1 = create_shared_memory(size, self.test_name)
+        shm1.buf[:size] = test_data1.numpy().tobytes()
+        tensor1, _ = rebuild_shared_memory(self.test_name, size)
+        reconstructed1 = torch.frombuffer(shm1.buf[:size], dtype=torch.uint8)
+        self.assertTrue(torch.equal(test_data1, reconstructed1))
+        del tensor1, reconstructed1, shm1
+        # Second iteration with different data
+        test_data2 = torch.arange(size, dtype=torch.uint8) * 2
+        shm2 = create_shared_memory(size, self.test_name)
+        shm2.buf[:size] = test_data2.numpy().tobytes()
+        tensor2, _ = rebuild_shared_memory(self.test_name, size)
+        reconstructed2 = torch.frombuffer(shm2.buf[:size], dtype=torch.uint8)
+        self.assertTrue(torch.equal(test_data2, reconstructed2))
+        del tensor2, reconstructed2, shm2
+# Module-level function for cross-process testing
+def child_process_function(name, size, test_data_bytes):
+    """Child process function to rebuild and verify tensor."""
+    shm = None
+    tensor = None
+    test_data = None
+    try:
+        # Convert bytes back to tensor
+        test_data = torch.frombuffer(test_data_bytes, dtype=torch.uint8)
+        # Attach to shared memory
+        shm = shared_memory.SharedMemory(name=name)
+        # Rebuild tensor from shared memory
+        tensor = torch.frombuffer(shm.buf[:size], dtype=torch.uint8)
+        # Verify data integrity
+        assert torch.equal(test_data, tensor), "Data mismatch in child process"
+        return True
+    except Exception as e:
+        print(f"Error in child process: {e}")
+        return False
+    finally:
+        # Clean up shared memory in child process
+        # Delete all references first
+        del tensor, test_data
+        if shm is not None:
+            shm.close()
+            # Note: Don't unlink in child process, parent will clean up
+class TestSharedMemoryIntegration(unittest.TestCase):
+    """Integration tests for shared memory operations across process boundaries."""
+    def test_cross_process_shared_memory(self):
+        """Test shared memory can be created in one process and accessed in another."""
+        size = 1024
+        test_data = torch.arange(size, dtype=torch.uint8)
+        # Create shared memory in parent process
+        shm = create_shared_memory(size, "test_cross_proc")
+        shm.buf[:size] = test_data.numpy().tobytes()
+        # Convert tensor to bytes for passing to child process
+        test_data_bytes = test_data.numpy().tobytes()
+        # Start child process
+        process = multiprocessing.Process(
+            target=child_process_function, args=("test_cross_proc", size, test_data_bytes)
+        )
+        process.start()
+        process.join(timeout=5)
+        # Verify child process completed successfully
+        self.assertEqual(process.exitcode, 0, "Child process failed")
+        # Clean up
+        del shm
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/utils/test_special_linear_cross_entropy_tp.py ADDED Viewed

	@@ -0,0 +1,514 @@

+#
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import torch
+import torch.distributed as dist
+try:
+    from verl.utils.kernel.linear_cross_entropy import linear_cross_entropy
+except ImportError:
+    # FIXME: remove these manually included paths
+    import sys
+    sys.path.append(os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../")))
+finally:
+    from verl.utils.kernel.linear_cross_entropy import linear_cross_entropy
+import verl.utils.torch_functional as verl_F
+compute_entropy_from_logits = torch.compile(verl_F.entropy_from_logits, dynamic=True)
+MAX_TEST_CASES = os.environ.get("MAX_TEST_CASES", 5)
+VERIFY_TORCH_SELF = os.environ.get("VERIFY_TORCH_SELF", False)
+LOW_MEMORY = os.environ.get("LOW_MEMORY", False)
+LOW_MEMORY_DIV_FACTOR = os.environ.get("LOW_MEMORY_DIV_FACTOR", 16)
+def run_torch_entropy(
+    hidden: torch.Tensor, weight: torch.Tensor, labels: torch.Tensor, temperature: float, reduction="none"
+) -> list[torch.Tensor]:
+    # [num_tokens, vocab_size]
+    if len(hidden.shape) > 2:
+        hidden = hidden.view(-1, hidden.shape[-1])  # [num_tokens, hidden_size]
+    if len(labels.shape) > 1:
+        labels = labels.view(-1)
+    logits = torch.matmul(
+        hidden.to(torch.float32),
+        weight.to(torch.float32) if weight.size(0) == hidden.size(1) else weight.T.to(torch.float32),
+    )
+    logits /= temperature
+    pd = torch.nn.functional.softmax(logits, dim=-1)  # [num_tokens, vocab_size]
+    entropy_a = torch.logsumexp(logits, dim=-1)  # [num_tokens]
+    entropy_b = torch.sum(pd * logits, dim=-1)  # [num_tokens]
+    entropy = entropy_a - entropy_b
+    logprobs = torch.nn.functional.cross_entropy(logits, labels, reduction=reduction)  # [num_tokens]
+    logprobs = torch.neg(logprobs)
+    return logprobs, entropy
+class TorchEntropyTP(torch.autograd.Function):
+    """
+    it is used for testing the correctness of the kernel
+    it is not efficient and is not recommended to use in practice
+    """
+    @staticmethod
+    def forward(
+        ctx,
+        hidden: torch.Tensor,
+        weight: torch.Tensor,
+        labels: torch.Tensor,
+        temperature: float,
+        dist_process_group: torch.distributed.ProcessGroup,
+    ):
+        # weight has shape [vocab_size, hidden_size], hidden has shape [num_tokens, hidden_size]
+        ctx.original_hidden_shape = hidden.shape
+        if len(hidden.shape) > 2:
+            hidden = hidden.view(-1, hidden.shape[-1])  # [num_tokens, hidden_size]
+        if len(labels.shape) > 1:
+            labels = labels.view(-1)
+        logits = torch.matmul(hidden.to(torch.float32), weight.to(torch.float32).T)  # [num_tokens, vocab_size]
+        logits /= temperature
+        whole_logits = torch.empty(
+            (logits.shape[0], logits.shape[1] * dist.get_world_size(dist_process_group)),
+            dtype=logits.dtype,
+            device=logits.device,
+        )
+        whole_logits_ref = [
+            whole_logits[:, i * logits.shape[1] : (i + 1) * logits.shape[1]]
+            for i in range(dist.get_world_size(dist_process_group))
+        ]
+        dist.all_gather(whole_logits_ref, logits, group=dist_process_group)
+        pd = torch.nn.functional.softmax(whole_logits, dim=-1)
+        entropy_a = torch.logsumexp(whole_logits, dim=-1)  # [num_tokens]
+        entropy_b = torch.sum(pd * whole_logits, dim=-1)  # [num_tokens]
+        entropy = entropy_a - entropy_b
+        logprobs = torch.nn.functional.cross_entropy(whole_logits, labels, reduction="none")
+        logprobs = torch.neg(logprobs)
+        ctx.save_for_backward(hidden, weight, labels, whole_logits, entropy_b)
+        ctx.dist_process_group = dist_process_group
+        ctx.temperature = temperature
+        return logprobs, entropy
+    @staticmethod
+    def backward(ctx, g_logprobs: torch.Tensor, g_entropy: torch.Tensor):
+        hidden, weight, labels, whole_logits, entropy_b = ctx.saved_tensors
+        dist_process_group = ctx.dist_process_group
+        temperature = ctx.temperature
+        batch_size, hidden_size = hidden.shape
+        vocab_size, hidden_size = weight.shape
+        rank = dist.get_rank(dist_process_group)
+        # Compute softmax probabilities
+        maximum, _ = torch.max(whole_logits, dim=-1, keepdim=True)
+        exp_logits = torch.exp(whole_logits - maximum)
+        accumulate = exp_logits.sum(dim=-1, keepdim=True)
+        pd = exp_logits / accumulate
+        # Gradient for entropy
+        # entropy = entropy_a - entropy_b
+        # entropy_a = log(sum(exp(logits)))
+        # entropy_b = sum(pd * logits)
+        # d_entropy_a/d_logits = pd
+        # d_entropy_b/d_logits = pd * (logits - b.unsqueeze(1) + 1)
+        # d_entropy/d_logits = d_entropy_a - d_entropy_b
+        # d_entropy/d_logits = pd - pd * (logits - b.unsqueeze(1) + 1)
+        # d_entropy/d_logits = -pd * (logits - b.unsqueeze(1))
+        d_logits_entropy = g_entropy.unsqueeze(1) * (-pd * (whole_logits - entropy_b.unsqueeze(1)))
+        # Gradient for logprobs
+        # logprobs = -cross_entropy = -log(pd[labels])
+        # d_logprobs/d_logits = (pd - one_hot(labels))
+        one_hot = torch.zeros_like(whole_logits)
+        one_hot.scatter_(1, labels.unsqueeze(1), 1)
+        g_logprobs = torch.neg(g_logprobs)
+        d_logits_logprobs = g_logprobs.unsqueeze(1) * (pd - one_hot)
+        # NOTE: This will lead to wrong result
+        # d_logits_logprobs = g_logprobs.unsqueeze(1) * (pd - 1) * one_hot
+        # Combine gradients
+        d_logits = d_logits_entropy + d_logits_logprobs
+        d_logits /= temperature
+        # Get local slice of gradients
+        local_d_logits = d_logits[:, rank * vocab_size : (rank + 1) * vocab_size]
+        # Compute gradients for hidden and weight
+        d_hidden = torch.matmul(local_d_logits, weight.to(torch.float32))
+        d_weight = torch.matmul(local_d_logits.T, hidden.to(torch.float32))
+        d_hidden = d_hidden.view(ctx.original_hidden_shape)
+        return d_hidden, d_weight, None, None, None
+run_torch_entropy_tp = TorchEntropyTP.apply
+class TestLinearCrossEntropy_TensorParallel:
+    def __init__(self):
+        dist.init_process_group(backend="nccl")
+        self.group = dist.group.WORLD
+        self.local_rank = dist.get_rank(self.group)
+        self.world_size = dist.get_world_size(self.group)
+        device = torch.device(f"cuda:{self.local_rank}")
+        torch.cuda.set_device(device)
+        print(f"[INFO]: Local rank: {self.local_rank}, World size: {self.world_size}")
+    def initialize(self, test_case_idx: int, temperature: float = 1.5):
+        self.test_case_idx = test_case_idx
+        self.temperature = temperature
+    def shutdown(self):
+        dist.destroy_process_group()
+    def cleanup(self):
+        torch.cuda.empty_cache()
+        torch.cuda.reset_peak_memory_stats()
+        import gc
+        gc.collect()
+        torch.cuda.synchronize()
+    def generate_hyper(self):
+        global LOW_MEMORY, LOW_MEMORY_DIV_FACTOR, MAX_TEST_CASES
+        self.dtype = torch.bfloat16
+        if self.test_case_idx == 0:
+            self.batch_size = 1
+            self.num_tokens = 1937
+            self.hidden_size = 3584
+            self.vocab_size = 152064
+        elif self.test_case_idx == 1:
+            self.batch_size = 1
+            self.num_tokens = 2169
+            self.hidden_size = 896
+            self.vocab_size = 151936
+        elif self.test_case_idx == 2:
+            self.batch_size = 1
+            self.num_tokens = 1530
+            self.hidden_size = 2048
+            self.vocab_size = 32256
+        elif self.test_case_idx == 3:
+            self.batch_size = 1
+            self.num_tokens = 1388
+            self.hidden_size = 4096
+            self.vocab_size = 102400
+        elif self.test_case_idx == 4:
+            self.batch_size = 1
+            self.num_tokens = 8192
+            self.hidden_size = 4096
+            self.vocab_size = 102400
+        else:
+            raise ValueError(f"Invalid test case index: {self.test_case_idx}")
+        if LOW_MEMORY:
+            self.vocab_size = int(self.vocab_size / LOW_MEMORY_DIV_FACTOR)
+        assert MAX_TEST_CASES <= 5, "MAX_TEST_CASES should be less than or equal to 5."
+    def generate_forward_inputs(self):
+        hidden = (
+            torch.empty((self.batch_size, self.num_tokens, self.hidden_size), dtype=self.dtype, device="cuda")
+            .uniform_(-0.5, 0.5)
+            .requires_grad_()
+        )
+        weight = (
+            torch.empty((self.vocab_size, self.hidden_size), dtype=self.dtype, device="cuda")
+            .uniform_(-0.5, 0.5)
+            .requires_grad_()
+        )
+        labels = torch.randint(0, self.vocab_size, (self.batch_size, self.num_tokens), device="cuda")
+        return hidden, weight, labels
+    def generate_backward_inputs(self):
+        g_entropy = torch.empty((self.num_tokens,), dtype=self.dtype, device="cuda").uniform_(-0.5, 0.5)
+        g_logprobs = torch.empty((self.num_tokens,), dtype=self.dtype, device="cuda").uniform_(-1, 1)
+        return g_entropy, g_logprobs
+    def verify_torch_itself(self, iterations: int = 5):
+        self.cleanup()
+        self.generate_hyper()
+        for i in range(iterations):
+            hidden, weight, labels = self.generate_forward_inputs()
+            # NOTE: we need to manually synchronize hidden and labels among Process Group
+            dist.broadcast(hidden, src=0, group=self.group)
+            dist.broadcast(labels, src=0, group=self.group)
+            # forward pass
+            # Create a tensor to hold the gathered weights from all ranks
+            # weight has shape [vocab_size, hidden_size]
+            # We want to gather along the first dimension to get [vocab_size * world_size, hidden_size]
+            # Create a single contiguous tensor to hold all gathered weights
+            whole_weight = torch.empty(
+                (self.vocab_size * self.world_size, self.hidden_size), dtype=weight.dtype, device=weight.device
+            )
+            # Create views into the tensor for each rank's portion
+            whole_weight_views = [
+                whole_weight[i * self.vocab_size : (i + 1) * self.vocab_size] for i in range(self.world_size)
+            ]
+            # Perform all_gather operation using the views
+            dist.all_gather(whole_weight_views, weight, group=self.group)
+            # Set requires_grad for autograd
+            whole_weight.requires_grad_()
+            (single_logprobs, single_entropy) = run_torch_entropy(hidden, whole_weight, labels, self.temperature)
+            (tp_logprobs, tp_entropy) = run_torch_entropy_tp(hidden, weight, labels, self.temperature, self.group)
+            torch.testing.assert_close(single_logprobs, tp_logprobs, atol=1e-4, rtol=1e-4)
+            torch.testing.assert_close(single_entropy, tp_entropy, atol=1e-4, rtol=1e-4)
+            # backward pass
+            g_entropy, g_logprobs = self.generate_backward_inputs()
+            # NOTE: we need to manually synchronize g_entropy and g_logprobs among Process Group
+            dist.broadcast(g_entropy, src=0, group=self.group)
+            dist.broadcast(g_logprobs, src=0, group=self.group)
+            (single_d_hidden, single_d_weight) = torch.autograd.grad(
+                (single_entropy, single_logprobs), (hidden, whole_weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            (tp_d_hidden, tp_d_weight) = torch.autograd.grad(
+                (tp_entropy, tp_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            # NOTE: all-reduce on hidden is conducted outside the kernel
+            dist.all_reduce(tp_d_hidden, op=dist.ReduceOp.SUM, group=self.group)
+            torch.testing.assert_close(tp_d_hidden, single_d_hidden, atol=1e-2, rtol=1e-4)
+            # Extract the corresponding slice from single_d_weight for comparison
+            # tp_d_weight has shape [vocab_size, hidden_size]
+            # single_d_weight has shape [vocab_size * world_size, hidden_size]
+            torch.testing.assert_close(
+                tp_d_weight,
+                single_d_weight[self.local_rank * self.vocab_size : (self.local_rank + 1) * self.vocab_size],
+                atol=1e-2,
+                rtol=1e-4,
+            )
+            # atol=1e-3, rtol=1e-4)
+        if self.local_rank == 0:
+            print("[PASS] torch TP correctness is verified")
+    def check_torch_storage(self):
+        self.cleanup()
+        self.generate_hyper()
+        hidden, weight, labels = self.generate_forward_inputs()
+        # NOTE: we need to manually synchronize hidden and labels among Process Group
+        dist.broadcast(hidden, src=0, group=self.group)
+        dist.broadcast(labels, src=0, group=self.group)
+        torch.cuda.reset_peak_memory_stats()
+        (tp_logprobs, tp_entropy) = run_torch_entropy_tp(hidden, weight, labels, self.temperature, self.group)
+        torch.cuda.synchronize()
+        forward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+        g_entropy, g_logprobs = self.generate_backward_inputs()
+        # NOTE: we need to manually synchronize g_entropy and g_logprobs among Process Group
+        dist.broadcast(g_entropy, src=0, group=self.group)
+        dist.broadcast(g_logprobs, src=0, group=self.group)
+        torch.cuda.reset_peak_memory_stats()
+        (d_tp_hidden, d_tp_weight) = torch.autograd.grad(
+            (tp_entropy, tp_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+        )
+        torch.cuda.synchronize()
+        backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+        # NOTE: all-reduce on hidden is conducted outside the kernel
+        dist.all_reduce(d_tp_hidden, op=dist.ReduceOp.SUM, group=self.group)
+        if self.local_rank == 0:
+            print(f"[INFO]: Torch Forward pass peak memory: {forward_max_memory:.2f} MB")
+            print(f"[INFO]: Torch Backward pass peak memory: {backward_max_memory:.2f} MB")
+    def verify_kernel_correctness(self, iterations: int = 5):
+        self.cleanup()
+        self.generate_hyper()
+        torch_forward_latency = list()
+        torch_backward_latency = list()
+        kernel_forward_latency = list()
+        kernel_backward_latency = list()
+        start_event = torch.cuda.Event(enable_timing=True)
+        end_event = torch.cuda.Event(enable_timing=True)
+        for i in range(iterations):
+            hidden, weight, labels = self.generate_forward_inputs()
+            # NOTE: we need to manually synchronize hidden and labels among Process Group
+            dist.broadcast(hidden, src=0, group=self.group)
+            dist.broadcast(labels, src=0, group=self.group)
+            start_event.record()
+            (torch_logprobs, torch_entropy) = run_torch_entropy_tp(hidden, weight, labels, self.temperature, self.group)
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_forward_latency.append(start_event.elapsed_time(end_event))
+            start_event.record()
+            (kernel_logprobs, kernel_entropy) = linear_cross_entropy(
+                hidden, weight, labels, self.temperature, "none", self.group
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            kernel_forward_latency.append(start_event.elapsed_time(end_event))
+            torch.testing.assert_close(torch_logprobs, kernel_logprobs, atol=1e-1, rtol=1e-2)
+            torch.testing.assert_close(torch_entropy, kernel_entropy, atol=1e-1, rtol=1e-2)
+            # backward pass
+            g_entropy, g_logprobs = self.generate_backward_inputs()
+            # NOTE: we need to manually synchronize g_entropy and g_logprobs among Process Group
+            dist.broadcast(g_entropy, src=0, group=self.group)
+            dist.broadcast(g_logprobs, src=0, group=self.group)
+            start_event.record()
+            (torch_d_hidden, torch_d_weight) = torch.autograd.grad(
+                (torch_entropy, torch_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            torch_backward_latency.append(start_event.elapsed_time(end_event))
+            # NOTE: all-reduce on hidden is conducted outside the kernel
+            dist.all_reduce(torch_d_hidden, op=dist.ReduceOp.SUM, group=self.group)
+            start_event.record()
+            (kernel_d_hidden, kernel_d_weight) = torch.autograd.grad(
+                (kernel_entropy, kernel_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+            )
+            end_event.record()
+            torch.cuda.synchronize()
+            kernel_backward_latency.append(start_event.elapsed_time(end_event))
+            # NOTE: all-reduce on hidden is conducted outside the kernel
+            dist.all_reduce(kernel_d_hidden, op=dist.ReduceOp.SUM, group=self.group)
+            torch.testing.assert_close(torch_d_hidden, kernel_d_hidden, atol=2e-2, rtol=4e-2)
+            torch.testing.assert_close(torch_d_weight, kernel_d_weight, atol=2e-2, rtol=4e-2)
+        # remove first latency
+        torch_forward_latency = torch_forward_latency[1:]
+        torch_backward_latency = torch_backward_latency[1:]
+        kernel_forward_latency = kernel_forward_latency[1:]
+        kernel_backward_latency = kernel_backward_latency[1:]
+        if self.local_rank == 0:
+            print("\n[PASS]: Verified kernel forward & backward correctness.")
+            print(
+                f"[INFO]: Forward pass: Torch implementation average time: "
+                f"{sum(torch_forward_latency) / len(torch_forward_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Backward pass: torch implementation average time: "
+                f"{sum(torch_backward_latency) / len(torch_backward_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Forward pass: Kernel implementation average time: "
+                f"{sum(kernel_forward_latency) / len(kernel_forward_latency):.2f} ms"
+            )
+            print(
+                f"[INFO]: Backward pass: kernel implementation average time: "
+                f"{sum(kernel_backward_latency) / len(kernel_backward_latency):.2f} ms"
+            )
+    def check_kernel_storage(self):
+        self.cleanup()
+        self.generate_hyper()
+        hidden, weight, labels = self.generate_forward_inputs()
+        # NOTE: we need to manually synchronize hidden and labels among Process Group
+        dist.broadcast(hidden, src=0, group=self.group)
+        dist.broadcast(labels, src=0, group=self.group)
+        torch.cuda.reset_peak_memory_stats()
+        (kernel_logprobs, kernel_entropy) = linear_cross_entropy(
+            hidden, weight, labels, self.temperature, "none", self.group
+        )
+        torch.cuda.synchronize()
+        kernel_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+        g_entropy, g_logprobs = self.generate_backward_inputs()
+        # NOTE: we need to manually synchronize g_entropy and g_logprobs among Process Group
+        dist.broadcast(g_entropy, src=0, group=self.group)
+        dist.broadcast(g_logprobs, src=0, group=self.group)
+        torch.cuda.reset_peak_memory_stats()
+        (d_kernel_hidden, d_kernel_weight) = torch.autograd.grad(
+            (kernel_entropy, kernel_logprobs), (hidden, weight), (g_entropy, g_logprobs), retain_graph=False
+        )
+        torch.cuda.synchronize()
+        kernel_backward_max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024
+        # NOTE: all-reduce on hidden is conducted outside the kernel
+        dist.all_reduce(d_kernel_hidden, op=dist.ReduceOp.SUM, group=self.group)
+        if self.local_rank == 0:
+            print(f"[INFO]: Kernel Forward pass peak memory: {kernel_max_memory:.2f} MB")
+            print(f"[INFO]: Kernel Backward pass peak memory: {kernel_backward_max_memory:.2f} MB")
+if __name__ == "__main__":
+    # TP command: torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/kernels/test_linear_cross_entropy_tp.py
+    # Check if running with torchrun (distributed mode)
+    assert int(os.environ["WORLD_SIZE"]) > 1, (
+        "[ERROR]: This test is designed to run in distributed mode with torchrun. Please use torchrun to "
+        "execute this script."
+    )
+    torch.manual_seed(233376 + int(os.environ.get("RANK", 0)))
+    # set_backward_method(BackwardEnum._Total_Fuse_MN)
+    # set_backward_method(BackwardEnum._Split_Dlogits_N)
+    test = TestLinearCrossEntropy_TensorParallel()
+    for test_case_idx in range(MAX_TEST_CASES):
+        print(f"[INFO] Running test case {test_case_idx}")
+        test.initialize(test_case_idx)
+        if VERIFY_TORCH_SELF:
+            test.verify_torch_itself()
+        test.check_torch_storage()
+        test.verify_kernel_correctness()
+        test.check_kernel_storage()
+    test.shutdown()

code/RL_model/verl/verl_train/tests/utils/test_special_mstx_profile.py ADDED Viewed

	@@ -0,0 +1,274 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from unittest.mock import MagicMock, patch
+from verl.utils.profiler.config import NPUToolConfig, ProfilerConfig
+from verl.utils.profiler.mstx_profile import NPUProfiler
+from verl.utils.profiler.profile import DistProfiler
+class TestNPUProfilerInitialization(unittest.TestCase):
+    def setUp(self):
+        NPUProfiler._define_count = 0
+    def test_init_with_default_config(self):
+        tool_config = NPUToolConfig()
+        config = ProfilerConfig(tool="npu")
+        profiler = DistProfiler(rank=0, config=config, tool_config=tool_config)
+        self.assertFalse(profiler.check_enable())
+    def test_init_with_disabled_config(self):
+        config = ProfilerConfig(enable=False, tool="npu")
+        tool_config = NPUToolConfig()
+        profiler = DistProfiler(rank=0, config=config, tool_config=tool_config)
+        self.assertFalse(profiler.check_enable())
+    def test_init_with_all_ranks_true(self):
+        config = ProfilerConfig(enable=True, all_ranks=True, tool="npu")
+        tool_config = NPUToolConfig()
+        profiler = DistProfiler(rank=0, config=config, tool_config=tool_config)
+        self.assertTrue(profiler.check_this_rank())
+    def test_init_with_ranks_list(self):
+        config = ProfilerConfig(enable=True, ranks=[1, 2], tool="npu")
+        tool_config = NPUToolConfig()
+        profiler = DistProfiler(rank=1, config=config, tool_config=tool_config)
+        self.assertTrue(profiler.check_this_rank())
+    def test_init_with_rank_not_in_ranks(self):
+        config = ProfilerConfig(enable=True, ranks=[1, 2], tool="npu")
+        tool_config = NPUToolConfig()
+        profiler = DistProfiler(rank=3, config=config, tool_config=tool_config)
+        self.assertFalse(profiler.check_this_rank())
+class TestNPUProfilerStart(unittest.TestCase):
+    def setUp(self):
+        NPUProfiler._define_count = 0
+        self.config = ProfilerConfig(enable=True, ranks=[0], tool="npu")
+        self.tool_config = NPUToolConfig(discrete=False)
+    @patch("verl.utils.profiler.mstx_profile.get_npu_profiler")
+    def test_start_when_enabled_and_this_rank(self, mock_get_profiler):
+        profiler = DistProfiler(rank=0, config=self.config, tool_config=self.tool_config)
+        profiler.start(role="worker", profile_step="1")
+        self.assertTrue(profiler.check_this_step())
+        self.assertEqual(NPUProfiler._define_count, 1)
+        mock_get_profiler.assert_called_once()
+    @patch("verl.utils.profiler.mstx_profile.get_npu_profiler")
+    def test_start_when_not_this_rank(self, mock_get_profiler):
+        profiler = DistProfiler(rank=1, config=self.config, tool_config=self.tool_config)
+        profiler.start()
+        self.assertFalse(profiler.check_this_step())
+        self.assertEqual(NPUProfiler._define_count, 0)
+        mock_get_profiler.assert_not_called()
+    @patch("verl.utils.profiler.mstx_profile.get_npu_profiler")
+    def test_start_discrete_mode_does_not_increase_count(self, mock_get_profiler):
+        tool_config = NPUToolConfig(discrete=True)
+        profiler = DistProfiler(rank=0, config=self.config, tool_config=tool_config)
+        profiler.start()
+        self.assertEqual(NPUProfiler._define_count, 0)
+        mock_get_profiler.assert_not_called()
+    @patch("verl.utils.profiler.mstx_profile.get_npu_profiler")
+    def test_multiple_start_calls_do_not_increase_count(self, mock_get_profiler):
+        profiler = DistProfiler(rank=0, config=self.config, tool_config=self.tool_config)
+        profiler.start()
+        profiler.start()
+        self.assertEqual(NPUProfiler._define_count, 1)
+        mock_get_profiler.assert_called_once()
+class TestNPUProfilerStartStopInteraction(unittest.TestCase):
+    def setUp(self):
+        NPUProfiler._define_count = 0
+        self.config = ProfilerConfig(enable=True, ranks=[0], tool="npu")
+        self.tool_config = NPUToolConfig(discrete=False)
+    @patch("verl.utils.profiler.mstx_profile.get_npu_profiler")
+    def test_start_stop_cycle(self, mock_get_profiler):
+        mock_profile_npu = MagicMock()
+        mock_get_profiler.return_value = mock_profile_npu
+        profiler = DistProfiler(rank=0, config=self.config, tool_config=self.tool_config)
+        profiler.start()
+        self.assertEqual(NPUProfiler._define_count, 1)
+        self.assertEqual(mock_profile_npu.start.call_count, 1)
+        profiler.stop()
+        self.assertEqual(NPUProfiler._define_count, 0)
+        self.assertEqual(mock_profile_npu.step.call_count, 1)
+        self.assertEqual(mock_profile_npu.stop.call_count, 1)
+    @patch("verl.utils.profiler.mstx_profile.get_npu_profiler")
+    def test_multiple_instances_share_define_count(self, mock_get_profiler):
+        mock_profile_npu = MagicMock()
+        mock_get_profiler.return_value = mock_profile_npu
+        profiler1 = DistProfiler(rank=0, config=self.config, tool_config=self.tool_config)
+        profiler2 = DistProfiler(rank=0, config=self.config, tool_config=self.tool_config)
+        profiler1.start()
+        profiler2.start()
+        self.assertEqual(NPUProfiler._define_count, 1)
+        self.assertEqual(mock_profile_npu.start.call_count, 1)
+        profiler1.stop()
+        self.assertEqual(NPUProfiler._define_count, 0)
+class TestNPUProfilerAnnotate(unittest.TestCase):
+    def setUp(self):
+        self.config = ProfilerConfig(enable=True, all_ranks=True, tool="npu")
+        self.tool_config = NPUToolConfig(discrete=False)
+        self.rank = 0
+    def test_annotate_decorator_applied_correctly(self):
+        mock_worker = MagicMock()
+        mock_worker.profiler = DistProfiler(rank=self.rank, config=self.config, tool_config=self.tool_config)
+        # Manually set private attribute for testing annotation in active step
+        mock_worker.profiler._this_step = True
+        mock_mark_range = "mocked_range_handle"
+        with (
+            patch("verl.utils.profiler.mstx_profile.mark_start_range") as mock_start_patch,
+            patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
+        ):
+            mock_start_patch.return_value = mock_mark_range
+            with patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler:
+                decorator = mock_worker.profiler.annotate(message="test")
+                @decorator
+                def test_func(self, *args, **kwargs):
+                    return "result"
+                result = test_func(mock_worker)
+                self.assertEqual(result, "result")
+                mock_start_patch.assert_called_once_with(message="test")
+                mock_end_patch.assert_called_once_with(mock_mark_range)
+                mock_get_profiler.assert_not_called()
+    def test_annotate_when_profiler_disabled(self):
+        disabled_config = ProfilerConfig(enable=False, tool="npu")
+        mock_worker = MagicMock()
+        mock_worker.profiler = DistProfiler(rank=self.rank, config=disabled_config, tool_config=self.tool_config)
+        with (
+            patch("verl.utils.profiler.mstx_profile.mark_start_range") as mock_start_patch,
+            patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
+            patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler,
+        ):
+            decorator = mock_worker.profiler.annotate(message="test")
+            @decorator
+            def test_func(self, *args, **kwargs):
+                return "result"
+            result = test_func(mock_worker)
+            self.assertEqual(result, "result")
+            mock_start_patch.assert_not_called()
+            mock_end_patch.assert_not_called()
+            mock_get_profiler.assert_not_called()
+    def test_annotate_when_this_step_disabled(self):
+        mock_worker = MagicMock()
+        mock_worker.profiler = DistProfiler(rank=self.rank, config=self.config, tool_config=self.tool_config)
+        mock_worker.profiler._this_step = False
+        with (
+            patch("verl.utils.profiler.mstx_profile.mark_start_range") as mock_start_patch,
+            patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
+            patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler,
+        ):
+            decorator = mock_worker.profiler.annotate(message="test")
+            @decorator
+            def test_func(self, *args, **kwargs):
+                return "result"
+            result = test_func(mock_worker)
+            self.assertEqual(result, "result")
+            mock_start_patch.assert_not_called()
+            mock_end_patch.assert_not_called()
+            mock_get_profiler.assert_not_called()
+    def test_annotate_discrete_mode_enabled(self):
+        discrete_tool_config = NPUToolConfig(discrete=True)
+        mock_worker = MagicMock()
+        mock_worker.profiler = DistProfiler(rank=self.rank, config=self.config, tool_config=discrete_tool_config)
+        mock_worker.profiler._this_step = True
+        mock_mark_range = "mocked_range_handle"
+        mock_profile_npu = MagicMock()
+        with (
+            patch("verl.utils.profiler.mstx_profile.mark_start_range") as mock_start_patch,
+            patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
+            patch("verl.utils.profiler.mstx_profile.get_npu_profiler") as mock_get_profiler,
+        ):
+            mock_start_patch.return_value = mock_mark_range
+            mock_get_profiler.return_value = mock_profile_npu
+            decorator = mock_worker.profiler.annotate(message="test", role="test_role")
+            @decorator
+            def test_func(self, *args, **kwargs):
+                return "result"
+            result = test_func(mock_worker)
+            self.assertEqual(result, "result")
+            mock_start_patch.assert_called_once_with(message="test")
+            mock_end_patch.assert_called_once_with(mock_mark_range)
+            mock_get_profiler.assert_called_once_with(
+                contents=mock_worker.profiler._impl.profile_contents,
+                profile_level=mock_worker.profiler._impl.profile_level,
+                profile_save_path=mock_worker.profiler._impl.profile_save_path,
+                analysis=mock_worker.profiler._impl.analysis,
+                role="test_role",
+            )
+            mock_profile_npu.start.assert_called_once()
+            mock_profile_npu.step.assert_called_once()
+            mock_profile_npu.stop.assert_called_once()
+    def test_annotate_with_default_message(self):
+        mock_worker = MagicMock()
+        mock_worker.profiler = DistProfiler(rank=self.rank, config=self.config, tool_config=self.tool_config)
+        mock_worker.profiler._this_step = True
+        mock_mark_range = "mocked_range_handle"
+        with (
+            patch("verl.utils.profiler.mstx_profile.mark_start_range") as mock_start_patch,
+            patch("verl.utils.profiler.mstx_profile.mark_end_range") as mock_end_patch,
+        ):
+            mock_start_patch.return_value = mock_mark_range
+            decorator = mock_worker.profiler.annotate()
+            @decorator
+            def test_func(self, *args, **kwargs):
+                return "result"
+            test_func(mock_worker)
+            mock_start_patch.assert_called_once_with(message="test_func")
+            mock_end_patch.assert_called_once_with(mock_mark_range)
+if __name__ == "__main__":
+    unittest.main()

code/RL_model/verl/verl_train/tests/utils/test_temp_env_on_cpu.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import pytest
+from verl.utils.py_functional import temp_env_var
+@pytest.fixture(autouse=True)
+def clean_env():
+    """Fixture to clean up environment variables before and after each test."""
+    # Store original environment state
+    original_env = dict(os.environ)
+    # Clean up any test variables that might exist
+    test_vars = ["TEST_VAR", "TEST_VAR_2", "EXISTING_VAR"]
+    for var in test_vars:
+        if var in os.environ:
+            del os.environ[var]
+    # Yield control to the test function
+    yield
+    # Restore original environment state after test
+    os.environ.clear()
+    os.environ.update(original_env)
+def test_set_new_env_var():
+    """Test setting a new environment variable that didn't exist before."""
+    # Ensure variable doesn't exist
+    assert "TEST_VAR" not in os.environ
+    with temp_env_var("TEST_VAR", "test_value"):
+        # Variable should be set inside context
+        assert os.environ["TEST_VAR"] == "test_value"
+        assert "TEST_VAR" in os.environ
+    # Variable should be removed after context
+    assert "TEST_VAR" not in os.environ
+def test_restore_existing_env_var():
+    """Test restoring an environment variable that already existed."""
+    # Set up existing variable
+    os.environ["EXISTING_VAR"] = "original_value"
+    with temp_env_var("EXISTING_VAR", "temporary_value"):
+        # Variable should be temporarily changed
+        assert os.environ["EXISTING_VAR"] == "temporary_value"
+    # Variable should be restored to original value
+    assert os.environ["EXISTING_VAR"] == "original_value"
+def test_env_var_restored_on_exception():
+    """Test that environment variables are restored even when exceptions occur."""
+    # Set up existing variable
+    os.environ["EXISTING_VAR"] = "original_value"
+    with pytest.raises(ValueError):
+        with temp_env_var("EXISTING_VAR", "temporary_value"):
+            # Verify variable is set
+            assert os.environ["EXISTING_VAR"] == "temporary_value"
+            # Raise exception
+            raise ValueError("Test exception")
+    # Variable should still be restored despite exception
+    assert os.environ["EXISTING_VAR"] == "original_value"
+def test_nested_context_managers():
+    """Test nested temp_env_var context managers."""
+    # Set up original variable
+    os.environ["TEST_VAR"] = "original"
+    with temp_env_var("TEST_VAR", "level1"):
+        assert os.environ["TEST_VAR"] == "level1"
+        with temp_env_var("TEST_VAR", "level2"):
+            assert os.environ["TEST_VAR"] == "level2"
+        # Should restore to level1
+        assert os.environ["TEST_VAR"] == "level1"
+    # Should restore to original
+    assert os.environ["TEST_VAR"] == "original"
+def test_multiple_different_vars():
+    """Test setting multiple different environment variables."""
+    # Set up one existing variable
+    os.environ["EXISTING_VAR"] = "existing_value"
+    with temp_env_var("EXISTING_VAR", "modified"):
+        with temp_env_var("TEST_VAR", "new_value"):
+            assert os.environ["EXISTING_VAR"] == "modified"
+            assert os.environ["TEST_VAR"] == "new_value"
+    # Check restoration
+    assert os.environ["EXISTING_VAR"] == "existing_value"
+    assert "TEST_VAR" not in os.environ
+def test_empty_string_value():
+    """Test setting environment variable to empty string."""
+    with temp_env_var("TEST_VAR", ""):
+        assert os.environ["TEST_VAR"] == ""
+        assert "TEST_VAR" in os.environ
+    # Should be removed after context
+    assert "TEST_VAR" not in os.environ
+def test_overwrite_with_empty_string():
+    """Test overwriting existing variable with empty string."""
+    os.environ["EXISTING_VAR"] = "original"
+    with temp_env_var("EXISTING_VAR", ""):
+        assert os.environ["EXISTING_VAR"] == ""
+    # Should restore original value
+    assert os.environ["EXISTING_VAR"] == "original"
+def test_context_manager_returns_none():
+    """Test that context manager yields None."""
+    with temp_env_var("TEST_VAR", "value") as result:
+        assert result is None
+        assert os.environ["TEST_VAR"] == "value"

code/RL_model/verl/verl_train/tests/utils/test_timeout_decorator_cpu.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import multiprocessing
+import sys
+import threading
+import time
+import pytest  # Import pytest
+from verl.utils.py_functional import timeout_limit as timeout
+# --- Test Task Functions ---
+TEST_TIMEOUT_SECONDS = 1.5  # Timeout duration for tests
+LONG_TASK_DURATION = TEST_TIMEOUT_SECONDS + 0.5  # Duration slightly longer than timeout
+@timeout(seconds=TEST_TIMEOUT_SECONDS)  # Keep global decorator for mp tests
+def quick_task(x):
+    """A task that completes quickly."""
+    time.sleep(0.1)
+    return "quick_ok"
+@timeout(seconds=TEST_TIMEOUT_SECONDS)  # Keep global decorator for mp tests
+def slow_task(x):
+    """A task that takes longer than the timeout."""
+    time.sleep(LONG_TASK_DURATION)
+    return "slow_finished"  # This return value indicates it didn't time out
+# REMOVE global decorator here
+def task_raises_value_error():  # Now truly not globally decorated
+    """A task that intentionally raises a ValueError."""
+    raise ValueError("Specific value error from task")
+# --- Top-level function for signal test in subprocess ---
+# Keep this decorated globally for the specific subprocess test case
+@timeout(seconds=TEST_TIMEOUT_SECONDS, use_signals=True)
+def top_level_decorated_quick_task_signal():
+    """A pickleable top-level function decorated with signal timeout."""
+    # Assuming this calls the logic of quick_task directly for the test purpose
+    time.sleep(0.1)
+    return "quick_ok_signal_subprocess"  # Different return for clarity if needed
+# --- Top-level function for signal test in subprocess ---
+# Keep this decorated globally for the specific subprocess test case
+@timeout(seconds=TEST_TIMEOUT_SECONDS, use_signals=True)
+def top_level_decorated_slow_task_signal():
+    """A pickleable top-level function decorated with signal timeout."""
+    time.sleep(LONG_TASK_DURATION)
+    return "slow_finished"
+# --- NEW: Top-level helper function to run target in process ---
+def run_target_and_put_in_queue(target_func, q):
+    """
+    Top-level helper function to run a target function and put its result or exception into a queue.
+    This function is pickleable and can be used as the target for multiprocessing.Process.
+    """
+    try:
+        result = target_func()
+        q.put(("success", result))
+    except Exception as e:
+        q.put(("error", e))
+# Use a module-level fixture to set the start method on macOS
+@pytest.fixture(scope="module", autouse=True)  # Changed scope to module
+def set_macos_start_method():
+    if sys.platform == "darwin":
+        # Force fork method on macOS to avoid pickling issues with globally decorated functions
+        # when running tests via pytest discovery.
+        current_method = multiprocessing.get_start_method(allow_none=True)
+        # Only set if not already set or if set to something else (less likely in test run)
+        if current_method is None or current_method != "fork":
+            try:
+                multiprocessing.set_start_method("fork", force=True)
+            except RuntimeError:
+                # Might fail if context is already started, ignore in that case.
+                pass
+def test_quick_task():  # Renamed from test_multiprocessing_quick_task
+    """Tests timeout handles a quick task correctly."""
+    # Call the globally decorated function directly
+    result = quick_task(1)
+    assert result == "quick_ok"  # Use pytest assert
+def test_slow_task_timeout():  # Renamed from test_multiprocessing_slow_task_timeout
+    """Tests timeout correctly raises TimeoutError for a slow task."""
+    # Call the globally decorated function directly within pytest.raises
+    with pytest.raises(TimeoutError) as excinfo:  # Use pytest.raises
+        slow_task(1)
+    # Check the error message from the multiprocessing implementation
+    assert f"timed out after {TEST_TIMEOUT_SECONDS} seconds" in str(excinfo.value)  # Use pytest assert
+def test_internal_exception():  # Renamed from test_multiprocessing_internal_exception
+    """Tests timeout correctly propagates internal exceptions."""
+    # Apply the default timeout decorator dynamically to the undecorated function
+    decorated_task = timeout(seconds=TEST_TIMEOUT_SECONDS)(task_raises_value_error)  # Apply decorator dynamically
+    with pytest.raises(ValueError) as excinfo:  # Use pytest.raises
+        decorated_task()  # Call the dynamically decorated function
+    assert str(excinfo.value) == "Specific value error from task"  # Use pytest assert
+# --- Test the signal implementation (use_signals=True) ---
+# Note: As per py_functional.py, use_signals=True currently falls back to
+# multiprocessing on POSIX. These tests verify that behavior.
+def test_signal_quick_task_main_process():  # Removed self
+    """Tests signal timeout handles a quick task correctly in the main process."""
+    # Apply the signal decorator dynamically
+    def plain_quick_task_logic():
+        time.sleep(0.1)
+        return "quick_ok_signal"
+    decorated_task = timeout(seconds=TEST_TIMEOUT_SECONDS, use_signals=True)(plain_quick_task_logic)
+    assert decorated_task() == "quick_ok_signal"  # Use pytest assert
+def test_signal_slow_task_main_process_timeout():  # Removed self
+    """Tests signal timeout correctly raises TimeoutError for a slow task in the main process."""
+    # Apply the signal decorator dynamically
+    def plain_slow_task_logic():
+        time.sleep(LONG_TASK_DURATION)
+        return "slow_finished_signal"
+    decorated_task = timeout(seconds=TEST_TIMEOUT_SECONDS, use_signals=True)(plain_slow_task_logic)
+    with pytest.raises(TimeoutError) as excinfo:  # Use pytest.raises
+        decorated_task()
+    # Check the error message (falls back to multiprocessing message on POSIX)
+    assert f"timed out after {TEST_TIMEOUT_SECONDS} seconds" in str(excinfo.value)  # Use pytest assert
+@pytest.mark.skip(reason="this test won't pass. Just to show why use_signals should not be used")
+def test_signal_in_thread_does_not_timeout():
+    """
+    Tests that signal-based timeout does NOT work reliably in a child thread.
+    The TimeoutError from the signal handler is not expected to be raised.
+    """
+    result_container = []  # Use a list to store result from thread
+    exception_container = []  # Use a list to store exception from thread
+    @timeout(seconds=TEST_TIMEOUT_SECONDS, use_signals=True)
+    def slow_task_in_thread():
+        try:
+            print("Thread: Starting slow task...")
+            time.sleep(LONG_TASK_DURATION)
+            print("Thread: Slow task finished.")
+            return "slow_finished_in_thread"
+        except Exception as e:
+            # Catch any exception within the thread's target function
+            print(f"Thread: Caught exception: {e}")
+            exception_container.append(e)
+            return None  # Indicate failure
+    def thread_target():
+        try:
+            # Run the decorated function inside the thread
+            res = slow_task_in_thread()
+            if res is not None:
+                result_container.append(res)
+        except Exception as e:
+            # This might catch exceptions happening *outside* the decorated function
+            # but still within the thread target, though less likely here.
+            print(f"Thread Target: Caught exception: {e}")
+            exception_container.append(e)
+    thread = threading.Thread(target=thread_target)
+    print("Main: Starting thread...")
+    thread.start()
+    # Wait longer than the timeout + task duration to ensure the thread finishes
+    # regardless of whether timeout worked or not.
+    thread.join(timeout=LONG_TASK_DURATION + 1)
+    assert len(exception_container) == 1
+    assert isinstance(exception_container[0], TimeoutError)
+    assert not result_container
+def test_in_thread_timeout():
+    result_container = []  # Use a list to store result from thread
+    exception_container = []  # Use a list to store exception from thread
+    @timeout(seconds=TEST_TIMEOUT_SECONDS, use_signals=False)
+    def slow_task_in_thread():
+        try:
+            print("Thread: Starting slow task...")
+            time.sleep(LONG_TASK_DURATION)
+            print("Thread: Slow task finished.")
+            return "slow_finished_in_thread"
+        except Exception as e:
+            # Catch any exception within the thread's target function
+            print(f"Thread: Caught exception: {e}")
+            exception_container.append(e)
+            return None  # Indicate failure
+    def thread_target():
+        try:
+            # Run the decorated function inside the thread
+            res = slow_task_in_thread()
+            if res is not None:
+                result_container.append(res)
+        except Exception as e:
+            # This might catch exceptions happening *outside* the decorated function
+            # but still within the thread target, though less likely here.
+            print(f"Thread Target: Caught exception: {e}")
+            exception_container.append(e)
+    thread = threading.Thread(target=thread_target)
+    print("Main: Starting thread...")
+    thread.start()
+    # Wait longer than the timeout + task duration to ensure the thread finishes
+    # regardless of whether timeout worked or not.
+    thread.join(timeout=LONG_TASK_DURATION + 1)
+    assert len(exception_container) == 1
+    assert isinstance(exception_container[0], TimeoutError)
+    assert not result_container

code/RL_model/verl/verl_train/tests/utils/test_torch_functional.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import pytest
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from verl.utils.device import get_device_name, get_nccl_backend, get_torch_device
+from verl.utils.torch_functional import (
+    distributed_masked_mean,
+    distributed_mean_max_min_std,
+    expand_as_nested,
+    masked_mean,
+)
+def _worker_mean(rank: int, world_size: int, rendezvous_file: str):
+    # 1) set GPU and init NCCL
+    get_torch_device().set_device(rank)
+    dist.init_process_group(
+        backend=get_nccl_backend(),
+        init_method=f"file://{rendezvous_file}",
+        rank=rank,
+        world_size=world_size,
+    )
+    # each rank holds tensor [rank+1]
+    local = torch.tensor([float(rank + 1)], device=f"{get_device_name()}:{rank}")
+    mean, gmax, gmin, gstd = distributed_mean_max_min_std(local, True, True, True)
+    values = [float(i + 1) for i in range(world_size)]
+    exp_mean = sum(values) / len(values)
+    exp_max = max(values)
+    exp_min = min(values)
+    var = sum((x - exp_mean) ** 2 for x in values) / (len(values) - 1)
+    exp_std = var**0.5
+    # all ranks should see the same result
+    assert torch.allclose(mean.cpu(), torch.tensor(exp_mean)), f"mean@{rank}"
+    assert torch.allclose(gmax.cpu(), torch.tensor(exp_max)), f"max@{rank}"
+    assert torch.allclose(gmin.cpu(), torch.tensor(exp_min)), f"min@{rank}"
+    assert torch.allclose(gstd.cpu(), torch.tensor(exp_std)), f"std@{rank}"
+    dist.destroy_process_group()
+@pytest.mark.parametrize(
+    "value,mask,gt",
+    [
+        ([1.0, 2.0, 3.0, 4.0], [1, 0, 0, 1], 2.5),
+        ([1.0, 2.0, float("nan"), 4.0], [1, 0, 0, 1], 2.5),
+        ([1.0, 2.0, float("nan"), 4.0], [1, 0, 1, 0], float("nan")),
+    ],
+)
+def test_masked_mean(value, mask, gt):
+    res = masked_mean(torch.tensor(value), torch.tensor(mask))
+    gt = torch.tensor(gt)
+    assert torch.allclose(res, gt) or (torch.isnan(res) and torch.isnan(gt))
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_distributed_mean_max_min_std(world_size, tmp_path):
+    rendezvous_file = str(tmp_path / "rdzv_mean")
+    os.makedirs(os.path.dirname(rendezvous_file), exist_ok=True)
+    mp.spawn(
+        fn=_worker_mean,
+        args=(world_size, rendezvous_file),
+        nprocs=world_size,
+        join=True,
+    )
+def _worker_mask(rank: int, world_size: int, rendezvous_file: str):
+    get_torch_device().set_device(rank)
+    dist.init_process_group(
+        backend=get_nccl_backend(),
+        init_method=f"file://{rendezvous_file}",
+        rank=rank,
+        world_size=world_size,
+    )
+    # build per‐rank tensor and mask
+    local_tensor = torch.tensor([rank * 2 + 1.0, rank * 2 + 2.0], device=f"{get_device_name()}:{rank}")
+    if rank == 0:
+        mask = torch.tensor([1, 0], device=f"{get_device_name()}:{rank}", dtype=torch.float32)
+    else:
+        mask = torch.tensor([0, 1], device=f"{get_device_name()}:{rank}", dtype=torch.float32)
+    gmean = distributed_masked_mean(local_tensor, mask)
+    valid_values = [1.0] + [2 * i + 2.0 for i in range(1, world_size)]
+    expected_mean = sum(valid_values) / len(valid_values)
+    assert torch.allclose(gmean.cpu(), torch.tensor(expected_mean)), f"masked_mean@{rank}"
+    dist.destroy_process_group()
+@pytest.mark.parametrize("world_size", [2, 4])
+def test_distributed_masked_mean(world_size, tmp_path):
+    rendezvous_file = str(tmp_path / "rdzv_mask")
+    os.makedirs(os.path.dirname(rendezvous_file), exist_ok=True)
+    mp.spawn(
+        fn=_worker_mask,
+        args=(world_size, rendezvous_file),
+        nprocs=world_size,
+        join=True,
+    )
+def test_expand_as_nested():
+    a = torch.randn(2)
+    b = torch.randn(3)
+    c = torch.randn(4)
+    nested_tensor = torch.nested.as_nested_tensor([a, b, c], layout=torch.jagged)
+    tensor = torch.tensor([1, 2, 3])
+    output = expand_as_nested(tensor, nested_tensor)
+    assert output.values().tolist() == [1, 1, 2, 2, 2, 3, 3, 3, 3]
+    assert torch.all(output.offsets() == nested_tensor.offsets()).item()
+    # test exceptions
+    with pytest.raises(AssertionError):
+        expand_as_nested(tensor, tensor)
+    other_tensor = torch.tensor([1, 2, 3, 4])
+    with pytest.raises(AssertionError):
+        expand_as_nested(other_tensor, nested_tensor)
+    other_tensor = torch.tensor([[1, 2, 3]])
+    with pytest.raises(AssertionError):
+        expand_as_nested(other_tensor, nested_tensor)
+    with pytest.raises(AssertionError):
+        expand_as_nested(tensor, nested_tensor.unsqueeze(-1))