yuccaaa commited on Sep 3, 2025

Commit

9828e9e

verified ·

1 Parent(s): 9440cb3

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

EasyR1-new/verl.egg-info/PKG-INFO +270 -0
EasyR1-new/verl.egg-info/SOURCES.txt +72 -0
EasyR1-new/verl.egg-info/dependency_links.txt +1 -0
EasyR1-new/verl.egg-info/requires.txt +24 -0
EasyR1-new/verl.egg-info/top_level.txt +1 -0
EasyR1-new/verl/ProtT3/__pycache__/blip2.cpython-310.pyc +0 -0
EasyR1-new/verl/ProtT3/__pycache__/blip2_opt.cpython-310.pyc +0 -0
EasyR1-new/verl/ProtT3/__pycache__/blip2_stage2.cpython-310.pyc +0 -0
EasyR1-new/verl/ProtT3/__pycache__/help_funcs.cpython-310.pyc +0 -0
EasyR1-new/verl/ProtT3/__pycache__/opt_flash_attention.cpython-310.pyc +0 -0
EasyR1-new/verl/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/__pycache__/protocol.cpython-310.pyc +0 -0
EasyR1-new/verl/models/__init__.py +13 -0
EasyR1-new/verl/models/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/models/__pycache__/monkey_patch.cpython-310.pyc +0 -0
EasyR1-new/verl/models/monkey_patch.py +63 -0
EasyR1-new/verl/models/transformers/__init__.py +13 -0
EasyR1-new/verl/models/transformers/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/models/transformers/__pycache__/flash_attention_utils.cpython-310.pyc +0 -0
EasyR1-new/verl/models/transformers/__pycache__/qwen2_vl.cpython-310.pyc +0 -0
EasyR1-new/verl/models/transformers/flash_attention_utils.py +183 -0
EasyR1-new/verl/models/transformers/qwen2_vl.py +356 -0
EasyR1-new/verl/single_controller/__init__.py +13 -0
EasyR1-new/verl/single_controller/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/base/__init__.py +19 -0
EasyR1-new/verl/single_controller/base/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/base/__pycache__/decorator.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/base/__pycache__/worker.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/base/__pycache__/worker_group.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/base/decorator.py +213 -0
EasyR1-new/verl/single_controller/base/register_center/__init__.py +13 -0
EasyR1-new/verl/single_controller/base/register_center/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/base/register_center/__pycache__/ray.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/base/register_center/ray.py +28 -0
EasyR1-new/verl/single_controller/base/worker.py +202 -0
EasyR1-new/verl/single_controller/base/worker_group.py +194 -0
EasyR1-new/verl/single_controller/ray/__init__.py +18 -0
EasyR1-new/verl/single_controller/ray/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/ray/__pycache__/base.cpython-310.pyc +0 -0
EasyR1-new/verl/single_controller/ray/base.py +493 -0
EasyR1-new/verl/trainer/__init__.py +13 -0
EasyR1-new/verl/trainer/__pycache__/__init__.cpython-310.pyc +0 -0
EasyR1-new/verl/trainer/__pycache__/config.cpython-310.pyc +0 -0
EasyR1-new/verl/trainer/__pycache__/core_algos.cpython-310.pyc +0 -0
EasyR1-new/verl/trainer/__pycache__/data_loader.cpython-310.pyc +0 -0
EasyR1-new/verl/trainer/__pycache__/main.cpython-310.pyc +0 -0
EasyR1-new/verl/trainer/__pycache__/metrics.cpython-310.pyc +0 -0
EasyR1-new/verl/trainer/__pycache__/ray_trainer.cpython-310.pyc +0 -0
EasyR1-new/verl/trainer/config.py +179 -0
EasyR1-new/verl/trainer/core_algos.py +495 -0

EasyR1-new/verl.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,270 @@

+Metadata-Version: 2.4
+Name: verl
+Version: 0.3.2.dev0
+Summary: An Efficient, Scalable, Multi-Modality RL Training Framework based on veRL
+Home-page: https://github.com/volcengine/verl
+Author: verl
+Author-email: zhangchi.usc1992@bytedance.com, gmsheng@connect.hku.hk, hiyouga@buaa.edu.cn
+License: Apache 2.0 License
+Platform: UNKNOWN
+Requires-Python: >=3.9.0
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: accelerate
+Requires-Dist: codetiming
+Requires-Dist: datasets
+Requires-Dist: flash-attn>=2.4.3
+Requires-Dist: liger-kernel
+Requires-Dist: mathruler
+Requires-Dist: numpy
+Requires-Dist: omegaconf
+Requires-Dist: pandas
+Requires-Dist: peft
+Requires-Dist: pillow
+Requires-Dist: pyarrow>=15.0.0
+Requires-Dist: pylatexenc
+Requires-Dist: qwen-vl-utils
+Requires-Dist: ray[default]
+Requires-Dist: tensordict
+Requires-Dist: torchdata
+Requires-Dist: transformers<4.53.0,>=4.51.0
+Requires-Dist: vllm>=0.8.0
+Requires-Dist: wandb
+Provides-Extra: dev
+Requires-Dist: pre-commit; extra == "dev"
+Requires-Dist: ruff; extra == "dev"
+Dynamic: author
+Dynamic: author-email
+Dynamic: description
+Dynamic: description-content-type
+Dynamic: home-page
+Dynamic: license
+Dynamic: license-file
+Dynamic: provides-extra
+Dynamic: requires-dist
+Dynamic: requires-python
+Dynamic: summary
+# EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework
+[![GitHub Repo stars](https://img.shields.io/github/stars/hiyouga/EasyR1)](https://github.com/hiyouga/EasyR1/stargazers)
+[![Twitter](https://img.shields.io/twitter/follow/llamafactory_ai)](https://twitter.com/llamafactory_ai)
+### Used by [Amazon Web Services](https://aws.amazon.com/cn/blogs/china/building-llm-model-hub-based-on-llamafactory-and-easyr1/)
+This project is a clean fork of the original [veRL](https://github.com/volcengine/verl) project to support vision language models, we thank all the authors for providing such a high-performance RL training framework.
+EasyR1 is efficient and scalable due to the design of **[HybirdEngine](https://arxiv.org/abs/2409.19256)** and the latest release of **[vLLM](https://github.com/vllm-project/vllm)**'s SPMD mode.
+## Features
+- Supported models
+  - Llama3/Qwen2/Qwen2.5/Qwen3 language models
+  - Qwen2/Qwen2.5-VL vision language models
+  - DeepSeek-R1 distill models
+- Supported algorithms
+  - GRPO
+  - DAPO
+  - Reinforce++
+  - ReMax
+  - RLOO
+- Supported datasets
+  - Any text, vision-text dataset in a [specific format](#custom-dataset)
+- Supported tricks
+  - Padding-free training
+  - Resuming from checkpoint
+  - Wandb & SwanLab & Mlflow & Tensorboard tracking
+## Requirements
+### Software Requirements
+- Python 3.9+
+- transformers>=4.51.0
+- flash-attn>=2.4.3
+- vllm>=0.8.3
+We provide a [Dockerfile](./Dockerfile) to easily build environments.
+We recommend using the [pre-built docker image](https://hub.docker.com/r/hiyouga/verl) in EasyR1.
+```bash
+docker pull hiyouga/verl:ngc-th2.7.0-cu12.6-vllm0.9.1
+```
+### Hardware Requirements
+\* *estimated*
+| Method                   | Bits |  1.5B  |   3B   |   7B   |   32B   |   72B   |
+| ------------------------ | ---- | ------ | ------ | ------ | ------- | ------- |
+| GRPO Full Fine-Tuning    |  AMP | 2*24GB | 4*40GB | 8*40GB | 16*80GB | 32*80GB |
+| GRPO Full Fine-Tuning    | BF16 | 1*24GB | 1*40GB | 4*40GB |  8*80GB | 16*80GB |
+> [!NOTE]
+> Use `worker.actor.fsdp.torch_dtype=bf16` and `worker.actor.optim.strategy=adamw_bf16` to enable bf16 training.
+>
+> We are working hard to reduce the VRAM in RL training, LoRA support will be integrated in next updates.
+## Tutorial: Run Qwen2.5-VL GRPO on [Geometry3K](https://huggingface.co/datasets/hiyouga/geometry3k) Dataset in Just 3 Steps
+![image](assets/qwen2_5_vl_7b_geo.png)
+### Installation
+```bash
+git clone https://github.com/hiyouga/EasyR1.git
+cd EasyR1
+pip install -e .
+```
+### GRPO Training
+```bash
+bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
+```
+### Merge Checkpoint in Hugging Face Format
+```bash
+python3 scripts/model_merger.py --local_dir checkpoints/easy_r1/exp_name/global_step_1/actor
+```
+> [!TIP]
+> If you encounter issues with connecting to Hugging Face, consider using `export HF_ENDPOINT=https://hf-mirror.com`.
+>
+> If you want to use SwanLab logger, consider using `bash examples/qwen2_5_vl_7b_geo3k_swanlab.sh`.
+## Custom Dataset
+Please refer to the example datasets to prepare your own dataset.
+- Text dataset: https://huggingface.co/datasets/hiyouga/math12k
+- Image-text dataset: https://huggingface.co/datasets/hiyouga/geometry3k
+- Multi-image-text dataset: https://huggingface.co/datasets/hiyouga/journeybench-multi-image-vqa
+- Text-image mixed dataset: https://huggingface.co/datasets/hiyouga/rl-mixed-dataset
+## How to Understand GRPO in EasyR1
+![image](assets/easyr1_grpo.png)
+- To learn about the GRPO algorithm, you can refer to [Hugging Face's blog](https://huggingface.co/docs/trl/v0.16.1/en/grpo_trainer).
+## How to Run 70B+ Model in Multi-node Environment
+1. Start the Ray head node.
+```bash
+ray start --head --port=6379 --dashboard-host=0.0.0.0
+```
+2. Start the Ray worker node and connect to the head node.
+```bash
+ray start --address=<head_node_ip>:6379
+```
+3. Check the Ray resource pool.
+```bash
+ray status
+```
+4. Run training script on the Ray head node only.
+```bash
+bash examples/qwen2_5_vl_7b_geo3k_grpo.sh
+```
+See the **[veRL's official doc](https://verl.readthedocs.io/en/latest/start/multinode.html)** for more details about multi-node training and Ray debugger.
+## Other Baselines
+We also reproduced the following two baselines of the [R1-V](https://github.com/deep-agent/R1-V) project.
+- [CLEVR-70k-Counting](examples/baselines/qwen2_5_vl_3b_clevr.sh): Train the Qwen2.5-VL-3B-Instruct model on counting problem.
+- [GeoQA-8k](examples/baselines/qwen2_5_vl_3b_geoqa8k.sh): Train the Qwen2.5-VL-3B-Instruct model on GeoQA problem.
+## Performance Baselines
+See [baselines.md](assets/baselines.md).
+## Awesome Work using EasyR1
+- **MMR1**: Advancing the Frontiers of Multimodal Reasoning. [![[code]](https://img.shields.io/github/stars/LengSicong/MMR1)](https://github.com/LengSicong/MMR1)
+- **Vision-R1**: Incentivizing Reasoning Capability in Multimodal Large Language Models. [![[code]](https://img.shields.io/github/stars/Osilly/Vision-R1)](https://github.com/Osilly/Vision-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06749-blue)](https://arxiv.org/abs/2503.06749)
+- **Seg-Zero**: Reasoning-Chain Guided Segmentation via Cognitive Reinforcement. [![[code]](https://img.shields.io/github/stars/dvlab-research/Seg-Zero)](https://github.com/dvlab-research/Seg-Zero) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.06520-blue)](https://arxiv.org/abs/2503.06520)
+- **MetaSpatial**: Reinforcing 3D Spatial Reasoning in VLMs for the Metaverse. [![[code]](https://img.shields.io/github/stars/PzySeere/MetaSpatial)](https://github.com/PzySeere/MetaSpatial) [![[arxiv]](https://img.shields.io/badge/arxiv-2503.18470-blue)](https://arxiv.org/abs/2503.18470)
+- **Temporal-R1**: Envolving Temporal Reasoning Capability into LMMs via Temporal Consistent Reward. [![[code]](https://img.shields.io/github/stars/appletea233/Temporal-R1)](https://github.com/appletea233/Temporal-R1)
+- **NoisyRollout**: Reinforcing Visual Reasoning with Data Augmentation. [![[code]](https://img.shields.io/github/stars/John-AI-Lab/NoisyRollout)](https://github.com/John-AI-Lab/NoisyRollout) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.13055-blue)](https://arxiv.org/pdf/2504.13055)
+- **GUI-R1**: A Generalist R1-Style Vision-Language Action Model For GUI Agents. [![[code]](https://img.shields.io/github/stars/ritzz-ai/GUI-R1)](https://github.com/ritzz-ai/GUI-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2504.10458-blue)](https://arxiv.org/abs/2504.10458)
+- **R1-Track**: Direct Application of MLLMs to Visual Object Tracking via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/Wangbiao2/R1-Track)](https://github.com/Wangbiao2/R1-Track)
+- **VisionReasoner**: Unified Visual Perception and Reasoning via Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/dvlab-research/VisionReasoner)](https://github.com/dvlab-research/VisionReasoner) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.12081-blue)](https://arxiv.org/abs/2505.12081)
+- **MM-UPT**: Unsupervised Post-Training for Multi-Modal LLM Reasoning via GRPO. [![[code]](https://img.shields.io/github/stars/waltonfuture/MM-UPT)](https://github.com/waltonfuture/MM-UPT) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22453-blue)](https://arxiv.org/pdf/2505.22453)
+- **RL-with-Cold-Start**: Advancing Multimodal Reasoning via Reinforcement Learning with Cold Start. [![[code]](https://img.shields.io/github/stars/waltonfuture/RL-with-Cold-Start)](https://github.com/waltonfuture/RL-with-Cold-Start) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/pdf/2505.22334)
+- **ViGoRL**: Grounded Reinforcement Learning for Visual Reasoning. [![[code]](https://img.shields.io/github/stars/Gabesarch/grounded-rl)](https://github.com/Gabesarch/grounded-rl) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.22334-blue)](https://arxiv.org/abs/2505.23678)
+- **Revisual-R1**: Advancing Multimodal Reasoning: From Optimized Cold Start to Staged Reinforcement Learning. [![[code]](https://img.shields.io/github/stars/CSfufu/Revisual-R1)](https://github.com/CSfufu/Revisual-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.04207-blue)](https://arxiv.org/abs/2506.04207)
+- **SophiaVL-R1**: Reinforcing MLLMs Reasoning with Thinking Reward. [![[code]](https://img.shields.io/github/stars/kxfan2002/SophiaVL-R1)](https://github.com/kxfan2002/SophiaVL-R1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.17018-blue)](https://arxiv.org/abs/2505.17018)
+- **Vision-Matters**: Simple Visual Perturbations Can Boost Multimodal Math Reasoning. [![[code]](https://img.shields.io/github/stars/YutingLi0606/Vision-Matters)](https://github.com/YutingLi0606/Vision-Matters) [![[arxiv]](https://img.shields.io/badge/arxiv-2506.09736-blue)](https://arxiv.org/abs/2506.09736)
+- **VTool-R1**: VLMs Learn to Think with Images via Reinforcement Learning on Multimodal Tool Use. [![[code]](https://img.shields.io/github/stars/VTOOL-R1/vtool-r1)](https://github.com/VTOOL-R1/vtool-r1) [![[arxiv]](https://img.shields.io/badge/arxiv-2505.19255-blue)](https://arxiv.org/abs/2505.19255)
+## TODO
+- Support LoRA (high priority).
+- Support ulysses parallelism for VLMs (middle priority).
+- Support more VLM architectures.
+> [!NOTE]
+> We will not provide scripts for supervised fine-tuning and inference in this project. If you have such requirements, we recommend using [LLaMA-Factory](https://github.com/hiyouga/LLaMA-Factory).
+### Known bugs
+These features are temporarily disabled for now, we plan to fix them one-by-one in the future updates.
+- Vision language models are not compatible with ulysses parallelism yet.
+## Discussion Group
+👋 Join our [WeChat group](assets/wechat.jpg).
+## FAQs
+> ValueError: Image features and image tokens do not match: tokens: 8192, features 9800
+Increase the `data.max_prompt_length` or reduce the `data.max_pixels`.
+> RuntimeError: CUDA Error: out of memory at /workspace/csrc/cumem_allocator.cpp:62
+Reduce the `worker.rollout.gpu_memory_utilization` and enable `worker.actor.offload.offload_params`.
+> RuntimeError: 0 active drivers ([]). There should only be one.
+Uninstall `deepspeed` from the current python environment.
+## Citation
+Core contributors: [Yaowei Zheng](https://github.com/hiyouga), [Junting Lu](https://github.com/AL-377), [Shenzhi Wang](https://github.com/Shenzhi-Wang), [Zhangchi Feng](https://github.com/BUAADreamer), [Dongdong Kuang](https://github.com/Kuangdd01) and Yuwen Xiong
+We also thank Guangming Sheng and Chi Zhang for helpful discussions.
+```bibtex
+@misc{zheng2025easyr1,
+  title        = {EasyR1: An Efficient, Scalable, Multi-Modality RL Training Framework},
+  author       = {Yaowei Zheng, Junting Lu, Shenzhi Wang, Zhangchi Feng, Dongdong Kuang, Yuwen Xiong},
+  howpublished = {\url{https://github.com/hiyouga/EasyR1}},
+  year         = {2025}
+}
+```
+We recommend to also cite the original work.
+```bibtex
+@article{sheng2024hybridflow,
+  title   = {HybridFlow: A Flexible and Efficient RLHF Framework},
+  author  = {Guangming Sheng and Chi Zhang and Zilingfeng Ye and Xibin Wu and Wang Zhang and Ru Zhang and Yanghua Peng and Haibin Lin and Chuan Wu},
+  year    = {2024},
+  journal = {arXiv preprint arXiv: 2409.19256}
+}
+```

EasyR1-new/verl.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,72 @@

+LICENSE
+README.md
+pyproject.toml
+setup.py
+./verl/__init__.py
+./verl/protocol.py
+./verl/models/__init__.py
+./verl/models/monkey_patch.py
+./verl/models/transformers/__init__.py
+./verl/models/transformers/flash_attention_utils.py
+./verl/models/transformers/qwen2_vl.py
+./verl/single_controller/__init__.py
+./verl/single_controller/base/__init__.py
+./verl/single_controller/base/decorator.py
+./verl/single_controller/base/worker.py
+./verl/single_controller/base/worker_group.py
+./verl/single_controller/base/register_center/__init__.py
+./verl/single_controller/base/register_center/ray.py
+./verl/single_controller/ray/__init__.py
+./verl/single_controller/ray/base.py
+./verl/trainer/__init__.py
+./verl/trainer/config.py
+./verl/trainer/core_algos.py
+./verl/trainer/data_loader.py
+./verl/trainer/main.py
+./verl/trainer/metrics.py
+./verl/trainer/ray_trainer.py
+./verl/utils/__init__.py
+./verl/utils/dataset.py
+./verl/utils/flops_counter.py
+./verl/utils/fsdp_utils.py
+./verl/utils/model_utils.py
+./verl/utils/py_functional.py
+./verl/utils/seqlen_balancing.py
+./verl/utils/tokenizer.py
+./verl/utils/torch_dtypes.py
+./verl/utils/torch_functional.py
+./verl/utils/ulysses.py
+./verl/utils/checkpoint/__init__.py
+./verl/utils/checkpoint/checkpoint_manager.py
+./verl/utils/checkpoint/fsdp_checkpoint_manager.py
+./verl/utils/logger/__init__.py
+./verl/utils/logger/gen_logger.py
+./verl/utils/logger/logger.py
+./verl/workers/__init__.py
+./verl/workers/config.py
+./verl/workers/fsdp_workers.py
+./verl/workers/actor/__init__.py
+./verl/workers/actor/base.py
+./verl/workers/actor/config.py
+./verl/workers/actor/dp_actor.py
+./verl/workers/critic/__init__.py
+./verl/workers/critic/base.py
+./verl/workers/critic/config.py
+./verl/workers/critic/dp_critic.py
+./verl/workers/reward/__init__.py
+./verl/workers/reward/config.py
+./verl/workers/reward/function.py
+./verl/workers/rollout/__init__.py
+./verl/workers/rollout/base.py
+./verl/workers/rollout/config.py
+./verl/workers/rollout/vllm_rollout_spmd.py
+./verl/workers/rollout/vllm_rollout_spmd_new.py
+./verl/workers/sharding_manager/__init__.py
+./verl/workers/sharding_manager/base.py
+./verl/workers/sharding_manager/fsdp_ulysses.py
+./verl/workers/sharding_manager/fsdp_vllm.py
+verl.egg-info/PKG-INFO
+verl.egg-info/SOURCES.txt
+verl.egg-info/dependency_links.txt
+verl.egg-info/requires.txt
+verl.egg-info/top_level.txt

EasyR1-new/verl.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

EasyR1-new/verl.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+accelerate
+codetiming
+datasets
+flash-attn>=2.4.3
+liger-kernel
+mathruler
+numpy
+omegaconf
+pandas
+peft
+pillow
+pyarrow>=15.0.0
+pylatexenc
+qwen-vl-utils
+ray[default]
+tensordict
+torchdata
+transformers<4.53.0,>=4.51.0
+vllm>=0.8.0
+wandb
+[dev]
+pre-commit
+ruff

EasyR1-new/verl.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ verl

EasyR1-new/verl/ProtT3/__pycache__/blip2.cpython-310.pyc ADDED Viewed

Binary file (3.18 kB). View file

EasyR1-new/verl/ProtT3/__pycache__/blip2_opt.cpython-310.pyc ADDED Viewed

Binary file (7.31 kB). View file

EasyR1-new/verl/ProtT3/__pycache__/blip2_stage2.cpython-310.pyc ADDED Viewed

Binary file (2.37 kB). View file

EasyR1-new/verl/ProtT3/__pycache__/help_funcs.cpython-310.pyc ADDED Viewed

Binary file (3.97 kB). View file

EasyR1-new/verl/ProtT3/__pycache__/opt_flash_attention.cpython-310.pyc ADDED Viewed

Binary file (7.21 kB). View file

EasyR1-new/verl/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (553 Bytes). View file

EasyR1-new/verl/__pycache__/protocol.cpython-310.pyc ADDED Viewed

Binary file (25.7 kB). View file

EasyR1-new/verl/models/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

EasyR1-new/verl/models/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (155 Bytes). View file

EasyR1-new/verl/models/__pycache__/monkey_patch.cpython-310.pyc ADDED Viewed

Binary file (1.62 kB). View file

EasyR1-new/verl/models/monkey_patch.py ADDED Viewed

	@@ -0,0 +1,63 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ..utils.py_functional import is_transformers_version_greater_than
+from .transformers.flash_attention_utils import flash_attention_forward
+from .transformers.qwen2_vl import (
+    qwen2_vl_attn_forward,
+    qwen2_vl_base_forward_new,
+    qwen2_vl_forward_new,
+    qwen2_vl_forward_old,
+)
+def apply_ulysses_patch(model_type: str) -> None:
+    if model_type in ("llama", "gemma", "gemma2", "mistral", "qwen2", "qwen3", "qwen3_moe"):
+        ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward
+    elif model_type in ("qwen2_vl", "qwen2_5_vl"):
+        if is_transformers_version_greater_than("4.53.0"):
+            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLAttention
+            from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLAttention
+            Qwen2VLAttention.forward = qwen2_vl_attn_forward
+            Qwen2_5_VLAttention.forward = qwen2_vl_attn_forward
+        else:
+            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLFlashAttention2
+            from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLFlashAttention2
+            Qwen2VLFlashAttention2.forward = qwen2_vl_attn_forward
+            Qwen2_5_VLFlashAttention2.forward = qwen2_vl_attn_forward
+        if is_transformers_version_greater_than("4.52.0"):
+            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
+                Qwen2_5_VLForConditionalGeneration,
+                Qwen2_5_VLModel,
+            )
+            from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration, Qwen2VLModel
+            Qwen2VLModel.forward = qwen2_vl_base_forward_new
+            Qwen2_5_VLModel.forward = qwen2_vl_base_forward_new
+            Qwen2VLForConditionalGeneration.forward = qwen2_vl_forward_new
+            Qwen2_5_VLForConditionalGeneration.forward = qwen2_vl_forward_new
+        else:
+            from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration
+            from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+            Qwen2VLForConditionalGeneration.forward = qwen2_vl_forward_old
+            Qwen2_5_VLForConditionalGeneration.forward = qwen2_vl_forward_old
+    else:
+        raise NotImplementedError(f"Model architecture {model_type} is not supported yet.")

EasyR1-new/verl/models/transformers/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

EasyR1-new/verl/models/transformers/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (168 Bytes). View file

EasyR1-new/verl/models/transformers/__pycache__/flash_attention_utils.cpython-310.pyc ADDED Viewed

Binary file (4.21 kB). View file

EasyR1-new/verl/models/transformers/__pycache__/qwen2_vl.cpython-310.pyc ADDED Viewed

Binary file (7.7 kB). View file

EasyR1-new/verl/models/transformers/flash_attention_utils.py ADDED Viewed

	@@ -0,0 +1,183 @@

+# Copyright 2024 The Fairseq Authors and the HuggingFace Inc. team
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Based on https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/modeling_flash_attention_utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from typing import Optional, Tuple
+import torch
+import torch.distributed as dist
+from transformers.modeling_flash_attention_utils import _flash_attention_forward, fa_peft_integration_check
+from transformers.utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10
+from ...utils.ulysses import (
+    gather_heads_scatter_seq,
+    gather_seq_scatter_heads,
+    get_ulysses_sequence_parallel_group,
+    get_ulysses_sequence_parallel_world_size,
+)
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    _flash_supports_window_size = "window_size" in inspect.signature(flash_attn_func).parameters
+    _flash_supports_deterministic = "deterministic" in inspect.signature(flash_attn_func).parameters
+    _flash_deterministic_enabled = os.getenv("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
+    _flash_use_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+def prepare_fa2_from_position_ids(
+    query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, position_ids: torch.Tensor
+):
+    query = query.view(-1, query.size(-2), query.size(-1))
+    key = key.contiguous().view(-1, key.size(-2), key.size(-1))
+    value = value.contiguous().view(-1, value.size(-2), value.size(-1))
+    position_ids = position_ids.flatten()
+    indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32)
+    cu_seqlens = torch.cat(
+        (
+            indices_q[position_ids == 0],
+            torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32),
+        )
+    )
+    max_length = cu_seqlens.diff().max()  # use cu_seqlens to infer max_length for qwen2vl mrope
+    return (query, key, value, indices_q, (cu_seqlens, cu_seqlens), (max_length, max_length))
+def _custom_flash_attention_forward(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    query_length: int,
+    is_causal: bool = True,
+    position_ids: Optional[torch.Tensor] = None,
+    sliding_window: Optional[int] = None,
+    use_top_left_mask: bool = False,
+    deterministic: Optional[bool] = None,
+    **kwargs,
+):
+    """
+    Patches flash attention forward to handle 3D position ids in mrope. (3, batch_size, seq_length)
+    """
+    # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
+    use_sliding_windows = (
+        _flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window
+    )
+    flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {}
+    if _flash_supports_deterministic:
+        flash_kwargs["deterministic"] = deterministic if deterministic is not None else _flash_deterministic_enabled
+    if kwargs.get("softcap") is not None:
+        flash_kwargs["softcap"] = kwargs.pop("softcap")
+    query_states, key_states, value_states = fa_peft_integration_check(
+        query_states, key_states, value_states, target_dtype=torch.bfloat16
+    )
+    sp_size = get_ulysses_sequence_parallel_world_size()
+    if sp_size > 1:
+        # (batch_size, seq_length, num_head, head_size)
+        query_states = gather_seq_scatter_heads(query_states, seq_dim=1, head_dim=2)
+        key_states = gather_seq_scatter_heads(key_states, seq_dim=1, head_dim=2)
+        value_states = gather_seq_scatter_heads(value_states, seq_dim=1, head_dim=2)
+        position_ids_lst = [torch.empty_like(position_ids) for _ in range(sp_size)]
+        position_ids = dist.all_gather(position_ids_lst, position_ids, group=get_ulysses_sequence_parallel_group())
+        position_ids = torch.cat(position_ids_lst, dim=-1)  # (..., batch_size, seq_length)
+    if position_ids is not None and query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all():
+        batch_size = query_states.size(0)
+        query_states, key_states, value_states, _, cu_seq_lens, max_seq_lens = prepare_fa2_from_position_ids(
+            query_states, key_states, value_states, position_ids
+        )
+        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+        max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+        attn_output = flash_attn_varlen_func(
+            query_states,
+            key_states,
+            value_states,
+            cu_seqlens_q=cu_seqlens_q,
+            cu_seqlens_k=cu_seqlens_k,
+            max_seqlen_q=max_seqlen_in_batch_q,
+            max_seqlen_k=max_seqlen_in_batch_k,
+            dropout_p=kwargs.pop("dropout", 0.0),
+            softmax_scale=kwargs.pop("softmax_scale", None),
+            causal=is_causal,
+            **flash_kwargs,
+        )
+        attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1))
+    else:
+        attn_output = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            query_length,
+            is_causal=is_causal,
+            sliding_window=sliding_window,
+            use_top_left_mask=use_top_left_mask,
+            deterministic=deterministic,
+            **kwargs,
+        )  # do not pass position_ids to old flash_attention_forward
+    if sp_size > 1:
+        # (batch_size, seq_length, num_head, head_size)
+        attn_output = gather_heads_scatter_seq(attn_output, head_dim=2, seq_dim=1)
+    return attn_output
+def flash_attention_forward(
+    module: torch.nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    softcap: Optional[float] = None,
+    **kwargs,
+) -> Tuple[torch.Tensor, None]:
+    # This is before the transpose
+    q_len = query.shape[2]
+    # FA2 uses non-transposed inputs
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+    # FA2 always relies on the value set in the module, so remove it if present in kwargs to avoid passing it twice
+    kwargs.pop("is_causal", None)
+    attn_output = _custom_flash_attention_forward(
+        query,
+        key,
+        value,
+        attention_mask,
+        query_length=q_len,
+        is_causal=module.is_causal,
+        dropout=dropout,
+        softmax_scale=scaling,
+        sliding_window=sliding_window,
+        softcap=softcap,
+        use_top_left_mask=_flash_use_top_left_mask,
+        **kwargs,
+    )
+    return attn_output, None

EasyR1-new/verl/models/transformers/qwen2_vl.py ADDED Viewed

	@@ -0,0 +1,356 @@

+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+# Based on:
+# https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple
+import torch
+from ...utils.py_functional import is_transformers_version_greater_than
+from .flash_attention_utils import flash_attention_forward
+if is_transformers_version_greater_than("4.52.0"):
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+        Qwen2VLAttention,
+        Qwen2VLCausalLMOutputWithPast,
+        Qwen2VLForConditionalGeneration,
+        Qwen2VLModel,
+        Qwen2VLModelOutputWithPast,
+        apply_multimodal_rotary_pos_emb,
+        repeat_kv,
+    )
+    from transformers.models.qwen2_vl.processing_qwen2_vl import Qwen2VLProcessor
+else:
+    from transformers.models.qwen2_vl.modeling_qwen2_vl import (
+        Qwen2VLAttention,
+        Qwen2VLCausalLMOutputWithPast,
+        Qwen2VLForConditionalGeneration,
+        apply_multimodal_rotary_pos_emb,
+        repeat_kv,
+    )
+def get_rope_index(
+    processor: "Qwen2VLProcessor",
+    input_ids: torch.Tensor,
+    image_grid_thw: Optional[torch.Tensor] = None,
+    video_grid_thw: Optional[torch.Tensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> torch.Tensor:
+    """
+    Gets the position ids for Qwen2-VL, it should be generated before sharding the sequence.
+    The batch dim has been removed and the input_ids should be a 1D tensor representing a single example.
+    https://github.com/huggingface/transformers/blob/v4.52.4/src/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py#L1405
+    """
+    spatial_merge_size = processor.image_processor.merge_size
+    tokens_per_second = 2
+    image_token_id = processor.tokenizer.convert_tokens_to_ids("<|image_pad|>")
+    video_token_id = processor.tokenizer.convert_tokens_to_ids("<|video_pad|>")
+    vision_start_token_id = processor.tokenizer.convert_tokens_to_ids("<|vision_start|>")
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        position_ids = torch.ones(3, input_ids.size(0), dtype=input_ids.dtype, device=input_ids.device)  # (3, seqlen)
+        image_index, video_index = 0, 0
+        input_ids = input_ids[attention_mask == 1]
+        image_nums, video_nums = 0, 0
+        vision_start_indices = torch.argwhere(input_ids == vision_start_token_id)
+        vision_tokens = input_ids[vision_start_indices + 1]
+        image_nums = (vision_tokens == image_token_id).sum()
+        video_nums = (vision_tokens == video_token_id).sum()
+        input_tokens = input_ids.tolist()
+        llm_pos_ids_list: list = []
+        st = 0
+        remain_images, remain_videos = image_nums, video_nums
+        for _ in range(image_nums + video_nums):
+            if image_token_id in input_tokens and remain_images > 0:
+                ed_image = input_tokens.index(image_token_id, st)
+            else:
+                ed_image = len(input_tokens) + 1
+            if video_token_id in input_tokens and remain_videos > 0:
+                ed_video = input_tokens.index(video_token_id, st)
+            else:
+                ed_video = len(input_tokens) + 1
+            if ed_image < ed_video:
+                t, h, w = (
+                    image_grid_thw[image_index][0],
+                    image_grid_thw[image_index][1],
+                    image_grid_thw[image_index][2],
+                )
+                second_per_grid_t = 0
+                image_index += 1
+                remain_images -= 1
+                ed = ed_image
+            else:
+                t, h, w = (
+                    video_grid_thw[video_index][0],
+                    video_grid_thw[video_index][1],
+                    video_grid_thw[video_index][2],
+                )
+                if second_per_grid_ts is not None:
+                    second_per_grid_t = second_per_grid_ts[video_index]
+                else:
+                    second_per_grid_t = 1.0
+                video_index += 1
+                remain_videos -= 1
+                ed = ed_video
+            llm_grid_t, llm_grid_h, llm_grid_w = (
+                t.item(),
+                h.item() // spatial_merge_size,
+                w.item() // spatial_merge_size,
+            )
+            text_len = ed - st
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+            t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w)
+            t_index = (t_index * second_per_grid_t * tokens_per_second).long().flatten()
+            h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+            w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+            llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+            st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+        if st < len(input_tokens):
+            st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+            text_len = len(input_tokens) - st
+            llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+        llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+        position_ids[..., attention_mask == 1] = llm_positions.to(position_ids.device)
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1).to(input_ids.device)
+        else:
+            position_ids = torch.arange(input_ids.shape[1], device=input_ids.device).view(1, -1).expand(3, -1)
+    return position_ids
+def qwen2_vl_attn_forward(
+    self: "Qwen2VLAttention",
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # will become mandatory in v4.46
+    **kwargs,
+) -> Tuple[torch.Tensor, None, None]:
+    bsz, q_len, _ = hidden_states.size()  # q_len = seq_length / sp_size
+    query_states = self.q_proj(hidden_states)  # (batch_size, seq_length / sp_size, num_heads * head_size)
+    key_states = self.k_proj(hidden_states)
+    value_states = self.v_proj(hidden_states)
+    query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+    key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+    # Because the input can be padded, the absolute sequence length depends on the max position id.
+    cos, sin = position_embeddings
+    query_states, key_states = apply_multimodal_rotary_pos_emb(
+        query_states, key_states, cos, sin, self.rope_scaling["mrope_section"]
+    )
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+    dropout_rate = 0.0 if not self.training else self.attention_dropout
+    sliding_window = None
+    if (
+        self.config.use_sliding_window
+        and getattr(self.config, "sliding_window", None) is not None
+        and self.layer_idx >= self.config.max_window_layers
+    ):
+        sliding_window = self.config.sliding_window
+    attn_output, _ = flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        dropout=dropout_rate,
+        sliding_window=sliding_window,
+        position_ids=position_ids[0],  # important: pass position ids
+    )  # (batch_size, seq_length, num_head / sp_size, head_size)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+    attn_output = self.o_proj(attn_output)
+    return attn_output, None, None
+def _get_input_embeds(
+    model: "Qwen2VLModel",
+    input_ids: torch.LongTensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+):
+    inputs_embeds = model.get_input_embeddings()(input_ids)
+    if pixel_values is not None:
+        pixel_values = pixel_values.type(model.visual.dtype)
+        image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
+        n_image_tokens = (input_ids == model.config.image_token_id).sum().item()
+        n_image_features = image_embeds.shape[0]
+        if n_image_tokens != n_image_features:
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        mask = input_ids == model.config.image_token_id
+        mask_unsqueezed = mask.unsqueeze(-1)
+        mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+        image_mask = mask_expanded.to(inputs_embeds.device)
+        image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+    if pixel_values_videos is not None:
+        pixel_values_videos = pixel_values_videos.type(model.visual.dtype)
+        video_embeds = model.visual(pixel_values_videos, grid_thw=video_grid_thw)
+        n_video_tokens = (input_ids == model.config.video_token_id).sum().item()
+        n_video_features = video_embeds.shape[0]
+        if n_video_tokens != n_video_features:
+            raise ValueError(
+                f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+            )
+        mask = input_ids == model.config.video_token_id
+        mask_unsqueezed = mask.unsqueeze(-1)
+        mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
+        video_mask = mask_expanded.to(inputs_embeds.device)
+        video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+        inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
+    if pixel_values is None and pixel_values_videos is None:
+        pixel_values = torch.zeros((16, 1176), dtype=inputs_embeds.dtype, device=inputs_embeds.device)
+        image_grid_thw = torch.tensor([[1, 4, 4]], dtype=torch.long, device=inputs_embeds.device)
+        image_embeds = model.visual(pixel_values, grid_thw=image_grid_thw)
+        inputs_embeds += 0.0 * image_embeds.mean()
+    if attention_mask is not None:
+        attention_mask = attention_mask.to(inputs_embeds.device)
+    return inputs_embeds, attention_mask
+def qwen2_vl_forward_old(
+    self: "Qwen2VLForConditionalGeneration",
+    input_ids: torch.LongTensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    **kwargs,
+) -> "Qwen2VLCausalLMOutputWithPast":
+    inputs_embeds, attention_mask = _get_input_embeds(
+        self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw
+    )
+    outputs = self.model(
+        input_ids=None,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        inputs_embeds=inputs_embeds,
+        **kwargs,
+    )
+    hidden_states = outputs[0]
+    logits = self.lm_head(hidden_states)
+    return Qwen2VLCausalLMOutputWithPast(
+        loss=None,
+        logits=logits,
+        past_key_values=None,
+        hidden_states=None,
+        attentions=None,
+        rope_deltas=None,
+    )
+def qwen2_vl_base_forward_new(
+    self: "Qwen2VLModel",
+    input_ids: torch.LongTensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    **kwargs,
+):
+    inputs_embeds, attention_mask = _get_input_embeds(
+        self, input_ids, attention_mask, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw
+    )
+    outputs = self.language_model(
+        input_ids=None,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        inputs_embeds=inputs_embeds,
+        **kwargs,
+    )
+    return Qwen2VLModelOutputWithPast(
+        last_hidden_state=outputs.last_hidden_state,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+        rope_deltas=None,
+    )
+def qwen2_vl_forward_new(
+    self: "Qwen2VLForConditionalGeneration",
+    input_ids: torch.LongTensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    pixel_values: Optional[torch.FloatTensor] = None,
+    pixel_values_videos: Optional[torch.FloatTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    **kwargs,
+) -> "Qwen2VLCausalLMOutputWithPast":
+    outputs = self.model(
+        input_ids=input_ids,
+        pixel_values=pixel_values,
+        pixel_values_videos=pixel_values_videos,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        position_ids=position_ids,
+        attention_mask=attention_mask,
+        **kwargs,
+    )
+    hidden_states = outputs[0]
+    logits = self.lm_head(hidden_states)
+    return Qwen2VLCausalLMOutputWithPast(
+        loss=None,
+        logits=logits,
+        past_key_values=None,
+        hidden_states=None,
+        attentions=None,
+        rope_deltas=None,
+    )

EasyR1-new/verl/single_controller/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

EasyR1-new/verl/single_controller/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (166 Bytes). View file

EasyR1-new/verl/single_controller/base/__init__.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .worker import Worker
+from .worker_group import ClassWithInitArgs, ResourcePool, WorkerGroup
+__all__ = ["ClassWithInitArgs", "ResourcePool", "Worker", "WorkerGroup"]

EasyR1-new/verl/single_controller/base/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (348 Bytes). View file

EasyR1-new/verl/single_controller/base/__pycache__/decorator.cpython-310.pyc ADDED Viewed

Binary file (6.17 kB). View file

EasyR1-new/verl/single_controller/base/__pycache__/worker.cpython-310.pyc ADDED Viewed

Binary file (6.51 kB). View file

EasyR1-new/verl/single_controller/base/__pycache__/worker_group.cpython-310.pyc ADDED Viewed

Binary file (6.86 kB). View file

EasyR1-new/verl/single_controller/base/decorator.py ADDED Viewed

	@@ -0,0 +1,213 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from enum import Enum, auto
+from functools import wraps
+from types import FunctionType
+from typing import TYPE_CHECKING, Dict, List, Literal, Union
+import ray
+from ...protocol import DataProto, DataProtoFuture
+if TYPE_CHECKING:
+    from .worker_group import WorkerGroup
+# here we add a magic number of avoid user-defined function already have this attribute
+MAGIC_ATTR = "attrs_3141562937"
+class Dispatch(Enum):
+    RANK_ZERO = auto()
+    ONE_TO_ALL = auto()
+    ALL_TO_ALL = auto()
+    DP_COMPUTE = auto()
+    DP_COMPUTE_PROTO = auto()
+    DP_COMPUTE_PROTO_WITH_FUNC = auto()
+    DP_COMPUTE_METRIC = auto()
+class Execute(Enum):
+    ALL = 0
+    RANK_ZERO = 1
+def _split_args_kwargs_data_proto(chunks: int, *args, **kwargs):
+    splitted_args = []
+    for arg in args:
+        assert isinstance(arg, (DataProto, DataProtoFuture))
+        splitted_args.append(arg.chunk(chunks=chunks))
+    splitted_kwargs = {}
+    for key, value in kwargs.items():
+        assert isinstance(value, (DataProto, DataProtoFuture))
+        splitted_kwargs[key] = value.chunk(chunks=chunks)
+    return splitted_args, splitted_kwargs
+def dispatch_one_to_all(worker_group: "WorkerGroup", *args, **kwargs):
+    args = tuple([arg] * worker_group.world_size for arg in args)
+    kwargs = {k: [v] * worker_group.world_size for k, v in kwargs.items()}
+    return args, kwargs
+def dispatch_all_to_all(worker_group: "WorkerGroup", *args, **kwargs):
+    return args, kwargs
+def collect_all_to_all(worker_group: "WorkerGroup", output):
+    return output
+def _concat_data_proto_or_future(outputs: List[DataProto]) -> DataProto:
+    # make sure all the elements in output has the same type
+    for output in outputs:
+        assert type(output) is type(outputs[0])
+    output = outputs[0]
+    if isinstance(output, DataProto):
+        return DataProto.concat(outputs)
+    elif isinstance(output, ray.ObjectRef):
+        return DataProtoFuture.concat(outputs)
+    else:
+        raise NotImplementedError
+def dispatch_dp_compute(worker_group: "WorkerGroup", *args, **kwargs):
+    for arg in args:
+        assert isinstance(arg, (tuple, list)) and len(arg) == worker_group.world_size
+    for value in kwargs.values():
+        assert isinstance(value, (tuple, list)) and len(value) == worker_group.world_size
+    return args, kwargs
+def collect_dp_compute(worker_group: "WorkerGroup", outputs: List[DataProto]) -> List[DataProto]:
+    assert len(outputs) == worker_group.world_size
+    return outputs
+def dispatch_dp_compute_data_proto(worker_group: "WorkerGroup", *args, **kwargs):
+    splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args, **kwargs)
+    return splitted_args, splitted_kwargs
+def dispatch_dp_compute_data_proto_with_func(worker_group: "WorkerGroup", *args, **kwargs):
+    assert type(args[0]) is FunctionType  # NOTE: The first one args is a function!
+    splitted_args, splitted_kwargs = _split_args_kwargs_data_proto(worker_group.world_size, *args[1:], **kwargs)
+    splitted_args_with_func = [[args[0]] * worker_group.world_size] + splitted_args
+    return splitted_args_with_func, splitted_kwargs
+def collect_dp_compute_data_proto(worker_group: "WorkerGroup", outputs: List[DataProto]) -> DataProto:
+    for output in outputs:
+        assert isinstance(output, (DataProto, ray.ObjectRef)), f"Expect a DataProto, but got {type(output)}"
+    outputs = collect_dp_compute(worker_group, outputs)
+    return _concat_data_proto_or_future(outputs)
+def get_predefined_dispatch_fn(dispatch_mode: Dispatch):
+    predefined_dispatch_mode_fn = {
+        Dispatch.ONE_TO_ALL: {
+            "dispatch_fn": dispatch_one_to_all,
+            "collect_fn": collect_all_to_all,
+        },
+        Dispatch.ALL_TO_ALL: {
+            "dispatch_fn": dispatch_all_to_all,
+            "collect_fn": collect_all_to_all,
+        },
+        Dispatch.DP_COMPUTE: {
+            "dispatch_fn": dispatch_dp_compute,
+            "collect_fn": collect_dp_compute,
+        },
+        Dispatch.DP_COMPUTE_PROTO: {
+            "dispatch_fn": dispatch_dp_compute_data_proto,
+            "collect_fn": collect_dp_compute_data_proto,
+        },
+        Dispatch.DP_COMPUTE_PROTO_WITH_FUNC: {
+            "dispatch_fn": dispatch_dp_compute_data_proto_with_func,
+            "collect_fn": collect_dp_compute_data_proto,
+        },
+        Dispatch.DP_COMPUTE_METRIC: {
+            "dispatch_fn": dispatch_dp_compute_data_proto,
+            "collect_fn": collect_dp_compute,
+        },
+    }
+    return predefined_dispatch_mode_fn[dispatch_mode]
+def get_predefined_execute_fn(execute_mode: Execute):
+    """
+    Note that here we only asks execute_all and execute_rank_zero to be implemented
+    Leave the choice of how these two functions handle argument 'blocking' to users
+    """
+    predefined_execute_mode_fn = {
+        Execute.ALL: {"execute_fn_name": "execute_all"},
+        Execute.RANK_ZERO: {"execute_fn_name": "execute_rank_zero"},
+    }
+    return predefined_execute_mode_fn[execute_mode]
+def _check_dispatch_mode(dispatch_mode: Union[Dispatch, Dict[Literal["dispatch_fn", "collect_fn"], FunctionType]]):
+    assert isinstance(dispatch_mode, (Dispatch, dict)), (
+        f"dispatch_mode must be a Dispatch or a Dict. Got {dispatch_mode}"
+    )
+    if isinstance(dispatch_mode, dict):
+        necessary_keys = ["dispatch_fn", "collect_fn"]
+        for key in necessary_keys:
+            assert key in dispatch_mode, f"key {key} should be in dispatch_mode if it is a dictionary"
+def _check_execute_mode(execute_mode: Execute):
+    assert isinstance(execute_mode, Execute), f"execute_mode must be a Execute. Got {execute_mode}"
+def _materialize_futures(*args, **kwargs):
+    new_args = []
+    for arg in args:
+        if isinstance(arg, DataProtoFuture):
+            arg = arg.get()
+        # add more type to materialize
+        new_args.append(arg)
+    for key, value in kwargs.items():
+        if isinstance(value, DataProtoFuture):
+            kwargs[key] = value.get()
+    new_args = tuple(new_args)
+    return new_args, kwargs
+def register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.ALL, blocking=True, materialize_futures=True):
+    _check_dispatch_mode(dispatch_mode=dispatch_mode)
+    _check_execute_mode(execute_mode=execute_mode)
+    def decorator(func):
+        @wraps(func)
+        def inner(*args, **kwargs):
+            if materialize_futures:
+                args, kwargs = _materialize_futures(*args, **kwargs)
+            return func(*args, **kwargs)
+        attrs = {"dispatch_mode": dispatch_mode, "execute_mode": execute_mode, "blocking": blocking}
+        setattr(inner, MAGIC_ATTR, attrs)
+        return inner
+    return decorator

EasyR1-new/verl/single_controller/base/register_center/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

EasyR1-new/verl/single_controller/base/register_center/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (187 Bytes). View file

EasyR1-new/verl/single_controller/base/register_center/__pycache__/ray.cpython-310.pyc ADDED Viewed

Binary file (882 Bytes). View file

EasyR1-new/verl/single_controller/base/register_center/ray.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import ray
+@ray.remote
+class WorkerGroupRegisterCenter:
+    def __init__(self, rank_zero_info):
+        self.rank_zero_info = rank_zero_info
+    def get_rank_zero_info(self):
+        return self.rank_zero_info
+def create_worker_group_register_center(name, info):
+    return WorkerGroupRegisterCenter.options(name=name).remote(info)

EasyR1-new/verl/single_controller/base/worker.py ADDED Viewed

	@@ -0,0 +1,202 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+the class for Worker
+"""
+import os
+import socket
+from dataclasses import dataclass
+from typing import Tuple
+import ray
+import torch
+from .decorator import Dispatch, Execute, register
+from .register_center.ray import create_worker_group_register_center
+@dataclass
+class DistRankInfo:
+    tp_rank: int
+    dp_rank: int
+    pp_rank: int
+@dataclass
+class DistGlobalInfo:
+    tp_size: int
+    dp_size: int
+    pp_size: int
+class WorkerHelper:
+    def _get_node_ip(self) -> str:
+        host_ipv4 = os.getenv("MY_HOST_IP", None)
+        host_ipv6 = os.getenv("MY_HOST_IPV6", None)
+        host_ip_by_env = host_ipv4 or host_ipv6
+        host_ip_by_sdk = ray._private.services.get_node_ip_address()
+        host_ip = host_ip_by_env or host_ip_by_sdk
+        return host_ip
+    def _get_free_port(self) -> int:
+        with socket.socket() as sock:
+            sock.bind(("", 0))
+            return sock.getsockname()[1]
+    def get_availale_master_addr_port(self) -> Tuple[str, str]:
+        return self._get_node_ip(), str(self._get_free_port())
+    def _get_pid(self):
+        return
+class WorkerMeta:
+    keys = [
+        "WORLD_SIZE",
+        "RANK",
+        "LOCAL_WORLD_SIZE",
+        "LOCAL_RANK",
+        "MASTER_ADDR",
+        "MASTER_PORT",
+        "CUDA_VISIBLE_DEVICES",
+    ]
+    def __init__(self, store) -> None:
+        self._store = store
+    def to_dict(self):
+        return {f"_{key.lower()}": self._store.get(f"_{key.lower()}", None) for key in WorkerMeta.keys}
+# we assume that in each WorkerGroup, there is a Master Worker
+class Worker(WorkerHelper):
+    """A (distributed) worker."""
+    _world_size: int
+    _rank: int
+    _local_world_size: int
+    _local_rank: int
+    _master_addr: str
+    _master_port: str
+    _cuda_visible_devices: str
+    def __new__(cls, *args, **kwargs):
+        instance = super().__new__(cls)
+        # note that here we use int to distinguish
+        disable_worker_init = int(os.getenv("DISABLE_WORKER_INIT", 0))
+        if disable_worker_init:
+            return instance
+        rank = os.getenv("RANK", None)
+        worker_group_prefix = os.getenv("WG_PREFIX", None)
+        # when decorator @ray.remote applies, __new__ will be called while we don't want to apply _configure_before_init
+        if None not in [rank, worker_group_prefix] and "ActorClass(" not in cls.__name__:
+            instance._configure_before_init(f"{worker_group_prefix}_register_center", int(rank))
+        return instance
+    def _configure_before_init(self, register_center_name: str, rank: int):
+        assert isinstance(rank, int), f"rank must be int, instead of {type(rank)}"
+        if rank == 0:
+            master_addr, master_port = self.get_availale_master_addr_port()
+            rank_zero_info = {
+                "MASTER_ADDR": master_addr,
+                "MASTER_PORT": master_port,
+            }
+            self.register_center = create_worker_group_register_center(name=register_center_name, info=rank_zero_info)
+            os.environ.update(rank_zero_info)
+    def __init__(self, cuda_visible_devices=None) -> None:
+        # construct a meta from envrionment variable. Note that the import must be inside the class because it is executed remotely
+        world_size = int(os.getenv("WORLD_SIZE"))
+        rank = int(os.getenv("RANK"))
+        self._rank = rank
+        self._world_size = world_size
+        if "AMD" in torch.cuda.get_device_name():
+            os.environ["CUDA_VISIBLE_DEVICES"] = os.getenv("ROCR_VISIBLE_DEVICES")
+            os.environ["LOCAL_RANK"] = os.getenv("RAY_LOCAL_RANK")
+            cuda_visible_devices = os.getenv("LOCAL_RANK", "0")
+            torch.cuda.set_device(int(cuda_visible_devices))
+        master_addr = os.getenv("MASTER_ADDR")
+        master_port = os.getenv("MASTER_PORT")
+        local_world_size = int(os.getenv("LOCAL_WORLD_SIZE", "1"))
+        local_rank = int(os.getenv("LOCAL_RANK", "0"))
+        store = {
+            "_world_size": world_size,
+            "_rank": rank,
+            "_local_world_size": local_world_size,
+            "_local_rank": local_rank,
+            "_master_addr": master_addr,
+            "_master_port": master_port,
+        }
+        if cuda_visible_devices is not None:
+            store["_cuda_visible_devices"] = cuda_visible_devices
+        meta = WorkerMeta(store=store)
+        self._configure_with_meta(meta=meta)
+    def _configure_with_meta(self, meta: WorkerMeta):
+        """
+        This function should only be called inside by WorkerGroup
+        """
+        assert isinstance(meta, WorkerMeta)
+        self.__dict__.update(meta.to_dict())  # this is hacky
+        # print(f"__dict__: {self.__dict__}")
+        for key in WorkerMeta.keys:
+            val = self.__dict__.get(f"_{key.lower()}", None)
+            if val is not None:
+                # print(f"set {key} to {val}")
+                os.environ[key] = str(val)
+        os.environ["REDIS_STORE_SERVER_HOST"] = (
+            str(self._master_addr).replace("[", "").replace("]", "") if self._master_addr else ""
+        )
+    def get_master_addr_port(self):
+        return self._master_addr, self._master_port
+    def get_cuda_visible_devices(self):
+        cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES", "not set")
+        return cuda_visible_devices
+    def print_rank0(self, *args, **kwargs):
+        if self.rank == 0:
+            print(*args, **kwargs)
+    @property
+    def world_size(self):
+        return self._world_size
+    @property
+    def rank(self):
+        return self._rank
+    @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO_WITH_FUNC)
+    def execute_with_func_generator(self, func, *args, **kwargs):
+        ret_proto = func(self, *args, **kwargs)
+        return ret_proto
+    @register(dispatch_mode=Dispatch.ALL_TO_ALL, execute_mode=Execute.RANK_ZERO)
+    def execute_func_rank_zero(self, func, *args, **kwargs):
+        result = func(*args, **kwargs)
+        return result

EasyR1-new/verl/single_controller/base/worker_group.py ADDED Viewed

	@@ -0,0 +1,194 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+the class of WorkerGroup
+"""
+import logging
+import signal
+import threading
+import time
+from typing import Any, Callable, Dict, List, Optional
+from .decorator import MAGIC_ATTR, Dispatch, get_predefined_dispatch_fn, get_predefined_execute_fn
+class ResourcePool:
+    """The resource pool with meta info such as world size."""
+    def __init__(
+        self, process_on_nodes: Optional[Any] = None, max_colocate_count: int = 10, n_gpus_per_node: int = 8
+    ) -> None:
+        if process_on_nodes is None:
+            process_on_nodes = []
+        self._store = process_on_nodes
+        self.max_colocate_count = max_colocate_count
+        self.n_gpus_per_node = n_gpus_per_node  # this is left for future huawei GPU that contains 16 GPUs per node
+    def add_node(self, process_count):
+        self._store.append(process_count)
+    @property
+    def world_size(self):
+        return sum(self._store)
+    def __call__(self) -> Any:
+        return self._store
+    @property
+    def store(self):
+        return self._store
+    def local_world_size_list(self) -> List[int]:
+        nested_local_world_size_list = [
+            [local_world_size for _ in range(local_world_size)] for local_world_size in self._store
+        ]
+        return [item for row in nested_local_world_size_list for item in row]
+    def local_rank_list(self) -> List[int]:
+        nested_local_rank_list = [[i for i in range(local_world_size)] for local_world_size in self._store]  # noqa: C416
+        return [item for row in nested_local_rank_list for item in row]
+class ClassWithInitArgs:
+    """
+    This class stores a class constructor and the args/kwargs to construct the class.
+    It is used to instantiate the remote class.
+    """
+    def __init__(self, cls, *args, **kwargs) -> None:
+        self.cls = cls
+        self.args = args
+        self.kwargs = kwargs
+    def __call__(self) -> Any:
+        return self.cls(*self.args, **self.kwargs)
+def check_workers_alive(workers: List, is_alive: Callable, gap_time: float = 1) -> None:
+    while True:
+        for worker in workers:
+            if not is_alive(worker):
+                logging.warning(f"Worker {worker} is not alive, sending signal to main thread")
+                signal.raise_signal(signal.SIGABRT)
+        time.sleep(gap_time)
+class WorkerGroup:
+    """A group of workers"""
+    def __init__(self, resource_pool: ResourcePool, **kwargs) -> None:
+        self._is_init_with_detached_workers = True if resource_pool is None else False
+        if resource_pool is not None:
+            # handle the case when WorkGroup is attached to an existing one
+            self._procecss_dispatch_config = resource_pool()
+        else:
+            self._procecss_dispatch_config = None
+        self._workers = []
+        self._worker_names = []
+        self._master_addr = None
+        self._master_port = None
+        self._checker_thread: threading.Thread = None
+    def _is_worker_alive(self, worker):
+        raise NotImplementedError("WorkerGroup._is_worker_alive called, should be implemented in derived class.")
+    def _block_until_all_workers_alive(self) -> None:
+        while True:
+            all_state = [self._is_worker_alive(worker) for worker in self._workers]
+            if False in all_state:
+                time.sleep(1)
+            else:
+                break
+    def start_worker_aliveness_check(self, every_n_seconds=1) -> None:
+        # before starting checking worker aliveness, make sure all workers are already alive
+        self._block_until_all_workers_alive()
+        self._checker_thread = threading.Thread(
+            target=check_workers_alive, args=(self._workers, self._is_worker_alive, every_n_seconds)
+        )
+        self._checker_thread.start()
+    @property
+    def world_size(self):
+        return len(self._workers)
+    def _bind_worker_method(self, user_defined_cls, func_generator):
+        """
+        Bind the worker method to the WorkerGroup
+        """
+        for method_name in dir(user_defined_cls):
+            try:
+                method = getattr(user_defined_cls, method_name)
+                assert callable(method), f"{method_name} in {user_defined_cls} is not callable"
+            except Exception:
+                # if it is a property, it will fail because Class doesn't have instance property
+                continue
+            if hasattr(method, MAGIC_ATTR):
+                # this method is decorated by register
+                attribute = getattr(method, MAGIC_ATTR)
+                assert isinstance(attribute, Dict), f"attribute must be a dictionary. Got {type(attribute)}"
+                assert "dispatch_mode" in attribute, "attribute must contain dispatch_mode in its key"
+                dispatch_mode = attribute["dispatch_mode"]
+                execute_mode = attribute["execute_mode"]
+                blocking = attribute["blocking"]
+                # get dispatch fn
+                if isinstance(dispatch_mode, Dispatch):
+                    # get default dispatch fn
+                    fn = get_predefined_dispatch_fn(dispatch_mode=dispatch_mode)
+                    dispatch_fn = fn["dispatch_fn"]
+                    collect_fn = fn["collect_fn"]
+                else:
+                    assert isinstance(dispatch_mode, dict)
+                    assert "dispatch_fn" in dispatch_mode
+                    assert "collect_fn" in dispatch_mode
+                    dispatch_fn = dispatch_mode["dispatch_fn"]
+                    collect_fn = dispatch_mode["collect_fn"]
+                # get execute_fn_name
+                execute_mode = get_predefined_execute_fn(execute_mode=execute_mode)
+                wg_execute_fn_name = execute_mode["execute_fn_name"]
+                # get execute_fn from string
+                try:
+                    execute_fn = getattr(self, wg_execute_fn_name)
+                    assert callable(execute_fn), "execute_fn must be callable"
+                except Exception:
+                    print(f"execute_fn {wg_execute_fn_name} is invalid")
+                    raise
+                # bind a new method to the RayWorkerGroup
+                func = func_generator(
+                    self,
+                    method_name,
+                    dispatch_fn=dispatch_fn,
+                    collect_fn=collect_fn,
+                    execute_fn=execute_fn,
+                    blocking=blocking,
+                )
+                try:
+                    setattr(self, method_name, func)
+                except Exception:
+                    raise ValueError(f"Fail to set method_name {method_name}")

EasyR1-new/verl/single_controller/ray/__init__.py ADDED Viewed

	@@ -0,0 +1,18 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .base import RayClassWithInitArgs, RayResourcePool, RayWorkerGroup, create_colocated_worker_cls
+__all__ = ["RayClassWithInitArgs", "RayResourcePool", "RayWorkerGroup", "create_colocated_worker_cls"]

EasyR1-new/verl/single_controller/ray/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (327 Bytes). View file

EasyR1-new/verl/single_controller/ray/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (18.1 kB). View file

EasyR1-new/verl/single_controller/ray/base.py ADDED Viewed

	@@ -0,0 +1,493 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import random
+import re
+import string
+import time
+from typing import Any, Dict, List, Optional, Tuple
+from unittest.mock import patch
+import ray
+from ray.actor import ActorHandle
+from ray.experimental.state.api import get_actor
+from ray.util import list_named_actors
+from ray.util.placement_group import PlacementGroup, placement_group
+from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy, PlacementGroupSchedulingStrategy
+from ..base import ClassWithInitArgs, ResourcePool, Worker, WorkerGroup
+from ..base.decorator import MAGIC_ATTR
+__all__ = ["Worker"]
+def get_random_string(length: int) -> str:
+    letters_digits = string.ascii_letters + string.digits
+    return "".join(random.choice(letters_digits) for _ in range(length))
+def func_generator(self, method_name, dispatch_fn, collect_fn, execute_fn, blocking):
+    def func(*args, **kwargs):
+        args, kwargs = dispatch_fn(self, *args, **kwargs)
+        output = execute_fn(method_name, *args, **kwargs)
+        if blocking:
+            output = ray.get(output)
+        output = collect_fn(self, output)
+        return output
+    return func
+def sort_placement_group_by_node_ip(pgs: List[PlacementGroup]) -> List[PlacementGroup]:
+    """
+    Sort the placement groups by node ip, all bundles in a single placement group should be on the same node.
+    FSDPCheckpointManager saves sharded model states and optimizer states in local storage, which requires RANK
+    to be consistent across nodes when resume from checkpoint.
+    With this function, if there's only one resource pool and there's no node change, RANK should be consistent
+    across nodes in multiple ray jobs, even if the whole ray cluster is restarted.
+    """
+    node_ip = {node["NodeID"]: node["NodeManagerAddress"] for node in ray.nodes()}
+    pg_ip = {}
+    for pg in pgs:
+        specs = ray._private.state.state.placement_group_table(pg.id)
+        # all bunles should be on the same node
+        node_id = specs["bundles_to_node_id"][0]
+        pg_ip[pg.id] = node_ip[node_id]
+    return sorted(pgs, key=lambda pg: pg_ip[pg.id])
+class RayResourcePool(ResourcePool):
+    def __init__(
+        self,
+        process_on_nodes: List[int] = None,
+        use_gpu: bool = True,
+        name_prefix: str = "",
+        max_colocate_count: int = 5,
+        detached: bool = False,
+    ) -> None:
+        super().__init__(process_on_nodes, max_colocate_count)
+        self.use_gpu = use_gpu
+        # print(f"in RayProcessDispatchConfiguration: name_prefix = {name_prefix}")
+        self.name_prefix = name_prefix
+        self.pgs = None
+        self.detached = detached
+    def get_placement_groups(self, strategy: str = "STRICT_PACK", name: Optional[str] = None) -> List[PlacementGroup]:
+        if self.pgs is not None:
+            return self.pgs
+        pg_name_prefix = (
+            name if name else f"{self.name_prefix}verl_group_{'_'.join([str(count) for count in self._store])}:"
+        )
+        # print(f"pg_name_prefix = {pg_name_prefix}")
+        pg_scheme = [
+            [
+                {"CPU": self.max_colocate_count, "GPU": 1} if self.use_gpu else {"CPU": self.max_colocate_count}
+                for _ in range(process_count)
+            ]
+            for process_count in self._store
+        ]
+        lifetime = "detached" if self.detached else None
+        pgs = [
+            placement_group(bundles=bundles, strategy=strategy, name=pg_name_prefix + str(idx), lifetime=lifetime)
+            for idx, bundles in enumerate(pg_scheme)
+        ]
+        ray.get([pg.ready() for pg in pgs])
+        self.pgs = pgs
+        return pgs
+def extract_pg_from_exist(
+    resource_pools: Dict[str, RayResourcePool], src_role_names: List[str], resource_pool: RayResourcePool
+) -> List[PlacementGroup]:
+    src_pgs = [
+        pg
+        for role_name, resource_pool in resource_pools.items()
+        for pg in resource_pool.get_placement_groups()
+        if role_name in src_role_names
+    ]
+    sorted_src_pgs = sorted(src_pgs, key=lambda pg: pg.bundle_count, reverse=True)
+    sorted_process_on_nodes = sorted([(val, idx) for idx, val in enumerate(resource_pool.store)], reverse=True)
+    unsorted_pgs: List[Tuple[int, PlacementGroup]] = []
+    searching_idx = 0
+    for request_process, original_idx in sorted_process_on_nodes:
+        assert searching_idx < len(sorted_src_pgs), f"no enough nodes for request: searching {searching_idx} th node"
+        assert request_process <= sorted_src_pgs[searching_idx].bundle_count, (
+            f"requesting {request_process} processes, bundle count cannot satisfy"
+        )
+        unsorted_pgs.append((original_idx, sorted_src_pgs[searching_idx]))
+        searching_idx += 1
+    return [pg for _, pg in sorted(unsorted_pgs)]
+def merge_resource_pool(rp1: RayResourcePool, rp2: RayResourcePool) -> RayResourcePool:
+    assert rp1.use_gpu == rp2.use_gpu, "Both RayResourcePool must either use_gpu or not"
+    assert rp1.max_colocate_count == rp2.max_colocate_count, (
+        "Both RayResourcePool must has the same max_colocate_count"
+    )
+    assert rp1.n_gpus_per_node == rp2.n_gpus_per_node, "Both RayResourcePool must has the same n_gpus_per_node"
+    assert rp1.detached == rp2.detached, "Detached ResourcePool cannot be merged with non-detached ResourcePool"
+    new_store = rp1.store + rp2.store
+    merged = RayResourcePool(new_store, rp1.use_gpu, f"{rp1.name_prefix}_{rp2.name_prefix}")
+    merged.pgs = rp1.get_placement_groups() + rp2.get_placement_groups()
+    return merged
+class RayClassWithInitArgs(ClassWithInitArgs):
+    def __init__(self, cls, *args, **kwargs) -> None:
+        # self._options = kwargs.pop('options', dict())
+        super().__init__(cls, *args, **kwargs)
+        self._options = {}
+        self._additional_resource = {}
+    def set_additional_resource(self, additional_resource):
+        self._additional_resource = additional_resource
+    def update_options(self, options: Dict):
+        self._options.update(options)
+    def __call__(
+        self,
+        placement_group: PlacementGroup,
+        placement_group_bundle_idx: int,
+        use_gpu: bool = True,
+        num_gpus: int = 1,
+        sharing_with: Worker = None,
+    ) -> Any:
+        if sharing_with is not None:
+            target_node_id = ray.get(sharing_with.get_node_id.remote())
+            cuda_visible_devices = ray.get(sharing_with.get_cuda_visible_devices.remote())
+            options = {"scheduling_strategy": NodeAffinitySchedulingStrategy(node_id=target_node_id, soft=False)}
+            return self.cls.options(**options).remote(
+                *self.args, cuda_visible_devices=cuda_visible_devices, **self.kwargs
+            )
+        options = {
+            "scheduling_strategy": PlacementGroupSchedulingStrategy(
+                placement_group=placement_group, placement_group_bundle_index=placement_group_bundle_idx
+            )
+        }
+        options.update(self._options)
+        if use_gpu:
+            options["num_gpus"] = num_gpus
+        if len(self._additional_resource) > 1:
+            for k, v in self._additional_resource.items():
+                options[k] = v
+        # print("cls:", self.cls)
+        # print("args: ", self.args)
+        # print("kwargs: ", self.kwargs)
+        return self.cls.options(**options).remote(*self.args, **self.kwargs)
+class RayWorkerGroup(WorkerGroup):
+    def __init__(
+        self,
+        resource_pool: RayResourcePool = None,
+        ray_cls_with_init: RayClassWithInitArgs = None,
+        bin_pack: bool = True,
+        name_prefix: str = None,
+        detached: bool = False,
+        worker_names: List[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(resource_pool=resource_pool, **kwargs)
+        self.ray_cls_with_init = ray_cls_with_init
+        self.name_prefix = get_random_string(length=6) if name_prefix is None else name_prefix
+        if worker_names is not None:
+            assert self._is_init_with_detached_workers
+            self._worker_names = worker_names
+        if self._is_init_with_detached_workers:
+            self._init_with_detached_workers(worker_names=worker_names)
+        else:
+            self._init_with_resource_pool(
+                resource_pool=resource_pool, ray_cls_with_init=ray_cls_with_init, bin_pack=bin_pack, detached=detached
+            )
+        if ray_cls_with_init is not None:
+            self._bind_worker_method(self.ray_cls_with_init.cls, func_generator)
+    def _is_worker_alive(self, worker: ActorHandle) -> bool:
+        worker_state_dict = get_actor(worker._actor_id.hex())
+        return worker_state_dict.get("state", "undefined") == "ALIVE" if worker_state_dict is not None else False
+    def _init_with_detached_workers(self, worker_names: List[str]) -> None:
+        workers = [ray.get_actor(name=name) for name in worker_names]
+        self._workers = workers
+        self._world_size = len(worker_names)
+    def _init_with_resource_pool(
+        self, resource_pool: RayResourcePool, ray_cls_with_init: RayClassWithInitArgs, bin_pack: bool, detached: bool
+    ):
+        use_gpu = resource_pool.use_gpu
+        strategy = "PACK"
+        if bin_pack:
+            strategy = "STRICT_PACK"
+        pgs = resource_pool.get_placement_groups(strategy=strategy)
+        world_size = resource_pool.world_size
+        self._world_size = world_size
+        # cia.add_kwarg("_world_size", world_size)
+        num_gpus = 1 / resource_pool.max_colocate_count
+        rank = -1
+        local_world_size = resource_pool.store[0]
+        for pg_idx, pg in enumerate(sort_placement_group_by_node_ip(pgs)):
+            assert local_world_size <= pg.bundle_count, f"when generating for {self.name_prefix}, for the "
+            for local_rank in range(local_world_size):
+                rank += 1
+                # we pass in environment variable at option so that Worker can use environment variable to set
+                env_vars = {
+                    "WORLD_SIZE": str(world_size),
+                    "RANK": str(rank),
+                    "WG_PREFIX": self.name_prefix,
+                    "WG_BACKEND": "ray",
+                    "RAY_LOCAL_WORLD_SIZE": str(local_world_size),
+                    "RAY_LOCAL_RANK": str(local_rank),
+                }
+                if rank != 0:
+                    env_vars["MASTER_ADDR"] = self._master_addr
+                    env_vars["MASTER_PORT"] = self._master_port
+                cia_name = type(ray_cls_with_init.cls).__name__
+                match = re.search(r"ActorClass\(([^)]+)\)", cia_name)  # ray.remote(Obj) -> "ActorClass(Obj)"
+                cia_name = match.group(1) if match else cia_name  # "ActorClass(Obj)" -> "Obj"
+                name = f"{self.name_prefix}{cia_name}_{pg_idx}:{local_rank}"  # e.g. Worker_2:5
+                ray_cls_with_init.update_options({"runtime_env": {"env_vars": env_vars}, "name": name})
+                if detached:
+                    ray_cls_with_init.update_options({"lifetime": "detached"})
+                # create a worker
+                worker = ray_cls_with_init(
+                    placement_group=pg, placement_group_bundle_idx=local_rank, use_gpu=use_gpu, num_gpus=num_gpus
+                )
+                self._workers.append(worker)
+                self._worker_names.append(name)
+                if rank == 0:
+                    register_center_actor = None
+                    for _ in range(120):
+                        if f"{self.name_prefix}_register_center" not in list_named_actors():
+                            time.sleep(1)
+                        else:
+                            register_center_actor = ray.get_actor(f"{self.name_prefix}_register_center")
+                            break
+                    assert register_center_actor is not None, (
+                        f"failed to get register_center_actor: {self.name_prefix}_register_center in {list_named_actors(all_namespaces=True)}"
+                    )
+                    rank_zero_info = ray.get(register_center_actor.get_rank_zero_info.remote())
+                    self._master_addr, self._master_port = rank_zero_info["MASTER_ADDR"], rank_zero_info["MASTER_PORT"]
+                    # print(f"rank_zero_info: {rank_zero_info}")
+                    # print(f"master_addr: {self._master_addr}, master_port: {self._master_port}")
+    @property
+    def worker_names(self):
+        return self._worker_names
+    @classmethod
+    def from_detached(cls, worker_names=None, ray_cls_with_init=None):
+        worker_group = cls(
+            resource_pool=None, ray_cls_with_init=ray_cls_with_init, name_prefix=None, worker_names=worker_names
+        )
+        return worker_group
+    def spawn(self, prefix_set):
+        """
+        spawn to a dictionary of worker groups, each with a subset of method with prefix.
+        """
+        def _rebind_actor_methods(worker_group, actor_name):
+            """
+            bind the method with actor_prefix to its original name
+            """
+            prefix: str = actor_name + "_"
+            for method_name in dir(worker_group):
+                if method_name.startswith(prefix):
+                    # only valid when Python >= 3.9
+                    original_method_name = method_name.removeprefix(prefix)
+                    method = getattr(worker_group, method_name)
+                    setattr(worker_group, original_method_name, method)
+        new_worker_group_dict = {}
+        for prefix in prefix_set:
+            new_worker_group = self.from_detached(
+                worker_names=self._worker_names, ray_cls_with_init=self.ray_cls_with_init
+            )
+            _rebind_actor_methods(new_worker_group, prefix)
+            new_worker_group_dict[prefix] = new_worker_group
+        return new_worker_group_dict
+    def execute_rank_zero_sync(self, method_name: str, *args, **kwargs):
+        return ray.get(self.execute_rank_zero_async(method_name, *args, **kwargs))
+    def execute_rank_zero_async(self, method_name: str, *args, **kwargs):
+        remote_call = getattr(self._workers[0], method_name)
+        return remote_call.remote(*args, **kwargs)
+    def execute_rank_zero(self, method_name: str, *args, **kwargs):
+        return self.execute_rank_zero_async(method_name, *args, **kwargs)
+    def execute_all(self, method_name: str, *args, **kwargs):
+        return self.execute_all_async(method_name, *args, **kwargs)
+    def execute_all_sync(self, method_name: str, *args, **kwargs):
+        return ray.get(self.execute_all_async(method_name, *args, **kwargs))
+    def execute_all_async(self, method_name: str, *args, **kwargs):
+        # Here we assume that if all the parameters in args and kwargs are lists,
+        # and the lengths of all these lists are the same as len(self._workers),
+        # then we will send each element in the list to the corresponding worker.
+        # print(f"execute_all_async: method {method_name}({args}, {kwargs})")
+        length = len(self._workers)
+        if all(isinstance(arg, list) for arg in args) and all(isinstance(kwarg, list) for kwarg in kwargs.values()):
+            if all(len(arg) == length for arg in args) and all(len(kwarg) == length for kwarg in kwargs.values()):
+                # print(f"splitting args and kwargs into {length} shards")
+                result = []
+                for i in range(length):
+                    sliced_args = tuple(arg[i] for arg in args)
+                    sliced_kwargs = {k: v[i] for k, v in kwargs.items()}
+                    remote_call = getattr(self._workers[i], method_name)
+                    result.append(remote_call.remote(*sliced_args, **sliced_kwargs))
+                return result
+        return [getattr(worker, method_name).remote(*args, **kwargs) for worker in self._workers]
+    @property
+    def master_address(self):
+        return self._master_addr
+    @property
+    def master_port(self):
+        return self._master_port
+    @property
+    def workers(self):
+        return self._workers
+    @property
+    def world_size(self):
+        return self._world_size
+"""
+Utilities that enables creating workers inside the same ray.Actor,
+with code written in separate ray.Actors.
+"""
+def _bind_workers_method_to_parent(cls, key, user_defined_cls):
+    """
+    Binds the methods of each worker to the WorkerDict.
+    Note that we only bind public methods that are decorated by register
+    """
+    for method_name in dir(user_defined_cls):
+        try:
+            method = getattr(user_defined_cls, method_name)
+            assert callable(method), f"{method_name} in {user_defined_cls} is not callable"
+        except Exception:
+            # if it is a property, it will fail because Class doesn't have instance property
+            continue
+        if hasattr(method, MAGIC_ATTR):
+            def generate_function(name):
+                def func(self, *args, **kwargs):
+                    # dispatch to the actual worker
+                    return getattr(self.worker_dict[key], name)(*args, **kwargs)
+                return func
+            func = generate_function(method_name)
+            # pass MAGIC_ATTR for outer worker group
+            setattr(func, MAGIC_ATTR, getattr(method, MAGIC_ATTR))
+            try:
+                method_name_with_prefix = key + "_" + method_name
+                setattr(cls, method_name_with_prefix, func)
+                # print(f'Binding {method_name_with_prefix}')
+            except Exception:
+                raise ValueError(f"Fail to set method_name {method_name}")
+def _unwrap_ray_remote(cls):
+    if hasattr(cls, "__ray_actor_class__"):
+        cls = cls.__ray_actor_class__
+    return cls
+def create_colocated_worker_cls(class_dict: dict[str, RayClassWithInitArgs]):
+    """
+    This function should return a class instance that delegates the calls to every
+    cls in cls_dict
+    """
+    cls_dict = {}
+    init_args_dict = {}
+    worker_cls = None
+    for key, cls in class_dict.items():
+        if worker_cls is None:
+            worker_cls = cls.cls.__ray_actor_class__.__base__
+        else:
+            assert worker_cls == cls.cls.__ray_actor_class__.__base__, (
+                "the worker class should be the same when share the same process"
+            )
+        cls_dict[key] = cls.cls
+        init_args_dict[key] = {"args": cls.args, "kwargs": cls.kwargs}
+    assert cls_dict.keys() == init_args_dict.keys()
+    # TODO: create a class with customizable name
+    class WorkerDict(worker_cls):
+        def __init__(self):
+            super().__init__()
+            self.worker_dict = {}
+            for key, user_defined_cls in cls_dict.items():
+                user_defined_cls = _unwrap_ray_remote(user_defined_cls)
+                # directly instantiate the class without remote
+                with patch.dict(os.environ, {"DISABLE_WORKER_INIT": "1"}):
+                    self.worker_dict[key] = user_defined_cls(
+                        *init_args_dict[key].get("args", ()), **init_args_dict[key].get("kwargs", {})
+                    )
+    # now monkey-patch the methods from inner class to WorkerDict
+    for key, user_defined_cls in cls_dict.items():
+        user_defined_cls = _unwrap_ray_remote(user_defined_cls)
+        _bind_workers_method_to_parent(WorkerDict, key, user_defined_cls)
+    remote_cls = ray.remote(WorkerDict)
+    remote_cls = RayClassWithInitArgs(cls=remote_cls)
+    return remote_cls

EasyR1-new/verl/trainer/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

EasyR1-new/verl/trainer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (156 Bytes). View file

EasyR1-new/verl/trainer/__pycache__/config.cpython-310.pyc ADDED Viewed

Binary file (5.08 kB). View file

EasyR1-new/verl/trainer/__pycache__/core_algos.cpython-310.pyc ADDED Viewed

Binary file (15.6 kB). View file

EasyR1-new/verl/trainer/__pycache__/data_loader.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

EasyR1-new/verl/trainer/__pycache__/main.cpython-310.pyc ADDED Viewed

Binary file (3.28 kB). View file

EasyR1-new/verl/trainer/__pycache__/metrics.cpython-310.pyc ADDED Viewed

Binary file (3.74 kB). View file

EasyR1-new/verl/trainer/__pycache__/ray_trainer.cpython-310.pyc ADDED Viewed

Binary file (21.2 kB). View file

EasyR1-new/verl/trainer/config.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PPO config
+"""
+import os
+from dataclasses import asdict, dataclass, field, fields, is_dataclass
+from typing import Optional, Tuple
+from ..workers.config import WorkerConfig
+def recursive_post_init(dataclass_obj):
+    if hasattr(dataclass_obj, "post_init"):
+        dataclass_obj.post_init()
+    for attr in fields(dataclass_obj):
+        if is_dataclass(getattr(dataclass_obj, attr.name)):
+            recursive_post_init(getattr(dataclass_obj, attr.name))
+@dataclass
+class DataConfig:
+    train_files: str = ""
+    val_files: str = ""
+    prompt_key: str = "prompt"
+    answer_key: str = "answer"
+    protein_key: str = "protein"
+    image_key: str = "images"
+    video_key: str = "videos"
+    image_dir: Optional[str] = None
+    video_fps: float = 2.0
+    max_prompt_length: int = 512
+    max_response_length: int = 512
+    rollout_batch_size: int = 512
+    mini_rollout_batch_size: Optional[int] = None
+    val_batch_size: int = -1
+    format_prompt: Optional[str] = None
+    override_chat_template: Optional[str] = None
+    shuffle: bool = True
+    seed: int = 1
+    min_pixels: Optional[int] = 262144
+    max_pixels: Optional[int] = 4194304
+    filter_overlong_prompts: bool = True
+    filter_overlong_prompts_workers: int = 16
+    def post_init(self):
+        if self.image_dir is not None:
+            if os.path.exists(self.image_dir):  # ray job uses absolute path
+                self.image_dir = os.path.abspath(self.image_dir)
+            else:
+                print(f"Image directory {self.image_dir} not found.")
+                self.image_dir = None
+        if self.format_prompt is not None:
+            if os.path.exists(self.format_prompt):  # ray job uses absolute path
+                self.format_prompt = os.path.abspath(self.format_prompt)
+            else:
+                print(f"Format prompt file {self.format_prompt} not found.")
+                self.format_prompt = None
+@dataclass
+class AlgorithmConfig:
+    gamma: float = 1.0
+    """discount factor for ppo gae advantage estimator"""
+    lam: float = 1.0
+    """lambda value for ppo gae advantage estimator"""
+    adv_estimator: str = "grpo"
+    """advantage estimator, support `gae`, `grpo`, `reinforce_plus_plus`, `remax`, `rloo`"""
+    disable_kl: bool = False
+    """disable reference model"""
+    use_kl_loss: bool = False
+    """use kl loss instead of kl in reward"""
+    kl_penalty: str = "kl"
+    """kl penalty type, support `kl`, `abs`, `mse`, `low_var_kl`, `full`"""
+    kl_coef: float = 1e-3
+    """kl coefficient"""
+    kl_type: str = "fixed"
+    """kl controller type, support `fixed`, `adaptive`"""
+    kl_horizon: float = 10000.0
+    """kl horizon for adaptive kl controller"""
+    kl_target: float = 0.1
+    """target kl for adaptive kl controller"""
+    online_filtering: bool = False
+    """use online filtering"""
+    filter_key: str = "overall"
+    """reward key for filtering samples"""
+    filter_low: float = 0.01
+    """filter out low reward samples if online filtering"""
+    filter_high: float = 0.99
+    """filter out high reward samples if online filtering"""
+@dataclass
+class TrainerConfig:
+    total_epochs: int = 15
+    """total epochs for training"""
+    max_steps: Optional[int] = None
+    """max steps for training, if specified, total_epochs is ignored"""
+    project_name: str = "easy_r1"
+    """project name for logger"""
+    experiment_name: str = "demo"
+    """experiment name for logger"""
+    logger: Tuple[str] = ("console", "wandb")
+    """logger type, support `console`, `mlflow`, `swanlab`, `tensorboard`, `wandb`"""
+    nnodes: int = 1
+    """number of nodes for training"""
+    n_gpus_per_node: int = 8
+    """number of gpus per node for training"""
+    max_try_make_batch: int = 20
+    """max number of generations for online filtering, -1 means no limit"""
+    critic_warmup: int = 0
+    """critic warmup steps"""
+    val_freq: int = -1
+    """validation frequency, -1 means no validation"""
+    val_before_train: bool = True
+    """validate before training"""
+    val_only: bool = False
+    """validate only, skip training"""
+    val_generations_to_log: int = 0
+    """number of generations to log for validation"""
+    save_freq: int = -1
+    """save frequency, -1 means no saving"""
+    save_limit: int = -1
+    """max number of checkpoints to save, -1 means no limit"""
+    save_model_only: bool = False
+    """save model only, no optimizer state dict"""
+    save_checkpoint_path: Optional[str] = None
+    """save checkpoint path, if not specified, use `checkpoints/project_name/experiment_name`"""
+    load_checkpoint_path: Optional[str] = None
+    """load checkpoint path"""
+    def post_init(self):
+        if self.save_checkpoint_path is None:
+            self.save_checkpoint_path = os.path.join("checkpoints", self.project_name, self.experiment_name)
+        self.save_checkpoint_path = os.path.abspath(self.save_checkpoint_path)  # ray job uses absolute path
+        if self.load_checkpoint_path is not None:
+            if os.path.exists(self.load_checkpoint_path):  # ray job uses absolute path
+                self.load_checkpoint_path = os.path.abspath(self.load_checkpoint_path)
+            else:
+                print(f"Model checkpoint {self.load_checkpoint_path} not found.")
+                self.load_checkpoint_path = None
+@dataclass
+class PPOConfig:
+    data: DataConfig = field(default_factory=DataConfig)
+    worker: WorkerConfig = field(default_factory=WorkerConfig)
+    algorithm: AlgorithmConfig = field(default_factory=AlgorithmConfig)
+    trainer: TrainerConfig = field(default_factory=TrainerConfig)
+    def post_init(self):
+        self.worker.rollout.prompt_length = self.data.max_prompt_length
+        self.worker.rollout.response_length = self.data.max_response_length
+        self.worker.rollout.trust_remote_code = self.worker.actor.model.trust_remote_code
+        self.worker.actor.disable_kl = self.algorithm.disable_kl
+        self.worker.actor.use_kl_loss = self.algorithm.use_kl_loss
+        self.worker.actor.kl_penalty = self.algorithm.kl_penalty
+        self.worker.actor.kl_coef = self.algorithm.kl_coef
+    def deep_post_init(self):
+        recursive_post_init(self)
+    def to_dict(self):
+        return asdict(self)

EasyR1-new/verl/trainer/core_algos.py ADDED Viewed

	@@ -0,0 +1,495 @@

+# Copyright 2022 The HuggingFace Team
+# Copyright 2024 Bytedance Ltd. and/or its affiliates
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Core functions to implement PPO algorithms.
+The function implemented in this file should be used by trainer with different distributed strategies to
+implement PPO
+"""
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from enum import Enum
+from typing import TYPE_CHECKING, Dict, Literal, Tuple
+import numpy as np
+import torch
+import torch.nn.functional as F
+from ..utils import torch_functional as VF
+if TYPE_CHECKING:
+    from .config import AlgorithmConfig
+class KLController(ABC):
+    kl_coef: float
+    """KL coefficient."""
+    @abstractmethod
+    def update(self, current_kl: float, n_steps: int):
+        """Update kl_coef according to current KL."""
+        ...
+class AdaptiveKLController(KLController):
+    """Adaptive KL controller described in: https://arxiv.org/pdf/1909.08593.pdf
+    Copied from https://github.com/huggingface/trl/blob/v0.11.0/trl/trainer/utils.py#L54"""
+    def __init__(self, init_kl_coef: float, target_kl: float, horizon: float):
+        self.kl_coef = init_kl_coef
+        self.target = target_kl
+        self.horizon = horizon
+    def update(self, current_kl: float, n_steps: int):
+        target = self.target
+        proportional_error = np.clip(current_kl / target - 1, -0.2, 0.2)
+        mult = 1 + proportional_error * n_steps / self.horizon
+        self.kl_coef *= mult
+class FixedKLController(KLController):
+    """Fixed KL controller.
+    Copeid from https://github.com/huggingface/trl/blob/v0.11.0/trl/trainer/utils.py#L72"""
+    def __init__(self, init_kl_coef: float):
+        self.kl_coef = init_kl_coef
+    def update(self, current_kl: float, n_steps: int):
+        pass
+class AdvantageEstimator(str, Enum):
+    """
+    Using an enumeration class to avoid spelling errors in adv_estimator
+    """
+    GAE = "gae"
+    GRPO = "grpo"
+    REINFORCE_PLUS_PLUS = "reinforce_plus_plus"
+    REMAX = "remax"
+    RLOO = "rloo"
+def get_kl_controller(algorithm_config: "AlgorithmConfig") -> KLController:
+    """Adapted from https://github.com/huggingface/trl/blob/v0.11.0/trl/trainer/ppo_trainer.py#L319"""
+    if algorithm_config.kl_type == "fixed":
+        kl_ctrl = FixedKLController(init_kl_coef=algorithm_config.kl_coef)
+    elif algorithm_config.kl_type == "adaptive":
+        assert algorithm_config.kl_horizon > 0, f"horizon must be larger than 0. Got {algorithm_config.kl_horizon}."
+        kl_ctrl = AdaptiveKLController(
+            init_kl_coef=algorithm_config.kl_coef,
+            target_kl=algorithm_config.kl_target,
+            horizon=algorithm_config.kl_horizon,
+        )
+    else:
+        raise ValueError(f"Unknown kl type: {algorithm_config.kl_type}.")
+    return kl_ctrl
+@torch.no_grad()
+def compute_gae_advantage_return(
+    token_level_rewards: torch.Tensor,
+    values: torch.Tensor,
+    response_mask: torch.Tensor,
+    gamma: torch.Tensor,
+    lam: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Adapted from https://github.com/huggingface/trl/blob/v0.16.0/trl/trainer/ppo_trainer.py#L513
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        values: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length). The token after eos tokens have mask zero.
+        gamma: `(float)`
+            discounted factor used in RL
+        lam: `(float)`
+            lambda value when computing Generalized Advantage Estimation (https://arxiv.org/abs/1506.02438)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    lastgaelam = 0
+    advantages_reversed = []
+    gen_len = token_level_rewards.shape[-1]
+    for t in reversed(range(gen_len)):
+        nextvalues = values[:, t + 1] if t < gen_len - 1 else 0.0
+        delta = token_level_rewards[:, t] + gamma * nextvalues - values[:, t]
+        lastgaelam = delta + gamma * lam * lastgaelam
+        advantages_reversed.append(lastgaelam)
+    advantages = torch.stack(advantages_reversed[::-1], dim=1)
+    returns = advantages + values
+    advantages = VF.masked_whiten(advantages, response_mask)
+    return advantages, returns
+# NOTE(sgm): this implementation only consider outcome supervision, where the reward is a scalar.
+@torch.no_grad()
+def compute_grpo_outcome_advantage(
+    token_level_rewards: torch.Tensor, response_mask: torch.Tensor, index: torch.Tensor, eps: float = 1e-6
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute advantage for GRPO, operating only on Outcome reward
+    (with only one scalar reward for each response).
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        index: `(torch.Tensor)`
+            shape: (bs,)
+        eps: `(float)`
+            epsilon value to avoid division by zero
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    scores = token_level_rewards.sum(dim=-1)
+    id2score = defaultdict(list)
+    id2mean, id2std = {}, {}
+    bsz = scores.shape[0]
+    for i in range(bsz):
+        id2score[index[i]].append(scores[i])
+    for idx in id2score:
+        assert len(id2score[idx]) > 1, "GRPO needs rollout.n > 1."
+        id2mean[idx] = torch.mean(torch.tensor(id2score[idx]))
+        id2std[idx] = torch.std(torch.tensor(id2score[idx]))
+    for i in range(bsz):
+        scores[i] = (scores[i] - id2mean[index[i]]) / (id2std[index[i]] + eps)
+    returns = scores.unsqueeze(-1) * response_mask
+    return returns, returns
+@torch.no_grad()
+def compute_rloo_outcome_advantage(
+    token_level_rewards: torch.Tensor, response_mask: torch.Tensor, index: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute advantage for RLOO based on https://arxiv.org/abs/2402.14740
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        index: `(torch.Tensor)`
+            shape: (bs,)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    scores = token_level_rewards.sum(dim=-1)
+    id2score = defaultdict(list)
+    id2sum = {}
+    bsz = scores.shape[0]
+    for i in range(bsz):
+        id2score[index[i]].append(scores[i])
+    for idx in id2score:
+        id2sum[idx] = torch.sum(torch.tensor(id2score[idx]))
+    for i in range(bsz):
+        sample_num = len(id2score[index[i]])
+        assert sample_num > 1, "RLOO needs rollout.n > 1."
+        baseline = (id2sum[index[i]] - scores[i]) / (sample_num - 1)
+        scores[i] = scores[i] - baseline
+    returns = scores.unsqueeze(-1) * response_mask
+    return returns, returns
+@torch.no_grad()
+def compute_reinforce_plus_plus_outcome_advantage(
+    token_level_rewards: torch.Tensor, response_mask: torch.Tensor, gamma: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute advantage for REINFORCE++.
+    This implementation is based on the paper: https://arxiv.org/abs/2501.03262
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    returns = torch.zeros_like(token_level_rewards)
+    running_return = 0
+    for t in reversed(range(token_level_rewards.shape[1])):
+        running_return = token_level_rewards[:, t] + gamma * running_return
+        returns[:, t] = running_return
+        # Reset after EOS
+        running_return = running_return * response_mask[:, t]
+    advantages = VF.masked_whiten(returns, response_mask)
+    return advantages, returns
+@torch.no_grad()
+def compute_remax_outcome_advantage(
+    token_level_rewards: torch.Tensor, reward_baselines: torch.Tensor, response_mask: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Compute advantage for ReMax, operating only on Outcome reward
+    This implementation is based on the paper: https://arxiv.org/abs/2310.10505
+    (with only one scalar reward for each response).
+    Args:
+        token_level_rewards: `(torch.Tensor)`
+            shape: (bs, response_length)
+        reward_baselines: `(torch.Tensor)`
+            shape: (bs,)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+    Returns:
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        returns: `(torch.Tensor)`
+            shape: (bs, response_length)
+    """
+    scores = token_level_rewards.sum(dim=-1) - reward_baselines
+    returns = scores.unsqueeze(-1) * response_mask
+    return returns, returns
+def compute_rewards(
+    token_level_scores: torch.Tensor,
+    log_probs: torch.Tensor,
+    ref_log_probs: torch.Tensor,
+    kl_ratio: float,
+) -> torch.Tensor:
+    kl = log_probs - ref_log_probs
+    return token_level_scores - kl * kl_ratio
+def average_loss(
+    values: torch.Tensor, mask: torch.Tensor, mode: Literal["token", "seq"], eps: float = 1e-8
+) -> torch.Tensor:
+    """Average the policy loss.
+    Args:
+        values: `(torch.Tensor)`
+            shape: (bs, response_length)
+        mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        mode: `(Literal["token", "seq"])`
+            "token": average the loss in the whole batch
+            "seq": average the loss in each sequence then average the mean of the means
+        eps: `(float)`
+            epsilon value
+    Returns:
+        loss: `a scalar torch.Tensor`
+    """
+    if mode == "token":
+        return VF.masked_mean(values, mask, eps=eps)
+    elif mode == "seq":
+        return ((values * mask).sum(-1) / (mask.sum(-1) + eps)).mean()
+    else:
+        raise NotImplementedError(f"Unknown mode: {mode}.")
+def compute_policy_loss(
+    old_log_probs: torch.Tensor,
+    log_probs: torch.Tensor,
+    advantages: torch.Tensor,
+    response_mask: torch.Tensor,
+    clip_ratio_low: float,
+    clip_ratio_high: float,
+    clip_ratio_dual: float,
+    loss_avg_mode: Literal["token", "seq"],
+) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+    """Compute the clipped policy objective and related metrics for PPO.
+    Adapted from https://github.com/huggingface/trl/blob/v0.15.0/trl/trainer/ppo_trainer.py#L568
+    Args:
+        old_log_prob: `(torch.Tensor)`
+            shape: (bs, response_length)
+        log_prob: `(torch.Tensor)`
+            shape: (bs, response_length)
+        advantages: `(torch.Tensor)`
+            shape: (bs, response_length)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        clip_ratio_low: (float)
+            The lower clip range used in PPO. See https://arxiv.org/abs/1707.06347
+        clip_ratio_high: (float)
+            The higher clip range used in DAPO. See https://arxiv.org/pdf/2503.14476
+        clip_ratio_dual: (float)
+            The dual clip range used in Dual-clip PPO. See https://arxiv.org/pdf/1912.09729
+        loss_avg_mode: (Literal["token", "seq"])
+            "token": average the loss in the whole batch
+            "seq": average the loss in each sequence then average the mean of the means
+    Returns:
+        pg_loss: `a scalar torch.Tensor`
+            policy gradient loss computed via PPO
+        pg_clipfrac_higher: (float)
+            a float number indicating the fraction of policy gradient loss being clipped to a higher value
+        pg_clipfrac_lower: (float)
+            a float number indicating the fraction of policy gradient loss being clipped to a lower value
+        ppo_kl: (float)
+            a float number indicating the mean KL divergence between the old policy and the new policy
+        entropy_loss: (float)
+            a float number indicating the mean entropy loss
+    """
+    negative_approx_kl = log_probs - old_log_probs
+    # clamp negative_approx_kl to avoid nan kld
+    negative_approx_kl = torch.clamp(negative_approx_kl, -20.0, 20.0)
+    ratio = torch.exp(negative_approx_kl)
+    # clamp the ratio before exp to avoid nan grad
+    # see: https://github.com/pytorch/pytorch/issues/10729
+    clipped_ratio = torch.exp(
+        torch.clamp(negative_approx_kl, np.log(1.0 - clip_ratio_low), np.log(1.0 + clip_ratio_high))
+    )
+    # pg metrics
+    metrics = {"ppo_kl": -negative_approx_kl}
+    # use negative log probs as an estimator of entropy loss
+    metrics["entropy_loss"] = average_loss(-log_probs, response_mask, mode=loss_avg_mode)
+    pg_loss = -advantages * ratio  # -ratio * A
+    pg_loss2 = -advantages * clipped_ratio  # -clip(ratio, 1-clip_low, 1+clip_high) * A
+    pg_loss3 = -advantages * clip_ratio_dual  # -clip_dual * A
+    clipped_pg_loss_higher = torch.max(pg_loss, pg_loss2)  # clip if pg_loss < pg_loss2
+    metrics["pg_clipfrac_higher"] = (pg_loss < pg_loss2).float()
+    clipped_pg_loss_lower = torch.min(clipped_pg_loss_higher, pg_loss3)  # clip if pg_loss > pg_loss3 and adv < 0
+    final_pg_loss = torch.where(advantages < 0, clipped_pg_loss_lower, clipped_pg_loss_higher)
+    metrics["pg_clipfrac_lower"] = (clipped_pg_loss_higher > pg_loss3).float() * (advantages < 0).float()
+    final_pg_loss = average_loss(final_pg_loss, response_mask, mode=loss_avg_mode)
+    metrics = {k: VF.masked_mean(v, response_mask).detach().item() for k, v in metrics.items()}
+    return final_pg_loss, metrics
+def compute_value_loss(
+    vpreds: torch.Tensor,
+    returns: torch.Tensor,
+    values: torch.Tensor,
+    response_mask: torch.Tensor,
+    cliprange_value: float,
+    loss_avg_mode: Literal["token", "seq"],
+) -> Tuple[torch.Tensor, float]:
+    """Compute the value loss.
+    Adapted from https://github.com/huggingface/trl/blob/v0.15.0/trl/trainer/ppo_trainer.py#L556
+    Args:
+        vpreds (`torch.FloatTensor`):
+            Predicted values of the value head, shape (`batch_size`, `response_length`)
+        returns: (`torch.FloatTensor`):
+            Ground truth returns, shape (`batch_size`, `response_length`)
+        values (`torch.FloatTensor`):
+            Old values of value head, shape (`batch_size`, `response_length`)
+        response_mask: `(torch.Tensor)`
+            shape: (bs, response_length)
+        cliprange_value: (float)
+            The clip range for value net used in PPO. See https://arxiv.org/abs/1707.06347
+        loss_avg_mode: (Literal["token", "seq"])
+            "token": average the loss in the whole batch
+            "seq": average the loss in each sequence then average the mean of the means
+    Returns:
+        vf_loss: a scalar (`torch.FloatTensor`):
+            value function loss
+        vf_clipfrac: a float
+            The ratio of vf being clipped
+    """
+    vpredclipped = torch.clamp(vpreds, values - cliprange_value, values + cliprange_value)
+    vf_loss1 = torch.square(vpreds - returns)
+    vf_loss2 = torch.square(vpredclipped - returns)
+    clipped_vf_losses = torch.max(vf_loss1, vf_loss2)  # clip if vf_loss1 < vf_loss2
+    vf_loss = 0.5 * average_loss(clipped_vf_losses, response_mask, mode=loss_avg_mode)
+    vf_clipfrac = VF.masked_mean((vf_loss1 < vf_loss2).float(), response_mask).detach().item()
+    return vf_loss, vf_clipfrac
+def compute_kl(
+    log_probs: torch.FloatTensor,
+    ref_log_probs: torch.FloatTensor,
+    kl_penalty: Literal["kl", "abs", "mse", "low_var_kl", "full"],
+) -> torch.Tensor:
+    """Compute KL divergence given log_probs and ref_log_probs.
+    Adapted from https://github.com/huggingface/trl/blob/v0.11.0/trl/trainer/ppo_trainer.py#L1150
+    Args:
+        log_probs: torch.Tensor
+        ref_log_probs: torch.Tensor
+        kl_penalty: str ("kl", "abs", "mse", "low_var_kl", "full")
+    Returns:
+        kl_div: torch.Tensor
+    """
+    log_probs, ref_log_probs = log_probs.float(), ref_log_probs.float()
+    if kl_penalty == "kl":
+        return log_probs - ref_log_probs
+    if kl_penalty == "abs":
+        return (log_probs - ref_log_probs).abs()
+    if kl_penalty == "mse":
+        return 0.5 * (log_probs - ref_log_probs).square()
+    # J. Schulman. Approximating kl divergence, 2020.
+    # URL http://joschu.net/blog/kl-approx.html
+    if kl_penalty == "low_var_kl":
+        # For numerical stability
+        kl = (ref_log_probs - log_probs).clamp(-20.0, 20.0)
+        kld = (kl.exp() - kl - 1).contiguous()
+        return torch.clamp(kld, min=-10.0, max=10.0)
+    if kl_penalty == "full":
+        return F.kl_div(ref_log_probs, log_probs, log_target=True, reduction="none").sum(-1)
+    raise NotImplementedError(f"Unknown KL penalty: {kl_penalty}.")