koichi12 commited on Feb 12, 2025

Commit

c27e68a

verified ·

1 Parent(s): db5dd97

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
.venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so +3 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/_custom_ops.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/_ipex_ops.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/_version.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/beam_search.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/connections.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/envs.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/forward_context.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/logger.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/logits_process.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/outputs.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/pooling_params.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/sampling_params.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/scalar_type.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/scripts.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/sequence.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/tracing.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/__pycache__/version.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/__init__.py +0 -0
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/block_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/evictor.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/interfaces.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/placeholder_block_space_manager.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/scheduler.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__init__.py +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/block_table.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/common.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/interfaces.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/naive_block.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/prefix_caching_block.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/utils.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/core/block/block_table.py +398 -0
.venv/lib/python3.11/site-packages/vllm/core/block/common.py +370 -0
.venv/lib/python3.11/site-packages/vllm/core/block/cpu_gpu_block_allocator.py +438 -0
.venv/lib/python3.11/site-packages/vllm/core/block/interfaces.py +318 -0
.venv/lib/python3.11/site-packages/vllm/core/block/naive_block.py +465 -0
.venv/lib/python3.11/site-packages/vllm/core/block/prefix_caching_block.py +1134 -0
.venv/lib/python3.11/site-packages/vllm/core/block/utils.py +27 -0
.venv/lib/python3.11/site-packages/vllm/core/interfaces.py +134 -0
.venv/lib/python3.11/site-packages/vllm/core/placeholder_block_space_manager.py +99 -0
.venv/lib/python3.11/site-packages/vllm/core/scheduler.py +1840 -0
.venv/lib/python3.11/site-packages/vllm/device_allocator/__pycache__/cumem.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc +0 -0

.gitattributes CHANGED Viewed

@@ -200,3 +200,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/google/protobuf/__pycache__/descriptor_pb2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/grpc/__pycache__/_channel.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/google/protobuf/__pycache__/descriptor_pb2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/grpc/__pycache__/_channel.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8da100e7b8957b1fbf02ef3114676091bdd6d861169f948bbbeaf0fceade5992
+size 1296528

.venv/lib/python3.11/site-packages/vllm/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (1.97 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/_custom_ops.cpython-311.pyc ADDED Viewed

Binary file (57.4 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/_ipex_ops.cpython-311.pyc ADDED Viewed

Binary file (11.4 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/_version.cpython-311.pyc ADDED Viewed

Binary file (635 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/beam_search.cpython-311.pyc ADDED Viewed

Binary file (3.98 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/connections.cpython-311.pyc ADDED Viewed

Binary file (9.88 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/envs.cpython-311.pyc ADDED Viewed

Binary file (27.6 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/forward_context.cpython-311.pyc ADDED Viewed

Binary file (5.77 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/logger.cpython-311.pyc ADDED Viewed

Binary file (9.24 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/logits_process.cpython-311.pyc ADDED Viewed

Binary file (5.47 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/outputs.cpython-311.pyc ADDED Viewed

Binary file (24.4 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/pooling_params.cpython-311.pyc ADDED Viewed

Binary file (1.42 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/sampling_params.cpython-311.pyc ADDED Viewed

Binary file (26.9 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/scalar_type.cpython-311.pyc ADDED Viewed

Binary file (14.4 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/scripts.cpython-311.pyc ADDED Viewed

Binary file (8.83 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/sequence.cpython-311.pyc ADDED Viewed

Binary file (77.6 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/tracing.cpython-311.pyc ADDED Viewed

Binary file (6.92 kB). View file

.venv/lib/python3.11/site-packages/vllm/__pycache__/version.cpython-311.pyc ADDED Viewed

Binary file (622 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/core/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/vllm/core/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (182 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/core/__pycache__/block_manager.cpython-311.pyc ADDED Viewed

Binary file (24.7 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/__pycache__/evictor.cpython-311.pyc ADDED Viewed

Binary file (8.26 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/__pycache__/interfaces.cpython-311.pyc ADDED Viewed

Binary file (6.89 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/__pycache__/placeholder_block_space_manager.cpython-311.pyc ADDED Viewed

Binary file (5.79 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/__pycache__/scheduler.cpython-311.pyc ADDED Viewed

Binary file (73.9 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (188 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/block_table.cpython-311.pyc ADDED Viewed

Binary file (18.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/common.cpython-311.pyc ADDED Viewed

Binary file (19.2 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-311.pyc ADDED Viewed

Binary file (22.8 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/interfaces.cpython-311.pyc ADDED Viewed

Binary file (16.3 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/naive_block.cpython-311.pyc ADDED Viewed

Binary file (22.5 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/prefix_caching_block.cpython-311.pyc ADDED Viewed

Binary file (47.4 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/utils.cpython-311.pyc ADDED Viewed

Binary file (1.33 kB). View file

.venv/lib/python3.11/site-packages/vllm/core/block/block_table.py ADDED Viewed

	@@ -0,0 +1,398 @@

+# SPDX-License-Identifier: Apache-2.0
+import math
+from typing import List, Optional
+from vllm.core.block.common import BlockList
+from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
+from vllm.utils import Device, cdiv, chunk_list
+class BlockTable:
+    """A class to manage blocks for a specific sequence.
+    The BlockTable maps a sequence of tokens to a list of blocks, where each
+    block represents a contiguous memory allocation for a portion of the
+    sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
+    responsible for allocating and freeing memory for the blocks.
+    Args:
+        block_size (int): The maximum number of tokens that can be stored in a
+            single block.
+        block_allocator (DeviceAwareBlockAllocator): The block allocator used to
+            manage memory for the blocks.
+        _blocks (Optional[List[Block]], optional): An optional list of existing
+            blocks to initialize the BlockTable with. If not provided, an empty
+            BlockTable is created.
+        max_block_sliding_window (Optional[int], optional): The number of
+            blocks to keep around for each sequence. If None, all blocks
+            are kept (eg., when sliding window is not used).
+            It should at least fit the sliding window size of the model.
+    Attributes:
+        _block_size (int): The maximum number of tokens that can be stored in a
+            single block.
+        _allocator (DeviceAwareBlockAllocator): The block allocator used to
+            manage memory for the blocks.
+        _blocks (Optional[List[Block]]): The list of blocks managed by this
+            BlockTable.
+        _num_full_slots (int): The number of tokens currently stored in the
+            blocks.
+    """
+    def __init__(
+        self,
+        block_size: int,
+        block_allocator: DeviceAwareBlockAllocator,
+        _blocks: Optional[List[Block]] = None,
+        max_block_sliding_window: Optional[int] = None,
+    ):
+        self._block_size = block_size
+        self._allocator = block_allocator
+        if _blocks is None:
+            _blocks = []
+        self._blocks: BlockList = BlockList(_blocks)
+        self._max_block_sliding_window = max_block_sliding_window
+        self._num_full_slots = self._get_num_token_ids()
+    @staticmethod
+    def get_num_required_blocks(token_ids: List[int],
+                                block_size: int,
+                                num_lookahead_slots: int = 0) -> int:
+        """Calculates the minimum number of blocks required to store a given
+        sequence of token IDs along with any look-ahead slots that may be
+        required (like in multi-step + chunked-prefill).
+        This assumes worst-case scenario, where every block requires a new
+        allocation (e.g. ignoring prefix caching).
+        Args:
+            token_ids (List[int]): The sequence of token IDs to be stored.
+            block_size (int): The maximum number of tokens that can be stored in
+                a single block.
+            num_lookahead_slots (int): look-ahead slots that the sequence may
+                require.
+        Returns:
+            int: The minimum number of blocks required to store the given
+                sequence of token IDs along with any required look-ahead slots.
+        """
+        return cdiv(len(token_ids) + num_lookahead_slots, block_size)
+    def allocate(self,
+                 token_ids: List[int],
+                 device: Device = Device.GPU,
+                 extra_hash: Optional[int] = None) -> None:
+        """Allocates memory blocks for storing the given sequence of token IDs.
+        This method allocates the required number of blocks to store the given
+        sequence of token IDs.
+        Args:
+            token_ids (List[int]): The sequence of token IDs to be stored.
+            device (Device, optional): The device on which the blocks should be
+                allocated. Defaults to Device.GPU.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefixcaching block.
+        """
+        assert not self._is_allocated
+        assert token_ids
+        blocks = self._allocate_blocks_for_token_ids(prev_block=None,
+                                                     token_ids=token_ids,
+                                                     device=device,
+                                                     extra_hash=extra_hash)
+        self.update(blocks)
+        self._num_full_slots = len(token_ids)
+    def update(self, blocks: List[Block]) -> None:
+        """Resets the table to the newly provided blocks
+        (with their corresponding block ids)
+        """
+        self._blocks.update(blocks)
+    def append_token_ids(self,
+                         token_ids: List[int],
+                         num_lookahead_slots: int = 0,
+                         num_computed_slots: Optional[int] = None,
+                         extra_hash: Optional[int] = None) -> None:
+        """Appends a sequence of token IDs to the existing blocks in the
+        BlockTable.
+        This method appends the given sequence of token IDs to the existing
+        blocks in the BlockTable. If there is not enough space in the existing
+        blocks, new blocks are allocated using the `ensure_num_empty_slots`
+        method to accommodate the additional tokens.
+        The token IDs are divided into chunks of size `block_size` (except for
+        the first chunk, which may be smaller), and each chunk is appended to a
+        separate block.
+        Args:
+            token_ids (List[int]): The sequence of token IDs to be appended.
+            num_computed_slots (Optional[int]): The number of KV cache slots
+                that are already filled (computed).
+                When sliding window is enabled, this is used to compute how many
+                blocks to drop at the front of the sequence.
+                Without sliding window, None can be passed.
+                Without chunked prefill, it should be the same as
+                _num_full_slots.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
+        """
+        assert self._is_allocated, "no blocks have been allocated"
+        assert len(self._blocks) > 0
+        # Drop blocks that are no longer needed due to sliding window
+        if self._max_block_sliding_window is not None:
+            null_block = self._allocator.allocate_or_get_null_block()
+            assert num_computed_slots is not None
+            end_block_idx = (num_computed_slots //
+                             self._block_size) - self._max_block_sliding_window
+            for idx in range(0, end_block_idx):
+                b = self._blocks[idx]
+                if b is not null_block:
+                    self._allocator.free(b)
+                    self._blocks[idx] = null_block
+        # Ensure there are enough empty slots for the new tokens plus
+        # lookahead slots
+        self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
+                                    num_lookahead_slots,
+                                    extra_hash=extra_hash)
+        # Update the blocks with the new tokens
+        first_block_idx = self._num_full_slots // self._block_size
+        token_blocks = self._chunk_token_blocks_for_append(token_ids)
+        for i, token_block in enumerate(token_blocks):
+            self._blocks.append_token_ids(first_block_idx + i, token_block)
+        self._num_full_slots += len(token_ids)
+    def ensure_num_empty_slots(self,
+                               num_empty_slots: int,
+                               extra_hash: Optional[int] = None) -> None:
+        """Ensures that the BlockTable has at least the specified number of
+        empty slots available.
+        This method checks if the BlockTable has enough empty slots (i.e.,
+        available space) to accommodate the requested number of tokens. If not,
+        it allocates additional blocks on the GPU to ensure that the required
+        number of empty slots is available.
+        Args:
+            num_empty_slots (int): The minimum number of empty slots required.
+            extra_hash (Optional[int]): The hash value of additional
+                factors such as adapters that influence the block, apart
+                from the token_ids.
+        """
+        # Currently the block table only supports
+        # appending tokens to GPU blocks.
+        device = Device.GPU
+        assert self._is_allocated
+        if self._num_empty_slots >= num_empty_slots:
+            return
+        slots_to_allocate = num_empty_slots - self._num_empty_slots
+        blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
+        for _ in range(blocks_to_allocate):
+            assert len(self._blocks) > 0
+            self._blocks.append(
+                self._allocator.allocate_mutable_block(
+                    prev_block=self._blocks[-1],
+                    device=device,
+                    extra_hash=extra_hash))
+    def fork(self) -> "BlockTable":
+        """Creates a new BlockTable instance with a copy of the blocks from the
+        current instance.
+        This method creates a new BlockTable instance with the same block size,
+        block allocator, and a copy of the blocks from the current instance. The
+        new BlockTable has its own independent set of blocks, but shares the
+        same underlying memory allocation with the original BlockTable.
+        Returns:
+            BlockTable: A new BlockTable instance with a copy of the blocks from
+                the current instance.
+        """
+        assert self._is_allocated
+        assert len(self._blocks) > 0
+        forked_blocks = self._allocator.fork(self._blocks[-1])
+        return BlockTable(
+            block_size=self._block_size,
+            block_allocator=self._allocator,
+            _blocks=forked_blocks,
+            max_block_sliding_window=self._max_block_sliding_window,
+        )
+    def free(self) -> None:
+        """Frees the memory occupied by the blocks in the BlockTable.
+        This method iterates over all the blocks in the `_blocks` list and calls
+        the `free` method of the `_allocator` object to release the memory
+        occupied by each block. After freeing all the blocks, the `_blocks` list
+        is set to `None`.
+        """
+        for block in self.blocks:
+            self._allocator.free(block)
+        self._blocks.reset()
+    @property
+    def physical_block_ids(self) -> List[int]:
+        """Returns a list of physical block indices for the blocks in the
+        BlockTable.
+        This property returns a list of integers, where each integer represents
+        the physical block index of a corresponding block in the `_blocks` list.
+        The physical block index is a unique identifier for the memory location
+        occupied by the block.
+        Returns:
+            List[int]: A list of physical block indices for the blocks in the
+                BlockTable.
+        """
+        return self._blocks.ids()
+    def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
+        """Get the number of "unseen" tokens in the sequence.
+        Unseen tokens are tokens in the sequence corresponding to this block
+        table, but are not yet appended to this block table.
+        Args:
+            sequence_token_ids (List[int]): The list of token ids in the
+                sequence.
+        Returns:
+            List[int]: The postfix of sequence_token_ids that has not yet been
+                appended to the block table.
+        """
+        # Since the block table is append-only, the unseen token ids are the
+        # ones after the appended ones.
+        return sequence_token_ids[self.num_full_slots:]
+    def _allocate_blocks_for_token_ids(
+            self,
+            prev_block: Optional[Block],
+            token_ids: List[int],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
+        blocks: List[Block] = []
+        block_token_ids = []
+        tail_token_ids = []
+        for cur_token_ids in chunk_list(token_ids, self._block_size):
+            if len(cur_token_ids) == self._block_size:
+                block_token_ids.append(cur_token_ids)
+            else:
+                tail_token_ids.append(cur_token_ids)
+        if block_token_ids:
+            blocks.extend(
+                self._allocator.allocate_immutable_blocks(
+                    prev_block,
+                    block_token_ids=block_token_ids,
+                    device=device,
+                    extra_hash=extra_hash))
+            prev_block = blocks[-1]
+        if tail_token_ids:
+            assert len(tail_token_ids) == 1
+            cur_token_ids = tail_token_ids[0]
+            block = self._allocator.allocate_mutable_block(
+                prev_block=prev_block, device=device, extra_hash=extra_hash)
+            block.append_token_ids(cur_token_ids)
+            blocks.append(block)
+        return blocks
+    def _get_all_token_ids(self) -> List[int]:
+        # NOTE: This function is O(seq_len); use sparingly.
+        token_ids: List[int] = []
+        if not self._is_allocated:
+            return token_ids
+        for block in self.blocks:
+            token_ids.extend(block.token_ids)
+        return token_ids
+    def _get_num_token_ids(self) -> int:
+        res = 0
+        for block in self.blocks:
+            res += len(block.token_ids)
+        return res
+    @property
+    def _is_allocated(self) -> bool:
+        return len(self._blocks) > 0
+    @property
+    def blocks(self) -> List[Block]:
+        return self._blocks.list()
+    @property
+    def _num_empty_slots(self) -> int:
+        assert self._is_allocated
+        return len(self._blocks) * self._block_size - self._num_full_slots
+    @property
+    def num_full_slots(self) -> int:
+        """Returns the total number of tokens currently stored in the
+        BlockTable.
+        Returns:
+            int: The total number of tokens currently stored in the BlockTable.
+        """
+        return self._num_full_slots
+    def get_num_blocks_touched_by_append_slots(
+            self, token_ids: List[int], num_lookahead_slots: int) -> int:
+        """Determine how many blocks will be "touched" by appending the token
+        ids.
+        This is required for the scheduler to determine whether a sequence can
+        continue generation, or if it must be preempted.
+        """
+        # Math below is equivalent to:
+        # all_token_ids = token_ids + [-1] * num_lookahead_slots
+        # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
+        # return len(token_blocks)
+        num_token_ids = len(token_ids) + num_lookahead_slots
+        first_chunk_size = self._block_size - (self._num_full_slots %
+                                               self._block_size)
+        num_token_blocks = (1 + math.ceil(
+            (num_token_ids - first_chunk_size) / self._block_size))
+        return num_token_blocks
+    def _chunk_token_blocks_for_append(
+            self, token_ids: List[int]) -> List[List[int]]:
+        """Split the token ids into block-sized chunks so they can be easily
+        appended to blocks. The first such "token block" may have less token ids
+        than the block size, since the last allocated block may be partially
+        full.
+        If no token ids are provided, then no chunks are returned.
+        """
+        if not token_ids:
+            return []
+        first_chunk_size = self._block_size - (self._num_full_slots %
+                                               self._block_size)
+        token_blocks = [token_ids[:first_chunk_size]]
+        token_blocks.extend(
+            chunk_list(token_ids[first_chunk_size:], self._block_size))
+        return token_blocks

.venv/lib/python3.11/site-packages/vllm/core/block/common.py ADDED Viewed

	@@ -0,0 +1,370 @@

+# SPDX-License-Identifier: Apache-2.0
+from collections import deque
+from dataclasses import dataclass
+from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
+from vllm.core.block.interfaces import Block, BlockAllocator
+BlockId = int
+RefCount = int
+class RefCounterProtocol(Protocol):
+    def incr(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+    def decr(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+    def get(self, block_id: BlockId) -> RefCount:
+        raise NotImplementedError
+class RefCounter(RefCounterProtocol):
+    """A class for managing reference counts for a set of block indices.
+    The RefCounter class maintains a dictionary that maps block indices to their
+    corresponding reference counts. It provides methods to increment, decrement,
+    and retrieve the reference count for a given block index.
+    Args:
+        all_block_indices (Iterable[BlockId]): An iterable of block indices
+            to initialize the reference counter with.
+    """
+    def __init__(self, all_block_indices: Iterable[BlockId]):
+        deduped = set(all_block_indices)
+        self._refcounts: Dict[BlockId, RefCount] = {
+            index: 0
+            for index in deduped
+        }
+    def incr(self, block_id: BlockId) -> RefCount:
+        assert block_id in self._refcounts
+        pre_incr_refcount = self._refcounts[block_id]
+        assert pre_incr_refcount >= 0
+        post_incr_refcount = pre_incr_refcount + 1
+        self._refcounts[block_id] = post_incr_refcount
+        return post_incr_refcount
+    def decr(self, block_id: BlockId) -> RefCount:
+        assert block_id in self._refcounts
+        refcount = self._refcounts[block_id]
+        assert refcount > 0
+        refcount -= 1
+        self._refcounts[block_id] = refcount
+        return refcount
+    def get(self, block_id: BlockId) -> RefCount:
+        assert block_id in self._refcounts
+        return self._refcounts[block_id]
+    def as_readonly(self) -> "ReadOnlyRefCounter":
+        return ReadOnlyRefCounter(self)
+class ReadOnlyRefCounter(RefCounterProtocol):
+    """A read-only view of the RefCounter class.
+    The ReadOnlyRefCounter class provides a read-only interface to access the
+    reference counts maintained by a RefCounter instance. It does not allow
+    modifications to the reference counts.
+    Args:
+        refcounter (RefCounter): The RefCounter instance to create a read-only
+            view for.
+    """
+    def __init__(self, refcounter: RefCounter):
+        self._refcounter = refcounter
+    def incr(self, block_id: BlockId) -> RefCount:
+        raise ValueError("Incr not allowed")
+    def decr(self, block_id: BlockId) -> RefCount:
+        raise ValueError("Decr not allowed")
+    def get(self, block_id: BlockId) -> RefCount:
+        return self._refcounter.get(block_id)
+class CopyOnWriteTracker:
+    """A class for tracking and managing copy-on-write operations for blocks.
+    The CopyOnWriteTracker class maintains a mapping of source block indices to
+        their corresponding copy-on-write destination block indices. It works in
+        conjunction with a RefCounter.
+    Args:
+        refcounter (RefCounter): The reference counter used to track block
+            reference counts.
+    """
+    def __init__(self, refcounter: RefCounterProtocol):
+        self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
+        self._refcounter = refcounter
+    def is_appendable(self, block: Block) -> bool:
+        """Checks if the block is shared or not. If shared, then it cannot
+        be appended and needs to be duplicated via copy-on-write
+        """
+        block_id = block.block_id
+        if block_id is None:
+            return True
+        refcount = self._refcounter.get(block_id)
+        return refcount <= 1
+    def record_cow(self, src_block_id: Optional[BlockId],
+                   trg_block_id: Optional[BlockId]) -> None:
+        """Records a copy-on-write operation from source to target block id
+        Args:
+            src_block_id (BlockId): The source block id from which to copy
+                the data
+            trg_block_id (BlockId): The target block id to which the data
+                is copied
+        """
+        assert src_block_id is not None
+        assert trg_block_id is not None
+        self._copy_on_writes.append((src_block_id, trg_block_id))
+    def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
+        """Clears the copy-on-write tracking information and returns the current
+        state.
+        This method returns a list mapping source block indices to
+         destination block indices for the current copy-on-write operations.
+        It then clears the internal tracking information.
+        Returns:
+            List[Tuple[BlockId, BlockId]]: A list mapping source
+                block indices to destination block indices for the
+                current copy-on-write operations.
+        """
+        cows = self._copy_on_writes
+        self._copy_on_writes = []
+        return cows
+class BlockPool:
+    """Used to pre-allocate block objects, in order to avoid excessive python
+    object allocations/deallocations.
+    The pool starts from "pool_size" objects and will increase to more objects
+    if necessary
+    Note that multiple block objects may point to the same physical block id,
+    which is why this pool is needed, so that it will be easier to support
+    prefix caching and more complicated sharing of physical blocks.
+    """
+    def __init__(self, block_size: int, create_block: Block.Factory,
+                 allocator: BlockAllocator, pool_size: int):
+        self._block_size = block_size
+        self._create_block = create_block
+        self._allocator = allocator
+        self._pool_size = pool_size
+        assert self._pool_size >= 0
+        self._free_ids: Deque[int] = deque(range(self._pool_size))
+        self._pool = []
+        for i in range(self._pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None,
+                                   extra_hash=None))
+    def increase_pool(self):
+        """Doubles the internal pool size
+        """
+        cur_pool_size = self._pool_size
+        new_pool_size = cur_pool_size * 2
+        self._pool_size = new_pool_size
+        self._free_ids += deque(range(cur_pool_size, new_pool_size))
+        for i in range(cur_pool_size, new_pool_size):
+            self._pool.append(
+                self._create_block(prev_block=None,
+                                   token_ids=[],
+                                   block_size=self._block_size,
+                                   allocator=self._allocator,
+                                   block_id=None,
+                                   extra_hash=None))
+    def init_block(self,
+                   prev_block: Optional[Block],
+                   token_ids: List[int],
+                   block_size: int,
+                   physical_block_id: Optional[int],
+                   extra_hash: Optional[int] = None) -> Block:
+        if len(self._free_ids) == 0:
+            self.increase_pool()
+            assert len(self._free_ids) > 0
+        pool_id = self._free_ids.popleft()
+        block = self._pool[pool_id]
+        block.__init__(  # type: ignore[misc]
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            allocator=block._allocator,  # type: ignore[attr-defined]
+            block_id=physical_block_id,
+            extra_hash=extra_hash)
+        block.pool_id = pool_id  # type: ignore[attr-defined]
+        return block
+    def free_block(self, block: Block) -> None:
+        self._free_ids.appendleft(block.pool_id)  # type: ignore[attr-defined]
+class BlockList:
+    """This class is an optimization to allow fast-access to physical
+    block ids. It maintains a block id list that is updated with the
+    block list and this avoids the need to reconstruct the block id
+    list on every iteration of the block manager
+    """
+    def __init__(self, blocks: List[Block]):
+        self._blocks: List[Block] = []
+        self._block_ids: List[int] = []
+        self.update(blocks)
+    def _add_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_ids.append(block_id)
+    def _update_block_id(self, block_index: int,
+                         new_block_id: Optional[BlockId]) -> None:
+        assert new_block_id is not None
+        self._block_ids[block_index] = new_block_id
+    def update(self, blocks: List[Block]):
+        self._blocks = blocks
+        # Cache block ids for fast query
+        self._block_ids = []
+        for block in self._blocks:
+            self._add_block_id(block.block_id)
+    def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
+        block = self._blocks[block_index]
+        prev_block_id = block.block_id
+        block.append_token_ids(token_ids)
+        # CoW or promotion may update the internal block_id
+        if prev_block_id != block.block_id:
+            self._update_block_id(block_index, block.block_id)
+    def append(self, new_block: Block):
+        self._blocks.append(new_block)
+        self._add_block_id(new_block.block_id)
+    def __len__(self) -> int:
+        return len(self._blocks)
+    def __getitem__(self, block_index: int) -> Block:
+        return self._blocks[block_index]
+    def __setitem__(self, block_index: int, new_block: Block) -> None:
+        self._blocks[block_index] = new_block
+        self._update_block_id(block_index, new_block.block_id)
+    def reset(self):
+        self._blocks = []
+        self._block_ids = []
+    def list(self) -> List[Block]:
+        return self._blocks
+    def ids(self) -> List[int]:
+        return self._block_ids
+@dataclass
+class CacheMetricData:
+    """A utility dataclass to maintain cache metric.
+    To avoid overflow, we maintain the hit rate in block granularity, so that
+    we can maintain a single hit rate for n_completed_block x block_size,
+    and calculate the real time hit rate by the following:
+    BS = The number of queries per block.
+    nB = The number of completed blocks.
+    HR = hit rate of (nB x BS) queries.
+    Q = current number of queries (< BS).
+    H = current number of hits (< BS).
+    hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
+    """
+    num_completed_blocks: int = 0
+    completed_block_cache_hit_rate: float = 0.0
+    num_incompleted_block_queries: int = 0
+    num_incompleted_block_hit: int = 0
+    block_size: int = 1000
+    def query(self, hit: bool):
+        self.num_incompleted_block_queries += 1
+        self.num_incompleted_block_hit += 1 if hit else 0
+        # When a block is completed, update the cache hit rate
+        # and reset the incomplete numbers.
+        if self.num_incompleted_block_queries == self.block_size:
+            hit_rate = (self.num_incompleted_block_hit /
+                        self.num_incompleted_block_queries)
+            self.completed_block_cache_hit_rate = (
+                self.completed_block_cache_hit_rate * self.num_completed_blocks
+                + hit_rate) / (self.num_completed_blocks + 1)
+            self.num_incompleted_block_queries = 0
+            self.num_incompleted_block_hit = 0
+            self.num_completed_blocks += 1
+    def get_hit_rate(self):
+        incomplete_ratio = self.num_incompleted_block_queries / self.block_size
+        total_blocks = self.num_completed_blocks + incomplete_ratio
+        if total_blocks == 0:
+            return 0.0
+        completed_block_hit, incompleted_block_hit = 0.0, 0.0
+        if self.num_completed_blocks > 0:
+            completed_block_hit = (self.completed_block_cache_hit_rate *
+                                   self.num_completed_blocks)
+        if self.num_incompleted_block_queries > 0:
+            incompleted_hit_rate = (self.num_incompleted_block_hit /
+                                    self.num_incompleted_block_queries)
+            incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
+        return (completed_block_hit + incompleted_block_hit) / total_blocks
+def get_all_blocks_recursively(last_block: Block) -> List[Block]:
+    """Retrieves all the blocks in a sequence starting from the last block.
+    This function recursively traverses the sequence of blocks in reverse order,
+    starting from the given last block, and returns a list of all the blocks in
+    the sequence.
+    Args:
+        last_block (Block): The last block in the sequence.
+    Returns:
+        List[Block]: A list of all the blocks in the sequence, in the order they
+            appear.
+    """
+    def recurse(block: Block, lst: List[Block]) -> None:
+        if block.prev_block is not None:
+            recurse(block.prev_block, lst)
+        lst.append(block)
+    all_blocks: List[Block] = []
+    recurse(last_block, all_blocks)
+    return all_blocks

.venv/lib/python3.11/site-packages/vllm/core/block/cpu_gpu_block_allocator.py ADDED Viewed

	@@ -0,0 +1,438 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import Dict, FrozenSet, List, Optional, Tuple
+from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
+                                        DeviceAwareBlockAllocator)
+from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
+from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
+from vllm.platforms import current_platform
+from vllm.utils import Device
+class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
+    """A block allocator that can allocate blocks on both CPU and GPU memory.
+    This class implements the `DeviceAwareBlockAllocator` interface and provides
+    functionality for allocating and managing blocks of memory on both CPU and
+    GPU devices.
+    The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
+    blocks, and allows for allocation, deallocation, forking, and swapping of
+    blocks across these memory pools.
+    """
+    @staticmethod
+    def create(
+        allocator_type: str,
+        num_gpu_blocks: int,
+        num_cpu_blocks: int,
+        block_size: int,
+    ) -> DeviceAwareBlockAllocator:
+        """Creates a CpuGpuBlockAllocator instance with the specified
+        configuration.
+        This static method creates and returns a CpuGpuBlockAllocator instance
+        based on the provided parameters. It initializes the CPU and GPU block
+        allocators with the specified number of blocks, block size, and
+        allocator type.
+        Args:
+            allocator_type (str): The type of block allocator to use for CPU
+                and GPU blocks. Currently supported values are "naive" and
+                "prefix_caching".
+            num_gpu_blocks (int): The number of blocks to allocate for GPU
+                memory.
+            num_cpu_blocks (int): The number of blocks to allocate for CPU
+                memory.
+            block_size (int): The size of each block in number of tokens.
+        Returns:
+            DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
+                specified configuration.
+        Notes:
+            - The block IDs are assigned contiguously, with GPU block IDs coming
+                before CPU block IDs.
+        """
+        # For HPU, block id 0 is used only for padding
+        reserved_blocks = 1 if current_platform.is_hpu() else 0
+        block_ids = list(
+            range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
+        num_gpu_blocks -= reserved_blocks
+        gpu_block_ids = block_ids[:num_gpu_blocks]
+        cpu_block_ids = block_ids[num_gpu_blocks:]
+        if allocator_type == "naive":
+            gpu_allocator: BlockAllocator = NaiveBlockAllocator(
+                create_block=NaiveBlock,  # type: ignore
+                num_blocks=num_gpu_blocks,
+                block_size=block_size,
+                block_ids=gpu_block_ids,
+            )
+            cpu_allocator: BlockAllocator = NaiveBlockAllocator(
+                create_block=NaiveBlock,  # type: ignore
+                num_blocks=num_cpu_blocks,
+                block_size=block_size,
+                block_ids=cpu_block_ids,
+            )
+        elif allocator_type == "prefix_caching":
+            gpu_allocator = PrefixCachingBlockAllocator(
+                num_blocks=num_gpu_blocks,
+                block_size=block_size,
+                block_ids=gpu_block_ids,
+            )
+            cpu_allocator = PrefixCachingBlockAllocator(
+                num_blocks=num_cpu_blocks,
+                block_size=block_size,
+                block_ids=cpu_block_ids,
+            )
+        else:
+            raise ValueError(f"Unknown allocator type {allocator_type=}")
+        return CpuGpuBlockAllocator(
+            cpu_block_allocator=cpu_allocator,
+            gpu_block_allocator=gpu_allocator,
+        )
+    def __init__(self, cpu_block_allocator: BlockAllocator,
+                 gpu_block_allocator: BlockAllocator):
+        assert not (
+            cpu_block_allocator.all_block_ids
+            & gpu_block_allocator.all_block_ids
+        ), "cpu and gpu block allocators can't have intersection of block ids"
+        self._allocators = {
+            Device.CPU: cpu_block_allocator,
+            Device.GPU: gpu_block_allocator,
+        }
+        self._swap_mapping: Dict[int, int] = {}
+        self._null_block: Optional[Block] = None
+        self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
+        for _, allocator in self._allocators.items():
+            for block_id in allocator.all_block_ids:
+                self._block_ids_to_allocator[block_id] = allocator
+    def allocate_or_get_null_block(self) -> Block:
+        if self._null_block is None:
+            self._null_block = NullBlock(
+                self.allocate_mutable_block(None, Device.GPU))
+        return self._null_block
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
+        """Allocates a new mutable block on the specified device.
+        Args:
+            prev_block (Optional[Block]): The previous block to in the sequence.
+                Used for prefix hashing.
+            device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+        return self._allocators[device].allocate_mutable_block(
+            prev_block, extra_hash=extra_hash)
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            device: Device,
+            extra_hash: Optional[int] = None) -> List[Block]:
+        """Allocates a new group of immutable blocks with the provided block
+        token IDs on the specified device.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+                Used for prefix hashing.
+            block_token_ids (List[int]): The list of block token IDs to be
+                stored in the new blocks.
+            device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
+        Returns:
+            List[Block]: The newly allocated list of immutable blocks
+                containing the provided block token IDs.
+        """
+        return self._allocators[device].allocate_immutable_blocks(
+            prev_block, block_token_ids, extra_hash=extra_hash)
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
+        """Allocates a new immutable block with the provided token IDs on the
+        specified device.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+                Used for prefix hashing.
+            token_ids (List[int]): The list of token IDs to be stored in the new
+                block.
+            device (Device): The device on which to allocate the new block.
+            extra_hash (Optional[int]): The hash value of additional
+                factors, such as adapters, that influence the block hash
+                in the prefix caching block.
+        Returns:
+            Block: The newly allocated immutable block containing the provided
+                token IDs.
+        """
+        return self._allocators[device].allocate_immutable_block(
+            prev_block, token_ids, extra_hash=extra_hash)
+    def free(self, block: Block) -> None:
+        """Frees the memory occupied by the given block.
+        Args:
+            block (Block): The block to be freed.
+        """
+        # Null block should never be freed
+        if isinstance(block, NullBlock):
+            return
+        block_id = block.block_id
+        assert block_id is not None
+        allocator = self._block_ids_to_allocator[block_id]
+        allocator.free(block)
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+            memory as the original sequence.
+        Args:
+            last_block (Block): The last block in the original sequence.
+        Returns:
+            List[Block]: A new list of blocks that shares the same memory as the
+                original sequence.
+        """
+        # do not attempt to fork the null block
+        assert not isinstance(last_block, NullBlock)
+        block_id = last_block.block_id
+        assert block_id is not None
+        allocator = self._block_ids_to_allocator[block_id]
+        return allocator.fork(last_block)
+    def get_num_free_blocks(self, device: Device) -> int:
+        """Returns the number of free blocks available on the specified device.
+        Args:
+            device (Device): The device for which to query the number of free
+                blocks. AssertionError is raised if None is passed.
+        Returns:
+            int: The number of free blocks available on the specified device.
+        """
+        return self._allocators[device].get_num_free_blocks()
+    def get_num_total_blocks(self, device: Device) -> int:
+        return self._allocators[device].get_num_total_blocks()
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain device given the
+        absolute block id.
+        Args:
+            device (Device): The device for which to query relative block id.
+                absolute_id (int): The absolute block id for the block in
+                whole allocator.
+        Returns:
+            int: The zero-offset block id on certain device.
+        """
+        return self._allocators[device].get_physical_block_id(absolute_id)
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
+        """Execute the swap for the given blocks from source_device
+        on to dest_device, save the current swap mapping and append
+        them to the accumulated `self._swap_mapping` for each
+        scheduling move.
+        Args:
+            blocks: List of blocks to be swapped.
+            src_device (Device): Device to swap the 'blocks' from.
+            dst_device (Device): Device to swap the 'blocks' to.
+        Returns:
+            Dict[int, int]: Swap mapping from source_device
+                on to dest_device.
+        """
+        src_block_ids = [block.block_id for block in blocks]
+        self._allocators[src_device].swap_out(blocks)
+        self._allocators[dst_device].swap_in(blocks)
+        dst_block_ids = [block.block_id for block in blocks]
+        current_swap_mapping: Dict[int, int] = {}
+        for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
+            if src_block_id is not None and dst_block_id is not None:
+                self._swap_mapping[src_block_id] = dst_block_id
+                current_swap_mapping[src_block_id] = dst_block_id
+        return current_swap_mapping
+    def get_num_full_blocks_touched(self, blocks: List[Block],
+                                    device: Device) -> int:
+        """Returns the number of full blocks that will be touched by
+        swapping in/out the given blocks on to the 'device'.
+        Args:
+            blocks: List of blocks to be swapped.
+            device (Device): Device to swap the 'blocks' on.
+        Returns:
+            int: the number of full blocks that will be touched by
+                swapping in/out the given blocks on to the 'device'.
+                Non full blocks are ignored when deciding the number
+                of blocks to touch.
+        """
+        return self._allocators[device].get_num_full_blocks_touched(blocks)
+    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+        """Clears the copy-on-write (CoW) state and returns the mapping of
+            source to destination block IDs.
+        Returns:
+            List[Tuple[int, int]]: A list mapping source block IDs to
+                destination block IDs.
+        """
+        # CoW only supported on GPU
+        device = Device.GPU
+        return self._allocators[device].clear_copy_on_writes()
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, only use for prefix caching."""
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        """Mark blocks as accessed, only use for prefix caching."""
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].mark_blocks_as_computed(block_ids)
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        # Prefix caching only supported on GPU.
+        device = Device.GPU
+        return self._allocators[device].get_common_computed_block_ids(
+            computed_seq_block_ids)
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return frozenset(self._block_ids_to_allocator.keys())
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        assert device in self._allocators
+        return self._allocators[device].get_prefix_cache_hit_rate()
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache for all devices."""
+        success = True
+        for allocator in self._allocators.values():
+            success = success and allocator.reset_prefix_cache()
+        return success
+    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
+        """Returns and clears the mapping of source to destination block IDs.
+        Will be called after every swapping operations for now, and after every
+        schedule when BlockManagerV2 become default. Currently not useful.
+        Returns:
+            List[Tuple[int, int]]: A mapping of source to destination block IDs.
+        """
+        mapping = self._swap_mapping.copy()
+        self._swap_mapping.clear()
+        return list(mapping.items())
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+        device: Device = Device.GPU,
+    ) -> List[int]:
+        return self._allocators[device].find_cached_blocks_prefix(block_hashes)
+class NullBlock(Block):
+    """
+    Null blocks are used as a placeholders for KV cache blocks that have
+    been dropped due to sliding window.
+    This implementation just wraps an ordinary block and prevents it from
+    being modified. It also allows for testing if a block is NullBlock
+    via isinstance().
+    """
+    def __init__(self, proxy: Block):
+        super().__init__()
+        self._proxy = proxy
+    def append_token_ids(self, token_ids: List[BlockId]):
+        raise ValueError("null block should not be modified")
+    @property
+    def block_id(self):
+        return self._proxy.block_id
+    @block_id.setter
+    def block_id(self, value: Optional[BlockId]):
+        raise ValueError("null block should not be modified")
+    @property
+    def token_ids(self) -> List[BlockId]:
+        return self._proxy.token_ids
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for null block")
+    @property
+    def num_empty_slots(self) -> BlockId:
+        return self._proxy.num_empty_slots
+    @property
+    def is_full(self):
+        return self._proxy.is_full
+    @property
+    def prev_block(self):
+        return self._proxy.prev_block
+    @property
+    def extra_hash(self):
+        return None
+    @property
+    def computed(self):
+        return self._proxy.computed
+    @computed.setter
+    def computed(self, value):
+        self._proxy.computed = value
+    @property
+    def last_accessed(self) -> float:
+        return self._proxy.last_accessed
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        self._proxy.last_accessed = last_accessed_ts
+    @property
+    def content_hash(self):
+        return self._proxy.content_hash

.venv/lib/python3.11/site-packages/vllm/core/block/interfaces.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# SPDX-License-Identifier: Apache-2.0
+from abc import ABC, abstractmethod
+from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
+from vllm.utils import Device
+BlockId = int
+class Block(ABC):
+    @abstractmethod
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        pass
+    @property
+    @abstractmethod
+    def block_id(self) -> Optional[int]:
+        pass
+    @block_id.setter
+    @abstractmethod
+    def block_id(self, value: Optional[int]) -> None:
+        """NOTE: Do not use this API outside Block."""
+        self._block_id = value
+    @property
+    @abstractmethod
+    def token_ids(self) -> List[int]:
+        pass
+    @property
+    @abstractmethod
+    def num_tokens_total(self) -> int:
+        """The number of tokens till the current block (inclusive)
+        """
+        pass
+    @property
+    @abstractmethod
+    def num_empty_slots(self) -> int:
+        pass
+    @property
+    @abstractmethod
+    def is_full(self) -> bool:
+        pass
+    @property
+    @abstractmethod
+    def prev_block(self) -> Optional["Block"]:
+        pass
+    @property
+    @abstractmethod
+    def extra_hash(self) -> Optional[int]:
+        return None
+    @property
+    @abstractmethod
+    def computed(self) -> bool:
+        raise NotImplementedError
+    @computed.setter
+    @abstractmethod
+    def computed(self, value) -> bool:
+        """Should be only used by PrefixCacingAllocator"""
+        raise NotImplementedError
+    @property
+    @abstractmethod
+    def last_accessed(self) -> float:
+        raise NotImplementedError
+    @last_accessed.setter
+    @abstractmethod
+    def last_accessed(self, last_accessed_ts: float):
+        raise NotImplementedError
+    class Factory(Protocol):
+        @abstractmethod
+        def __call__(
+            self,
+            prev_block: Optional["Block"],
+            token_ids: List[int],
+            block_size: int,
+            allocator: "BlockAllocator",
+            block_id: Optional[int] = None,
+            computed: bool = False,
+            extra_hash: Optional[int] = None,
+        ) -> "Block":
+            pass
+    @property
+    @abstractmethod
+    def content_hash(self) -> Optional[int]:
+        """Return the content-based hash of the current block, or None if it is
+        not yet defined or not supported.
+        For the content-based hash to be defined, the current block must be
+        full.
+        """
+        return None
+class BlockAllocator(ABC):
+    @abstractmethod
+    def allocate_mutable_block(self, prev_block: Optional[Block],
+                               extra_hash: Optional[int]) -> Block:
+        pass
+    @abstractmethod
+    def allocate_immutable_block(self, prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int]) -> Block:
+        pass
+    @abstractmethod
+    def allocate_immutable_blocks(self, prev_block: Optional[Block],
+                                  block_token_ids: List[List[int]],
+                                  extra_hash: Optional[int]) -> List[Block]:
+        pass
+    @abstractmethod
+    def free(self, block: Block) -> None:
+        pass
+    @abstractmethod
+    def fork(self, last_block: Block) -> List[Block]:
+        pass
+    @abstractmethod
+    def get_num_total_blocks(self) -> int:
+        pass
+    @abstractmethod
+    def get_num_free_blocks(self) -> int:
+        pass
+    @abstractmethod
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        pass
+    @abstractmethod
+    def swap_out(self, blocks: List[Block]) -> None:
+        pass
+    @abstractmethod
+    def swap_in(self, blocks: List[Block]) -> None:
+        pass
+    @property
+    @abstractmethod
+    def all_block_ids(self) -> FrozenSet[int]:
+        pass
+    @abstractmethod
+    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+        pass
+    @abstractmethod
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        pass
+    @abstractmethod
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        pass
+    @abstractmethod
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        pass
+    @abstractmethod
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+        """NOTE: This should not be used besides Block"""
+        pass
+    @abstractmethod
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        """NOTE: This should not be used besides Block"""
+        pass
+    @abstractmethod
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
+        pass
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache."""
+        pass
+    class NoFreeBlocksError(ValueError):
+        pass
+    @abstractmethod
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+    ) -> List[int]:
+        pass
+class DeviceAwareBlockAllocator(ABC):
+    @abstractmethod
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               device: Device,
+                               extra_hash: Optional[int] = None) -> Block:
+        pass
+    @abstractmethod
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 device: Device,
+                                 extra_hash: Optional[int] = None) -> Block:
+        pass
+    @abstractmethod
+    def allocate_immutable_blocks(
+        self,
+        prev_block: Optional[Block],
+        block_token_ids: List[List[int]],
+        device: Device,
+        extra_hash: Optional[int] = None,
+    ) -> List[Block]:
+        pass
+    @abstractmethod
+    def get_num_free_blocks(self, device: Device) -> int:
+        pass
+    @abstractmethod
+    def get_num_total_blocks(self, device: Device) -> int:
+        pass
+    @abstractmethod
+    def free(self, block: Block) -> None:
+        pass
+    @abstractmethod
+    def fork(self, last_block: Block) -> List[Block]:
+        pass
+    @property
+    @abstractmethod
+    def all_block_ids(self) -> FrozenSet[int]:
+        pass
+    @abstractmethod
+    def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
+        pass
+    @abstractmethod
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        pass
+    @abstractmethod
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        pass
+    @abstractmethod
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        pass
+    @abstractmethod
+    def get_num_full_blocks_touched(self, blocks: List[Block],
+                                    device: Device) -> int:
+        pass
+    @abstractmethod
+    def swap(self, blocks: List[Block], src_device: Device,
+             dst_device: Device) -> Dict[int, int]:
+        pass
+    @abstractmethod
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        pass
+    @abstractmethod
+    def allocate_or_get_null_block(self) -> Block:
+        """
+        Null blocks are used as a placeholders for KV cache blocks that have
+        been dropped due to sliding window.
+        There is at most one null block per allocator.
+        """
+        pass
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache."""
+        pass
+    @abstractmethod
+    def find_cached_blocks_prefix(
+        self,
+        block_hashes: List[int],
+        device: Device = Device.GPU,
+    ) -> List[int]:
+        pass

.venv/lib/python3.11/site-packages/vllm/core/block/naive_block.py ADDED Viewed

	@@ -0,0 +1,465 @@

+# SPDX-License-Identifier: Apache-2.0
+from collections import deque
+from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
+from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
+                                    get_all_blocks_recursively)
+from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+Refcount = int
+class NaiveBlockAllocator(BlockAllocator):
+    """A simple block allocator that manages blocks of memory without prefix
+    caching.
+    Args:
+        create_block (Block.Factory): A factory function for creating new
+            blocks. This is used when a NaiveBlockAllocator is composed within
+            a prefix caching allocator -- the naive block allocator must
+            construct prefix caching blocks (but shouldn't know anything else
+            about them).
+        num_blocks (int): The total number of blocks to manage.
+        block_size (int): The size of each block in tokens.
+        block_ids (Optional[Iterable[int]], optional): An optional iterable of
+            block IDs. If not provided, block IDs will be assigned sequentially
+            from 0 to num_blocks - 1.
+    """
+    def __init__(
+        self,
+        create_block: Block.Factory,
+        num_blocks: int,
+        block_size: int,
+        block_ids: Optional[Iterable[int]] = None,
+        block_pool: Optional[BlockPool] = None,
+    ):
+        if block_ids is None:
+            block_ids = range(num_blocks)
+        self._free_block_indices: Deque[BlockId] = deque(block_ids)
+        self._all_block_indices = frozenset(block_ids)
+        assert len(self._all_block_indices) == num_blocks
+        self._refcounter = RefCounter(
+            all_block_indices=self._free_block_indices)
+        self._block_size = block_size
+        self._cow_tracker = CopyOnWriteTracker(
+            refcounter=self._refcounter.as_readonly())
+        if block_pool is None:
+            extra_factor = 4
+            # Pre-allocate "num_blocks * extra_factor" block objects.
+            # The "* extra_factor" is a buffer to allow more block objects
+            # than physical blocks
+            self._block_pool = BlockPool(self._block_size, create_block, self,
+                                         num_blocks * extra_factor)
+        else:
+            # In this case, the block pool is provided by the caller,
+            # which means that there is most likely a need to share
+            # a block pool between allocators
+            self._block_pool = block_pool
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
+                                 device: Optional[Device] = None) -> Block:
+        """Allocates a new immutable block with the given token IDs, linked to
+        the previous block.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+            token_ids (List[int]): The token IDs to be stored in the new block.
+        Returns:
+            Block: The newly allocated immutable block.
+        """
+        assert device is None
+        block = self.allocate_mutable_block(prev_block=prev_block)
+        block.append_token_ids(token_ids)
+        return block
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
+            device: Optional[Device] = None) -> List[Block]:
+        assert device is None
+        num_blocks = len(block_token_ids)
+        block_ids = []
+        for i in range(num_blocks):
+            block_ids.append(self._allocate_block_id())
+        blocks = []
+        for i in range(num_blocks):
+            prev_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block_token_ids[i],
+                block_size=self._block_size,
+                physical_block_id=block_ids[i])
+            blocks.append(prev_block)
+        return blocks
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
+                               device: Optional[Device] = None) -> Block:
+        """Allocates a new mutable block, linked to the previous block.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence. If
+                None, then the block to be allocated is the first block in the
+                sequence.
+        Returns:
+            Block: The newly allocated mutable block.
+        """
+        assert device is None
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id)
+        return block
+    def _allocate_block_id(self) -> BlockId:
+        if not self._free_block_indices:
+            raise BlockAllocator.NoFreeBlocksError()
+        block_id = self._free_block_indices.popleft()
+        self._refcounter.incr(block_id)
+        return block_id
+    def _free_block_id(self, block: Union[Block, BlockId]) -> None:
+        if isinstance(block, Block):
+            block_id = block.block_id
+            block.block_id = None
+        else:
+            block_id = block
+        assert block_id is not None
+        refcount = self._refcounter.decr(block_id)
+        if refcount == 0:
+            self._free_block_indices.appendleft(block_id)
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        # Release the physical block id
+        self._free_block_id(block)
+        # Release the block object
+        if not keep_block_object:
+            self._block_pool.free_block(block)
+    def free_block_id(self, block_id: BlockId) -> None:
+        self._free_block_id(block_id)
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+        memory as the original sequence.
+        Args:
+            last_block (Block): The last block in the original sequence.
+        Returns:
+            List[Block]: The new sequence of blocks that shares the same memory
+                as the original sequence.
+        """
+        source_blocks = get_all_blocks_recursively(last_block)
+        forked_blocks: List[Block] = []
+        prev_block = None
+        for block in source_blocks:
+            # Increment refcount for each block.
+            assert block.block_id is not None
+            refcount = self._refcounter.incr(block.block_id)
+            assert refcount != 1, "can't fork free'd block"
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block.block_id)
+            forked_blocks.append(forked_block)
+            prev_block = forked_blocks[-1]
+        return forked_blocks
+    def get_num_free_blocks(self) -> int:
+        return len(self._free_block_indices)
+    def get_num_total_blocks(self) -> int:
+        return len(self._all_block_indices)
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
+        given the absolute block id.
+        Args:
+            absolute_id (int): The absolute block id for the block
+            in whole allocator.
+        Returns:
+            int: The zero-offset block id on certain device.
+        """
+        return sorted(self._all_block_indices).index(absolute_id)
+    @property
+    def refcounter(self):
+        return self._refcounter
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return self._all_block_indices
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+        """Performs a copy-on-write operation on the given block if it is not
+        appendable.
+        Args:
+            block (Block): The block to check for copy-on-write.
+        Returns:
+            BlockId: The block index of the new block if a copy-on-write
+                operation was performed, or the original block index if
+                no copy-on-write was necessary.
+        """
+        src_block_id = block.block_id
+        assert src_block_id is not None
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+        return trg_block_id
+    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
+        """Returns the copy-on-write source->destination mapping and clears it.
+        Returns:
+            List[Tuple[BlockId, BlockId]]: A list mapping source
+                block indices to destination block indices.
+        """
+        return self._cow_tracker.clear_cows()
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+        Since the naive allocator does not implement prefix caching, we do
+        nothing.
+        """
+        pass
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        """Mark blocks as computed, used in prefix caching.
+        Since the naive allocator does not implement prefix caching, we do
+        nothing.
+        """
+        pass
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        """Determine blocks that can be skipped in prefill.
+        Since the naive allocator does not support prefix caching, always return
+        an empty list.
+        """
+        return []
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        raise NotImplementedError("There is no promotion for naive blocks")
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
+        """Returns the number of full blocks that will be touched by
+        swapping in/out.
+        Args:
+            blocks: List of blocks to be swapped.
+        Returns:
+            int: the number of full blocks that will be touched by
+                swapping in/out the given blocks. Non full blocks are ignored
+                when deciding the number of blocks to touch.
+        """
+        # NOTE: for naive block, we use set to eliminate common blocks among
+        # seqs, also we compare the empty slots in the mutable blocks with
+        # lookahead slots to get the number of unique new block that are
+        # needed.
+        old_block_set = set()
+        for block in blocks:
+            if block.is_full:
+                old_block_set.add(block)
+        return len(old_block_set)
+    def swap_out(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            self._free_block_id(block)
+    def swap_in(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
+            if block.is_full:
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block, token_ids=block.token_ids)
+            else:
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block)
+                tmp_block.append_token_ids(block.token_ids)
+            block_id = tmp_block.block_id
+            tmp_block.block_id = None
+            self._block_pool.free_block(tmp_block)
+            block.block_id = block_id  # Assign block_id
+    def get_prefix_cache_hit_rate(self) -> float:
+        return -1
+    def reset_prefix_cache(self) -> bool:
+        """No prefix cache for naive block allocator."""
+        return True
+    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
+        # Not applicable for naive block allocator.
+        return []
+class NaiveBlock(Block):
+    """An implementation of the Block class that does not support prefix
+    caching.
+    The NaiveBlock class represents a block of token IDs with a fixed size. It
+    provides methods for appending token IDs to the block and manages copy-on
+    -write operations when necessary.
+    Args:
+        prev_block (Block): The previous block in the sequence.
+        token_ids (List[int]): The initial token IDs to be stored in the block.
+        block_size (int): The maximum number of token IDs that can be stored in
+            the block.
+        allocator (BlockAllocator): The block allocator associated with this
+            block.
+        block_id (Optional[int], optional): The physical block index
+            of this block. Defaults to None, which means no allocation has been
+            made.
+        _cow_target (Optional[Block], optional): The copy-on-write target block.
+            If not provided, it defaults to self.
+    """
+    def __init__(self,
+                 prev_block: Optional[Block],
+                 token_ids: List[int],
+                 block_size: int,
+                 allocator: BlockAllocator,
+                 block_id: Optional[int] = None,
+                 _cow_target: Optional[Block] = None,
+                 extra_hash: Optional[int] = None):
+        self._token_ids: List[int] = []
+        self._block_size = block_size
+        self._prev_block = prev_block
+        self._block_id = block_id
+        self._allocator = allocator
+        self._cow_target = _cow_target if _cow_target is not None else self
+        self._append_token_ids_no_cow(token_ids)
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block and performs a
+        copy-on-write if necessary.
+        Args:
+            token_ids (Optional[List[int]]): The token IDs to be appended
+                to the block.
+        """
+        self._append_token_ids_no_cow(token_ids)
+        if self._block_id is not None:
+            self._block_id = (self._allocator.cow_block_if_not_appendable(
+                self._cow_target))
+    def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        if len(token_ids) == 0:
+            return
+        assert len(token_ids) <= self.num_empty_slots
+        self._token_ids.extend(token_ids)
+    @property
+    def computed(self) -> bool:
+        raise NotImplementedError
+    @computed.setter
+    def computed(self, value) -> None:
+        raise NotImplementedError
+    @property
+    def last_accessed(self) -> float:
+        raise NotImplementedError
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        raise NotImplementedError
+    @property
+    def block_id(self) -> Optional[int]:
+        return self._block_id
+    @block_id.setter
+    def block_id(self, value: Optional[int]) -> None:
+        self._block_id = value
+    @property
+    def is_full(self) -> bool:
+        return self.num_empty_slots == 0
+    @property
+    def num_empty_slots(self) -> int:
+        return self._block_size - len(self.token_ids)
+    @property
+    def token_ids(self) -> List[int]:
+        return self._token_ids
+    @property
+    def num_tokens_total(self) -> int:
+        raise NotImplementedError(
+            "num_tokens_total is not used for naive block")
+    @property
+    def block_size(self) -> int:
+        return self._block_size
+    @property
+    def prev_block(self) -> Optional["Block"]:
+        return self._prev_block
+    @property
+    def extra_hash(self):
+        return None
+    @property
+    def content_hash(self) -> Optional[int]:
+        return None

.venv/lib/python3.11/site-packages/vllm/core/block/prefix_caching_block.py ADDED Viewed

	@@ -0,0 +1,1134 @@

+# SPDX-License-Identifier: Apache-2.0
+"""Token blocks."""
+import sys
+from bisect import bisect_left
+from os.path import commonprefix
+from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
+                    Tuple)
+from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
+                                    get_all_blocks_recursively)
+from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
+                                        DeviceAwareBlockAllocator)
+from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
+                                         NaiveBlockAllocator)
+from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
+from vllm.logger import init_logger
+from vllm.sequence import Sequence
+PrefixHash = int
+# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
+# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
+# then we know this block hasn't been accessed yet.
+_DEFAULT_LAST_ACCESSED_TIME = -1
+logger = init_logger(__name__)
+class BlockTracker:
+    """Used to track the status of a block inside the prefix caching allocator
+    """
+    __slots__ = ("active", "last_accessed", "computed")
+    def reset(self):
+        self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self.computed: bool = False
+    def __init__(self):
+        self.active: bool = False
+        self.reset()
+    def enable(self):
+        assert not self.active
+        self.active = True
+        self.reset()
+    def disable(self):
+        assert self.active
+        self.active = False
+        self.reset()
+class PrefixCachingBlockAllocator(BlockAllocator):
+    """A block allocator that implements prefix caching.
+    The PrefixCachingBlockAllocator maintains a cache of blocks based on their
+    content hash. It reuses blocks with the same content hash to avoid redundant
+    memory allocation. The allocator also supports copy-on-write operations.
+    Args:
+        num_blocks (int): The total number of blocks to manage.
+        block_size (int): The size of each block in tokens.
+        block_ids(Optional[Iterable[int]], optional): An optional iterable of
+            block IDs. If not provided, block IDs will be assigned sequentially
+            from 0 to num_blocks - 1.
+    """
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+    # Implements Block.Factory.
+    def __init__(
+        self,
+        num_blocks: int,
+        block_size: int,
+        block_ids: Optional[Iterable[int]] = None,
+        eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
+    ):
+        if block_ids is None:
+            block_ids = range(num_blocks)
+        self._block_size = block_size
+        # A mapping of prefix hash to block index. All blocks which have a
+        # prefix hash will be in this dict, even if they have refcount 0.
+        self._cached_blocks: Dict[PrefixHash, BlockId] = {}
+        # A list of immutable block IDs that have been touched by scheduler
+        # and should be marked as computed after an entire batch of sequences
+        # are scheduled.
+        self._touched_blocks: Set[BlockId] = set()
+        # Used to track status of each physical block id
+        self._block_tracker: Dict[BlockId, BlockTracker] = {}
+        for block_id in block_ids:
+            self._block_tracker[block_id] = BlockTracker()
+        # Pre-allocate "num_blocks * extra_factor" block objects.
+        # The "* extra_factor" is a buffer to allow more block objects
+        # than physical blocks
+        extra_factor = 4
+        self._block_pool = BlockPool(self._block_size, self._create_block,
+                                     self, num_blocks * extra_factor)
+        # An allocator for blocks that do not have prefix hashes.
+        self._hashless_allocator = NaiveBlockAllocator(
+            create_block=self._create_block,  # type: ignore
+            num_blocks=num_blocks,
+            block_size=block_size,
+            block_ids=block_ids,
+            block_pool=self._block_pool,  # Share block pool here
+        )
+        # Evitor used to maintain how we want to handle those computed blocks
+        # if we find memory pressure is high.
+        self.eviction_policy = eviction_policy
+        self.evictor: Evictor = make_evictor(self.eviction_policy)
+        # We share the refcounter between allocators. This allows us to promote
+        # blocks originally allocated in the hashless allocator to immutable
+        # blocks.
+        self._refcounter = self._hashless_allocator.refcounter
+        self._cow_tracker = CopyOnWriteTracker(
+            refcounter=self._refcounter.as_readonly())
+        self.metric_data = CacheMetricData()
+    def _create_block(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+        block_size: int,
+        allocator: BlockAllocator,
+        block_id: Optional[int] = None,
+        computed: bool = False,
+        extra_hash: Optional[int] = None,
+    ) -> Block:
+        # Bind block to self.
+        allocator = self
+        return PrefixCachingBlock(
+            prev_block=prev_block,
+            token_ids=token_ids,
+            block_size=block_size,
+            block_id=block_id,
+            allocator=allocator,
+            computed=computed,
+            extra_hash=extra_hash,
+        )
+    def allocate_immutable_block(self,
+                                 prev_block: Optional[Block],
+                                 token_ids: List[int],
+                                 extra_hash: Optional[int] = None,
+                                 device: Optional[Device] = None) -> Block:
+        """Allocates an immutable block with the given token IDs, reusing cached
+        blocks if possible.
+        Args:
+            prev_block (Optional[Block]): The previous block in the sequence.
+            token_ids (List[int]): The token IDs to be stored in the block.
+        Returns:
+            Block: The allocated immutable block.
+        """
+        assert device is None
+        assert_prefix_caching_block_or_none(prev_block)
+        # First, try to create a block that points to cached data
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=token_ids,
+                                            block_size=self._block_size,
+                                            physical_block_id=None,
+                                            extra_hash=extra_hash)
+        assert block.content_hash is not None
+        cached_block_id = self._cached_blocks.get(block.content_hash, None)
+        if cached_block_id is not None:
+            self.metric_data.query(hit=True)
+            block.block_id = cached_block_id
+            self._incr_refcount_cached_block(block)
+            return block
+        self.metric_data.query(hit=False)
+        self._block_pool.free_block(block)
+        # No cached block => Allocate a new block
+        block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
+        block.append_token_ids(token_ids)
+        return block
+    def allocate_immutable_blocks(
+            self,
+            prev_block: Optional[Block],
+            block_token_ids: List[List[int]],
+            extra_hash: Optional[int] = None,
+            device: Optional[Device] = None) -> List[Block]:
+        blocks = []
+        for token_ids in block_token_ids:
+            prev_block = self.allocate_immutable_block(prev_block=prev_block,
+                                                       token_ids=token_ids,
+                                                       device=device,
+                                                       extra_hash=extra_hash)
+            blocks.append(prev_block)
+        return blocks
+    def allocate_mutable_block(self,
+                               prev_block: Optional[Block],
+                               extra_hash: Optional[int] = None,
+                               device: Optional[Device] = None) -> Block:
+        """Allocates a mutable block. If there are no free blocks, this will
+        evict unused cached blocks.
+        Args:
+            prev_block (Block): The previous block in the sequence.
+                None is not allowed unlike it is super class.
+        Returns:
+            Block: The allocated mutable block.
+        """
+        assert device is None
+        assert_prefix_caching_block_or_none(prev_block)
+        block_id = self._allocate_block_id()
+        block = self._block_pool.init_block(prev_block=prev_block,
+                                            token_ids=[],
+                                            block_size=self._block_size,
+                                            physical_block_id=block_id,
+                                            extra_hash=extra_hash)
+        assert not block.computed
+        assert block.content_hash is None
+        return block
+    def _incr_refcount_cached_block(self, block: Block) -> None:
+        # Set this block to be "computed" since it is pointing to a
+        # cached block id (which was already computed)
+        block.computed = True
+        block_id = block.block_id
+        assert block_id is not None
+        refcount = self._refcounter.incr(block_id)
+        if refcount == 1:
+            # In case a cached block was evicted, restore its tracking
+            if block_id in self.evictor:
+                self.evictor.remove(block_id)
+            self._track_block_id(block_id, computed=True)
+    def _decr_refcount_cached_block(self, block: Block) -> None:
+        # Ensure this is immutable/cached block
+        assert block.content_hash is not None
+        block_id = block.block_id
+        assert block_id is not None
+        refcount = self._refcounter.decr(block_id)
+        if refcount > 0:
+            block.block_id = None
+            return
+        else:
+            assert refcount == 0
+        # No longer used
+        assert block.content_hash in self._cached_blocks
+        # Add the cached block to the evictor
+        # (This keeps the cached block around so it can be reused)
+        self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
+                         self._block_tracker[block_id].last_accessed)
+        # Stop tracking the block
+        self._untrack_block_id(block_id)
+        block.block_id = None
+    def _decr_refcount_hashless_block(self, block: Block) -> None:
+        block_id = block.block_id
+        assert block_id is not None
+        # We may have a fork case where block is shared,
+        # in which case, we cannot remove it from tracking
+        refcount = self._refcounter.get(block_id)
+        if refcount == 1:
+            self._untrack_block_id(block_id)
+        # Decrement refcount of the block_id, but do not free the block object
+        # itself (will be handled by the caller)
+        self._hashless_allocator.free(block, keep_block_object=True)
+    def _allocate_block_id(self) -> BlockId:
+        """First tries to allocate a block id from the hashless allocator,
+        and if there are no blocks, then tries to evict an unused cached block.
+        """
+        hashless_block_id = self._maybe_allocate_hashless_block_id()
+        if hashless_block_id is not None:
+            return hashless_block_id
+        evicted_block_id = self._maybe_allocate_evicted_block_id()
+        if evicted_block_id is not None:
+            return evicted_block_id
+        # No block available in hashless allocator, nor in unused cache blocks.
+        raise BlockAllocator.NoFreeBlocksError()
+    def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
+        try:
+            # Allocate mutable block and extract its block_id
+            block = self._hashless_allocator.allocate_mutable_block(
+                prev_block=None)
+            block_id = block.block_id
+            self._block_pool.free_block(block)
+            self._track_block_id(block_id, computed=False)
+            return block_id
+        except BlockAllocator.NoFreeBlocksError:
+            return None
+    def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
+        if self.evictor.num_blocks == 0:
+            return None
+        # Here we get an evicted block, which is only added
+        # into evictor if its ref counter is 0
+        # and since its content would be changed, we need
+        # to remove it from _cached_blocks's tracking list
+        block_id, content_hash_to_evict = self.evictor.evict()
+        # Sanity checks
+        assert content_hash_to_evict in self._cached_blocks
+        _block_id = self._cached_blocks[content_hash_to_evict]
+        assert self._refcounter.get(_block_id) == 0
+        assert _block_id == block_id
+        self._cached_blocks.pop(content_hash_to_evict)
+        self._refcounter.incr(block_id)
+        self._track_block_id(block_id, computed=False)
+        return block_id
+    def _free_block_id(self, block: Block) -> None:
+        """Decrements the refcount of the block. The block may be in two
+        possible states: (1) immutable/cached or (2) mutable/hashless.
+        In the first case, the refcount is decremented directly and the block
+        may be possibly added to the evictor. In other case, hashless
+        allocator free(..) with keep_block_object=True is called to only free
+        the block id (since the block object may be reused by the caller)
+        """
+        block_id = block.block_id
+        assert block_id is not None, "Freeing unallocated block is undefined"
+        if block.content_hash is not None:
+            # Immutable: This type of block is always cached, and we want to
+            # keep it in the evictor for future reuse
+            self._decr_refcount_cached_block(block)
+        else:
+            # Mutable: This type of block is not cached, so we release it
+            # directly to the hashless allocator
+            self._decr_refcount_hashless_block(block)
+        assert block.block_id is None
+    def free(self, block: Block, keep_block_object: bool = False) -> None:
+        """Release the block (look at free_block_id(..) docs)
+        """
+        # Release the physical block index
+        self._free_block_id(block)
+        # Release the block object to the pool
+        if not keep_block_object:
+            self._block_pool.free_block(block)
+    def fork(self, last_block: Block) -> List[Block]:
+        """Creates a new sequence of blocks that shares the same underlying
+        memory as the original sequence.
+        Args:
+            last_block (Block): The last block in the original sequence.
+        Returns:
+            List[Block]: The new sequence of blocks that shares the same memory
+                as the original sequence.
+        """
+        source_blocks = get_all_blocks_recursively(last_block)
+        forked_blocks: List[Block] = []
+        prev_block = None
+        for block in source_blocks:
+            block_id = block.block_id
+            assert block_id is not None
+            refcount = self._refcounter.incr(block_id)
+            assert refcount != 1, "can't fork free'd block_id = {}".format(
+                block_id)
+            forked_block = self._block_pool.init_block(
+                prev_block=prev_block,
+                token_ids=block.token_ids,
+                block_size=self._block_size,
+                physical_block_id=block_id,
+                extra_hash=block.extra_hash)
+            forked_blocks.append(forked_block)
+            prev_block = forked_blocks[-1]
+        return forked_blocks
+    def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
+        assert device is None
+        # The number of free blocks is the number of hashless free blocks
+        # plus the number of blocks evictor could free from its list.
+        return self._hashless_allocator.get_num_free_blocks(
+        ) + self.evictor.num_blocks
+    def get_num_total_blocks(self) -> int:
+        return self._hashless_allocator.get_num_total_blocks()
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
+        given the absolute block id.
+        Args:
+            absolute_id (int): The absolute block id for the block
+                in whole allocator.
+        Returns:
+            int: The rzero-offset block id on certain device.
+        """
+        return sorted(self.all_block_ids).index(absolute_id)
+    @property
+    def all_block_ids(self) -> FrozenSet[int]:
+        return self._hashless_allocator.all_block_ids
+    def get_prefix_cache_hit_rate(self) -> float:
+        return self.metric_data.get_hit_rate()
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache. This function may be used in RLHF
+        flows to invalid prefix caching after the weights are updated,
+        or used for resetting prefix caching status for benchmarking.
+        Returns:
+            bool: True if the prefix cache is successfully reset,
+            False otherwise.
+        """
+        num_used_blocks = (self.get_num_total_blocks() -
+                           self.get_num_free_blocks())
+        if num_used_blocks > 0:
+            logger.warning(
+                "Failed to reset prefix cache because some "
+                "blocks (%d) are not freed yet", num_used_blocks)
+            return False
+        # Free all blocks in the evictor.
+        while (block_id :=
+               self._maybe_allocate_evicted_block_id()) is not None:
+            self._hashless_allocator.free_block_id(block_id)
+        # Should not have any cached blocks because all blocks are evicted.
+        assert not self._cached_blocks
+        # Reset the evictor.
+        self.evictor = make_evictor(self.eviction_policy)
+        # Reset the block tracker.
+        for block_id in self._block_tracker:
+            self._block_tracker[block_id] = BlockTracker()
+        # Reset the metrics.
+        self.metric_data = CacheMetricData()
+        logger.info("Successfully reset prefix cache")
+        return True
+    def is_block_cached(self, block: Block) -> bool:
+        assert block.content_hash is not None
+        return block.content_hash in self._cached_blocks
+    def promote_to_immutable_block(self, block: Block) -> BlockId:
+        """Once a mutable block is full, it can be promoted to an immutable
+        block. This means that its content can be referenced by future blocks
+        having the same prefix.
+        Note that if we already have a cached block with the same content, we
+        will replace the newly-promoted block's mapping with the existing cached
+        block id.
+        Args:
+            block: The mutable block to be promoted.
+        Returns:
+            BlockId: Either the original block index, or the block index of
+                the previously cached block matching the same content.
+        """
+        # Ensure block can be promoted
+        assert block.content_hash is not None
+        assert block.block_id is not None
+        assert self._refcounter.get(block.block_id) > 0
+        if block.content_hash not in self._cached_blocks:
+            # No cached content hash => Set this block as cached.
+            # Note that this block cannot be marked as computed yet
+            # because other sequences in the same batch cannot reuse
+            # this block.
+            self._cached_blocks[block.content_hash] = block.block_id
+            # Mark this block as touched so that it can be marked as
+            # computed after the entire batch of sequences are scheduled.
+            self._touched_blocks.add(block.block_id)
+            return block.block_id
+        # Reuse the cached content hash
+        self._decr_refcount_hashless_block(block)
+        block.block_id = self._cached_blocks[block.content_hash]
+        # Increment refcount of the cached block and (possibly) restore
+        # it from the evictor.
+        # Note that in this case, the block is marked as computed
+        self._incr_refcount_cached_block(block)
+        return block.block_id
+    def cow_block_if_not_appendable(self, block: Block) -> BlockId:
+        """Performs a copy-on-write operation on the given block if it is not
+        appendable.
+        Args:
+            block (Block): The block to check for copy-on-write.
+        Returns:
+            BlockId: The block index of the new block if a copy-on-write
+                operation was performed, or the original block index if
+                no copy-on-write was necessary.
+        """
+        src_block_id = block.block_id
+        assert src_block_id is not None
+        if self._cow_tracker.is_appendable(block):
+            return src_block_id
+        self._free_block_id(block)
+        trg_block_id = self._allocate_block_id()
+        self._cow_tracker.record_cow(src_block_id, trg_block_id)
+        return trg_block_id
+    def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
+        """Returns the copy-on-write source->destination mapping and clears it.
+        Returns:
+            List[Tuple[BlockId, BlockId]]: A list mapping source
+                block indices to destination block indices.
+        """
+        return self._cow_tracker.clear_cows()
+    def mark_blocks_as_accessed(self, block_ids: List[int],
+                                now: float) -> None:
+        """Mark blocks as accessed, used in prefix caching.
+        If the block is added into evictor, we need to update corresponding
+        info in evictor's metadata.
+        """
+        for block_id in block_ids:
+            if self._block_tracker[block_id].active:
+                self._block_tracker[block_id].last_accessed = now
+            elif block_id in self.evictor:
+                self.evictor.update(block_id, now)
+            else:
+                raise ValueError(
+                    "Mark block as accessed which is not belonged to GPU")
+    def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
+        # Mark all touched blocks as computed.
+        for block_id in self._touched_blocks:
+            self._block_tracker[block_id].computed = True
+        self._touched_blocks.clear()
+    def _track_block_id(self, block_id: Optional[BlockId],
+                        computed: bool) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].enable()
+        self._block_tracker[block_id].computed = computed
+    def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
+        assert block_id is not None
+        self._block_tracker[block_id].disable()
+    def block_is_computed(self, block_id: int) -> bool:
+        if self._block_tracker[block_id].active:
+            return self._block_tracker[block_id].computed
+        else:
+            return block_id in self.evictor
+    def get_common_computed_block_ids(
+            self, computed_seq_block_ids: List[List[int]]) -> List[int]:
+        """Return the block ids that are common for a given sequence group.
+        Only those blocks that are immutable and already be marked
+        compyted would be taken consideration.
+        """
+        # NOTE We exclude the last block to avoid the case where the entire
+        # prompt is cached. This would cause erroneous behavior in model
+        # runner.
+        # It returns a list of int although type annotation says list of string.
+        if len(computed_seq_block_ids) == 1:
+            return computed_seq_block_ids[0]
+        return commonprefix([
+            ids for ids in computed_seq_block_ids  # type: ignore
+            if ids
+        ])
+    def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
+        """Returns the number of full blocks that will be touched by
+        swapping in/out.
+        Args:
+            blocks: List of blocks to be swapped.
+        Returns:
+            int: the number of full blocks that will be touched by
+                swapping in/out the given blocks. Non full blocks are ignored
+                when deciding the number of blocks to touch.
+        """
+        num_touched_blocks: int = 0
+        for block in blocks:
+            # If the block has a match in the cache and the cached
+            # block is not referenced, then we still count it as a
+            # touched block
+            if block.is_full and (not self.is_block_cached(block) or \
+                (block.content_hash is not None and \
+                self._cached_blocks[block.content_hash] in \
+                        self.evictor)):
+                num_touched_blocks += 1
+        return num_touched_blocks
+    def swap_out(self, blocks: List[Block]) -> None:
+        """Execute the swap out actions. Basically just free the
+        given blocks.
+        Args:
+            blocks: List of blocks to be swapped out.
+        """
+        for block in blocks:
+            self._free_block_id(block)
+    def swap_in(self, blocks: List[Block]) -> None:
+        """Execute the swap in actions. Change the block id from
+        old allocator to current allocator for each block to finish
+        the block table update.
+        Args:
+            blocks: List of blocks to be swapped in.
+        """
+        for block in blocks:
+            # Here we allocate either immutable or mutable block and then
+            # extract its block_id. Note that the block object is released
+            # and the block_id is assigned to "block" to allow reusing the
+            # existing "block" object
+            if block.is_full:
+                tmp_block = self.allocate_immutable_block(
+                    prev_block=block.prev_block,
+                    token_ids=block.token_ids,
+                    extra_hash=block.extra_hash)
+            else:
+                tmp_block = self.allocate_mutable_block(
+                    prev_block=block.prev_block, extra_hash=block.extra_hash)
+                tmp_block.append_token_ids(block.token_ids)
+            block_id = tmp_block.block_id
+            self._block_pool.free_block(tmp_block)
+            block.block_id = block_id  # Assign block_id
+    def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
+        """
+        Given a list of block hashes, return the prefix of the block hashes that
+        are all cached.
+        Since a block's block hash includes the hashes of all previous blocks,
+        and we only allocate/deallocate blocks in the entire sequence, so if a
+        block is cached, then all previous blocks are also cached. With this
+        property, we can use binary search to find the prefix of cached blocks.
+        Args:
+            block_hashes (List[int]): The list of block hashes.
+        Returns:
+            List[int]: The prefix of the `block_hashes` that are cached.
+        """
+        def _block_is_cached(block_hash: PrefixHash) -> bool:
+            if block_hash not in self._cached_blocks:
+                return False
+            cached_block_id = self._cached_blocks[block_hash]
+            # We only consider the blocks that are marked as computed.
+            return self.block_is_computed(cached_block_id)
+        def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
+            # python <= 3.10 don't have the key argument
+            if sys.version_info < (3, 10):
+                a = [key(e) for e in a]
+                return bisect_left(a, x)
+            else:
+                return bisect_left(a, x, key=key)
+        # Look for the first block that's not cached, and returns the prefix
+        # i.e. blocks that are cached.
+        idx = _bisect_left(block_hashes,
+                           True,
+                           key=lambda x: not _block_is_cached(x))
+        return block_hashes[:idx]
+class PrefixCachingBlock(Block):
+    """A block implementation that supports prefix caching.
+    The PrefixCachingBlock class represents a block of token IDs with prefix
+    caching capabilities. It wraps a NaiveBlock internally and provides
+    additional functionality for content hashing and promoting immutable blocks
+    with the prefix caching allocator.
+    Args:
+        prev_block (Optional[PrefixCachingBlock]): The previous block in the
+            sequence.
+        token_ids (List[int]): The initial token IDs to be stored in the block.
+        block_size (int): The maximum number of token IDs that can be stored in
+            the block.
+        allocator (BlockAllocator): The prefix
+            caching block allocator associated with this block.
+        block_id (Optional[int], optional): The physical block index
+            of this block. Defaults to None.
+        extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
+    """
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+    def __init__(
+        self,
+        prev_block: Optional[Block],
+        token_ids: List[int],
+        block_size: int,
+        allocator: BlockAllocator,
+        block_id: Optional[int] = None,
+        computed: bool = False,
+        extra_hash: Optional[int] = None,
+    ):
+        assert isinstance(allocator, PrefixCachingBlockAllocator), (
+            "Currently this class is only tested with "
+            "PrefixCachingBlockAllocator. Got instead allocator = {}".format(
+                allocator))
+        assert_prefix_caching_block_or_none(prev_block)
+        self._prev_block = prev_block
+        self._cached_content_hash: Optional[int] = None
+        self._cached_num_tokens_total: int = 0
+        self._allocator = allocator
+        self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
+        self._computed = computed
+        self._extra_hash = extra_hash
+        # On the first time, we create the block object, and next we only
+        # reinitialize it
+        if hasattr(self, "_block"):
+            self._block.__init__(  # type: ignore[has-type]
+                prev_block=prev_block,
+                token_ids=token_ids,
+                block_size=block_size,
+                block_id=block_id,
+                allocator=self._allocator)
+        else:
+            self._block = NaiveBlock(prev_block=prev_block,
+                                     token_ids=token_ids,
+                                     block_size=block_size,
+                                     block_id=block_id,
+                                     allocator=self._allocator)
+        self._update_num_tokens_total()
+    def _update_num_tokens_total(self):
+        """Incrementally computes the number of tokens that there is
+        till the current block (included)
+        """
+        res = 0
+        # Add all previous blocks
+        if self._prev_block is not None:
+            res += self._prev_block.num_tokens_total
+        # Add current block
+        res += len(self.token_ids)
+        self._cached_num_tokens_total = res
+    @property
+    def computed(self) -> bool:
+        return self._computed
+    @computed.setter
+    def computed(self, value) -> None:
+        self._computed = value
+    @property
+    def last_accessed(self) -> float:
+        return self._last_accessed
+    @last_accessed.setter
+    def last_accessed(self, last_accessed_ts: float):
+        self._last_accessed = last_accessed_ts
+    def append_token_ids(self, token_ids: List[int]) -> None:
+        """Appends the given token IDs to the block and registers the block as
+        immutable if the block becomes full.
+        Args:
+            token_ids (List[int]): The token IDs to be appended to the block.
+        """
+        # Ensure this is mutable block (not promoted)
+        assert self.content_hash is None
+        assert not self.computed
+        if len(token_ids) == 0:
+            return
+        # Ensure there are input tokens
+        assert token_ids, "Got token_ids = {}".format(token_ids)
+        # Naive block handles CoW.
+        self._block.append_token_ids(token_ids)
+        self._update_num_tokens_total()
+        # If the content hash is present, then the block can be made immutable.
+        # Register ourselves with the allocator, potentially replacing the
+        # physical block index.
+        if self.content_hash is not None:
+            self.block_id = self._allocator.promote_to_immutable_block(self)
+    @property
+    def block_id(self) -> Optional[int]:
+        return self._block.block_id
+    @block_id.setter
+    def block_id(self, value) -> None:
+        self._block.block_id = value
+    @property
+    def is_full(self) -> bool:
+        return self._block.is_full
+    @property
+    def num_empty_slots(self) -> int:
+        return self._block.num_empty_slots
+    @property
+    def num_tokens_total(self) -> int:
+        return self._cached_num_tokens_total
+    @property
+    def block_size(self) -> int:
+        return self._block.block_size
+    @property
+    def token_ids(self) -> List[int]:
+        return self._block.token_ids
+    @property
+    def prev_block(self) -> Optional[Block]:
+        return self._prev_block
+    @property
+    def extra_hash(self) -> Optional[int]:
+        return self._extra_hash
+    @property
+    def content_hash(self) -> Optional[int]:
+        """Return the content-based hash of the current block, or None if it is
+        not yet defined.
+        For the content-based hash to be defined, the current block must be
+        full.
+        """
+        # If the hash is already computed, return it.
+        if self._cached_content_hash is not None:
+            return self._cached_content_hash
+        # We cannot compute a hash for the current block because it is not full.
+        if not self.is_full:
+            return None
+        is_first_block = self._prev_block is None
+        prev_block_hash = (
+            self._none_hash if is_first_block else
+            self._prev_block.content_hash  # type: ignore
+        )
+        # Previous block exists but does not yet have a hash.
+        # Return no hash in this case.
+        if prev_block_hash == self._none_hash and not is_first_block:
+            return None
+        self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
+            is_first_block,
+            prev_block_hash,
+            cur_block_token_ids=self.token_ids,
+            extra_hash=self._extra_hash)
+        return self._cached_content_hash
+    @classmethod
+    def hash_block_tokens(cls,
+                          is_first_block: bool,
+                          prev_block_hash: Optional[int],
+                          cur_block_token_ids: List[int],
+                          extra_hash: Optional[int] = None) -> int:
+        """Computes a hash value corresponding to the contents of a block and
+        the contents of the preceding block(s). The hash value is used for
+        prefix caching.
+        Parameters:
+        - is_first_block (bool): A flag indicating if the block is the first in
+            the sequence.
+        - prev_block_hash (Optional[int]): The hash of the previous block. None
+            if this is the first block.
+        - cur_block_token_ids (List[int]): A list of token ids in the current
+            block. The current block is assumed to be full.
+        - extra_hash (Optional[int]): The hash value of additional factors
+            such as adapters that influence the block, apart from the token_ids.
+        Returns:
+        - int: The computed hash value for the block.
+        """
+        if is_first_block and prev_block_hash is None:
+            prev_block_hash = cls._none_hash
+        return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
+                     extra_hash))
+class ComputedBlocksTracker:
+    """
+    Tracks the computed blocks for each sequence.
+    Internally, it maintains a map from sequence id to the list of block hashes
+    for the sequence. We cache the hashes of the full blocks for each sequence,
+    and make sure the hash is calculated in the same way as the allocator.
+    When a sequence is being decoded, we also update the sequence's hash
+    accordingly and incrementally.
+    From the sequence hash, with prefix caching enabled, we could also calculate
+    the number of cached tokens for the sequence by looking up the number of
+    cached block hashes in the allocator.
+    """
+    # Note that we use 'None' as a string here instead of None because
+    # as of Python 3.12, hash(None) returns a constant predictable value.
+    # This could possibly make it easier to find and exploit hash
+    # collisions. 'None' as a string will be hashed differently per process,
+    # but consistently within the same process. This is the same as the
+    # behavior of None prior to Python 3.12.
+    _none_hash: int = hash('None')
+    def __init__(
+        self,
+        allocator: DeviceAwareBlockAllocator,
+        block_size: int,
+        enable_caching: bool,
+    ):
+        self._allocator = allocator
+        self._block_size = block_size
+        self._enable_caching = enable_caching
+        # A map from seq_id to the list of block hashes for the
+        # sequence. This is so that we don't have to recompute the block hashes
+        # for the sequence when we need to check if the sequence is cached.
+        # Note a block that's not full will not have its hash calculated and
+        # recorded.
+        self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
+        # A map from seq_id to the number of tokens that are cached for the
+        # sequence.
+        # We need this so that a sequence in continuous prefill doesn't
+        # accidentally see its cached token count change. See comments in
+        # `get_num_cached_tokens` for more details.
+        self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
+    def _update_seq_hashes(self, seq: Sequence) -> None:
+        """Incrementally update the sequence's block hashes and record them."""
+        assert self._enable_caching
+        block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
+            seq.seq_id, [])
+        cur_num_blocks_recorded = len(block_hashes_recorded)
+        token_ids = seq.get_token_ids()
+        assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
+            f"The sequence has {len(token_ids)} tokens, but"
+            f" already recorded {cur_num_blocks_recorded} blocks. "
+            "This should not happen since we assume blocks are "
+            "only appended other than recomputation. When the sequence is "
+            "recomputed, we should have removed the info of the old blocks.")
+        # Update the computed block hashes for the sequence. Since only full
+        # blocks are considered as "computed", we take floor here.
+        num_computed_blocks = len(token_ids) // self._block_size
+        # We need to know the hash of the previous block to compute the hash of
+        # the current block so that blocks could be uniquely identified across
+        # sequences of prefixes.
+        prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
+                           block_hashes_recorded[-1])
+        # Only update the computed block hashes for the new blocks
+        for i in range(cur_num_blocks_recorded, num_computed_blocks):
+            assert len(token_ids) >= (i + 1) * self._block_size
+            block_token_ids = token_ids[i * self._block_size:(i + 1) *
+                                        self._block_size]
+            # NOTE: If there are any factors affecting the block besides
+            # token_ids, they should be added as input to extra_hash.
+            extra_hash = seq.extra_hash()
+            # This has to be kept in sync with the allocator's hash
+            # calculation.
+            block_hash = PrefixCachingBlock.hash_block_tokens(
+                is_first_block=prev_block_hash == self._none_hash,
+                prev_block_hash=prev_block_hash,
+                cur_block_token_ids=block_token_ids,
+                extra_hash=extra_hash,
+            )
+            block_hashes_recorded.append(block_hash)
+            prev_block_hash = block_hash
+        self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        if not self._enable_caching:
+            return 0
+        # We always try to update the sequence hashes on the fly.
+        # This is to ensure that we don't miss any cached tokens for the
+        # sequence during decode.
+        # This routine should only update hash for any new blocks too.
+        self._update_seq_hashes(seq)
+        num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
+            seq.seq_id, None)
+        # TODO(rickyx): This hack could be removed once we mark blocks as
+        # computed correctly with chunked prefills.
+        if num_computed_tokens_prev is not None and seq.is_prefill():
+            # For a sequence that is still in prefill, we don't
+            # recompute the number of cached tokens.
+            # This also handles correctly chunked prefill since currently
+            # we mark blocks as computed even if the sequence is still partially
+            # prefilled. So a continuously prefilled sequence should not
+            # see its cached token count change while running.
+            return num_computed_tokens_prev
+        block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
+        # This is O(logN), where N is the number of blocks.
+        num_cached_blocks = len(
+            self._allocator.find_cached_blocks_prefix(block_hashes))
+        num_cached_tokens = num_cached_blocks * self._block_size
+        self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
+        return num_cached_tokens
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking the sequence."""
+        if not self._enable_caching:
+            return
+        assert seq_id in self._seq_id_to_blocks_hashes
+        del self._seq_id_to_blocks_hashes[seq_id]
+        assert seq_id in self._seq_id_to_num_tokens_computed
+        del self._seq_id_to_num_tokens_computed[seq_id]
+class LastAccessBlocksTracker:
+    """Manages the last access time of the tracked sequences, in order to allow
+    an efficient update of allocator's block last access times
+    """
+    def __init__(self, allocator):
+        self._allocator = allocator
+        self._seq_last_access: Dict[int, Optional[float]] = {}
+    def add_seq(self, seq_id: int) -> None:
+        """Start tracking seq_id
+        """
+        assert seq_id not in self._seq_last_access
+        self._seq_last_access[seq_id] = None
+    def remove_seq(self, seq_id: int) -> None:
+        """Stop tracking seq_id
+        """
+        assert seq_id in self._seq_last_access
+        del self._seq_last_access[seq_id]
+    def update_last_access(self, seq_id: int, time: float) -> None:
+        assert seq_id in self._seq_last_access
+        self._seq_last_access[seq_id] = time
+    def update_seq_blocks_last_access(self, seq_id: int,
+                                      block_ids: List[int]) -> None:
+        assert seq_id in self._seq_last_access
+        ts = self._seq_last_access[seq_id]
+        if ts is None:
+            # No last access was recorded, no need to update.
+            return
+        self._allocator.mark_blocks_as_accessed(block_ids, ts)
+def assert_prefix_caching_block_or_none(block: Optional[Block]):
+    if block is None:
+        return
+    assert isinstance(block,
+                      PrefixCachingBlock), "Got block = {}".format(block)

.venv/lib/python3.11/site-packages/vllm/core/block/utils.py ADDED Viewed

	@@ -0,0 +1,27 @@

+# SPDX-License-Identifier: Apache-2.0
+"""Block manager utils."""
+from vllm.sequence import SequenceGroup
+from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
+                        STR_NOT_IMPL_ENC_DEC_SWA)
+def check_no_caching_or_swa_for_blockmgr_encdec(
+        block_mgr, seq_group: SequenceGroup) -> None:
+    '''
+    Enforce that prefix caching & sliding-window attention (SWA)
+    are currently unsupported *specifically* for encoder/decoder models.
+    Raises NotImplementedError if unsupported scenario is detected.
+    Arguments:
+    * block_mgr: BlockSpaceManager instance
+    * seq_group: SequenceGroup passed to block_mgr
+    '''
+    if seq_group.is_encoder_decoder():
+        if block_mgr.max_block_sliding_window is not None:
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
+        if block_mgr.enable_caching:
+            raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)

.venv/lib/python3.11/site-packages/vllm/core/interfaces.py ADDED Viewed

	@@ -0,0 +1,134 @@

+# SPDX-License-Identifier: Apache-2.0
+import enum
+from abc import ABC, abstractmethod
+from typing import List
+from typing import Sequence as GenericSequence
+from typing import Tuple
+from vllm.sequence import Sequence, SequenceGroup
+from vllm.utils import Device
+class AllocStatus(enum.Enum):
+    """Result for BlockSpaceManager.can_allocate
+    1. Ok: seq_group can be allocated now.
+    2. Later: seq_group cannot be allocated.
+      The capacity of allocator is larger than seq_group required.
+    3. Never: seq_group can never be allocated.
+      The seq_group is too large to allocated in GPU.
+    """
+    OK = enum.auto()
+    LATER = enum.auto()
+    NEVER = enum.auto()
+class BlockSpaceManager(ABC):
+    @staticmethod
+    def get_block_space_manager_class(version: str):
+        version = version.lower()
+        if version == "selfattn":
+            from vllm.core.block_manager import SelfAttnBlockSpaceManager
+            return SelfAttnBlockSpaceManager
+        if version == "placeholder":
+            from vllm.core.placeholder_block_space_manager import (
+                PlaceholderBlockSpaceManager)
+            return PlaceholderBlockSpaceManager
+        raise ValueError(f"Unknown version {version=}")
+    @abstractmethod
+    def can_allocate(self,
+                     seq_group: SequenceGroup,
+                     num_lookahead_slots: int = 0) -> AllocStatus:
+        pass
+    @abstractmethod
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        pass
+    @abstractmethod
+    def can_append_slots(self, seq_group: SequenceGroup,
+                         num_lookahead_slots: int) -> bool:
+        pass
+    @abstractmethod
+    def append_slots(
+        self,
+        seq: Sequence,
+        num_lookahead_slots: int,
+    ) -> List[Tuple[int, int]]:
+        pass
+    @abstractmethod
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        pass
+    @abstractmethod
+    def can_swap_in(self, seq_group: SequenceGroup,
+                    num_lookahead_slots: int) -> AllocStatus:
+        pass
+    @abstractmethod
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        pass
+    @abstractmethod
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        pass
+    @abstractmethod
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        pass
+    @abstractmethod
+    def free(self, seq: Sequence) -> None:
+        pass
+    @abstractmethod
+    def get_block_table(self, seq: Sequence) -> List[int]:
+        pass
+    @abstractmethod
+    def get_num_free_gpu_blocks(self) -> int:
+        pass
+    @abstractmethod
+    def get_num_free_cpu_blocks(self) -> int:
+        pass
+    @abstractmethod
+    def access_all_blocks_in_seq(
+        self,
+        seq: Sequence,
+        access_time: float,
+    ) -> None:
+        pass
+    @abstractmethod
+    def get_common_computed_block_ids(
+            self, seqs: List[Sequence]) -> GenericSequence[int]:
+        pass
+    @abstractmethod
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
+                                token_chunk_size: int):
+        pass
+    @abstractmethod
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        """Prefix cache hit rate. -1 means not supported or disabled."""
+        pass
+    @abstractmethod
+    def reset_prefix_cache(self) -> bool:
+        """Reset prefix cache for all devices."""
+        pass
+    @abstractmethod
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        pass

.venv/lib/python3.11/site-packages/vllm/core/placeholder_block_space_manager.py ADDED Viewed

	@@ -0,0 +1,99 @@

+# SPDX-License-Identifier: Apache-2.0
+from typing import List, Tuple
+from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+from vllm.sequence import Sequence, SequenceGroup
+from vllm.utils import Device
+class PlaceholderBlockSpaceManager(BlockSpaceManager):
+    """A version of BlockSpaceManager for use in environments
+    where block management is not required.
+    For example: pooling models or attention-free models like Mamba.
+    This class provides the same interface as BlockSpaceManager, but its
+    methods perform no actions or return simple values like True in specific
+    actions. It's designed to be used in scenarios where the overhead of
+    block management is unnecessary, such as in an embedding environment.
+    """
+    def __init__(
+        self,
+        **kwargs,
+    ) -> None:
+        pass
+    def can_allocate(self,
+                     seq_group: SequenceGroup,
+                     num_lookahead_slots: int = 0) -> AllocStatus:
+        # Always return OK for dummy purposes
+        return AllocStatus.OK
+    def allocate(self, seq_group: SequenceGroup) -> None:
+        # No actual allocation logic needed
+        pass
+    def can_append_slots(self, seq_group: SequenceGroup,
+                         num_lookahead_slots: int) -> bool:
+        return True
+    def append_slots(
+        self,
+        seq: Sequence,
+        num_lookahead_slots: int,
+    ) -> List[Tuple[int, int]]:
+        return []
+    def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        pass
+    def can_swap_in(self, seq_group: SequenceGroup,
+                    num_lookahead_slots: int) -> AllocStatus:
+        return AllocStatus.OK
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        return None  # type: ignore
+    def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        return True
+    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        return None  # type: ignore
+    def free(self, seq: Sequence) -> None:
+        # No operation on free
+        return
+    def get_block_table(self, seq: Sequence) -> List[int]:
+        return None  # type: ignore
+    def get_num_free_gpu_blocks(self) -> int:
+        return 1
+    def get_num_free_cpu_blocks(self) -> int:
+        return 1
+    def access_all_blocks_in_seq(
+        self,
+        seq: Sequence,
+        access_time: float,
+    ) -> None:
+        pass
+    def get_common_computed_block_ids(self,
+                                      seq_group: List[Sequence]) -> List[int]:
+        return []
+    def mark_blocks_as_computed(self, seq_group: SequenceGroup,
+                                token_chunk_size: int):
+        pass
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return -1
+    def reset_prefix_cache(self) -> bool:
+        return True
+    def get_num_cached_tokens(self, seq: Sequence) -> int:
+        return 0

.venv/lib/python3.11/site-packages/vllm/core/scheduler.py ADDED Viewed

	@@ -0,0 +1,1840 @@

+# SPDX-License-Identifier: Apache-2.0
+import enum
+import os
+import random
+import time
+from collections import deque
+from dataclasses import dataclass, field
+from typing import Callable, Deque, Dict, Iterable, List, Optional
+from typing import Sequence as GenericSequence
+from typing import Set, Tuple, Union
+from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.core.interfaces import AllocStatus, BlockSpaceManager
+from vllm.logger import init_logger
+from vllm.lora.request import LoRARequest
+from vllm.prompt_adapter.request import PromptAdapterRequest
+from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
+                           SequenceGroupMetadata, SequenceGroupMetadataDelta,
+                           SequenceStatus)
+from vllm.utils import Device, PyObjectCache
+logger = init_logger(__name__)
+# Test-only. If configured, decode is preempted with
+# ARTIFICIAL_PREEMPTION_PROB% probability.
+ENABLE_ARTIFICIAL_PREEMPT = bool(
+    os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False))  # noqa
+ARTIFICIAL_PREEMPTION_PROB = 0.5
+ARTIFICIAL_PREEMPTION_MAX_CNT = 500
+class PreemptionMode(enum.Enum):
+    """Preemption modes.
+    1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
+    and swap them back in when the sequences are resumed.
+    2. Recomputation: Discard the blocks of the preempted sequences and
+    recompute them when the sequences are resumed, treating the sequences as
+    new prompts.
+    """
+    SWAP = enum.auto()
+    RECOMPUTE = enum.auto()
+@dataclass
+class SchedulingBudget:
+    """The available slots for scheduling.
+    TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
+    budget update from the same request_id. It is because in normal scheduling
+    path, we update RUNNING num_seqs ahead of time, meaning it could be
+    updated more than once when scheduling RUNNING requests. Since this won't
+    happen if we only have chunked prefill scheduling, we can remove this
+    feature from the API when chunked prefill is enabled by default.
+    """
+    token_budget: int
+    max_num_seqs: int
+    _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
+    _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
+    # Number of cached tokens in the batch.
+    _num_cached_tokens: int = 0
+    # Number of actual non-cached tokens in the batch.
+    _num_batched_tokens: int = 0
+    _num_curr_seqs: int = 0
+    def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
+        # We allow num_new_tokens to be 0 when the entire sequence has
+        # been cached.
+        assert num_new_tokens >= 0
+        assert num_new_seqs != 0
+        return (self.num_batched_tokens + num_new_tokens <= self.token_budget
+                and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
+    def remaining_token_budget(self):
+        return self.token_budget - self.num_batched_tokens
+    def add_num_batched_tokens(self,
+                               req_id: str,
+                               num_batched_tokens: int,
+                               num_cached_tokens: int = 0):
+        if req_id in self._request_ids_num_batched_tokens:
+            return
+        assert num_cached_tokens >= 0
+        assert num_batched_tokens >= 0
+        self._request_ids_num_batched_tokens.add(req_id)
+        self._num_batched_tokens += num_batched_tokens
+        self._num_cached_tokens += num_cached_tokens
+    def subtract_num_batched_tokens(self, req_id: str,
+                                    num_batched_tokens: int):
+        if req_id in self._request_ids_num_batched_tokens:
+            self._request_ids_num_batched_tokens.remove(req_id)
+            self._num_batched_tokens -= num_batched_tokens
+    def add_num_seqs(self, req_id: str, num_curr_seqs: int):
+        if req_id in self._request_ids_num_curr_seqs:
+            return
+        self._request_ids_num_curr_seqs.add(req_id)
+        self._num_curr_seqs += num_curr_seqs
+    def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
+        if req_id in self._request_ids_num_curr_seqs:
+            self._request_ids_num_curr_seqs.remove(req_id)
+            self._num_curr_seqs -= num_curr_seqs
+    @property
+    def num_batched_tokens(self):
+        return self._num_batched_tokens
+    @property
+    def num_curr_seqs(self):
+        return self._num_curr_seqs
+    @property
+    def num_cached_tokens(self):
+        return self._num_cached_tokens
+@dataclass
+class ScheduledSequenceGroup:
+    # A sequence group that's scheduled.
+    seq_group: SequenceGroup
+    # The total chunk size (number of tokens) to process for next iteration.
+    # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
+    # chunked, it can be smaller than that.
+    token_chunk_size: int
+@dataclass
+class SchedulerOutputs:
+    """The scheduling decision made from a scheduler."""
+    # Scheduled sequence groups.
+    scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
+    # Number of prefill groups scheduled.
+    num_prefill_groups: int
+    # Total number of batched tokens.
+    num_batched_tokens: int
+    # Blocks to swap in. List of CPU -> GPU block number.
+    blocks_to_swap_in: List[Tuple[int, int]]
+    # Blocks to swap out. List of GPU -> CPU block number.
+    blocks_to_swap_out: List[Tuple[int, int]]
+    # Blocks to copy. Source to dest block.
+    blocks_to_copy: List[Tuple[int, int]]
+    # Sequence groups that are going to be ignored.
+    ignored_seq_groups: List[SequenceGroup]
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int
+    # The number of requests in the running queue
+    running_queue_size: int
+    preempted: int
+    def __post_init__(self):
+        # Swap in and swap out should never happen at the same time.
+        assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
+        self.num_loras: int = len(self.lora_requests)
+        if self.num_loras > 0:
+            self._sort_by_lora_ids()
+        self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
+    def is_empty(self) -> bool:
+        # NOTE: We do not consider the ignored sequence groups.
+        return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
+                and not self.blocks_to_swap_out and not self.blocks_to_copy)
+    def _sort_by_lora_ids(self):
+        assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
+        def key_fn(group: ScheduledSequenceGroup):
+            key = (group.seq_group.lora_int_id, group.seq_group.request_id)
+            if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
+                # Sort sequence groups so that all prefills come before all
+                # decodes as required by chunked prefill.
+                return (not group.seq_group.is_prefill(), *key)
+            return key
+        self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
+                                           key=key_fn)
+    @property
+    def lora_requests(self) -> Set[LoRARequest]:
+        return {
+            g.seq_group.lora_request
+            for g in self.scheduled_seq_groups
+            if g.seq_group.lora_request is not None
+        }
+    @property
+    def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
+        return {
+            g.seq_group.prompt_adapter_request
+            for g in self.scheduled_seq_groups
+            if g.seq_group.prompt_adapter_request is not None
+        }
+@dataclass
+class SchedulerRunningOutputs:
+    """The requests that are scheduled from a running queue.
+    Could contain prefill (prefill that's chunked) or decodes. If there's not
+    enough memory, it can be preempted (for recompute) or swapped out.
+    """
+    # Selected sequences that are running and in a decoding phase.
+    decode_seq_groups: List[ScheduledSequenceGroup]
+    # Selected sequences that are running and in a prefill phase.
+    # I.e., it means the prefill has been chunked.
+    prefill_seq_groups: List[ScheduledSequenceGroup]
+    # The preempted sequences.
+    preempted: List[SequenceGroup]
+    # Sequences that are swapped out.
+    swapped_out: List[SequenceGroup]
+    # The blocks to swap out.
+    blocks_to_swap_out: List[Tuple[int, int]]
+    # The blocks to copy.
+    blocks_to_copy: List[Tuple[int, int]]
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int
+    # Optimization for fast-access to seq_group lists
+    decode_seq_groups_list: List[SequenceGroup]
+    prefill_seq_groups_list: List[SequenceGroup]
+    @classmethod
+    def create_empty(cls) -> "SchedulerRunningOutputs":
+        return SchedulerRunningOutputs(
+            decode_seq_groups=[],
+            prefill_seq_groups=[],
+            preempted=[],
+            swapped_out=[],
+            blocks_to_swap_out=[],
+            blocks_to_copy=[],
+            num_lookahead_slots=0,
+            decode_seq_groups_list=[],
+            prefill_seq_groups_list=[],
+        )
+@dataclass
+class SchedulerSwappedInOutputs:
+    """The requests that are scheduled from a swap queue.
+    Could contain prefill (prefill that's chunked) or decodes.
+    """
+    # Selected sequences that are going to be swapped in and is in a
+    # decoding phase.
+    decode_seq_groups: List[ScheduledSequenceGroup]
+    # Selected sequences that are going to be swapped in and in a prefill
+    # phase. I.e., it means the prefill has been chunked.
+    prefill_seq_groups: List[ScheduledSequenceGroup]
+    # The blocks to swap in.
+    blocks_to_swap_in: List[Tuple[int, int]]
+    # The blocks to copy.
+    blocks_to_copy: List[Tuple[int, int]]
+    # The number of slots for lookahead decoding.
+    num_lookahead_slots: int
+    # Infeasible sequence groups.
+    infeasible_seq_groups: List[SequenceGroup]
+    @classmethod
+    def create_empty(cls) -> "SchedulerSwappedInOutputs":
+        return SchedulerSwappedInOutputs(
+            decode_seq_groups=[],
+            prefill_seq_groups=[],
+            blocks_to_swap_in=[],
+            blocks_to_copy=[],
+            num_lookahead_slots=0,
+            infeasible_seq_groups=[],
+        )
+@dataclass
+class SchedulerPrefillOutputs:
+    """The requests that are scheduled from a waiting queue.
+    Could contain a fresh prefill requests or preempted requests that need
+    to be recomputed from scratch.
+    """
+    # Selected sequences for prefill.
+    seq_groups: List[ScheduledSequenceGroup]
+    # Ignored sequence groups.
+    ignored_seq_groups: List[SequenceGroup]
+    num_lookahead_slots: int
+    @classmethod
+    def create_empty(cls) -> "SchedulerPrefillOutputs":
+        return SchedulerPrefillOutputs(
+            seq_groups=[],
+            ignored_seq_groups=[],
+            num_lookahead_slots=0,
+        )
+def seq_group_metadata_builder():
+    return SequenceGroupMetadata(request_id="",
+                                 is_prompt=False,
+                                 seq_data={},
+                                 sampling_params=None,
+                                 block_tables={})
+def scheduler_running_outputs_builder():
+    return SchedulerRunningOutputs(decode_seq_groups=[],
+                                   prefill_seq_groups=[],
+                                   preempted=[],
+                                   swapped_out=[],
+                                   blocks_to_swap_out=[],
+                                   blocks_to_copy=[],
+                                   num_lookahead_slots=0,
+                                   prefill_seq_groups_list=[],
+                                   decode_seq_groups_list=[])
+def scheduled_seq_group_builder():
+    return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
+                                  token_chunk_size=0)
+    # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
+class Scheduler:
+    def __init__(
+        self,
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+        lora_config: Optional[LoRAConfig],
+        pipeline_parallel_size: int = 1,
+        output_proc_callback: Optional[Callable] = None,
+    ) -> None:
+        self.scheduler_config = scheduler_config
+        self.cache_config = cache_config
+        # Note for LoRA scheduling: the current policy is extremely
+        # simple and NOT fair. It can lead to starvation of some
+        # LoRAs. This should be improved in the future.
+        self.lora_config = lora_config
+        version = "selfattn"
+        if (self.scheduler_config.runner_type == "pooling"
+                or self.cache_config.is_attention_free):
+            version = "placeholder"
+        BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
+            version)
+        num_gpu_blocks = cache_config.num_gpu_blocks
+        if num_gpu_blocks:
+            num_gpu_blocks //= pipeline_parallel_size
+        num_cpu_blocks = cache_config.num_cpu_blocks
+        if num_cpu_blocks:
+            num_cpu_blocks //= pipeline_parallel_size
+        # Create the block space manager.
+        self.block_manager = BlockSpaceManagerImpl(
+            block_size=self.cache_config.block_size,
+            num_gpu_blocks=num_gpu_blocks,
+            num_cpu_blocks=num_cpu_blocks,
+            sliding_window=self.cache_config.sliding_window,
+            enable_caching=self.cache_config.enable_prefix_caching)
+        # Sequence groups in the WAITING state.
+        # Contain new prefill or preempted requests.
+        self.waiting: Deque[SequenceGroup] = deque()
+        # Sequence groups in the RUNNING state.
+        # Contain decode requests.
+        self.running: Deque[SequenceGroup] = deque()
+        # Sequence groups in the SWAPPED state.
+        # Contain decode requests that are swapped out.
+        self.swapped: Deque[SequenceGroup] = deque()
+        # Sequence groups finished requests ids since last step iteration.
+        # It lets the model know that any state associated with these requests
+        # can and must be released after the current step.
+        # This is used to evict the finished requests from the Mamba cache.
+        self._finished_requests_ids: List[str] = list()
+        # Time at previous scheduling step
+        self.prev_time = 0.0
+        # Did we schedule a prompt at previous step?
+        self.prev_prompt = False
+        # Latency of the last prompt step
+        self.last_prompt_latency = 0.0
+        # preemption mode, RECOMPUTE or SWAP
+        self.user_specified_preemption_mode = scheduler_config.preemption_mode
+        # The following field is test-only. It is used to inject artificial
+        # preemption.
+        self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
+        self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
+                                       if self.enable_artificial_preemption
+                                       else 0)
+        self.num_cumulative_preemption: int = 0
+        # Used to cache python objects
+        self._seq_group_metadata_cache: List[PyObjectCache] = []
+        self._scheduler_running_outputs_cache: List[PyObjectCache] = []
+        self._scheduled_seq_group_cache: List[PyObjectCache] = []
+        # For async output processing, we need to swap cache buffers between
+        # iterations. I.e. since the output processing is lagged one step,
+        # we cannot reuse the cached objects immediately when the schedule()
+        # is called again, but only when schedule() is called the second time.
+        self.output_proc_callback = output_proc_callback
+        self.use_async_output_proc = self.output_proc_callback is not None
+        self.num_cache_iters = 2 if self.use_async_output_proc else 1
+        self.cache_id = 0
+        for i in range(self.num_cache_iters):
+            self._seq_group_metadata_cache.append(
+                PyObjectCache(seq_group_metadata_builder))
+            self._scheduler_running_outputs_cache.append(
+                PyObjectCache(scheduler_running_outputs_builder))
+            self._scheduled_seq_group_cache.append(
+                PyObjectCache(scheduled_seq_group_builder))
+        # For async postprocessor, the extra decode run cannot be done
+        # when the request reaches max_model_len. In this case, the request
+        # will be stopped during schedule() call and added to this stop list
+        # for processing and deallocation by the free_finished_seq_groups()
+        self._async_stopped: List[SequenceGroup] = []
+    @property
+    def next_cache_id(self):
+        return (self.cache_id + 1) % self.num_cache_iters
+    @property
+    def lora_enabled(self) -> bool:
+        return bool(self.lora_config)
+    @property
+    def num_decoding_tokens_per_seq(self) -> int:
+        """The number of new tokens."""
+        return 1
+    def add_seq_group(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the waiting queue.
+        self.waiting.append(seq_group)
+    def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the running queue.
+        # Only for testing purposes.
+        self.running.append(seq_group)
+    def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
+        # Add sequence groups to the swapped queue.
+        # Only for testing purposes.
+        self.swapped.append(seq_group)
+    def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
+        """Aborts a sequence group with the given ID.
+        Check if the sequence group with the given ID
+            is present in any of the state queue.
+        If present, remove the sequence group from the state queue.
+            Also, if any of the sequences in the sequence group is not finished,
+                free the sequence with status `FINISHED_ABORTED`.
+        Otherwise, do nothing.
+        Args:
+            request_id: The ID(s) of the sequence group to abort.
+        """
+        if isinstance(request_id, str):
+            request_id = (request_id, )
+        request_ids = set(request_id)
+        for state_queue in [self.waiting, self.running, self.swapped]:
+            aborted_groups: List[SequenceGroup] = []
+            for seq_group in state_queue:
+                if not request_ids:
+                    # Using 'break' here may add two extra iterations,
+                    # but is acceptable to reduce complexity.
+                    break
+                if seq_group.request_id in request_ids:
+                    # Appending aborted group into pending list.
+                    aborted_groups.append(seq_group)
+                    request_ids.remove(seq_group.request_id)
+            for aborted_group in aborted_groups:
+                # Remove the sequence group from the state queue.
+                state_queue.remove(aborted_group)
+                # Remove the aborted request from the Mamba cache.
+                self._finished_requests_ids.append(aborted_group.request_id)
+                for seq in aborted_group.get_seqs():
+                    if seq.is_finished():
+                        continue
+                    seq.status = SequenceStatus.FINISHED_ABORTED
+                    self.free_seq(seq)
+                self._free_seq_group_cross_attn_blocks(aborted_group)
+    def _free_seq_group_cross_attn_blocks(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        """
+        Free a sequence group from a cross-attention block table.
+        Has no effect on decoder-only models.
+        """
+        if seq_group.is_encoder_decoder():
+            self.block_manager.free_cross(seq_group)
+    def has_unfinished_seqs(self) -> bool:
+        return len(self.waiting) != 0 or len(self.running) != 0 or len(
+            self.swapped) != 0
+    def get_prefix_cache_hit_rate(self, device: Device) -> float:
+        return self.block_manager.get_prefix_cache_hit_rate(device)
+    def reset_prefix_cache(self) -> bool:
+        return self.block_manager.reset_prefix_cache()
+    def get_num_unfinished_seq_groups(self) -> int:
+        return len(self.waiting) + len(self.running) + len(self.swapped)
+    def get_and_reset_finished_requests_ids(self) -> List[str]:
+        """Flushes the list of request ids of previously finished seq_groups."""
+        finished_requests_ids = self._finished_requests_ids
+        self._finished_requests_ids = list()
+        return finished_requests_ids
+    def _schedule_running(
+        self,
+        budget: SchedulingBudget,
+        curr_loras: Optional[Set[int]],
+        enable_chunking: bool = False,
+    ) -> SchedulerRunningOutputs:
+        """Schedule sequence groups that are running.
+        Running queue should include decode and chunked prefill requests.
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any decodes are preempted.
+            curr_loras: Currently batched lora request ids. The argument is
+                in-place updated when any decodes are preempted.
+            enable_chunking: If True, seq group can be chunked and only a
+                chunked number of tokens are scheduled  if
+                `budget.num_batched_tokens` has not enough capacity to schedule
+                all tokens.
+        Returns:
+            SchedulerRunningOutputs.
+        """
+        ret: SchedulerRunningOutputs = \
+            self._scheduler_running_outputs_cache[self.cache_id].get_object()
+        ret.blocks_to_swap_out.clear()
+        ret.blocks_to_copy.clear()
+        ret.decode_seq_groups.clear()
+        ret.prefill_seq_groups.clear()
+        ret.preempted.clear()
+        ret.swapped_out.clear()
+        ret.num_lookahead_slots = self._get_num_lookahead_slots(
+            is_prefill=False, enable_chunking=enable_chunking)
+        ret.decode_seq_groups_list.clear()
+        ret.prefill_seq_groups_list.clear()
+        # Blocks that need to be swapped or copied before model execution.
+        blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
+        blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
+        decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
+        prefill_seq_groups: List[
+            ScheduledSequenceGroup] = ret.prefill_seq_groups
+        preempted: List[SequenceGroup] = ret.preempted
+        swapped_out: List[SequenceGroup] = ret.swapped_out
+        running_queue = self.running
+        assert len(self._async_stopped) == 0
+        while running_queue:
+            seq_group = running_queue[0]
+            # We discard the cached tokens info here because we don't need it
+            # for running sequence:
+            #   1. If a sequence is running with chunked prefill, the cached
+            #      tokens info was already used for the first prefill.
+            #   2. If a sequence is running with non-chunked prefill, then
+            #      there it's a decoding sequence, and the cached tokens info is
+            #      irrelevant.
+            num_uncached_new_tokens, _ = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group, SequenceStatus.RUNNING, enable_chunking,
+                    budget))
+            num_running_tokens = num_uncached_new_tokens
+            if num_running_tokens == 0:
+                # No budget => Stop
+                break
+            running_queue.popleft()
+            # With async postprocessor, an extra decode run is done
+            # to process the final tokens. The check below avoids this extra
+            # decode run when the model max len is reached, in order to avoid
+            # a memory overflow.
+            if self.use_async_output_proc and seq_group.seqs[0].get_len(
+            ) > self.scheduler_config.max_model_len:
+                self._async_stopped.append(seq_group)
+                continue
+            # NOTE(woosuk): Preemption happens only when there is no available
+            # slot to keep all the sequence groups in the RUNNING state.
+            while not self._can_append_slots(seq_group, enable_chunking):
+                budget.subtract_num_batched_tokens(seq_group.request_id,
+                                                   num_running_tokens)
+                num_running_seqs = seq_group.get_max_num_running_seqs()
+                budget.subtract_num_seqs(seq_group.request_id,
+                                         num_running_seqs)
+                if (curr_loras is not None and seq_group.lora_int_id > 0
+                        and seq_group.lora_int_id in curr_loras):
+                    curr_loras.remove(seq_group.lora_int_id)
+                # Determine victim sequence
+                cont_loop = True
+                if running_queue:
+                    # Preempt the lowest-priority sequence group.
+                    victim_seq_group = running_queue.pop()
+                else:
+                    # No other sequence group can be preempted.
+                    # Preempt the current sequence group.
+                    # Note: This is also where we stop this loop
+                    # (since there is nothing else to preempt)
+                    victim_seq_group = seq_group
+                    cont_loop = False
+                # With async postprocessor, before preempting a sequence
+                # we need to ensure it has no pending async postprocessor
+                do_preempt = True
+                if self.use_async_output_proc:
+                    assert self.output_proc_callback is not None
+                    self.output_proc_callback(
+                        request_id=victim_seq_group.request_id)
+                    # It may be that the async pending "victim_seq_group"
+                    # becomes finished, in which case we simply free it.
+                    if victim_seq_group.is_finished():
+                        self._free_finished_seq_group(victim_seq_group)
+                        do_preempt = False
+                # Do preemption
+                if do_preempt:
+                    preempted_mode = self._preempt(victim_seq_group,
+                                                   blocks_to_swap_out)
+                    if preempted_mode == PreemptionMode.RECOMPUTE:
+                        preempted.append(victim_seq_group)
+                    else:
+                        swapped_out.append(victim_seq_group)
+                if not cont_loop:
+                    break
+            else:
+                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+                is_prefill = seq_group.is_prefill()
+                scheduled_seq_group: ScheduledSequenceGroup = \
+                    self._scheduled_seq_group_cache[self.cache_id].get_object()
+                scheduled_seq_group.seq_group = seq_group
+                if is_prefill:
+                    scheduled_seq_group.token_chunk_size = num_running_tokens
+                    prefill_seq_groups.append(scheduled_seq_group)
+                    ret.prefill_seq_groups_list.append(seq_group)
+                else:
+                    scheduled_seq_group.token_chunk_size = 1
+                    decode_seq_groups.append(scheduled_seq_group)
+                    ret.decode_seq_groups_list.append(seq_group)
+                budget.add_num_batched_tokens(seq_group.request_id,
+                                              num_running_tokens)
+                # OPTIMIZATION:  Note that get_max_num_running_seqs is
+                # expensive. For the default scheduling chase where
+                # enable_chunking is False, num_seqs are updated before running
+                # this method, so we don't have to update it again here.
+                if enable_chunking:
+                    num_running_seqs = seq_group.get_max_num_running_seqs()
+                    budget.add_num_seqs(seq_group.request_id, num_running_seqs)
+                if curr_loras is not None and seq_group.lora_int_id > 0:
+                    curr_loras.add(seq_group.lora_int_id)
+        self._scheduler_running_outputs_cache[self.next_cache_id].reset()
+        self._scheduled_seq_group_cache[self.next_cache_id].reset()
+        return ret
+    def _schedule_swapped(
+        self,
+        budget: SchedulingBudget,
+        curr_loras: Optional[Set[int]],
+        enable_chunking: bool = False,
+    ) -> SchedulerSwappedInOutputs:
+        """Schedule sequence groups that are swapped out.
+        It schedules swapped requests as long as it fits `budget` and
+        curr_loras <= max_lora from the scheduling config. The input arguments
+        `budget` and `curr_loras` are updated based on scheduled seq_groups.
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any requests are swapped in.
+            curr_loras: Currently batched lora request ids. The argument is
+                in-place updated when any requests are swapped in.
+            enable_chunking: If True, seq group can be chunked and only a
+                chunked number of tokens are scheduled  if
+                `budget.num_batched_tokens` has not enough capacity to schedule
+                all tokens.
+        Returns:
+            SchedulerSwappedInOutputs.
+        """
+        # Blocks that need to be swapped or copied before model execution.
+        blocks_to_swap_in: List[Tuple[int, int]] = []
+        blocks_to_copy: List[Tuple[int, int]] = []
+        decode_seq_groups: List[ScheduledSequenceGroup] = []
+        prefill_seq_groups: List[ScheduledSequenceGroup] = []
+        infeasible_seq_groups: List[SequenceGroup] = []
+        swapped_queue = self.swapped
+        leftover_swapped: Deque[SequenceGroup] = deque()
+        while swapped_queue:
+            seq_group = swapped_queue[0]
+            # If the sequence group cannot be swapped in, stop.
+            is_prefill = seq_group.is_prefill()
+            alloc_status = self.block_manager.can_swap_in(
+                seq_group,
+                self._get_num_lookahead_slots(is_prefill, enable_chunking))
+            if alloc_status == AllocStatus.LATER:
+                break
+            elif alloc_status == AllocStatus.NEVER:
+                logger.warning(
+                    "Failing the request %s because there's not enough kv "
+                    "cache blocks to run the entire sequence.",
+                    seq_group.request_id)
+                for seq in seq_group.get_seqs():
+                    seq.status = SequenceStatus.FINISHED_IGNORED
+                infeasible_seq_groups.append(seq_group)
+                swapped_queue.popleft()
+                continue
+            lora_int_id = 0
+            if self.lora_enabled:
+                lora_int_id = seq_group.lora_int_id
+                assert curr_loras is not None
+                assert self.lora_config is not None
+                if (lora_int_id > 0 and (lora_int_id not in curr_loras)
+                        and len(curr_loras) >= self.lora_config.max_loras):
+                    # We don't have a space for another LoRA, so
+                    # we ignore this request for now.
+                    leftover_swapped.appendleft(seq_group)
+                    swapped_queue.popleft()
+                    continue
+            # The total number of sequences in the RUNNING state should not
+            # exceed the maximum number of sequences.
+            num_new_seqs = seq_group.get_max_num_running_seqs()
+            num_new_tokens_uncached, num_new_tokens_cached = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group, SequenceStatus.SWAPPED, enable_chunking,
+                    budget))
+            if num_new_tokens_uncached == 0 or not budget.can_schedule(
+                    num_new_tokens=num_new_tokens_uncached,
+                    num_new_seqs=num_new_seqs,
+            ):
+                break
+            if lora_int_id > 0 and curr_loras is not None:
+                curr_loras.add(lora_int_id)
+            swapped_queue.popleft()
+            self._swap_in(seq_group, blocks_to_swap_in)
+            self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+            is_prefill = seq_group.is_prefill()
+            if is_prefill:
+                prefill_seq_groups.append(
+                    ScheduledSequenceGroup(
+                        seq_group,
+                        token_chunk_size=num_new_tokens_uncached +
+                        num_new_tokens_cached,
+                    ))
+            else:
+                decode_seq_groups.append(
+                    ScheduledSequenceGroup(seq_group, token_chunk_size=1))
+            budget.add_num_batched_tokens(
+                seq_group.request_id,
+                num_batched_tokens=num_new_tokens_uncached,
+                num_cached_tokens=num_new_tokens_cached,
+            )
+            budget.add_num_seqs(seq_group.request_id, num_new_seqs)
+        swapped_queue.extendleft(leftover_swapped)
+        return SchedulerSwappedInOutputs(
+            decode_seq_groups=decode_seq_groups,
+            prefill_seq_groups=prefill_seq_groups,
+            blocks_to_swap_in=blocks_to_swap_in,
+            blocks_to_copy=blocks_to_copy,
+            num_lookahead_slots=self._get_num_lookahead_slots(
+                is_prefill=False, enable_chunking=enable_chunking),
+            infeasible_seq_groups=infeasible_seq_groups,
+        )
+    def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
+        if self.scheduler_config.chunked_prefill_enabled and \
+                not self.scheduler_config.is_multi_step:
+            prompt_limit = self.scheduler_config.max_model_len
+        else:
+            prompt_limit = min(self.scheduler_config.max_model_len,
+                               self.scheduler_config.max_num_batched_tokens)
+        # Model is fine tuned with long context. Return the fine tuned max_len.
+        if (seq_group.lora_request
+                and seq_group.lora_request.long_lora_max_len):
+            assert prompt_limit <= seq_group.lora_request.long_lora_max_len
+            return seq_group.lora_request.long_lora_max_len
+        else:
+            return prompt_limit
+    def _get_priority(self,
+                      seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
+        """ Get the priority of the sequence group.
+        Highest preference to user-defined priority, followed by arrival time.
+        Args:
+            seq_group: The sequence group input.
+        Returns:
+            The priority of the sequence group.
+        """
+        return seq_group.priority, seq_group.arrival_time
+    def _schedule_priority_preemption(
+        self,
+        budget: SchedulingBudget,
+    ) -> int:
+        """Sorts waiting and running queue. Also, force preempt requests
+        from the running queue if their priority is lower.
+        Priority-based preemption is used with the priority policy.
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any requests are scheduled.
+        Returns:
+            A count of priority-based preemptions.
+        """
+        waiting_queue = self.waiting
+        running_queue = deque(sorted(self.running, key=self._get_priority))
+        blocks_to_swap_out: List[Tuple[int, int]] = []
+        force_preemption_count = 0
+        if waiting_queue:
+            seq_group = waiting_queue.popleft()
+            num_new_seqs = seq_group.get_max_num_running_seqs()
+            num_new_tokens_uncached, _ = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group, SequenceStatus.WAITING, False, budget))
+            #Only preempt if priority inversion exists
+            while running_queue and self._get_priority(
+                    running_queue[-1]) > self._get_priority(seq_group):
+                #Only preempt if waiting sequence cannot be allocated
+                can_allocate = self.block_manager.can_allocate(seq_group)
+                if (num_new_tokens_uncached > 0
+                        and can_allocate == AllocStatus.OK
+                        and budget.can_schedule(
+                            num_new_tokens=num_new_tokens_uncached,
+                            num_new_seqs=num_new_seqs,
+                        )):
+                    break
+                #Adjust budget to remove the victim sequence group
+                vseq_group = running_queue.pop()
+                num_running_tokens_uncached, _ = (
+                    self._get_num_new_uncached_and_cached_tokens(
+                        vseq_group, SequenceStatus.RUNNING, False, budget))
+                budget.subtract_num_batched_tokens(
+                    vseq_group.request_id, num_running_tokens_uncached)
+                num_running_seqs = vseq_group.get_max_num_running_seqs()
+                budget.subtract_num_seqs(vseq_group.request_id,
+                                         num_running_seqs)
+                #Preempt out the victim sequence group
+                self._preempt(vseq_group, blocks_to_swap_out)
+                waiting_queue.appendleft(vseq_group)
+                force_preemption_count += 1
+            #Put the sequence back into the waiting queue
+            waiting_queue.appendleft(seq_group)
+        waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
+        self.waiting = waiting_queue
+        self.running = running_queue
+        return force_preemption_count
+    def _schedule_prefills(
+        self,
+        budget: SchedulingBudget,
+        curr_loras: Optional[Set[int]],
+        enable_chunking: bool = False,
+    ) -> SchedulerPrefillOutputs:
+        """Schedule sequence groups that are in prefill stage.
+        Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
+        as a new prefill (that starts from beginning -> most recently generated
+        tokens).
+        It schedules waiting requests as long as it fits `budget` and
+        curr_loras <= max_lora from the scheduling config. The input arguments
+        `budget` and `curr_loras` are updated based on scheduled seq_groups.
+        Args:
+            budget: The scheduling budget. The argument is in-place updated
+                when any requests are scheduled.
+            curr_loras: Currently batched lora request ids. The argument is
+                in-place updated when any requests are scheduled.
+            enable_chunking: If True, seq group can be chunked and only a
+                chunked number of tokens are scheduled  if
+                `budget.num_batched_tokens` has not enough capacity to schedule
+                all tokens.
+        Returns:
+            SchedulerPrefillOutputs.
+        """
+        ignored_seq_groups: List[SequenceGroup] = []
+        seq_groups: List[ScheduledSequenceGroup] = []
+        waiting_queue = self.waiting
+        leftover_waiting_sequences: Deque[SequenceGroup] = deque()
+        while self._passed_delay(time.time()) and waiting_queue:
+            seq_group = waiting_queue[0]
+            waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
+            assert len(waiting_seqs) == 1, (
+                "Waiting sequence group should have only one prompt "
+                "sequence.")
+            num_new_tokens_uncached, num_new_tokens_cached = (
+                self._get_num_new_uncached_and_cached_tokens(
+                    seq_group, SequenceStatus.WAITING, enable_chunking,
+                    budget))
+            num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
+            if not enable_chunking:
+                num_prompt_tokens = waiting_seqs[0].get_len()
+                assert num_new_tokens == num_prompt_tokens
+            prompt_limit = self._get_prompt_limit(seq_group)
+            if num_new_tokens > prompt_limit:
+                logger.warning(
+                    "Input prompt (%d tokens) is too long"
+                    " and exceeds limit of %d", num_new_tokens, prompt_limit)
+                for seq in waiting_seqs:
+                    seq.status = SequenceStatus.FINISHED_IGNORED
+                ignored_seq_groups.append(seq_group)
+                waiting_queue.popleft()
+                continue
+            num_lookahead_slots: int = 0
+            if self.scheduler_config.is_multi_step and enable_chunking:
+                num_lookahead_slots = self._get_num_lookahead_slots(
+                    True, enable_chunking)
+            # If the sequence group cannot be allocated, stop.
+            can_allocate = self.block_manager.can_allocate(
+                seq_group, num_lookahead_slots=num_lookahead_slots)
+            if can_allocate == AllocStatus.LATER:
+                break
+            elif can_allocate == AllocStatus.NEVER:
+                logger.warning(
+                    "Input prompt (%d tokens) + lookahead slots (%d) is "
+                    "too long and exceeds the capacity of block_manager",
+                    num_new_tokens, num_lookahead_slots)
+                for seq in waiting_seqs:
+                    seq.status = SequenceStatus.FINISHED_IGNORED
+                ignored_seq_groups.append(seq_group)
+                waiting_queue.popleft()
+                continue
+            lora_int_id = 0
+            if self.lora_enabled:
+                lora_int_id = seq_group.lora_int_id
+                assert curr_loras is not None
+                assert self.lora_config is not None
+                if (self.lora_enabled and lora_int_id > 0
+                        and lora_int_id not in curr_loras
+                        and len(curr_loras) >= self.lora_config.max_loras):
+                    # We don't have a space for another LoRA, so
+                    # we ignore this request for now.
+                    leftover_waiting_sequences.appendleft(seq_group)
+                    waiting_queue.popleft()
+                    continue
+            if (budget.num_batched_tokens
+                    >= self.scheduler_config.max_num_batched_tokens):
+                # We've reached the budget limit - since there might be
+                # continuous prefills in the running queue, we should break
+                # to avoid scheduling any new prefills.
+                break
+            num_new_seqs = seq_group.get_max_num_running_seqs()
+            if num_new_tokens_uncached == 0 or not budget.can_schedule(
+                    num_new_tokens=num_new_tokens_uncached,
+                    num_new_seqs=num_new_seqs,
+            ):
+                break
+            # Can schedule this request.
+            if curr_loras is not None and lora_int_id > 0:
+                curr_loras.add(lora_int_id)
+            waiting_queue.popleft()
+            self._allocate_and_set_running(seq_group)
+            if enable_chunking and self.scheduler_config.is_multi_step:
+                blocks_to_copy: List[Tuple[int, int]] = []
+                # init_multi_step_from_lookahead_slots happens in append_slots
+                self._append_slots(seq_group, blocks_to_copy, enable_chunking)
+                # This assert will trip when a copy-on-write happens. This is
+                # not a concern as the very first sequence-group block
+                # allocation happens above. Still, we have the assert to
+                # catch any edge-cases.
+                assert not blocks_to_copy
+            else:
+                seq_group.init_multi_step_from_lookahead_slots(
+                    num_lookahead_slots,
+                    num_scheduler_steps=self.scheduler_config.
+                    num_scheduler_steps,
+                    is_multi_step=self.scheduler_config.is_multi_step,
+                    enable_chunking=enable_chunking)
+            seq_groups.append(
+                ScheduledSequenceGroup(seq_group=seq_group,
+                                       token_chunk_size=num_new_tokens))
+            budget.add_num_batched_tokens(
+                seq_group.request_id,
+                num_batched_tokens=num_new_tokens_uncached,
+                num_cached_tokens=num_new_tokens_cached,
+            )
+            budget.add_num_seqs(seq_group.request_id, num_new_seqs)
+        # Queue requests that couldn't be scheduled.
+        waiting_queue.extendleft(leftover_waiting_sequences)
+        if len(seq_groups) > 0:
+            self.prev_prompt = True
+        return SchedulerPrefillOutputs(
+            seq_groups=seq_groups,
+            ignored_seq_groups=ignored_seq_groups,
+            num_lookahead_slots=self._get_num_lookahead_slots(
+                is_prefill=True, enable_chunking=enable_chunking))
+    def _schedule_default(self) -> SchedulerOutputs:
+        """Schedule queued requests.
+        The current policy is designed to optimize the throughput. First,
+        it batches as many prefill requests as possible. And it schedules
+        decodes. If there's a pressure on GPU memory, decode requests can
+        be swapped or preempted.
+        """
+        # Include running requests to the budget.
+        budget = SchedulingBudget(
+            token_budget=self.scheduler_config.max_num_batched_tokens,
+            max_num_seqs=self.scheduler_config.max_num_seqs,
+        )
+        # Make sure we include num running seqs before scheduling prefill,
+        # so that we don't schedule beyond max_num_seqs for prefill.
+        for seq_group in self.running:
+            budget.add_num_seqs(seq_group.request_id,
+                                seq_group.get_max_num_running_seqs())
+        curr_loras = set(
+            seq_group.lora_int_id for seq_group in self.running
+            if seq_group.lora_int_id > 0) if self.lora_enabled else None
+        prefills = SchedulerPrefillOutputs.create_empty()
+        running_scheduled = SchedulerRunningOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
+        # If any requests are swapped, prioritized swapped requests.
+        if not self.swapped:
+            prefills = self._schedule_prefills(budget,
+                                               curr_loras,
+                                               enable_chunking=False)
+        if len(prefills.seq_groups
+               ) == 0 and self.scheduler_config.policy == "priority":
+            self._schedule_priority_preemption(budget)
+        # Don't schedule decodes if prefills are scheduled.
+        # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
+        # only contains decode requests, not chunked prefills.
+        if len(prefills.seq_groups) == 0:
+            running_scheduled = self._schedule_running(budget,
+                                                       curr_loras,
+                                                       enable_chunking=False)
+            # If any sequence group is preempted, do not swap in any sequence
+            # group. because it means there's no slot for new running requests.
+            if len(running_scheduled.preempted) + len(
+                    running_scheduled.swapped_out) == 0:
+                swapped_in = self._schedule_swapped(budget, curr_loras)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
+        assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
+        # Update waiting requests.
+        self.waiting.extendleft(running_scheduled.preempted)
+        # Update new running requests.
+        if len(prefills.seq_groups) > 0:
+            self.running.extend([s.seq_group for s in prefills.seq_groups])
+        self.running.extend(running_scheduled.decode_seq_groups_list)
+        if len(swapped_in.decode_seq_groups) > 0:
+            self.running.extend(
+                [s.seq_group for s in swapped_in.decode_seq_groups])
+        # Update swapped requests.
+        self.swapped.extend(running_scheduled.swapped_out)
+        preempted = (len(running_scheduled.preempted) +
+                     len(running_scheduled.swapped_out))
+        # There should be no prefill from running queue because this policy
+        # doesn't allow chunked prefills.
+        assert len(running_scheduled.prefill_seq_groups) == 0
+        assert len(swapped_in.prefill_seq_groups) == 0
+        # Merge lists
+        num_prefill_groups = len(prefills.seq_groups)
+        if num_prefill_groups > 0:
+            scheduled_seq_groups = prefills.seq_groups
+            scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
+        else:
+            scheduled_seq_groups = running_scheduled.decode_seq_groups
+        scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
+        blocks_to_copy = running_scheduled.blocks_to_copy
+        blocks_to_copy.extend(swapped_in.blocks_to_copy)
+        ignored_seq_groups = prefills.ignored_seq_groups
+        ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
+        return SchedulerOutputs(
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
+            num_batched_tokens=budget.num_batched_tokens +
+            budget.num_cached_tokens,
+            blocks_to_swap_in=swapped_in.blocks_to_swap_in,
+            blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
+            blocks_to_copy=blocks_to_copy,
+            ignored_seq_groups=ignored_seq_groups,
+            num_lookahead_slots=running_scheduled.num_lookahead_slots,
+            running_queue_size=len(self.running),
+            preempted=preempted,
+        )
+    def _schedule_chunked_prefill(self) -> SchedulerOutputs:
+        """Schedule queued requests.
+        Chunked prefill allows to chunk prefill requests, batch them together
+        with decode requests. This policy 1. schedule as many decoding requests
+        as possible. 2. schedule chunked prefill requests that are not
+        finished. 3. schedule swapped request. 4. schedule new prefill
+        requests.
+        The policy can sustain the high GPU utilization because it can put
+        prefill and decodes requests to the same batch, while it improves
+        inter token latency because decodes requests don't need to be blocked
+        by prefill requests.
+        """
+        budget = SchedulingBudget(
+            token_budget=self.scheduler_config.max_num_batched_tokens,
+            max_num_seqs=self.scheduler_config.max_num_seqs,
+        )
+        curr_loras: Set[int] = set()
+        prefills = SchedulerPrefillOutputs.create_empty()
+        swapped_in = SchedulerSwappedInOutputs.create_empty()
+        # Decoding should be always scheduled first by fcfs.
+        running_scheduled = self._schedule_running(budget,
+                                                   curr_loras,
+                                                   enable_chunking=True)
+        # Schedule swapped out requests.
+        # If preemption happens, it means we don't have space for swap-in.
+        if len(running_scheduled.preempted) + len(
+                running_scheduled.swapped_out) == 0:
+            swapped_in = self._schedule_swapped(budget, curr_loras)
+        prefills = self._schedule_prefills(budget,
+                                           curr_loras,
+                                           enable_chunking=True)
+        assert (budget.num_batched_tokens
+                <= self.scheduler_config.max_num_batched_tokens)
+        assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
+        # Update waiting requests.
+        self.waiting.extendleft(running_scheduled.preempted)
+        # Update new running requests.
+        # By default, vLLM scheduler prioritizes prefills.
+        # Once chunked prefill is enabled,
+        # the policy is changed to prioritize decode requests.
+        self.running.extend(
+            [s.seq_group for s in swapped_in.decode_seq_groups])
+        self.running.extend(
+            [s.seq_group for s in swapped_in.prefill_seq_groups])
+        self.running.extend(
+            [s.seq_group for s in running_scheduled.decode_seq_groups])
+        self.running.extend(
+            [s.seq_group for s in running_scheduled.prefill_seq_groups])
+        self.running.extend([s.seq_group for s in prefills.seq_groups])
+        # Update swapped requests.
+        self.swapped.extend(running_scheduled.swapped_out)
+        # Put prefills first due to Attention backend ordering assumption.
+        scheduled_seq_groups = (prefills.seq_groups +
+                                running_scheduled.prefill_seq_groups +
+                                swapped_in.prefill_seq_groups +
+                                running_scheduled.decode_seq_groups +
+                                swapped_in.decode_seq_groups)
+        num_prefill_groups = (len(prefills.seq_groups) +
+                              len(swapped_in.prefill_seq_groups) +
+                              len(running_scheduled.prefill_seq_groups))
+        # If all prompts, then we set num_lookahead_slots to 0
+        # this allows us to go through the `no_spec` path in
+        # `spec_decode_worker.py`
+        all_prefills = (len(scheduled_seq_groups) == num_prefill_groups)
+        num_lookahead_slots = (0 if
+                               (all_prefills
+                                and not self.scheduler_config.is_multi_step)
+                               else running_scheduled.num_lookahead_slots)
+        return SchedulerOutputs(
+            scheduled_seq_groups=scheduled_seq_groups,
+            num_prefill_groups=num_prefill_groups,
+            num_batched_tokens=budget.num_batched_tokens +
+            budget.num_cached_tokens,
+            blocks_to_swap_in=swapped_in.blocks_to_swap_in,
+            blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
+            blocks_to_copy=running_scheduled.blocks_to_copy +
+            swapped_in.blocks_to_copy,
+            ignored_seq_groups=prefills.ignored_seq_groups +
+            swapped_in.infeasible_seq_groups,
+            num_lookahead_slots=num_lookahead_slots,
+            running_queue_size=len(self.running),
+            preempted=(len(running_scheduled.preempted) +
+                       len(running_scheduled.swapped_out)),
+        )
+    def _schedule(self) -> SchedulerOutputs:
+        """Schedule queued requests."""
+        if self.scheduler_config.chunked_prefill_enabled:
+            return self._schedule_chunked_prefill()
+        else:
+            return self._schedule_default()
+    def _can_append_slots(self, seq_group: SequenceGroup,
+                          enable_chunking: bool) -> bool:
+        """Determine whether or not we have enough space in the KV cache to
+        continue generation of the sequence group.
+        """
+        # It is True only for testing case to trigger artificial preemption.
+        if (self.enable_artificial_preemption
+                and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
+                and self.artificial_preempt_cnt > 0):
+            self.artificial_preempt_cnt -= 1
+            return False
+        is_prefill = seq_group.is_prefill()
+        num_lookahead_slots = self._get_num_lookahead_slots(
+            is_prefill, enable_chunking)
+        if is_prefill and num_lookahead_slots > 0:
+            # Appending prefill slots only happens multi-step and
+            # chunked-prefill are enabled together.
+            assert self.scheduler_config.is_multi_step and enable_chunking
+        return self.block_manager.can_append_slots(
+            seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
+    def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
+        # async_output_proc is allowed only when we have a single sequence
+        # in the sequence group
+        no_single_seq = seq_group.sampling_params is None or (
+            seq_group.sampling_params.n == 1)
+        return no_single_seq
+    def schedule(
+            self
+    ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
+        # Schedule sequence groups.
+        # This function call changes the internal states of the scheduler
+        # such as self.running, self.swapped, and self.waiting.
+        scheduler_start_time = time.perf_counter()
+        scheduler_outputs: SchedulerOutputs = self._schedule()
+        now = time.time()
+        if not self.cache_config.enable_prefix_caching:
+            common_computed_block_nums = []
+        allow_async_output_proc: bool = self.use_async_output_proc
+        # Create input data structures.
+        seq_group_metadata_list: List[SequenceGroupMetadata] = []
+        for i, scheduled_seq_group in enumerate(
+                scheduler_outputs.scheduled_seq_groups):
+            seq_group = scheduled_seq_group.seq_group
+            token_chunk_size = scheduled_seq_group.token_chunk_size
+            seq_group.maybe_set_first_scheduled_time(now)
+            seq_group_metadata = self._seq_group_metadata_cache[
+                self.cache_id].get_object()
+            seq_group_metadata.seq_data.clear()
+            seq_group_metadata.block_tables.clear()
+            # seq_id -> SequenceData
+            seq_data: Dict[int, SequenceData] = {}
+            # seq_id -> physical block numbers
+            block_tables: Dict[int, List[int]] = {}
+            if seq_group.is_encoder_decoder():
+                # Encoder associated with SequenceGroup
+                encoder_seq = seq_group.get_encoder_seq()
+                assert encoder_seq is not None
+                encoder_seq_data = encoder_seq.data
+                # Block table for cross-attention
+                # Also managed at SequenceGroup level
+                cross_block_table = self.block_manager.get_cross_block_table(
+                    seq_group)
+            else:
+                encoder_seq_data = None
+                cross_block_table = None
+            for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+                seq_id = seq.seq_id
+                seq_data[seq_id] = seq.data
+                block_tables[seq_id] = self.block_manager.get_block_table(seq)
+                self.block_manager.access_all_blocks_in_seq(seq, now)
+            if self.cache_config.enable_prefix_caching:
+                common_computed_block_nums = (
+                    self.block_manager.get_common_computed_block_ids(
+                        seq_group.get_seqs(status=SequenceStatus.RUNNING)))
+            do_sample = True
+            is_prompt = seq_group.is_prefill()
+            # We should send the metadata to workers when the first prefill
+            # is sent. Subsequent requests could be chunked prefill or decode.
+            is_first_prefill = False
+            if is_prompt:
+                seqs = seq_group.get_seqs()
+                # Prefill has only 1 sequence.
+                assert len(seqs) == 1
+                num_computed_tokens = seqs[0].data.get_num_computed_tokens()
+                is_first_prefill = num_computed_tokens == 0
+                # In the next iteration, all prompt tokens are not computed.
+                # It means the prefill is chunked, and we don't need sampling.
+                # NOTE: We use get_len instead of get_prompt_len because when
+                # a sequence is preempted, prefill includes previous generated
+                # output tokens.
+                if (token_chunk_size + num_computed_tokens
+                        < seqs[0].data.get_len()):
+                    do_sample = False
+            # It assumes the scheduled_seq_groups is ordered by
+            # prefill < decoding.
+            if is_first_prefill or not self.scheduler_config.send_delta_data:
+                seq_group_metadata = SequenceGroupMetadata(
+                    request_id=seq_group.request_id,
+                    is_prompt=is_prompt,
+                    seq_data=seq_data,
+                    sampling_params=seq_group.sampling_params,
+                    block_tables=block_tables,
+                    do_sample=do_sample,
+                    pooling_params=seq_group.pooling_params,
+                    token_chunk_size=token_chunk_size,
+                    lora_request=seq_group.lora_request,
+                    computed_block_nums=common_computed_block_nums,
+                    encoder_seq_data=encoder_seq_data,
+                    cross_block_table=cross_block_table,
+                    state=seq_group.state,
+                    token_type_ids=seq_group.token_type_ids,
+                    # `multi_modal_data` will only be present for the 1st comm
+                    # between engine and worker.
+                    # the subsequent comms can still use delta, but
+                    # `multi_modal_data` will be None.
+                    multi_modal_data=seq_group.multi_modal_data
+                    if scheduler_outputs.num_prefill_groups > 0 else None,
+                    multi_modal_placeholders=seq_group.multi_modal_placeholders
+                    if scheduler_outputs.num_prefill_groups > 0 else None,
+                    mm_processor_kwargs=seq_group.mm_processor_kwargs,
+                    prompt_adapter_request=seq_group.prompt_adapter_request,
+                )
+            else:
+                # When SPMD mode is enabled, we only send delta data except for
+                # the first request to reduce serialization cost.
+                seq_data_delta = {}
+                for id, data in seq_data.items():
+                    seq_data_delta[id] = data.get_delta_and_reset()
+                seq_group_metadata = SequenceGroupMetadataDelta(
+                    seq_data_delta,
+                    seq_group.request_id,
+                    block_tables,
+                    is_prompt,
+                    do_sample=do_sample,
+                    token_chunk_size=token_chunk_size,
+                    computed_block_nums=common_computed_block_nums,
+                )
+            seq_group_metadata_list.append(seq_group_metadata)
+            if allow_async_output_proc:
+                allow_async_output_proc = self._allow_async_output_proc(
+                    seq_group)
+        # Now that the batch has been created, we can assume all blocks in the
+        # batch will have been computed before the next scheduling invocation.
+        # This is because the engine assumes that a failure in model execution
+        # will crash the vLLM instance / will not retry.
+        for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
+            self.block_manager.mark_blocks_as_computed(
+                scheduled_seq_group.seq_group,
+                scheduled_seq_group.token_chunk_size)
+        self._seq_group_metadata_cache[self.next_cache_id].reset()
+        scheduler_time = time.perf_counter() - scheduler_start_time
+        # Add this to scheduler time to all the sequences that are currently
+        # running. This will help estimate if the scheduler is a significant
+        # component in the e2e latency.
+        for seq_group in self.running:
+            if seq_group is not None and seq_group.metrics is not None:
+                if seq_group.metrics.scheduler_time is not None:
+                    seq_group.metrics.scheduler_time += scheduler_time
+                else:
+                    seq_group.metrics.scheduler_time = scheduler_time
+        # Move to next cache (if exists)
+        self.cache_id = self.next_cache_id
+        # Return results
+        return (seq_group_metadata_list, scheduler_outputs,
+                allow_async_output_proc)
+    def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
+        self.block_manager.fork(parent_seq, child_seq)
+    def free_seq(self, seq: Sequence) -> None:
+        """Free a sequence from a block table."""
+        self.block_manager.free(seq)
+    def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
+        """Free finished seqs in a sequence group."""
+        for seq in seq_group.get_seqs():
+            if seq.is_finished():
+                self.free_seq(seq)
+    def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
+        if seq_group.is_finished():
+            # Free cross-attention block table, if it exists
+            self._free_seq_group_cross_attn_blocks(seq_group)
+            # Add the finished requests to the finished requests list.
+            # This list will be used to update the Mamba cache in the
+            # next step.
+            self._finished_requests_ids.append(seq_group.request_id)
+        # Free finished seqs
+        self._free_finished_seqs(seq_group)
+    def free_finished_seq_groups(self) -> None:
+        remaining: Deque[SequenceGroup] = deque()
+        for seq_group in self.running:
+            self._free_finished_seq_group(seq_group)
+            if not seq_group.is_finished():
+                remaining.append(seq_group)
+        self.running = remaining
+        # Handle async stopped sequence groups
+        # (ones that reached max model len)
+        if self._async_stopped:
+            for seq_group in self._async_stopped:
+                self._free_seq_group_cross_attn_blocks(seq_group)
+                self._finished_requests_ids.append(seq_group.request_id)
+                # Free finished seqs
+                self._free_finished_seqs(seq_group)
+            self._async_stopped.clear()
+    def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
+        self.block_manager.allocate(seq_group)
+        for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
+            seq.status = SequenceStatus.RUNNING
+    def _append_slots(self,
+                      seq_group: SequenceGroup,
+                      blocks_to_copy: List[Tuple[int, int]],
+                      enable_chunking: bool = False) -> None:
+        """Appends new slots to the sequences in the given sequence group.
+        Args:
+            seq_group (SequenceGroup): The sequence group containing the
+                sequences to append slots to.
+            blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
+                ints, the first int is the source block index, and the second
+                int is the destination block index. This list is updated with
+                the new source and destination block indices for the appended
+                slots.
+            enable_chunking (bool): True if chunked prefill is enabled.
+        """
+        is_prefill: bool = seq_group.is_prefill()
+        num_lookahead_slots: int = self._get_num_lookahead_slots(
+            is_prefill, enable_chunking)
+        seq_group.init_multi_step_from_lookahead_slots(
+            num_lookahead_slots,
+            num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
+            is_multi_step=self.scheduler_config.is_multi_step,
+            enable_chunking=enable_chunking)
+        seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
+        if self.scheduler_config.is_multi_step and enable_chunking:
+            # In multi-step chunked-prefill any sequence type can have
+            # slots appended.
+            seq_status = None
+        for seq in seq_group.get_seqs(status=seq_status):
+            cows = self.block_manager.append_slots(seq, num_lookahead_slots)
+            if len(cows) > 0:
+                blocks_to_copy.extend(cows)
+    def _preempt(self, seq_group: SequenceGroup,
+                 blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
+        # If preemption mode is not specified, we determine the mode as follows:
+        # We use recomputation by default since it incurs lower overhead than
+        # swapping. However, when the sequence group has multiple sequences
+        # (e.g., beam search), recomputation is not currently supported. In
+        # such a case, we use swapping instead.
+        # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
+        # As swapped sequences are prioritized over waiting sequences,
+        # sequence groups with multiple sequences are implicitly prioritized
+        # over sequence groups with a single sequence.
+        # TODO(woosuk): Support recomputation for sequence groups with multiple
+        # sequences. This may require a more sophisticated CUDA kernel.
+        if self.user_specified_preemption_mode is None:
+            if seq_group.get_max_num_running_seqs() == 1:
+                preemption_mode = PreemptionMode.RECOMPUTE
+            else:
+                preemption_mode = PreemptionMode.SWAP
+        elif self.user_specified_preemption_mode == "swap":
+            preemption_mode = PreemptionMode.SWAP
+        else:
+            preemption_mode = PreemptionMode.RECOMPUTE
+        if self.num_cumulative_preemption % 50 == 0:
+            logger.warning(
+                "Sequence group %s is preempted by %s mode because there is "
+                "not enough KV cache space. This can affect the end-to-end "
+                "performance. Increase gpu_memory_utilization or "
+                "tensor_parallel_size to provide more KV cache memory. "
+                "total_num_cumulative_preemption=%d", seq_group.request_id,
+                preemption_mode, self.num_cumulative_preemption + 1)
+        self.num_cumulative_preemption += 1
+        if preemption_mode == PreemptionMode.RECOMPUTE:
+            self._preempt_by_recompute(seq_group)
+        elif preemption_mode == PreemptionMode.SWAP:
+            self._preempt_by_swap(seq_group, blocks_to_swap_out)
+        else:
+            raise AssertionError("Invalid preemption mode.")
+        return preemption_mode
+    def _preempt_by_recompute(
+        self,
+        seq_group: SequenceGroup,
+    ) -> None:
+        seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
+        assert len(seqs) == 1
+        for seq in seqs:
+            seq.status = SequenceStatus.WAITING
+            self.free_seq(seq)
+            seq.reset_state_for_recompute()
+        self._free_seq_group_cross_attn_blocks(seq_group)
+    def _preempt_by_swap(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: List[Tuple[int, int]],
+    ) -> None:
+        self._swap_out(seq_group, blocks_to_swap_out)
+    def _swap_in(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_in: List[Tuple[int, int]],
+    ) -> None:
+        mapping = self.block_manager.swap_in(seq_group)
+        blocks_to_swap_in.extend(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
+            seq.status = SequenceStatus.RUNNING
+    def _swap_out(
+        self,
+        seq_group: SequenceGroup,
+        blocks_to_swap_out: List[Tuple[int, int]],
+    ) -> None:
+        if not self.block_manager.can_swap_out(seq_group):
+            # FIXME(woosuk): Abort the sequence group instead of aborting the
+            # entire engine.
+            raise RuntimeError(
+                "Aborted due to the lack of CPU swap space. Please increase "
+                "the swap space to avoid this error.")
+        mapping = self.block_manager.swap_out(seq_group)
+        blocks_to_swap_out.extend(mapping)
+        for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
+            seq.status = SequenceStatus.SWAPPED
+    def _passed_delay(self, now: float) -> bool:
+        if self.prev_prompt:
+            self.last_prompt_latency = now - self.prev_time
+        self.prev_time, self.prev_prompt = now, False
+        # Delay scheduling prompts to let waiting queue fill up
+        if self.scheduler_config.delay_factor > 0 and self.waiting:
+            earliest_arrival_time = min(
+                [e.metrics.arrival_time for e in self.waiting])
+            passed_delay = ((now - earliest_arrival_time)
+                            > (self.scheduler_config.delay_factor *
+                               self.last_prompt_latency) or not self.running)
+        else:
+            passed_delay = True
+        return passed_delay
+    def _get_num_lookahead_slots(self, is_prefill: bool,
+                                 enable_chunking: bool) -> int:
+        """The number of slots to allocate per sequence per step, beyond known
+        token ids. Speculative decoding uses these slots to store KV activations
+        of tokens which may or may not be accepted.
+        Speculative decoding does not yet support prefill, so we do not perform
+        lookahead allocation for prefill.
+        When chunking is enabled with multi-step, we allocate lookahead slots
+        for the prefills for when the prefills turn into decodes in the first
+        step.
+        """
+        if is_prefill:
+            if self.scheduler_config.is_multi_step and enable_chunking:
+                # num_lookahead_slots was introduced in the context of decodes,
+                # in Speculative Decoding.
+                # When the num_scheduler_steps is 8, say, then the
+                # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
+                # decode anyways and we wish to do 7 more.
+                #
+                # "lookaheads" for prefills, is introduced in support for
+                # Chunked-Prefill in Multi-Step.
+                return self.scheduler_config.num_lookahead_slots + 1
+            else:
+                return 0
+        return self.scheduler_config.num_lookahead_slots
+    def _get_num_new_uncached_and_cached_tokens(
+        self,
+        seq_group: SequenceGroup,
+        status: SequenceStatus,
+        enable_chunking: bool,
+        budget: SchedulingBudget,
+    ) -> Tuple[int, int]:
+        """
+        Returns the number of new uncached and cached tokens to schedule for a
+        given sequence group that's in a given `status`.
+        The API could chunk the number of tokens to compute based on `budget`
+        if `enable_chunking` is True. If a sequence group has multiple
+        sequences (e.g., running beam search), it means it is in decoding
+        phase, so chunking doesn't happen.
+        Returns (0, 0) if the new token cannot be computed due to token budget.
+        The cached tokens's blocks are already computed, and the attention
+        backend will reuse the cached blocks rather than recomputing them. So
+        the scheduler could schedule these cached tokens "for free".
+        Args:
+            seq_group: The sequence group to get the number of new tokens to
+                schedule.
+            status: The status of the sequences to get the number of new tokens
+                to schedule.
+            enable_chunking: Whether to chunk the number of tokens to compute.
+            budget: The budget to chunk the number of tokens to compute.
+        Returns:
+            A tuple of two ints. The first int is the number of new uncached
+            tokens to schedule. The second int is the number of cached tokens.
+            If no more new tokens can be scheduled, returns (0, 0).
+        """
+        num_cached_new_tokens = 0
+        num_uncached_new_tokens = 0
+        seqs = seq_group.get_seqs(status=status)
+        # Compute the number of new uncached and cached tokens for
+        # each sequence.
+        for seq in seqs:
+            if not seq.is_prefill():
+                # Decode sequences should always just have 1 uncached token
+                # TODO(rickyx): Actually is this still correct for multi-step?
+                num_uncached_new_tokens += 1
+                continue
+            num_computed_tokens_seq = seq.get_num_computed_tokens()
+            all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
+            if not self.cache_config.enable_prefix_caching:
+                # If prefix caching is not enabled, all new tokens are uncached.
+                num_uncached_new_tokens += all_num_new_tokens_seq
+                continue
+            # NOTE: the cache token might be currently in a block that's in an
+            # evictor meaning that it's not yet allocated. However, we don't
+            # exclude such tokens in the cache count because it will be
+            # guaranteed to be allocated later if the sequence can be allocated.
+            num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
+                seq)
+            # Sanity check.
+            if num_cached_tokens_seq < num_computed_tokens_seq:
+                # This should only happen with chunked prefill, and
+                # the seq is still in prefill. The `num_cached_tokens_seq`
+                # is the value we calculated on scheduling the first prefill.
+                # For subsequent continuous prefill steps, we cached the
+                # number of cache tokens for the sequence so the cached token
+                # count could be less than the number of computed tokens.
+                # See comments on `ComputedBlocksTracker` for more details.
+                assert (
+                    seq.is_prefill() and seq.status == SequenceStatus.RUNNING
+                    and self.scheduler_config.chunked_prefill_enabled
+                ), ("Number of cached tokens should not be less than the "
+                    "number of computed tokens for a sequence that's still "
+                    f"in prefill. But there are {num_cached_tokens_seq} cached "
+                    f"tokens and {num_computed_tokens_seq} computed tokens "
+                    f"for sequence {seq.seq_id}.")
+            num_cached_new_tokens_seq = max(
+                0, num_cached_tokens_seq - num_computed_tokens_seq)
+            num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
+                                           num_cached_new_tokens_seq)
+            num_uncached_new_tokens += num_uncached_new_tokens_seq
+            num_cached_new_tokens += num_cached_new_tokens_seq
+        if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
+            # For a fully cached hit sequence, we actually need to recompute the
+            # last token. So we need at least 1 uncached token to schedule.
+            # See ModelRunner._compute_for_prefix_cache_hit for more details.
+            num_uncached_new_tokens = 1
+            num_cached_new_tokens -= 1
+        if enable_chunking and len(seqs) == 1:
+            # Chunk if a running request cannot fit in the given budget.
+            # If number of seq > 1, it means it is doing beam search
+            # in a decode phase. Do not chunk.
+            num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
+                self.scheduler_config,
+                self.cache_config,
+                budget,
+                self._get_prompt_limit(seq_group),
+                num_uncached_new_tokens,
+            )
+        return num_uncached_new_tokens, num_cached_new_tokens
+    @staticmethod
+    def _chunk_new_tokens_to_schedule(
+        scheduler_config: SchedulerConfig,
+        cache_config: CacheConfig,
+        budget: SchedulingBudget,
+        prompt_limit: int,
+        num_new_tokens: int,
+    ) -> int:
+        """
+        Chunks the number of new tokens to schedule based on the budget when
+        chunked prefill is enabled.
+        Args:
+            scheduler_config: The scheduler config.
+            cache_config: The cache config.
+            budget: The budget to chunk the number of tokens to compute.
+            prompt_limit: The maximum number of tokens allowed in a prompt.
+            num_new_tokens: The number of new tokens to schedule.
+        Returns:
+            The number of new tokens to schedule after chunking.
+        """
+        remaining_token_budget = budget.remaining_token_budget()
+        if scheduler_config.is_multi_step:
+            # The current multi-step + chunked prefill capability does
+            # not actually support chunking prompts.
+            #
+            # Therefore, `num_new_tokens` is computed in the same fashion
+            # for both multi-step+chunked-prefill &
+            # multi-step+chunked-prefill+APC
+            #
+            # Prompts with more tokens than the current remaining budget
+            # are postponed to future scheduler steps
+            if num_new_tokens > prompt_limit:
+                # If the seq_group is in prompt-stage, pass the
+                # num_new_tokens as-is so the caller can ignore
+                # the sequence.
+                return num_new_tokens
+            return (0 if num_new_tokens > remaining_token_budget else
+                    num_new_tokens)
+        if cache_config.enable_prefix_caching:
+            # Adjust the remaining token budget to be divisible by the block
+            # size when prefix caching is enabled.
+            # When prefix caching is enabled, we always allocate
+            # the number of new tokens that is dividable by the block
+            # size to avoid partial block matching.
+            block_size = cache_config.block_size
+            remainder = budget.token_budget % block_size
+            if remainder != 0:
+                raise ValueError("When enabling chunked prefill and "
+                                 "prefix caching, max_num_batched_tokens "
+                                 "(chunk size) must be dividable by "
+                                 "block size, but got chunk_size "
+                                 f"({budget.token_budget}) % block_size "
+                                 f"({block_size}) = {remainder}")
+            # Round down to block size.
+            remaining_token_budget = (remaining_token_budget // block_size *
+                                      block_size)
+        num_new_tokens = min(num_new_tokens, remaining_token_budget)
+        return num_new_tokens

.venv/lib/python3.11/site-packages/vllm/device_allocator/__pycache__/cumem.cpython-311.pyc ADDED Viewed

Binary file (12 kB). View file

.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (210 Bytes). View file

.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc ADDED Viewed

Binary file (9.62 kB). View file

.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc ADDED Viewed

Binary file (12.7 kB). View file