Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- .venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so +3 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/_custom_ops.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/_ipex_ops.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/_version.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/beam_search.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/connections.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/envs.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/forward_context.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/logger.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/logits_process.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/outputs.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/pooling_params.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/sampling_params.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/scalar_type.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/scripts.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/sequence.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/tracing.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/__pycache__/version.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/__pycache__/block_manager.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/__pycache__/evictor.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/__pycache__/interfaces.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/__pycache__/placeholder_block_space_manager.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/__pycache__/scheduler.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/block_table.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/common.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/interfaces.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/naive_block.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/prefix_caching_block.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/block_table.py +398 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/common.py +370 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/cpu_gpu_block_allocator.py +438 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/interfaces.py +318 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/naive_block.py +465 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/prefix_caching_block.py +1134 -0
- .venv/lib/python3.11/site-packages/vllm/core/block/utils.py +27 -0
- .venv/lib/python3.11/site-packages/vllm/core/interfaces.py +134 -0
- .venv/lib/python3.11/site-packages/vllm/core/placeholder_block_space_manager.py +99 -0
- .venv/lib/python3.11/site-packages/vllm/core/scheduler.py +1840 -0
- .venv/lib/python3.11/site-packages/vllm/device_allocator/__pycache__/cumem.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc +0 -0
.gitattributes
CHANGED
|
@@ -200,3 +200,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
|
|
| 200 |
.venv/lib/python3.11/site-packages/google/protobuf/__pycache__/descriptor_pb2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 201 |
.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 202 |
.venv/lib/python3.11/site-packages/grpc/__pycache__/_channel.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 200 |
.venv/lib/python3.11/site-packages/google/protobuf/__pycache__/descriptor_pb2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 201 |
.venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 202 |
.venv/lib/python3.11/site-packages/grpc/__pycache__/_channel.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
|
| 203 |
+
.venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
|
.venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8da100e7b8957b1fbf02ef3114676091bdd6d861169f948bbbeaf0fceade5992
|
| 3 |
+
size 1296528
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (1.97 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/_custom_ops.cpython-311.pyc
ADDED
|
Binary file (57.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/_ipex_ops.cpython-311.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/_version.cpython-311.pyc
ADDED
|
Binary file (635 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/beam_search.cpython-311.pyc
ADDED
|
Binary file (3.98 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/connections.cpython-311.pyc
ADDED
|
Binary file (9.88 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/envs.cpython-311.pyc
ADDED
|
Binary file (27.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/forward_context.cpython-311.pyc
ADDED
|
Binary file (5.77 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/logger.cpython-311.pyc
ADDED
|
Binary file (9.24 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/logits_process.cpython-311.pyc
ADDED
|
Binary file (5.47 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/outputs.cpython-311.pyc
ADDED
|
Binary file (24.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/pooling_params.cpython-311.pyc
ADDED
|
Binary file (1.42 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/sampling_params.cpython-311.pyc
ADDED
|
Binary file (26.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/scalar_type.cpython-311.pyc
ADDED
|
Binary file (14.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/scripts.cpython-311.pyc
ADDED
|
Binary file (8.83 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/sequence.cpython-311.pyc
ADDED
|
Binary file (77.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/tracing.cpython-311.pyc
ADDED
|
Binary file (6.92 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/__pycache__/version.cpython-311.pyc
ADDED
|
Binary file (622 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (182 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/block_manager.cpython-311.pyc
ADDED
|
Binary file (24.7 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/evictor.cpython-311.pyc
ADDED
|
Binary file (8.26 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/interfaces.cpython-311.pyc
ADDED
|
Binary file (6.89 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/placeholder_block_space_manager.cpython-311.pyc
ADDED
|
Binary file (5.79 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/scheduler.cpython-311.pyc
ADDED
|
Binary file (73.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (188 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/block_table.cpython-311.pyc
ADDED
|
Binary file (18.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/common.cpython-311.pyc
ADDED
|
Binary file (19.2 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-311.pyc
ADDED
|
Binary file (22.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/interfaces.cpython-311.pyc
ADDED
|
Binary file (16.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/naive_block.cpython-311.pyc
ADDED
|
Binary file (22.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/prefix_caching_block.cpython-311.pyc
ADDED
|
Binary file (47.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/utils.cpython-311.pyc
ADDED
|
Binary file (1.33 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/core/block/block_table.py
ADDED
|
@@ -0,0 +1,398 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import math
|
| 4 |
+
from typing import List, Optional
|
| 5 |
+
|
| 6 |
+
from vllm.core.block.common import BlockList
|
| 7 |
+
from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
|
| 8 |
+
from vllm.utils import Device, cdiv, chunk_list
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class BlockTable:
|
| 12 |
+
"""A class to manage blocks for a specific sequence.
|
| 13 |
+
|
| 14 |
+
The BlockTable maps a sequence of tokens to a list of blocks, where each
|
| 15 |
+
block represents a contiguous memory allocation for a portion of the
|
| 16 |
+
sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
|
| 17 |
+
responsible for allocating and freeing memory for the blocks.
|
| 18 |
+
|
| 19 |
+
Args:
|
| 20 |
+
block_size (int): The maximum number of tokens that can be stored in a
|
| 21 |
+
single block.
|
| 22 |
+
block_allocator (DeviceAwareBlockAllocator): The block allocator used to
|
| 23 |
+
manage memory for the blocks.
|
| 24 |
+
_blocks (Optional[List[Block]], optional): An optional list of existing
|
| 25 |
+
blocks to initialize the BlockTable with. If not provided, an empty
|
| 26 |
+
BlockTable is created.
|
| 27 |
+
max_block_sliding_window (Optional[int], optional): The number of
|
| 28 |
+
blocks to keep around for each sequence. If None, all blocks
|
| 29 |
+
are kept (eg., when sliding window is not used).
|
| 30 |
+
It should at least fit the sliding window size of the model.
|
| 31 |
+
|
| 32 |
+
Attributes:
|
| 33 |
+
_block_size (int): The maximum number of tokens that can be stored in a
|
| 34 |
+
single block.
|
| 35 |
+
_allocator (DeviceAwareBlockAllocator): The block allocator used to
|
| 36 |
+
manage memory for the blocks.
|
| 37 |
+
_blocks (Optional[List[Block]]): The list of blocks managed by this
|
| 38 |
+
BlockTable.
|
| 39 |
+
_num_full_slots (int): The number of tokens currently stored in the
|
| 40 |
+
blocks.
|
| 41 |
+
"""
|
| 42 |
+
|
| 43 |
+
def __init__(
|
| 44 |
+
self,
|
| 45 |
+
block_size: int,
|
| 46 |
+
block_allocator: DeviceAwareBlockAllocator,
|
| 47 |
+
_blocks: Optional[List[Block]] = None,
|
| 48 |
+
max_block_sliding_window: Optional[int] = None,
|
| 49 |
+
):
|
| 50 |
+
self._block_size = block_size
|
| 51 |
+
self._allocator = block_allocator
|
| 52 |
+
if _blocks is None:
|
| 53 |
+
_blocks = []
|
| 54 |
+
self._blocks: BlockList = BlockList(_blocks)
|
| 55 |
+
|
| 56 |
+
self._max_block_sliding_window = max_block_sliding_window
|
| 57 |
+
self._num_full_slots = self._get_num_token_ids()
|
| 58 |
+
|
| 59 |
+
@staticmethod
|
| 60 |
+
def get_num_required_blocks(token_ids: List[int],
|
| 61 |
+
block_size: int,
|
| 62 |
+
num_lookahead_slots: int = 0) -> int:
|
| 63 |
+
"""Calculates the minimum number of blocks required to store a given
|
| 64 |
+
sequence of token IDs along with any look-ahead slots that may be
|
| 65 |
+
required (like in multi-step + chunked-prefill).
|
| 66 |
+
|
| 67 |
+
This assumes worst-case scenario, where every block requires a new
|
| 68 |
+
allocation (e.g. ignoring prefix caching).
|
| 69 |
+
|
| 70 |
+
Args:
|
| 71 |
+
token_ids (List[int]): The sequence of token IDs to be stored.
|
| 72 |
+
block_size (int): The maximum number of tokens that can be stored in
|
| 73 |
+
a single block.
|
| 74 |
+
num_lookahead_slots (int): look-ahead slots that the sequence may
|
| 75 |
+
require.
|
| 76 |
+
|
| 77 |
+
Returns:
|
| 78 |
+
int: The minimum number of blocks required to store the given
|
| 79 |
+
sequence of token IDs along with any required look-ahead slots.
|
| 80 |
+
"""
|
| 81 |
+
return cdiv(len(token_ids) + num_lookahead_slots, block_size)
|
| 82 |
+
|
| 83 |
+
def allocate(self,
|
| 84 |
+
token_ids: List[int],
|
| 85 |
+
device: Device = Device.GPU,
|
| 86 |
+
extra_hash: Optional[int] = None) -> None:
|
| 87 |
+
"""Allocates memory blocks for storing the given sequence of token IDs.
|
| 88 |
+
|
| 89 |
+
This method allocates the required number of blocks to store the given
|
| 90 |
+
sequence of token IDs.
|
| 91 |
+
|
| 92 |
+
Args:
|
| 93 |
+
token_ids (List[int]): The sequence of token IDs to be stored.
|
| 94 |
+
device (Device, optional): The device on which the blocks should be
|
| 95 |
+
allocated. Defaults to Device.GPU.
|
| 96 |
+
extra_hash (Optional[int]): The hash value of additional
|
| 97 |
+
factors, such as adapters, that influence the block hash
|
| 98 |
+
in the prefixcaching block.
|
| 99 |
+
"""
|
| 100 |
+
assert not self._is_allocated
|
| 101 |
+
assert token_ids
|
| 102 |
+
blocks = self._allocate_blocks_for_token_ids(prev_block=None,
|
| 103 |
+
token_ids=token_ids,
|
| 104 |
+
device=device,
|
| 105 |
+
extra_hash=extra_hash)
|
| 106 |
+
self.update(blocks)
|
| 107 |
+
self._num_full_slots = len(token_ids)
|
| 108 |
+
|
| 109 |
+
def update(self, blocks: List[Block]) -> None:
|
| 110 |
+
"""Resets the table to the newly provided blocks
|
| 111 |
+
(with their corresponding block ids)
|
| 112 |
+
"""
|
| 113 |
+
self._blocks.update(blocks)
|
| 114 |
+
|
| 115 |
+
def append_token_ids(self,
|
| 116 |
+
token_ids: List[int],
|
| 117 |
+
num_lookahead_slots: int = 0,
|
| 118 |
+
num_computed_slots: Optional[int] = None,
|
| 119 |
+
extra_hash: Optional[int] = None) -> None:
|
| 120 |
+
"""Appends a sequence of token IDs to the existing blocks in the
|
| 121 |
+
BlockTable.
|
| 122 |
+
|
| 123 |
+
This method appends the given sequence of token IDs to the existing
|
| 124 |
+
blocks in the BlockTable. If there is not enough space in the existing
|
| 125 |
+
blocks, new blocks are allocated using the `ensure_num_empty_slots`
|
| 126 |
+
method to accommodate the additional tokens.
|
| 127 |
+
|
| 128 |
+
The token IDs are divided into chunks of size `block_size` (except for
|
| 129 |
+
the first chunk, which may be smaller), and each chunk is appended to a
|
| 130 |
+
separate block.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
token_ids (List[int]): The sequence of token IDs to be appended.
|
| 134 |
+
num_computed_slots (Optional[int]): The number of KV cache slots
|
| 135 |
+
that are already filled (computed).
|
| 136 |
+
When sliding window is enabled, this is used to compute how many
|
| 137 |
+
blocks to drop at the front of the sequence.
|
| 138 |
+
Without sliding window, None can be passed.
|
| 139 |
+
Without chunked prefill, it should be the same as
|
| 140 |
+
_num_full_slots.
|
| 141 |
+
extra_hash (Optional[int]): The hash value of additional
|
| 142 |
+
factors such as adapters that influence the block, apart
|
| 143 |
+
from the token_ids.
|
| 144 |
+
"""
|
| 145 |
+
assert self._is_allocated, "no blocks have been allocated"
|
| 146 |
+
assert len(self._blocks) > 0
|
| 147 |
+
|
| 148 |
+
# Drop blocks that are no longer needed due to sliding window
|
| 149 |
+
if self._max_block_sliding_window is not None:
|
| 150 |
+
null_block = self._allocator.allocate_or_get_null_block()
|
| 151 |
+
assert num_computed_slots is not None
|
| 152 |
+
end_block_idx = (num_computed_slots //
|
| 153 |
+
self._block_size) - self._max_block_sliding_window
|
| 154 |
+
for idx in range(0, end_block_idx):
|
| 155 |
+
b = self._blocks[idx]
|
| 156 |
+
if b is not null_block:
|
| 157 |
+
self._allocator.free(b)
|
| 158 |
+
self._blocks[idx] = null_block
|
| 159 |
+
|
| 160 |
+
# Ensure there are enough empty slots for the new tokens plus
|
| 161 |
+
# lookahead slots
|
| 162 |
+
self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
|
| 163 |
+
num_lookahead_slots,
|
| 164 |
+
extra_hash=extra_hash)
|
| 165 |
+
|
| 166 |
+
# Update the blocks with the new tokens
|
| 167 |
+
first_block_idx = self._num_full_slots // self._block_size
|
| 168 |
+
token_blocks = self._chunk_token_blocks_for_append(token_ids)
|
| 169 |
+
|
| 170 |
+
for i, token_block in enumerate(token_blocks):
|
| 171 |
+
self._blocks.append_token_ids(first_block_idx + i, token_block)
|
| 172 |
+
|
| 173 |
+
self._num_full_slots += len(token_ids)
|
| 174 |
+
|
| 175 |
+
def ensure_num_empty_slots(self,
|
| 176 |
+
num_empty_slots: int,
|
| 177 |
+
extra_hash: Optional[int] = None) -> None:
|
| 178 |
+
"""Ensures that the BlockTable has at least the specified number of
|
| 179 |
+
empty slots available.
|
| 180 |
+
|
| 181 |
+
This method checks if the BlockTable has enough empty slots (i.e.,
|
| 182 |
+
available space) to accommodate the requested number of tokens. If not,
|
| 183 |
+
it allocates additional blocks on the GPU to ensure that the required
|
| 184 |
+
number of empty slots is available.
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
num_empty_slots (int): The minimum number of empty slots required.
|
| 188 |
+
extra_hash (Optional[int]): The hash value of additional
|
| 189 |
+
factors such as adapters that influence the block, apart
|
| 190 |
+
from the token_ids.
|
| 191 |
+
"""
|
| 192 |
+
# Currently the block table only supports
|
| 193 |
+
# appending tokens to GPU blocks.
|
| 194 |
+
device = Device.GPU
|
| 195 |
+
assert self._is_allocated
|
| 196 |
+
|
| 197 |
+
if self._num_empty_slots >= num_empty_slots:
|
| 198 |
+
return
|
| 199 |
+
|
| 200 |
+
slots_to_allocate = num_empty_slots - self._num_empty_slots
|
| 201 |
+
blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
|
| 202 |
+
|
| 203 |
+
for _ in range(blocks_to_allocate):
|
| 204 |
+
assert len(self._blocks) > 0
|
| 205 |
+
self._blocks.append(
|
| 206 |
+
self._allocator.allocate_mutable_block(
|
| 207 |
+
prev_block=self._blocks[-1],
|
| 208 |
+
device=device,
|
| 209 |
+
extra_hash=extra_hash))
|
| 210 |
+
|
| 211 |
+
def fork(self) -> "BlockTable":
|
| 212 |
+
"""Creates a new BlockTable instance with a copy of the blocks from the
|
| 213 |
+
current instance.
|
| 214 |
+
|
| 215 |
+
This method creates a new BlockTable instance with the same block size,
|
| 216 |
+
block allocator, and a copy of the blocks from the current instance. The
|
| 217 |
+
new BlockTable has its own independent set of blocks, but shares the
|
| 218 |
+
same underlying memory allocation with the original BlockTable.
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
BlockTable: A new BlockTable instance with a copy of the blocks from
|
| 222 |
+
the current instance.
|
| 223 |
+
"""
|
| 224 |
+
assert self._is_allocated
|
| 225 |
+
assert len(self._blocks) > 0
|
| 226 |
+
forked_blocks = self._allocator.fork(self._blocks[-1])
|
| 227 |
+
return BlockTable(
|
| 228 |
+
block_size=self._block_size,
|
| 229 |
+
block_allocator=self._allocator,
|
| 230 |
+
_blocks=forked_blocks,
|
| 231 |
+
max_block_sliding_window=self._max_block_sliding_window,
|
| 232 |
+
)
|
| 233 |
+
|
| 234 |
+
def free(self) -> None:
|
| 235 |
+
"""Frees the memory occupied by the blocks in the BlockTable.
|
| 236 |
+
|
| 237 |
+
This method iterates over all the blocks in the `_blocks` list and calls
|
| 238 |
+
the `free` method of the `_allocator` object to release the memory
|
| 239 |
+
occupied by each block. After freeing all the blocks, the `_blocks` list
|
| 240 |
+
is set to `None`.
|
| 241 |
+
"""
|
| 242 |
+
for block in self.blocks:
|
| 243 |
+
self._allocator.free(block)
|
| 244 |
+
self._blocks.reset()
|
| 245 |
+
|
| 246 |
+
@property
|
| 247 |
+
def physical_block_ids(self) -> List[int]:
|
| 248 |
+
"""Returns a list of physical block indices for the blocks in the
|
| 249 |
+
BlockTable.
|
| 250 |
+
|
| 251 |
+
This property returns a list of integers, where each integer represents
|
| 252 |
+
the physical block index of a corresponding block in the `_blocks` list.
|
| 253 |
+
The physical block index is a unique identifier for the memory location
|
| 254 |
+
occupied by the block.
|
| 255 |
+
|
| 256 |
+
Returns:
|
| 257 |
+
List[int]: A list of physical block indices for the blocks in the
|
| 258 |
+
BlockTable.
|
| 259 |
+
"""
|
| 260 |
+
return self._blocks.ids()
|
| 261 |
+
|
| 262 |
+
def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
|
| 263 |
+
"""Get the number of "unseen" tokens in the sequence.
|
| 264 |
+
|
| 265 |
+
Unseen tokens are tokens in the sequence corresponding to this block
|
| 266 |
+
table, but are not yet appended to this block table.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
sequence_token_ids (List[int]): The list of token ids in the
|
| 270 |
+
sequence.
|
| 271 |
+
|
| 272 |
+
Returns:
|
| 273 |
+
List[int]: The postfix of sequence_token_ids that has not yet been
|
| 274 |
+
appended to the block table.
|
| 275 |
+
"""
|
| 276 |
+
|
| 277 |
+
# Since the block table is append-only, the unseen token ids are the
|
| 278 |
+
# ones after the appended ones.
|
| 279 |
+
return sequence_token_ids[self.num_full_slots:]
|
| 280 |
+
|
| 281 |
+
def _allocate_blocks_for_token_ids(
|
| 282 |
+
self,
|
| 283 |
+
prev_block: Optional[Block],
|
| 284 |
+
token_ids: List[int],
|
| 285 |
+
device: Device,
|
| 286 |
+
extra_hash: Optional[int] = None) -> List[Block]:
|
| 287 |
+
blocks: List[Block] = []
|
| 288 |
+
|
| 289 |
+
block_token_ids = []
|
| 290 |
+
tail_token_ids = []
|
| 291 |
+
for cur_token_ids in chunk_list(token_ids, self._block_size):
|
| 292 |
+
if len(cur_token_ids) == self._block_size:
|
| 293 |
+
block_token_ids.append(cur_token_ids)
|
| 294 |
+
else:
|
| 295 |
+
tail_token_ids.append(cur_token_ids)
|
| 296 |
+
|
| 297 |
+
if block_token_ids:
|
| 298 |
+
blocks.extend(
|
| 299 |
+
self._allocator.allocate_immutable_blocks(
|
| 300 |
+
prev_block,
|
| 301 |
+
block_token_ids=block_token_ids,
|
| 302 |
+
device=device,
|
| 303 |
+
extra_hash=extra_hash))
|
| 304 |
+
prev_block = blocks[-1]
|
| 305 |
+
|
| 306 |
+
if tail_token_ids:
|
| 307 |
+
assert len(tail_token_ids) == 1
|
| 308 |
+
cur_token_ids = tail_token_ids[0]
|
| 309 |
+
|
| 310 |
+
block = self._allocator.allocate_mutable_block(
|
| 311 |
+
prev_block=prev_block, device=device, extra_hash=extra_hash)
|
| 312 |
+
block.append_token_ids(cur_token_ids)
|
| 313 |
+
|
| 314 |
+
blocks.append(block)
|
| 315 |
+
|
| 316 |
+
return blocks
|
| 317 |
+
|
| 318 |
+
def _get_all_token_ids(self) -> List[int]:
|
| 319 |
+
# NOTE: This function is O(seq_len); use sparingly.
|
| 320 |
+
token_ids: List[int] = []
|
| 321 |
+
|
| 322 |
+
if not self._is_allocated:
|
| 323 |
+
return token_ids
|
| 324 |
+
|
| 325 |
+
for block in self.blocks:
|
| 326 |
+
token_ids.extend(block.token_ids)
|
| 327 |
+
|
| 328 |
+
return token_ids
|
| 329 |
+
|
| 330 |
+
def _get_num_token_ids(self) -> int:
|
| 331 |
+
res = 0
|
| 332 |
+
for block in self.blocks:
|
| 333 |
+
res += len(block.token_ids)
|
| 334 |
+
|
| 335 |
+
return res
|
| 336 |
+
|
| 337 |
+
@property
|
| 338 |
+
def _is_allocated(self) -> bool:
|
| 339 |
+
return len(self._blocks) > 0
|
| 340 |
+
|
| 341 |
+
@property
|
| 342 |
+
def blocks(self) -> List[Block]:
|
| 343 |
+
return self._blocks.list()
|
| 344 |
+
|
| 345 |
+
@property
|
| 346 |
+
def _num_empty_slots(self) -> int:
|
| 347 |
+
assert self._is_allocated
|
| 348 |
+
return len(self._blocks) * self._block_size - self._num_full_slots
|
| 349 |
+
|
| 350 |
+
@property
|
| 351 |
+
def num_full_slots(self) -> int:
|
| 352 |
+
"""Returns the total number of tokens currently stored in the
|
| 353 |
+
BlockTable.
|
| 354 |
+
|
| 355 |
+
Returns:
|
| 356 |
+
int: The total number of tokens currently stored in the BlockTable.
|
| 357 |
+
"""
|
| 358 |
+
return self._num_full_slots
|
| 359 |
+
|
| 360 |
+
def get_num_blocks_touched_by_append_slots(
|
| 361 |
+
self, token_ids: List[int], num_lookahead_slots: int) -> int:
|
| 362 |
+
"""Determine how many blocks will be "touched" by appending the token
|
| 363 |
+
ids.
|
| 364 |
+
|
| 365 |
+
This is required for the scheduler to determine whether a sequence can
|
| 366 |
+
continue generation, or if it must be preempted.
|
| 367 |
+
"""
|
| 368 |
+
# Math below is equivalent to:
|
| 369 |
+
# all_token_ids = token_ids + [-1] * num_lookahead_slots
|
| 370 |
+
# token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
|
| 371 |
+
# return len(token_blocks)
|
| 372 |
+
|
| 373 |
+
num_token_ids = len(token_ids) + num_lookahead_slots
|
| 374 |
+
first_chunk_size = self._block_size - (self._num_full_slots %
|
| 375 |
+
self._block_size)
|
| 376 |
+
num_token_blocks = (1 + math.ceil(
|
| 377 |
+
(num_token_ids - first_chunk_size) / self._block_size))
|
| 378 |
+
return num_token_blocks
|
| 379 |
+
|
| 380 |
+
def _chunk_token_blocks_for_append(
|
| 381 |
+
self, token_ids: List[int]) -> List[List[int]]:
|
| 382 |
+
"""Split the token ids into block-sized chunks so they can be easily
|
| 383 |
+
appended to blocks. The first such "token block" may have less token ids
|
| 384 |
+
than the block size, since the last allocated block may be partially
|
| 385 |
+
full.
|
| 386 |
+
|
| 387 |
+
If no token ids are provided, then no chunks are returned.
|
| 388 |
+
"""
|
| 389 |
+
|
| 390 |
+
if not token_ids:
|
| 391 |
+
return []
|
| 392 |
+
|
| 393 |
+
first_chunk_size = self._block_size - (self._num_full_slots %
|
| 394 |
+
self._block_size)
|
| 395 |
+
token_blocks = [token_ids[:first_chunk_size]]
|
| 396 |
+
token_blocks.extend(
|
| 397 |
+
chunk_list(token_ids[first_chunk_size:], self._block_size))
|
| 398 |
+
return token_blocks
|
.venv/lib/python3.11/site-packages/vllm/core/block/common.py
ADDED
|
@@ -0,0 +1,370 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from collections import deque
|
| 4 |
+
from dataclasses import dataclass
|
| 5 |
+
from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
|
| 6 |
+
|
| 7 |
+
from vllm.core.block.interfaces import Block, BlockAllocator
|
| 8 |
+
|
| 9 |
+
BlockId = int
|
| 10 |
+
RefCount = int
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class RefCounterProtocol(Protocol):
|
| 14 |
+
|
| 15 |
+
def incr(self, block_id: BlockId) -> RefCount:
|
| 16 |
+
raise NotImplementedError
|
| 17 |
+
|
| 18 |
+
def decr(self, block_id: BlockId) -> RefCount:
|
| 19 |
+
raise NotImplementedError
|
| 20 |
+
|
| 21 |
+
def get(self, block_id: BlockId) -> RefCount:
|
| 22 |
+
raise NotImplementedError
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class RefCounter(RefCounterProtocol):
|
| 26 |
+
"""A class for managing reference counts for a set of block indices.
|
| 27 |
+
|
| 28 |
+
The RefCounter class maintains a dictionary that maps block indices to their
|
| 29 |
+
corresponding reference counts. It provides methods to increment, decrement,
|
| 30 |
+
and retrieve the reference count for a given block index.
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
all_block_indices (Iterable[BlockId]): An iterable of block indices
|
| 34 |
+
to initialize the reference counter with.
|
| 35 |
+
"""
|
| 36 |
+
|
| 37 |
+
def __init__(self, all_block_indices: Iterable[BlockId]):
|
| 38 |
+
deduped = set(all_block_indices)
|
| 39 |
+
self._refcounts: Dict[BlockId, RefCount] = {
|
| 40 |
+
index: 0
|
| 41 |
+
for index in deduped
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
def incr(self, block_id: BlockId) -> RefCount:
|
| 45 |
+
assert block_id in self._refcounts
|
| 46 |
+
pre_incr_refcount = self._refcounts[block_id]
|
| 47 |
+
|
| 48 |
+
assert pre_incr_refcount >= 0
|
| 49 |
+
|
| 50 |
+
post_incr_refcount = pre_incr_refcount + 1
|
| 51 |
+
self._refcounts[block_id] = post_incr_refcount
|
| 52 |
+
return post_incr_refcount
|
| 53 |
+
|
| 54 |
+
def decr(self, block_id: BlockId) -> RefCount:
|
| 55 |
+
assert block_id in self._refcounts
|
| 56 |
+
refcount = self._refcounts[block_id]
|
| 57 |
+
|
| 58 |
+
assert refcount > 0
|
| 59 |
+
refcount -= 1
|
| 60 |
+
|
| 61 |
+
self._refcounts[block_id] = refcount
|
| 62 |
+
|
| 63 |
+
return refcount
|
| 64 |
+
|
| 65 |
+
def get(self, block_id: BlockId) -> RefCount:
|
| 66 |
+
assert block_id in self._refcounts
|
| 67 |
+
return self._refcounts[block_id]
|
| 68 |
+
|
| 69 |
+
def as_readonly(self) -> "ReadOnlyRefCounter":
|
| 70 |
+
return ReadOnlyRefCounter(self)
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
class ReadOnlyRefCounter(RefCounterProtocol):
|
| 74 |
+
"""A read-only view of the RefCounter class.
|
| 75 |
+
|
| 76 |
+
The ReadOnlyRefCounter class provides a read-only interface to access the
|
| 77 |
+
reference counts maintained by a RefCounter instance. It does not allow
|
| 78 |
+
modifications to the reference counts.
|
| 79 |
+
|
| 80 |
+
Args:
|
| 81 |
+
refcounter (RefCounter): The RefCounter instance to create a read-only
|
| 82 |
+
view for.
|
| 83 |
+
"""
|
| 84 |
+
|
| 85 |
+
def __init__(self, refcounter: RefCounter):
|
| 86 |
+
self._refcounter = refcounter
|
| 87 |
+
|
| 88 |
+
def incr(self, block_id: BlockId) -> RefCount:
|
| 89 |
+
raise ValueError("Incr not allowed")
|
| 90 |
+
|
| 91 |
+
def decr(self, block_id: BlockId) -> RefCount:
|
| 92 |
+
raise ValueError("Decr not allowed")
|
| 93 |
+
|
| 94 |
+
def get(self, block_id: BlockId) -> RefCount:
|
| 95 |
+
return self._refcounter.get(block_id)
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
class CopyOnWriteTracker:
|
| 99 |
+
"""A class for tracking and managing copy-on-write operations for blocks.
|
| 100 |
+
|
| 101 |
+
The CopyOnWriteTracker class maintains a mapping of source block indices to
|
| 102 |
+
their corresponding copy-on-write destination block indices. It works in
|
| 103 |
+
conjunction with a RefCounter.
|
| 104 |
+
|
| 105 |
+
Args:
|
| 106 |
+
refcounter (RefCounter): The reference counter used to track block
|
| 107 |
+
reference counts.
|
| 108 |
+
"""
|
| 109 |
+
|
| 110 |
+
def __init__(self, refcounter: RefCounterProtocol):
|
| 111 |
+
self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
|
| 112 |
+
self._refcounter = refcounter
|
| 113 |
+
|
| 114 |
+
def is_appendable(self, block: Block) -> bool:
|
| 115 |
+
"""Checks if the block is shared or not. If shared, then it cannot
|
| 116 |
+
be appended and needs to be duplicated via copy-on-write
|
| 117 |
+
"""
|
| 118 |
+
block_id = block.block_id
|
| 119 |
+
if block_id is None:
|
| 120 |
+
return True
|
| 121 |
+
|
| 122 |
+
refcount = self._refcounter.get(block_id)
|
| 123 |
+
return refcount <= 1
|
| 124 |
+
|
| 125 |
+
def record_cow(self, src_block_id: Optional[BlockId],
|
| 126 |
+
trg_block_id: Optional[BlockId]) -> None:
|
| 127 |
+
"""Records a copy-on-write operation from source to target block id
|
| 128 |
+
Args:
|
| 129 |
+
src_block_id (BlockId): The source block id from which to copy
|
| 130 |
+
the data
|
| 131 |
+
trg_block_id (BlockId): The target block id to which the data
|
| 132 |
+
is copied
|
| 133 |
+
"""
|
| 134 |
+
assert src_block_id is not None
|
| 135 |
+
assert trg_block_id is not None
|
| 136 |
+
self._copy_on_writes.append((src_block_id, trg_block_id))
|
| 137 |
+
|
| 138 |
+
def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
|
| 139 |
+
"""Clears the copy-on-write tracking information and returns the current
|
| 140 |
+
state.
|
| 141 |
+
|
| 142 |
+
This method returns a list mapping source block indices to
|
| 143 |
+
destination block indices for the current copy-on-write operations.
|
| 144 |
+
It then clears the internal tracking information.
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
List[Tuple[BlockId, BlockId]]: A list mapping source
|
| 148 |
+
block indices to destination block indices for the
|
| 149 |
+
current copy-on-write operations.
|
| 150 |
+
"""
|
| 151 |
+
cows = self._copy_on_writes
|
| 152 |
+
self._copy_on_writes = []
|
| 153 |
+
return cows
|
| 154 |
+
|
| 155 |
+
|
| 156 |
+
class BlockPool:
|
| 157 |
+
"""Used to pre-allocate block objects, in order to avoid excessive python
|
| 158 |
+
object allocations/deallocations.
|
| 159 |
+
The pool starts from "pool_size" objects and will increase to more objects
|
| 160 |
+
if necessary
|
| 161 |
+
|
| 162 |
+
Note that multiple block objects may point to the same physical block id,
|
| 163 |
+
which is why this pool is needed, so that it will be easier to support
|
| 164 |
+
prefix caching and more complicated sharing of physical blocks.
|
| 165 |
+
"""
|
| 166 |
+
|
| 167 |
+
def __init__(self, block_size: int, create_block: Block.Factory,
|
| 168 |
+
allocator: BlockAllocator, pool_size: int):
|
| 169 |
+
self._block_size = block_size
|
| 170 |
+
self._create_block = create_block
|
| 171 |
+
self._allocator = allocator
|
| 172 |
+
self._pool_size = pool_size
|
| 173 |
+
assert self._pool_size >= 0
|
| 174 |
+
|
| 175 |
+
self._free_ids: Deque[int] = deque(range(self._pool_size))
|
| 176 |
+
self._pool = []
|
| 177 |
+
for i in range(self._pool_size):
|
| 178 |
+
self._pool.append(
|
| 179 |
+
self._create_block(prev_block=None,
|
| 180 |
+
token_ids=[],
|
| 181 |
+
block_size=self._block_size,
|
| 182 |
+
allocator=self._allocator,
|
| 183 |
+
block_id=None,
|
| 184 |
+
extra_hash=None))
|
| 185 |
+
|
| 186 |
+
def increase_pool(self):
|
| 187 |
+
"""Doubles the internal pool size
|
| 188 |
+
"""
|
| 189 |
+
cur_pool_size = self._pool_size
|
| 190 |
+
new_pool_size = cur_pool_size * 2
|
| 191 |
+
self._pool_size = new_pool_size
|
| 192 |
+
|
| 193 |
+
self._free_ids += deque(range(cur_pool_size, new_pool_size))
|
| 194 |
+
|
| 195 |
+
for i in range(cur_pool_size, new_pool_size):
|
| 196 |
+
self._pool.append(
|
| 197 |
+
self._create_block(prev_block=None,
|
| 198 |
+
token_ids=[],
|
| 199 |
+
block_size=self._block_size,
|
| 200 |
+
allocator=self._allocator,
|
| 201 |
+
block_id=None,
|
| 202 |
+
extra_hash=None))
|
| 203 |
+
|
| 204 |
+
def init_block(self,
|
| 205 |
+
prev_block: Optional[Block],
|
| 206 |
+
token_ids: List[int],
|
| 207 |
+
block_size: int,
|
| 208 |
+
physical_block_id: Optional[int],
|
| 209 |
+
extra_hash: Optional[int] = None) -> Block:
|
| 210 |
+
if len(self._free_ids) == 0:
|
| 211 |
+
self.increase_pool()
|
| 212 |
+
assert len(self._free_ids) > 0
|
| 213 |
+
|
| 214 |
+
pool_id = self._free_ids.popleft()
|
| 215 |
+
|
| 216 |
+
block = self._pool[pool_id]
|
| 217 |
+
block.__init__( # type: ignore[misc]
|
| 218 |
+
prev_block=prev_block,
|
| 219 |
+
token_ids=token_ids,
|
| 220 |
+
block_size=block_size,
|
| 221 |
+
allocator=block._allocator, # type: ignore[attr-defined]
|
| 222 |
+
block_id=physical_block_id,
|
| 223 |
+
extra_hash=extra_hash)
|
| 224 |
+
block.pool_id = pool_id # type: ignore[attr-defined]
|
| 225 |
+
return block
|
| 226 |
+
|
| 227 |
+
def free_block(self, block: Block) -> None:
|
| 228 |
+
self._free_ids.appendleft(block.pool_id) # type: ignore[attr-defined]
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
class BlockList:
|
| 232 |
+
"""This class is an optimization to allow fast-access to physical
|
| 233 |
+
block ids. It maintains a block id list that is updated with the
|
| 234 |
+
block list and this avoids the need to reconstruct the block id
|
| 235 |
+
list on every iteration of the block manager
|
| 236 |
+
"""
|
| 237 |
+
|
| 238 |
+
def __init__(self, blocks: List[Block]):
|
| 239 |
+
self._blocks: List[Block] = []
|
| 240 |
+
self._block_ids: List[int] = []
|
| 241 |
+
|
| 242 |
+
self.update(blocks)
|
| 243 |
+
|
| 244 |
+
def _add_block_id(self, block_id: Optional[BlockId]) -> None:
|
| 245 |
+
assert block_id is not None
|
| 246 |
+
self._block_ids.append(block_id)
|
| 247 |
+
|
| 248 |
+
def _update_block_id(self, block_index: int,
|
| 249 |
+
new_block_id: Optional[BlockId]) -> None:
|
| 250 |
+
assert new_block_id is not None
|
| 251 |
+
self._block_ids[block_index] = new_block_id
|
| 252 |
+
|
| 253 |
+
def update(self, blocks: List[Block]):
|
| 254 |
+
self._blocks = blocks
|
| 255 |
+
|
| 256 |
+
# Cache block ids for fast query
|
| 257 |
+
self._block_ids = []
|
| 258 |
+
for block in self._blocks:
|
| 259 |
+
self._add_block_id(block.block_id)
|
| 260 |
+
|
| 261 |
+
def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
|
| 262 |
+
block = self._blocks[block_index]
|
| 263 |
+
prev_block_id = block.block_id
|
| 264 |
+
|
| 265 |
+
block.append_token_ids(token_ids)
|
| 266 |
+
|
| 267 |
+
# CoW or promotion may update the internal block_id
|
| 268 |
+
if prev_block_id != block.block_id:
|
| 269 |
+
self._update_block_id(block_index, block.block_id)
|
| 270 |
+
|
| 271 |
+
def append(self, new_block: Block):
|
| 272 |
+
self._blocks.append(new_block)
|
| 273 |
+
self._add_block_id(new_block.block_id)
|
| 274 |
+
|
| 275 |
+
def __len__(self) -> int:
|
| 276 |
+
return len(self._blocks)
|
| 277 |
+
|
| 278 |
+
def __getitem__(self, block_index: int) -> Block:
|
| 279 |
+
return self._blocks[block_index]
|
| 280 |
+
|
| 281 |
+
def __setitem__(self, block_index: int, new_block: Block) -> None:
|
| 282 |
+
self._blocks[block_index] = new_block
|
| 283 |
+
self._update_block_id(block_index, new_block.block_id)
|
| 284 |
+
|
| 285 |
+
def reset(self):
|
| 286 |
+
self._blocks = []
|
| 287 |
+
self._block_ids = []
|
| 288 |
+
|
| 289 |
+
def list(self) -> List[Block]:
|
| 290 |
+
return self._blocks
|
| 291 |
+
|
| 292 |
+
def ids(self) -> List[int]:
|
| 293 |
+
return self._block_ids
|
| 294 |
+
|
| 295 |
+
|
| 296 |
+
@dataclass
|
| 297 |
+
class CacheMetricData:
|
| 298 |
+
"""A utility dataclass to maintain cache metric.
|
| 299 |
+
To avoid overflow, we maintain the hit rate in block granularity, so that
|
| 300 |
+
we can maintain a single hit rate for n_completed_block x block_size,
|
| 301 |
+
and calculate the real time hit rate by the following:
|
| 302 |
+
BS = The number of queries per block.
|
| 303 |
+
nB = The number of completed blocks.
|
| 304 |
+
HR = hit rate of (nB x BS) queries.
|
| 305 |
+
Q = current number of queries (< BS).
|
| 306 |
+
H = current number of hits (< BS).
|
| 307 |
+
hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
|
| 308 |
+
"""
|
| 309 |
+
num_completed_blocks: int = 0
|
| 310 |
+
completed_block_cache_hit_rate: float = 0.0
|
| 311 |
+
num_incompleted_block_queries: int = 0
|
| 312 |
+
num_incompleted_block_hit: int = 0
|
| 313 |
+
block_size: int = 1000
|
| 314 |
+
|
| 315 |
+
def query(self, hit: bool):
|
| 316 |
+
self.num_incompleted_block_queries += 1
|
| 317 |
+
self.num_incompleted_block_hit += 1 if hit else 0
|
| 318 |
+
|
| 319 |
+
# When a block is completed, update the cache hit rate
|
| 320 |
+
# and reset the incomplete numbers.
|
| 321 |
+
if self.num_incompleted_block_queries == self.block_size:
|
| 322 |
+
hit_rate = (self.num_incompleted_block_hit /
|
| 323 |
+
self.num_incompleted_block_queries)
|
| 324 |
+
self.completed_block_cache_hit_rate = (
|
| 325 |
+
self.completed_block_cache_hit_rate * self.num_completed_blocks
|
| 326 |
+
+ hit_rate) / (self.num_completed_blocks + 1)
|
| 327 |
+
self.num_incompleted_block_queries = 0
|
| 328 |
+
self.num_incompleted_block_hit = 0
|
| 329 |
+
self.num_completed_blocks += 1
|
| 330 |
+
|
| 331 |
+
def get_hit_rate(self):
|
| 332 |
+
incomplete_ratio = self.num_incompleted_block_queries / self.block_size
|
| 333 |
+
total_blocks = self.num_completed_blocks + incomplete_ratio
|
| 334 |
+
if total_blocks == 0:
|
| 335 |
+
return 0.0
|
| 336 |
+
|
| 337 |
+
completed_block_hit, incompleted_block_hit = 0.0, 0.0
|
| 338 |
+
if self.num_completed_blocks > 0:
|
| 339 |
+
completed_block_hit = (self.completed_block_cache_hit_rate *
|
| 340 |
+
self.num_completed_blocks)
|
| 341 |
+
if self.num_incompleted_block_queries > 0:
|
| 342 |
+
incompleted_hit_rate = (self.num_incompleted_block_hit /
|
| 343 |
+
self.num_incompleted_block_queries)
|
| 344 |
+
incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
|
| 345 |
+
return (completed_block_hit + incompleted_block_hit) / total_blocks
|
| 346 |
+
|
| 347 |
+
|
| 348 |
+
def get_all_blocks_recursively(last_block: Block) -> List[Block]:
|
| 349 |
+
"""Retrieves all the blocks in a sequence starting from the last block.
|
| 350 |
+
|
| 351 |
+
This function recursively traverses the sequence of blocks in reverse order,
|
| 352 |
+
starting from the given last block, and returns a list of all the blocks in
|
| 353 |
+
the sequence.
|
| 354 |
+
|
| 355 |
+
Args:
|
| 356 |
+
last_block (Block): The last block in the sequence.
|
| 357 |
+
|
| 358 |
+
Returns:
|
| 359 |
+
List[Block]: A list of all the blocks in the sequence, in the order they
|
| 360 |
+
appear.
|
| 361 |
+
"""
|
| 362 |
+
|
| 363 |
+
def recurse(block: Block, lst: List[Block]) -> None:
|
| 364 |
+
if block.prev_block is not None:
|
| 365 |
+
recurse(block.prev_block, lst)
|
| 366 |
+
lst.append(block)
|
| 367 |
+
|
| 368 |
+
all_blocks: List[Block] = []
|
| 369 |
+
recurse(last_block, all_blocks)
|
| 370 |
+
return all_blocks
|
.venv/lib/python3.11/site-packages/vllm/core/block/cpu_gpu_block_allocator.py
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import Dict, FrozenSet, List, Optional, Tuple
|
| 4 |
+
|
| 5 |
+
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
|
| 6 |
+
DeviceAwareBlockAllocator)
|
| 7 |
+
from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
|
| 8 |
+
from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
|
| 9 |
+
from vllm.platforms import current_platform
|
| 10 |
+
from vllm.utils import Device
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
|
| 14 |
+
"""A block allocator that can allocate blocks on both CPU and GPU memory.
|
| 15 |
+
|
| 16 |
+
This class implements the `DeviceAwareBlockAllocator` interface and provides
|
| 17 |
+
functionality for allocating and managing blocks of memory on both CPU and
|
| 18 |
+
GPU devices.
|
| 19 |
+
|
| 20 |
+
The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
|
| 21 |
+
blocks, and allows for allocation, deallocation, forking, and swapping of
|
| 22 |
+
blocks across these memory pools.
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
@staticmethod
|
| 26 |
+
def create(
|
| 27 |
+
allocator_type: str,
|
| 28 |
+
num_gpu_blocks: int,
|
| 29 |
+
num_cpu_blocks: int,
|
| 30 |
+
block_size: int,
|
| 31 |
+
) -> DeviceAwareBlockAllocator:
|
| 32 |
+
"""Creates a CpuGpuBlockAllocator instance with the specified
|
| 33 |
+
configuration.
|
| 34 |
+
|
| 35 |
+
This static method creates and returns a CpuGpuBlockAllocator instance
|
| 36 |
+
based on the provided parameters. It initializes the CPU and GPU block
|
| 37 |
+
allocators with the specified number of blocks, block size, and
|
| 38 |
+
allocator type.
|
| 39 |
+
|
| 40 |
+
Args:
|
| 41 |
+
allocator_type (str): The type of block allocator to use for CPU
|
| 42 |
+
and GPU blocks. Currently supported values are "naive" and
|
| 43 |
+
"prefix_caching".
|
| 44 |
+
num_gpu_blocks (int): The number of blocks to allocate for GPU
|
| 45 |
+
memory.
|
| 46 |
+
num_cpu_blocks (int): The number of blocks to allocate for CPU
|
| 47 |
+
memory.
|
| 48 |
+
block_size (int): The size of each block in number of tokens.
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
|
| 52 |
+
specified configuration.
|
| 53 |
+
|
| 54 |
+
Notes:
|
| 55 |
+
- The block IDs are assigned contiguously, with GPU block IDs coming
|
| 56 |
+
before CPU block IDs.
|
| 57 |
+
"""
|
| 58 |
+
# For HPU, block id 0 is used only for padding
|
| 59 |
+
reserved_blocks = 1 if current_platform.is_hpu() else 0
|
| 60 |
+
block_ids = list(
|
| 61 |
+
range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
|
| 62 |
+
num_gpu_blocks -= reserved_blocks
|
| 63 |
+
gpu_block_ids = block_ids[:num_gpu_blocks]
|
| 64 |
+
cpu_block_ids = block_ids[num_gpu_blocks:]
|
| 65 |
+
|
| 66 |
+
if allocator_type == "naive":
|
| 67 |
+
gpu_allocator: BlockAllocator = NaiveBlockAllocator(
|
| 68 |
+
create_block=NaiveBlock, # type: ignore
|
| 69 |
+
num_blocks=num_gpu_blocks,
|
| 70 |
+
block_size=block_size,
|
| 71 |
+
block_ids=gpu_block_ids,
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
cpu_allocator: BlockAllocator = NaiveBlockAllocator(
|
| 75 |
+
create_block=NaiveBlock, # type: ignore
|
| 76 |
+
num_blocks=num_cpu_blocks,
|
| 77 |
+
block_size=block_size,
|
| 78 |
+
block_ids=cpu_block_ids,
|
| 79 |
+
)
|
| 80 |
+
elif allocator_type == "prefix_caching":
|
| 81 |
+
gpu_allocator = PrefixCachingBlockAllocator(
|
| 82 |
+
num_blocks=num_gpu_blocks,
|
| 83 |
+
block_size=block_size,
|
| 84 |
+
block_ids=gpu_block_ids,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
cpu_allocator = PrefixCachingBlockAllocator(
|
| 88 |
+
num_blocks=num_cpu_blocks,
|
| 89 |
+
block_size=block_size,
|
| 90 |
+
block_ids=cpu_block_ids,
|
| 91 |
+
)
|
| 92 |
+
else:
|
| 93 |
+
raise ValueError(f"Unknown allocator type {allocator_type=}")
|
| 94 |
+
|
| 95 |
+
return CpuGpuBlockAllocator(
|
| 96 |
+
cpu_block_allocator=cpu_allocator,
|
| 97 |
+
gpu_block_allocator=gpu_allocator,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
def __init__(self, cpu_block_allocator: BlockAllocator,
|
| 101 |
+
gpu_block_allocator: BlockAllocator):
|
| 102 |
+
assert not (
|
| 103 |
+
cpu_block_allocator.all_block_ids
|
| 104 |
+
& gpu_block_allocator.all_block_ids
|
| 105 |
+
), "cpu and gpu block allocators can't have intersection of block ids"
|
| 106 |
+
|
| 107 |
+
self._allocators = {
|
| 108 |
+
Device.CPU: cpu_block_allocator,
|
| 109 |
+
Device.GPU: gpu_block_allocator,
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
self._swap_mapping: Dict[int, int] = {}
|
| 113 |
+
self._null_block: Optional[Block] = None
|
| 114 |
+
|
| 115 |
+
self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
|
| 116 |
+
for _, allocator in self._allocators.items():
|
| 117 |
+
for block_id in allocator.all_block_ids:
|
| 118 |
+
self._block_ids_to_allocator[block_id] = allocator
|
| 119 |
+
|
| 120 |
+
def allocate_or_get_null_block(self) -> Block:
|
| 121 |
+
if self._null_block is None:
|
| 122 |
+
self._null_block = NullBlock(
|
| 123 |
+
self.allocate_mutable_block(None, Device.GPU))
|
| 124 |
+
return self._null_block
|
| 125 |
+
|
| 126 |
+
def allocate_mutable_block(self,
|
| 127 |
+
prev_block: Optional[Block],
|
| 128 |
+
device: Device,
|
| 129 |
+
extra_hash: Optional[int] = None) -> Block:
|
| 130 |
+
"""Allocates a new mutable block on the specified device.
|
| 131 |
+
|
| 132 |
+
Args:
|
| 133 |
+
prev_block (Optional[Block]): The previous block to in the sequence.
|
| 134 |
+
Used for prefix hashing.
|
| 135 |
+
device (Device): The device on which to allocate the new block.
|
| 136 |
+
extra_hash (Optional[int]): The hash value of additional
|
| 137 |
+
factors, such as adapters, that influence the block hash
|
| 138 |
+
in the prefix caching block.
|
| 139 |
+
|
| 140 |
+
Returns:
|
| 141 |
+
Block: The newly allocated mutable block.
|
| 142 |
+
"""
|
| 143 |
+
return self._allocators[device].allocate_mutable_block(
|
| 144 |
+
prev_block, extra_hash=extra_hash)
|
| 145 |
+
|
| 146 |
+
def allocate_immutable_blocks(
|
| 147 |
+
self,
|
| 148 |
+
prev_block: Optional[Block],
|
| 149 |
+
block_token_ids: List[List[int]],
|
| 150 |
+
device: Device,
|
| 151 |
+
extra_hash: Optional[int] = None) -> List[Block]:
|
| 152 |
+
"""Allocates a new group of immutable blocks with the provided block
|
| 153 |
+
token IDs on the specified device.
|
| 154 |
+
|
| 155 |
+
Args:
|
| 156 |
+
prev_block (Optional[Block]): The previous block in the sequence.
|
| 157 |
+
Used for prefix hashing.
|
| 158 |
+
block_token_ids (List[int]): The list of block token IDs to be
|
| 159 |
+
stored in the new blocks.
|
| 160 |
+
device (Device): The device on which to allocate the new block.
|
| 161 |
+
extra_hash (Optional[int]): The hash value of additional
|
| 162 |
+
factors, such as adapters, that influence the block hash
|
| 163 |
+
in the prefix caching block.
|
| 164 |
+
|
| 165 |
+
Returns:
|
| 166 |
+
List[Block]: The newly allocated list of immutable blocks
|
| 167 |
+
containing the provided block token IDs.
|
| 168 |
+
"""
|
| 169 |
+
return self._allocators[device].allocate_immutable_blocks(
|
| 170 |
+
prev_block, block_token_ids, extra_hash=extra_hash)
|
| 171 |
+
|
| 172 |
+
def allocate_immutable_block(self,
|
| 173 |
+
prev_block: Optional[Block],
|
| 174 |
+
token_ids: List[int],
|
| 175 |
+
device: Device,
|
| 176 |
+
extra_hash: Optional[int] = None) -> Block:
|
| 177 |
+
"""Allocates a new immutable block with the provided token IDs on the
|
| 178 |
+
specified device.
|
| 179 |
+
|
| 180 |
+
Args:
|
| 181 |
+
prev_block (Optional[Block]): The previous block in the sequence.
|
| 182 |
+
Used for prefix hashing.
|
| 183 |
+
token_ids (List[int]): The list of token IDs to be stored in the new
|
| 184 |
+
block.
|
| 185 |
+
device (Device): The device on which to allocate the new block.
|
| 186 |
+
extra_hash (Optional[int]): The hash value of additional
|
| 187 |
+
factors, such as adapters, that influence the block hash
|
| 188 |
+
in the prefix caching block.
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
Block: The newly allocated immutable block containing the provided
|
| 192 |
+
token IDs.
|
| 193 |
+
"""
|
| 194 |
+
return self._allocators[device].allocate_immutable_block(
|
| 195 |
+
prev_block, token_ids, extra_hash=extra_hash)
|
| 196 |
+
|
| 197 |
+
def free(self, block: Block) -> None:
|
| 198 |
+
"""Frees the memory occupied by the given block.
|
| 199 |
+
|
| 200 |
+
Args:
|
| 201 |
+
block (Block): The block to be freed.
|
| 202 |
+
"""
|
| 203 |
+
# Null block should never be freed
|
| 204 |
+
if isinstance(block, NullBlock):
|
| 205 |
+
return
|
| 206 |
+
block_id = block.block_id
|
| 207 |
+
assert block_id is not None
|
| 208 |
+
allocator = self._block_ids_to_allocator[block_id]
|
| 209 |
+
allocator.free(block)
|
| 210 |
+
|
| 211 |
+
def fork(self, last_block: Block) -> List[Block]:
|
| 212 |
+
"""Creates a new sequence of blocks that shares the same underlying
|
| 213 |
+
memory as the original sequence.
|
| 214 |
+
|
| 215 |
+
Args:
|
| 216 |
+
last_block (Block): The last block in the original sequence.
|
| 217 |
+
|
| 218 |
+
Returns:
|
| 219 |
+
List[Block]: A new list of blocks that shares the same memory as the
|
| 220 |
+
original sequence.
|
| 221 |
+
"""
|
| 222 |
+
# do not attempt to fork the null block
|
| 223 |
+
assert not isinstance(last_block, NullBlock)
|
| 224 |
+
block_id = last_block.block_id
|
| 225 |
+
assert block_id is not None
|
| 226 |
+
allocator = self._block_ids_to_allocator[block_id]
|
| 227 |
+
return allocator.fork(last_block)
|
| 228 |
+
|
| 229 |
+
def get_num_free_blocks(self, device: Device) -> int:
|
| 230 |
+
"""Returns the number of free blocks available on the specified device.
|
| 231 |
+
|
| 232 |
+
Args:
|
| 233 |
+
device (Device): The device for which to query the number of free
|
| 234 |
+
blocks. AssertionError is raised if None is passed.
|
| 235 |
+
|
| 236 |
+
Returns:
|
| 237 |
+
int: The number of free blocks available on the specified device.
|
| 238 |
+
"""
|
| 239 |
+
return self._allocators[device].get_num_free_blocks()
|
| 240 |
+
|
| 241 |
+
def get_num_total_blocks(self, device: Device) -> int:
|
| 242 |
+
return self._allocators[device].get_num_total_blocks()
|
| 243 |
+
|
| 244 |
+
def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
|
| 245 |
+
"""Returns the zero-offset block id on certain device given the
|
| 246 |
+
absolute block id.
|
| 247 |
+
|
| 248 |
+
Args:
|
| 249 |
+
device (Device): The device for which to query relative block id.
|
| 250 |
+
absolute_id (int): The absolute block id for the block in
|
| 251 |
+
whole allocator.
|
| 252 |
+
|
| 253 |
+
Returns:
|
| 254 |
+
int: The zero-offset block id on certain device.
|
| 255 |
+
"""
|
| 256 |
+
return self._allocators[device].get_physical_block_id(absolute_id)
|
| 257 |
+
|
| 258 |
+
def swap(self, blocks: List[Block], src_device: Device,
|
| 259 |
+
dst_device: Device) -> Dict[int, int]:
|
| 260 |
+
"""Execute the swap for the given blocks from source_device
|
| 261 |
+
on to dest_device, save the current swap mapping and append
|
| 262 |
+
them to the accumulated `self._swap_mapping` for each
|
| 263 |
+
scheduling move.
|
| 264 |
+
|
| 265 |
+
Args:
|
| 266 |
+
blocks: List of blocks to be swapped.
|
| 267 |
+
src_device (Device): Device to swap the 'blocks' from.
|
| 268 |
+
dst_device (Device): Device to swap the 'blocks' to.
|
| 269 |
+
|
| 270 |
+
Returns:
|
| 271 |
+
Dict[int, int]: Swap mapping from source_device
|
| 272 |
+
on to dest_device.
|
| 273 |
+
"""
|
| 274 |
+
src_block_ids = [block.block_id for block in blocks]
|
| 275 |
+
self._allocators[src_device].swap_out(blocks)
|
| 276 |
+
self._allocators[dst_device].swap_in(blocks)
|
| 277 |
+
dst_block_ids = [block.block_id for block in blocks]
|
| 278 |
+
|
| 279 |
+
current_swap_mapping: Dict[int, int] = {}
|
| 280 |
+
for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
|
| 281 |
+
if src_block_id is not None and dst_block_id is not None:
|
| 282 |
+
self._swap_mapping[src_block_id] = dst_block_id
|
| 283 |
+
current_swap_mapping[src_block_id] = dst_block_id
|
| 284 |
+
return current_swap_mapping
|
| 285 |
+
|
| 286 |
+
def get_num_full_blocks_touched(self, blocks: List[Block],
|
| 287 |
+
device: Device) -> int:
|
| 288 |
+
"""Returns the number of full blocks that will be touched by
|
| 289 |
+
swapping in/out the given blocks on to the 'device'.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
blocks: List of blocks to be swapped.
|
| 293 |
+
device (Device): Device to swap the 'blocks' on.
|
| 294 |
+
|
| 295 |
+
Returns:
|
| 296 |
+
int: the number of full blocks that will be touched by
|
| 297 |
+
swapping in/out the given blocks on to the 'device'.
|
| 298 |
+
Non full blocks are ignored when deciding the number
|
| 299 |
+
of blocks to touch.
|
| 300 |
+
"""
|
| 301 |
+
return self._allocators[device].get_num_full_blocks_touched(blocks)
|
| 302 |
+
|
| 303 |
+
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
| 304 |
+
"""Clears the copy-on-write (CoW) state and returns the mapping of
|
| 305 |
+
source to destination block IDs.
|
| 306 |
+
|
| 307 |
+
Returns:
|
| 308 |
+
List[Tuple[int, int]]: A list mapping source block IDs to
|
| 309 |
+
destination block IDs.
|
| 310 |
+
"""
|
| 311 |
+
# CoW only supported on GPU
|
| 312 |
+
device = Device.GPU
|
| 313 |
+
return self._allocators[device].clear_copy_on_writes()
|
| 314 |
+
|
| 315 |
+
def mark_blocks_as_accessed(self, block_ids: List[int],
|
| 316 |
+
now: float) -> None:
|
| 317 |
+
"""Mark blocks as accessed, only use for prefix caching."""
|
| 318 |
+
# Prefix caching only supported on GPU.
|
| 319 |
+
device = Device.GPU
|
| 320 |
+
return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
|
| 321 |
+
|
| 322 |
+
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
| 323 |
+
"""Mark blocks as accessed, only use for prefix caching."""
|
| 324 |
+
# Prefix caching only supported on GPU.
|
| 325 |
+
device = Device.GPU
|
| 326 |
+
return self._allocators[device].mark_blocks_as_computed(block_ids)
|
| 327 |
+
|
| 328 |
+
def get_common_computed_block_ids(
|
| 329 |
+
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
| 330 |
+
# Prefix caching only supported on GPU.
|
| 331 |
+
device = Device.GPU
|
| 332 |
+
return self._allocators[device].get_common_computed_block_ids(
|
| 333 |
+
computed_seq_block_ids)
|
| 334 |
+
|
| 335 |
+
@property
|
| 336 |
+
def all_block_ids(self) -> FrozenSet[int]:
|
| 337 |
+
return frozenset(self._block_ids_to_allocator.keys())
|
| 338 |
+
|
| 339 |
+
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
| 340 |
+
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
| 341 |
+
assert device in self._allocators
|
| 342 |
+
return self._allocators[device].get_prefix_cache_hit_rate()
|
| 343 |
+
|
| 344 |
+
def reset_prefix_cache(self) -> bool:
|
| 345 |
+
"""Reset prefix cache for all devices."""
|
| 346 |
+
success = True
|
| 347 |
+
for allocator in self._allocators.values():
|
| 348 |
+
success = success and allocator.reset_prefix_cache()
|
| 349 |
+
return success
|
| 350 |
+
|
| 351 |
+
def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
|
| 352 |
+
"""Returns and clears the mapping of source to destination block IDs.
|
| 353 |
+
Will be called after every swapping operations for now, and after every
|
| 354 |
+
schedule when BlockManagerV2 become default. Currently not useful.
|
| 355 |
+
|
| 356 |
+
Returns:
|
| 357 |
+
List[Tuple[int, int]]: A mapping of source to destination block IDs.
|
| 358 |
+
"""
|
| 359 |
+
mapping = self._swap_mapping.copy()
|
| 360 |
+
self._swap_mapping.clear()
|
| 361 |
+
return list(mapping.items())
|
| 362 |
+
|
| 363 |
+
def find_cached_blocks_prefix(
|
| 364 |
+
self,
|
| 365 |
+
block_hashes: List[int],
|
| 366 |
+
device: Device = Device.GPU,
|
| 367 |
+
) -> List[int]:
|
| 368 |
+
return self._allocators[device].find_cached_blocks_prefix(block_hashes)
|
| 369 |
+
|
| 370 |
+
|
| 371 |
+
class NullBlock(Block):
|
| 372 |
+
"""
|
| 373 |
+
Null blocks are used as a placeholders for KV cache blocks that have
|
| 374 |
+
been dropped due to sliding window.
|
| 375 |
+
This implementation just wraps an ordinary block and prevents it from
|
| 376 |
+
being modified. It also allows for testing if a block is NullBlock
|
| 377 |
+
via isinstance().
|
| 378 |
+
"""
|
| 379 |
+
|
| 380 |
+
def __init__(self, proxy: Block):
|
| 381 |
+
super().__init__()
|
| 382 |
+
self._proxy = proxy
|
| 383 |
+
|
| 384 |
+
def append_token_ids(self, token_ids: List[BlockId]):
|
| 385 |
+
raise ValueError("null block should not be modified")
|
| 386 |
+
|
| 387 |
+
@property
|
| 388 |
+
def block_id(self):
|
| 389 |
+
return self._proxy.block_id
|
| 390 |
+
|
| 391 |
+
@block_id.setter
|
| 392 |
+
def block_id(self, value: Optional[BlockId]):
|
| 393 |
+
raise ValueError("null block should not be modified")
|
| 394 |
+
|
| 395 |
+
@property
|
| 396 |
+
def token_ids(self) -> List[BlockId]:
|
| 397 |
+
return self._proxy.token_ids
|
| 398 |
+
|
| 399 |
+
@property
|
| 400 |
+
def num_tokens_total(self) -> int:
|
| 401 |
+
raise NotImplementedError(
|
| 402 |
+
"num_tokens_total is not used for null block")
|
| 403 |
+
|
| 404 |
+
@property
|
| 405 |
+
def num_empty_slots(self) -> BlockId:
|
| 406 |
+
return self._proxy.num_empty_slots
|
| 407 |
+
|
| 408 |
+
@property
|
| 409 |
+
def is_full(self):
|
| 410 |
+
return self._proxy.is_full
|
| 411 |
+
|
| 412 |
+
@property
|
| 413 |
+
def prev_block(self):
|
| 414 |
+
return self._proxy.prev_block
|
| 415 |
+
|
| 416 |
+
@property
|
| 417 |
+
def extra_hash(self):
|
| 418 |
+
return None
|
| 419 |
+
|
| 420 |
+
@property
|
| 421 |
+
def computed(self):
|
| 422 |
+
return self._proxy.computed
|
| 423 |
+
|
| 424 |
+
@computed.setter
|
| 425 |
+
def computed(self, value):
|
| 426 |
+
self._proxy.computed = value
|
| 427 |
+
|
| 428 |
+
@property
|
| 429 |
+
def last_accessed(self) -> float:
|
| 430 |
+
return self._proxy.last_accessed
|
| 431 |
+
|
| 432 |
+
@last_accessed.setter
|
| 433 |
+
def last_accessed(self, last_accessed_ts: float):
|
| 434 |
+
self._proxy.last_accessed = last_accessed_ts
|
| 435 |
+
|
| 436 |
+
@property
|
| 437 |
+
def content_hash(self):
|
| 438 |
+
return self._proxy.content_hash
|
.venv/lib/python3.11/site-packages/vllm/core/block/interfaces.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
|
| 5 |
+
|
| 6 |
+
from vllm.utils import Device
|
| 7 |
+
|
| 8 |
+
BlockId = int
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class Block(ABC):
|
| 12 |
+
|
| 13 |
+
@abstractmethod
|
| 14 |
+
def append_token_ids(self, token_ids: List[int]) -> None:
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
@property
|
| 18 |
+
@abstractmethod
|
| 19 |
+
def block_id(self) -> Optional[int]:
|
| 20 |
+
pass
|
| 21 |
+
|
| 22 |
+
@block_id.setter
|
| 23 |
+
@abstractmethod
|
| 24 |
+
def block_id(self, value: Optional[int]) -> None:
|
| 25 |
+
"""NOTE: Do not use this API outside Block."""
|
| 26 |
+
self._block_id = value
|
| 27 |
+
|
| 28 |
+
@property
|
| 29 |
+
@abstractmethod
|
| 30 |
+
def token_ids(self) -> List[int]:
|
| 31 |
+
pass
|
| 32 |
+
|
| 33 |
+
@property
|
| 34 |
+
@abstractmethod
|
| 35 |
+
def num_tokens_total(self) -> int:
|
| 36 |
+
"""The number of tokens till the current block (inclusive)
|
| 37 |
+
"""
|
| 38 |
+
pass
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
@abstractmethod
|
| 42 |
+
def num_empty_slots(self) -> int:
|
| 43 |
+
pass
|
| 44 |
+
|
| 45 |
+
@property
|
| 46 |
+
@abstractmethod
|
| 47 |
+
def is_full(self) -> bool:
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
@property
|
| 51 |
+
@abstractmethod
|
| 52 |
+
def prev_block(self) -> Optional["Block"]:
|
| 53 |
+
pass
|
| 54 |
+
|
| 55 |
+
@property
|
| 56 |
+
@abstractmethod
|
| 57 |
+
def extra_hash(self) -> Optional[int]:
|
| 58 |
+
return None
|
| 59 |
+
|
| 60 |
+
@property
|
| 61 |
+
@abstractmethod
|
| 62 |
+
def computed(self) -> bool:
|
| 63 |
+
raise NotImplementedError
|
| 64 |
+
|
| 65 |
+
@computed.setter
|
| 66 |
+
@abstractmethod
|
| 67 |
+
def computed(self, value) -> bool:
|
| 68 |
+
"""Should be only used by PrefixCacingAllocator"""
|
| 69 |
+
raise NotImplementedError
|
| 70 |
+
|
| 71 |
+
@property
|
| 72 |
+
@abstractmethod
|
| 73 |
+
def last_accessed(self) -> float:
|
| 74 |
+
raise NotImplementedError
|
| 75 |
+
|
| 76 |
+
@last_accessed.setter
|
| 77 |
+
@abstractmethod
|
| 78 |
+
def last_accessed(self, last_accessed_ts: float):
|
| 79 |
+
raise NotImplementedError
|
| 80 |
+
|
| 81 |
+
class Factory(Protocol):
|
| 82 |
+
|
| 83 |
+
@abstractmethod
|
| 84 |
+
def __call__(
|
| 85 |
+
self,
|
| 86 |
+
prev_block: Optional["Block"],
|
| 87 |
+
token_ids: List[int],
|
| 88 |
+
block_size: int,
|
| 89 |
+
allocator: "BlockAllocator",
|
| 90 |
+
block_id: Optional[int] = None,
|
| 91 |
+
computed: bool = False,
|
| 92 |
+
extra_hash: Optional[int] = None,
|
| 93 |
+
) -> "Block":
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
@property
|
| 97 |
+
@abstractmethod
|
| 98 |
+
def content_hash(self) -> Optional[int]:
|
| 99 |
+
"""Return the content-based hash of the current block, or None if it is
|
| 100 |
+
not yet defined or not supported.
|
| 101 |
+
|
| 102 |
+
For the content-based hash to be defined, the current block must be
|
| 103 |
+
full.
|
| 104 |
+
"""
|
| 105 |
+
return None
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class BlockAllocator(ABC):
|
| 109 |
+
|
| 110 |
+
@abstractmethod
|
| 111 |
+
def allocate_mutable_block(self, prev_block: Optional[Block],
|
| 112 |
+
extra_hash: Optional[int]) -> Block:
|
| 113 |
+
pass
|
| 114 |
+
|
| 115 |
+
@abstractmethod
|
| 116 |
+
def allocate_immutable_block(self, prev_block: Optional[Block],
|
| 117 |
+
token_ids: List[int],
|
| 118 |
+
extra_hash: Optional[int]) -> Block:
|
| 119 |
+
pass
|
| 120 |
+
|
| 121 |
+
@abstractmethod
|
| 122 |
+
def allocate_immutable_blocks(self, prev_block: Optional[Block],
|
| 123 |
+
block_token_ids: List[List[int]],
|
| 124 |
+
extra_hash: Optional[int]) -> List[Block]:
|
| 125 |
+
pass
|
| 126 |
+
|
| 127 |
+
@abstractmethod
|
| 128 |
+
def free(self, block: Block) -> None:
|
| 129 |
+
pass
|
| 130 |
+
|
| 131 |
+
@abstractmethod
|
| 132 |
+
def fork(self, last_block: Block) -> List[Block]:
|
| 133 |
+
pass
|
| 134 |
+
|
| 135 |
+
@abstractmethod
|
| 136 |
+
def get_num_total_blocks(self) -> int:
|
| 137 |
+
pass
|
| 138 |
+
|
| 139 |
+
@abstractmethod
|
| 140 |
+
def get_num_free_blocks(self) -> int:
|
| 141 |
+
pass
|
| 142 |
+
|
| 143 |
+
@abstractmethod
|
| 144 |
+
def get_physical_block_id(self, absolute_id: int) -> int:
|
| 145 |
+
pass
|
| 146 |
+
|
| 147 |
+
@abstractmethod
|
| 148 |
+
def swap_out(self, blocks: List[Block]) -> None:
|
| 149 |
+
pass
|
| 150 |
+
|
| 151 |
+
@abstractmethod
|
| 152 |
+
def swap_in(self, blocks: List[Block]) -> None:
|
| 153 |
+
pass
|
| 154 |
+
|
| 155 |
+
@property
|
| 156 |
+
@abstractmethod
|
| 157 |
+
def all_block_ids(self) -> FrozenSet[int]:
|
| 158 |
+
pass
|
| 159 |
+
|
| 160 |
+
@abstractmethod
|
| 161 |
+
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
| 162 |
+
pass
|
| 163 |
+
|
| 164 |
+
@abstractmethod
|
| 165 |
+
def mark_blocks_as_accessed(self, block_ids: List[int],
|
| 166 |
+
now: float) -> None:
|
| 167 |
+
pass
|
| 168 |
+
|
| 169 |
+
@abstractmethod
|
| 170 |
+
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
| 171 |
+
pass
|
| 172 |
+
|
| 173 |
+
@abstractmethod
|
| 174 |
+
def get_common_computed_block_ids(
|
| 175 |
+
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
| 176 |
+
pass
|
| 177 |
+
|
| 178 |
+
@abstractmethod
|
| 179 |
+
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
| 180 |
+
"""NOTE: This should not be used besides Block"""
|
| 181 |
+
pass
|
| 182 |
+
|
| 183 |
+
@abstractmethod
|
| 184 |
+
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
| 185 |
+
"""NOTE: This should not be used besides Block"""
|
| 186 |
+
pass
|
| 187 |
+
|
| 188 |
+
@abstractmethod
|
| 189 |
+
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
| 190 |
+
pass
|
| 191 |
+
|
| 192 |
+
@abstractmethod
|
| 193 |
+
def get_prefix_cache_hit_rate(self) -> float:
|
| 194 |
+
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
| 195 |
+
pass
|
| 196 |
+
|
| 197 |
+
@abstractmethod
|
| 198 |
+
def reset_prefix_cache(self) -> bool:
|
| 199 |
+
"""Reset prefix cache."""
|
| 200 |
+
pass
|
| 201 |
+
|
| 202 |
+
class NoFreeBlocksError(ValueError):
|
| 203 |
+
pass
|
| 204 |
+
|
| 205 |
+
@abstractmethod
|
| 206 |
+
def find_cached_blocks_prefix(
|
| 207 |
+
self,
|
| 208 |
+
block_hashes: List[int],
|
| 209 |
+
) -> List[int]:
|
| 210 |
+
pass
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
class DeviceAwareBlockAllocator(ABC):
|
| 214 |
+
|
| 215 |
+
@abstractmethod
|
| 216 |
+
def allocate_mutable_block(self,
|
| 217 |
+
prev_block: Optional[Block],
|
| 218 |
+
device: Device,
|
| 219 |
+
extra_hash: Optional[int] = None) -> Block:
|
| 220 |
+
pass
|
| 221 |
+
|
| 222 |
+
@abstractmethod
|
| 223 |
+
def allocate_immutable_block(self,
|
| 224 |
+
prev_block: Optional[Block],
|
| 225 |
+
token_ids: List[int],
|
| 226 |
+
device: Device,
|
| 227 |
+
extra_hash: Optional[int] = None) -> Block:
|
| 228 |
+
pass
|
| 229 |
+
|
| 230 |
+
@abstractmethod
|
| 231 |
+
def allocate_immutable_blocks(
|
| 232 |
+
self,
|
| 233 |
+
prev_block: Optional[Block],
|
| 234 |
+
block_token_ids: List[List[int]],
|
| 235 |
+
device: Device,
|
| 236 |
+
extra_hash: Optional[int] = None,
|
| 237 |
+
) -> List[Block]:
|
| 238 |
+
pass
|
| 239 |
+
|
| 240 |
+
@abstractmethod
|
| 241 |
+
def get_num_free_blocks(self, device: Device) -> int:
|
| 242 |
+
pass
|
| 243 |
+
|
| 244 |
+
@abstractmethod
|
| 245 |
+
def get_num_total_blocks(self, device: Device) -> int:
|
| 246 |
+
pass
|
| 247 |
+
|
| 248 |
+
@abstractmethod
|
| 249 |
+
def free(self, block: Block) -> None:
|
| 250 |
+
pass
|
| 251 |
+
|
| 252 |
+
@abstractmethod
|
| 253 |
+
def fork(self, last_block: Block) -> List[Block]:
|
| 254 |
+
pass
|
| 255 |
+
|
| 256 |
+
@property
|
| 257 |
+
@abstractmethod
|
| 258 |
+
def all_block_ids(self) -> FrozenSet[int]:
|
| 259 |
+
pass
|
| 260 |
+
|
| 261 |
+
@abstractmethod
|
| 262 |
+
def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
|
| 263 |
+
pass
|
| 264 |
+
|
| 265 |
+
@abstractmethod
|
| 266 |
+
def mark_blocks_as_accessed(self, block_ids: List[int],
|
| 267 |
+
now: float) -> None:
|
| 268 |
+
pass
|
| 269 |
+
|
| 270 |
+
@abstractmethod
|
| 271 |
+
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
| 272 |
+
pass
|
| 273 |
+
|
| 274 |
+
@abstractmethod
|
| 275 |
+
def get_common_computed_block_ids(
|
| 276 |
+
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
| 277 |
+
pass
|
| 278 |
+
|
| 279 |
+
@abstractmethod
|
| 280 |
+
def get_num_full_blocks_touched(self, blocks: List[Block],
|
| 281 |
+
device: Device) -> int:
|
| 282 |
+
pass
|
| 283 |
+
|
| 284 |
+
@abstractmethod
|
| 285 |
+
def swap(self, blocks: List[Block], src_device: Device,
|
| 286 |
+
dst_device: Device) -> Dict[int, int]:
|
| 287 |
+
pass
|
| 288 |
+
|
| 289 |
+
@abstractmethod
|
| 290 |
+
def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
|
| 291 |
+
pass
|
| 292 |
+
|
| 293 |
+
@abstractmethod
|
| 294 |
+
def allocate_or_get_null_block(self) -> Block:
|
| 295 |
+
"""
|
| 296 |
+
Null blocks are used as a placeholders for KV cache blocks that have
|
| 297 |
+
been dropped due to sliding window.
|
| 298 |
+
There is at most one null block per allocator.
|
| 299 |
+
"""
|
| 300 |
+
pass
|
| 301 |
+
|
| 302 |
+
@abstractmethod
|
| 303 |
+
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
| 304 |
+
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
| 305 |
+
pass
|
| 306 |
+
|
| 307 |
+
@abstractmethod
|
| 308 |
+
def reset_prefix_cache(self) -> bool:
|
| 309 |
+
"""Reset prefix cache."""
|
| 310 |
+
pass
|
| 311 |
+
|
| 312 |
+
@abstractmethod
|
| 313 |
+
def find_cached_blocks_prefix(
|
| 314 |
+
self,
|
| 315 |
+
block_hashes: List[int],
|
| 316 |
+
device: Device = Device.GPU,
|
| 317 |
+
) -> List[int]:
|
| 318 |
+
pass
|
.venv/lib/python3.11/site-packages/vllm/core/block/naive_block.py
ADDED
|
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from collections import deque
|
| 4 |
+
from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
|
| 5 |
+
|
| 6 |
+
from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
|
| 7 |
+
get_all_blocks_recursively)
|
| 8 |
+
from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
|
| 9 |
+
|
| 10 |
+
Refcount = int
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class NaiveBlockAllocator(BlockAllocator):
|
| 14 |
+
"""A simple block allocator that manages blocks of memory without prefix
|
| 15 |
+
caching.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
create_block (Block.Factory): A factory function for creating new
|
| 19 |
+
blocks. This is used when a NaiveBlockAllocator is composed within
|
| 20 |
+
a prefix caching allocator -- the naive block allocator must
|
| 21 |
+
construct prefix caching blocks (but shouldn't know anything else
|
| 22 |
+
about them).
|
| 23 |
+
num_blocks (int): The total number of blocks to manage.
|
| 24 |
+
block_size (int): The size of each block in tokens.
|
| 25 |
+
block_ids (Optional[Iterable[int]], optional): An optional iterable of
|
| 26 |
+
block IDs. If not provided, block IDs will be assigned sequentially
|
| 27 |
+
from 0 to num_blocks - 1.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
def __init__(
|
| 31 |
+
self,
|
| 32 |
+
create_block: Block.Factory,
|
| 33 |
+
num_blocks: int,
|
| 34 |
+
block_size: int,
|
| 35 |
+
block_ids: Optional[Iterable[int]] = None,
|
| 36 |
+
block_pool: Optional[BlockPool] = None,
|
| 37 |
+
):
|
| 38 |
+
if block_ids is None:
|
| 39 |
+
block_ids = range(num_blocks)
|
| 40 |
+
|
| 41 |
+
self._free_block_indices: Deque[BlockId] = deque(block_ids)
|
| 42 |
+
self._all_block_indices = frozenset(block_ids)
|
| 43 |
+
assert len(self._all_block_indices) == num_blocks
|
| 44 |
+
|
| 45 |
+
self._refcounter = RefCounter(
|
| 46 |
+
all_block_indices=self._free_block_indices)
|
| 47 |
+
self._block_size = block_size
|
| 48 |
+
|
| 49 |
+
self._cow_tracker = CopyOnWriteTracker(
|
| 50 |
+
refcounter=self._refcounter.as_readonly())
|
| 51 |
+
|
| 52 |
+
if block_pool is None:
|
| 53 |
+
extra_factor = 4
|
| 54 |
+
# Pre-allocate "num_blocks * extra_factor" block objects.
|
| 55 |
+
# The "* extra_factor" is a buffer to allow more block objects
|
| 56 |
+
# than physical blocks
|
| 57 |
+
self._block_pool = BlockPool(self._block_size, create_block, self,
|
| 58 |
+
num_blocks * extra_factor)
|
| 59 |
+
else:
|
| 60 |
+
# In this case, the block pool is provided by the caller,
|
| 61 |
+
# which means that there is most likely a need to share
|
| 62 |
+
# a block pool between allocators
|
| 63 |
+
self._block_pool = block_pool
|
| 64 |
+
|
| 65 |
+
def allocate_immutable_block(self,
|
| 66 |
+
prev_block: Optional[Block],
|
| 67 |
+
token_ids: List[int],
|
| 68 |
+
extra_hash: Optional[int] = None,
|
| 69 |
+
device: Optional[Device] = None) -> Block:
|
| 70 |
+
"""Allocates a new immutable block with the given token IDs, linked to
|
| 71 |
+
the previous block.
|
| 72 |
+
|
| 73 |
+
Args:
|
| 74 |
+
prev_block (Optional[Block]): The previous block in the sequence. If
|
| 75 |
+
None, then the block to be allocated is the first block in the
|
| 76 |
+
sequence.
|
| 77 |
+
token_ids (List[int]): The token IDs to be stored in the new block.
|
| 78 |
+
|
| 79 |
+
Returns:
|
| 80 |
+
Block: The newly allocated immutable block.
|
| 81 |
+
"""
|
| 82 |
+
assert device is None
|
| 83 |
+
block = self.allocate_mutable_block(prev_block=prev_block)
|
| 84 |
+
block.append_token_ids(token_ids)
|
| 85 |
+
return block
|
| 86 |
+
|
| 87 |
+
def allocate_immutable_blocks(
|
| 88 |
+
self,
|
| 89 |
+
prev_block: Optional[Block],
|
| 90 |
+
block_token_ids: List[List[int]],
|
| 91 |
+
extra_hash: Optional[int] = None,
|
| 92 |
+
device: Optional[Device] = None) -> List[Block]:
|
| 93 |
+
assert device is None
|
| 94 |
+
num_blocks = len(block_token_ids)
|
| 95 |
+
|
| 96 |
+
block_ids = []
|
| 97 |
+
for i in range(num_blocks):
|
| 98 |
+
block_ids.append(self._allocate_block_id())
|
| 99 |
+
|
| 100 |
+
blocks = []
|
| 101 |
+
for i in range(num_blocks):
|
| 102 |
+
prev_block = self._block_pool.init_block(
|
| 103 |
+
prev_block=prev_block,
|
| 104 |
+
token_ids=block_token_ids[i],
|
| 105 |
+
block_size=self._block_size,
|
| 106 |
+
physical_block_id=block_ids[i])
|
| 107 |
+
blocks.append(prev_block)
|
| 108 |
+
|
| 109 |
+
return blocks
|
| 110 |
+
|
| 111 |
+
def allocate_mutable_block(self,
|
| 112 |
+
prev_block: Optional[Block],
|
| 113 |
+
extra_hash: Optional[int] = None,
|
| 114 |
+
device: Optional[Device] = None) -> Block:
|
| 115 |
+
"""Allocates a new mutable block, linked to the previous block.
|
| 116 |
+
|
| 117 |
+
Args:
|
| 118 |
+
prev_block (Optional[Block]): The previous block in the sequence. If
|
| 119 |
+
None, then the block to be allocated is the first block in the
|
| 120 |
+
sequence.
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
Block: The newly allocated mutable block.
|
| 124 |
+
"""
|
| 125 |
+
assert device is None
|
| 126 |
+
block_id = self._allocate_block_id()
|
| 127 |
+
block = self._block_pool.init_block(prev_block=prev_block,
|
| 128 |
+
token_ids=[],
|
| 129 |
+
block_size=self._block_size,
|
| 130 |
+
physical_block_id=block_id)
|
| 131 |
+
return block
|
| 132 |
+
|
| 133 |
+
def _allocate_block_id(self) -> BlockId:
|
| 134 |
+
if not self._free_block_indices:
|
| 135 |
+
raise BlockAllocator.NoFreeBlocksError()
|
| 136 |
+
|
| 137 |
+
block_id = self._free_block_indices.popleft()
|
| 138 |
+
self._refcounter.incr(block_id)
|
| 139 |
+
return block_id
|
| 140 |
+
|
| 141 |
+
def _free_block_id(self, block: Union[Block, BlockId]) -> None:
|
| 142 |
+
if isinstance(block, Block):
|
| 143 |
+
block_id = block.block_id
|
| 144 |
+
block.block_id = None
|
| 145 |
+
else:
|
| 146 |
+
block_id = block
|
| 147 |
+
assert block_id is not None
|
| 148 |
+
|
| 149 |
+
refcount = self._refcounter.decr(block_id)
|
| 150 |
+
if refcount == 0:
|
| 151 |
+
self._free_block_indices.appendleft(block_id)
|
| 152 |
+
|
| 153 |
+
def free(self, block: Block, keep_block_object: bool = False) -> None:
|
| 154 |
+
# Release the physical block id
|
| 155 |
+
self._free_block_id(block)
|
| 156 |
+
|
| 157 |
+
# Release the block object
|
| 158 |
+
if not keep_block_object:
|
| 159 |
+
self._block_pool.free_block(block)
|
| 160 |
+
|
| 161 |
+
def free_block_id(self, block_id: BlockId) -> None:
|
| 162 |
+
self._free_block_id(block_id)
|
| 163 |
+
|
| 164 |
+
def fork(self, last_block: Block) -> List[Block]:
|
| 165 |
+
"""Creates a new sequence of blocks that shares the same underlying
|
| 166 |
+
memory as the original sequence.
|
| 167 |
+
|
| 168 |
+
Args:
|
| 169 |
+
last_block (Block): The last block in the original sequence.
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
List[Block]: The new sequence of blocks that shares the same memory
|
| 173 |
+
as the original sequence.
|
| 174 |
+
"""
|
| 175 |
+
source_blocks = get_all_blocks_recursively(last_block)
|
| 176 |
+
|
| 177 |
+
forked_blocks: List[Block] = []
|
| 178 |
+
prev_block = None
|
| 179 |
+
for block in source_blocks:
|
| 180 |
+
|
| 181 |
+
# Increment refcount for each block.
|
| 182 |
+
assert block.block_id is not None
|
| 183 |
+
refcount = self._refcounter.incr(block.block_id)
|
| 184 |
+
assert refcount != 1, "can't fork free'd block"
|
| 185 |
+
|
| 186 |
+
forked_block = self._block_pool.init_block(
|
| 187 |
+
prev_block=prev_block,
|
| 188 |
+
token_ids=block.token_ids,
|
| 189 |
+
block_size=self._block_size,
|
| 190 |
+
physical_block_id=block.block_id)
|
| 191 |
+
|
| 192 |
+
forked_blocks.append(forked_block)
|
| 193 |
+
prev_block = forked_blocks[-1]
|
| 194 |
+
|
| 195 |
+
return forked_blocks
|
| 196 |
+
|
| 197 |
+
def get_num_free_blocks(self) -> int:
|
| 198 |
+
return len(self._free_block_indices)
|
| 199 |
+
|
| 200 |
+
def get_num_total_blocks(self) -> int:
|
| 201 |
+
return len(self._all_block_indices)
|
| 202 |
+
|
| 203 |
+
def get_physical_block_id(self, absolute_id: int) -> int:
|
| 204 |
+
"""Returns the zero-offset block id on certain block allocator
|
| 205 |
+
given the absolute block id.
|
| 206 |
+
|
| 207 |
+
Args:
|
| 208 |
+
absolute_id (int): The absolute block id for the block
|
| 209 |
+
in whole allocator.
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
int: The zero-offset block id on certain device.
|
| 213 |
+
"""
|
| 214 |
+
return sorted(self._all_block_indices).index(absolute_id)
|
| 215 |
+
|
| 216 |
+
@property
|
| 217 |
+
def refcounter(self):
|
| 218 |
+
return self._refcounter
|
| 219 |
+
|
| 220 |
+
@property
|
| 221 |
+
def all_block_ids(self) -> FrozenSet[int]:
|
| 222 |
+
return self._all_block_indices
|
| 223 |
+
|
| 224 |
+
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
| 225 |
+
"""Performs a copy-on-write operation on the given block if it is not
|
| 226 |
+
appendable.
|
| 227 |
+
|
| 228 |
+
Args:
|
| 229 |
+
block (Block): The block to check for copy-on-write.
|
| 230 |
+
|
| 231 |
+
Returns:
|
| 232 |
+
BlockId: The block index of the new block if a copy-on-write
|
| 233 |
+
operation was performed, or the original block index if
|
| 234 |
+
no copy-on-write was necessary.
|
| 235 |
+
"""
|
| 236 |
+
src_block_id = block.block_id
|
| 237 |
+
assert src_block_id is not None
|
| 238 |
+
|
| 239 |
+
if self._cow_tracker.is_appendable(block):
|
| 240 |
+
return src_block_id
|
| 241 |
+
|
| 242 |
+
self._free_block_id(block)
|
| 243 |
+
trg_block_id = self._allocate_block_id()
|
| 244 |
+
|
| 245 |
+
self._cow_tracker.record_cow(src_block_id, trg_block_id)
|
| 246 |
+
|
| 247 |
+
return trg_block_id
|
| 248 |
+
|
| 249 |
+
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
| 250 |
+
"""Returns the copy-on-write source->destination mapping and clears it.
|
| 251 |
+
|
| 252 |
+
Returns:
|
| 253 |
+
List[Tuple[BlockId, BlockId]]: A list mapping source
|
| 254 |
+
block indices to destination block indices.
|
| 255 |
+
"""
|
| 256 |
+
return self._cow_tracker.clear_cows()
|
| 257 |
+
|
| 258 |
+
def mark_blocks_as_accessed(self, block_ids: List[int],
|
| 259 |
+
now: float) -> None:
|
| 260 |
+
"""Mark blocks as accessed, used in prefix caching.
|
| 261 |
+
|
| 262 |
+
Since the naive allocator does not implement prefix caching, we do
|
| 263 |
+
nothing.
|
| 264 |
+
"""
|
| 265 |
+
pass
|
| 266 |
+
|
| 267 |
+
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
| 268 |
+
"""Mark blocks as computed, used in prefix caching.
|
| 269 |
+
|
| 270 |
+
Since the naive allocator does not implement prefix caching, we do
|
| 271 |
+
nothing.
|
| 272 |
+
"""
|
| 273 |
+
pass
|
| 274 |
+
|
| 275 |
+
def get_common_computed_block_ids(
|
| 276 |
+
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
| 277 |
+
"""Determine blocks that can be skipped in prefill.
|
| 278 |
+
|
| 279 |
+
Since the naive allocator does not support prefix caching, always return
|
| 280 |
+
an empty list.
|
| 281 |
+
"""
|
| 282 |
+
return []
|
| 283 |
+
|
| 284 |
+
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
| 285 |
+
raise NotImplementedError("There is no promotion for naive blocks")
|
| 286 |
+
|
| 287 |
+
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
| 288 |
+
"""Returns the number of full blocks that will be touched by
|
| 289 |
+
swapping in/out.
|
| 290 |
+
|
| 291 |
+
Args:
|
| 292 |
+
blocks: List of blocks to be swapped.
|
| 293 |
+
Returns:
|
| 294 |
+
int: the number of full blocks that will be touched by
|
| 295 |
+
swapping in/out the given blocks. Non full blocks are ignored
|
| 296 |
+
when deciding the number of blocks to touch.
|
| 297 |
+
"""
|
| 298 |
+
# NOTE: for naive block, we use set to eliminate common blocks among
|
| 299 |
+
# seqs, also we compare the empty slots in the mutable blocks with
|
| 300 |
+
# lookahead slots to get the number of unique new block that are
|
| 301 |
+
# needed.
|
| 302 |
+
old_block_set = set()
|
| 303 |
+
for block in blocks:
|
| 304 |
+
if block.is_full:
|
| 305 |
+
old_block_set.add(block)
|
| 306 |
+
return len(old_block_set)
|
| 307 |
+
|
| 308 |
+
def swap_out(self, blocks: List[Block]) -> None:
|
| 309 |
+
for block in blocks:
|
| 310 |
+
self._free_block_id(block)
|
| 311 |
+
|
| 312 |
+
def swap_in(self, blocks: List[Block]) -> None:
|
| 313 |
+
for block in blocks:
|
| 314 |
+
# Here we allocate either immutable or mutable block and then
|
| 315 |
+
# extract its block_id. Note that the block object is released
|
| 316 |
+
# and the block_id is assigned to "block" to allow reusing the
|
| 317 |
+
# existing "block" object
|
| 318 |
+
if block.is_full:
|
| 319 |
+
tmp_block = self.allocate_immutable_block(
|
| 320 |
+
prev_block=block.prev_block, token_ids=block.token_ids)
|
| 321 |
+
else:
|
| 322 |
+
tmp_block = self.allocate_mutable_block(
|
| 323 |
+
prev_block=block.prev_block)
|
| 324 |
+
tmp_block.append_token_ids(block.token_ids)
|
| 325 |
+
|
| 326 |
+
block_id = tmp_block.block_id
|
| 327 |
+
tmp_block.block_id = None
|
| 328 |
+
self._block_pool.free_block(tmp_block)
|
| 329 |
+
|
| 330 |
+
block.block_id = block_id # Assign block_id
|
| 331 |
+
|
| 332 |
+
def get_prefix_cache_hit_rate(self) -> float:
|
| 333 |
+
return -1
|
| 334 |
+
|
| 335 |
+
def reset_prefix_cache(self) -> bool:
|
| 336 |
+
"""No prefix cache for naive block allocator."""
|
| 337 |
+
return True
|
| 338 |
+
|
| 339 |
+
def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
|
| 340 |
+
# Not applicable for naive block allocator.
|
| 341 |
+
return []
|
| 342 |
+
|
| 343 |
+
|
| 344 |
+
class NaiveBlock(Block):
|
| 345 |
+
"""An implementation of the Block class that does not support prefix
|
| 346 |
+
caching.
|
| 347 |
+
|
| 348 |
+
The NaiveBlock class represents a block of token IDs with a fixed size. It
|
| 349 |
+
provides methods for appending token IDs to the block and manages copy-on
|
| 350 |
+
-write operations when necessary.
|
| 351 |
+
|
| 352 |
+
Args:
|
| 353 |
+
prev_block (Block): The previous block in the sequence.
|
| 354 |
+
token_ids (List[int]): The initial token IDs to be stored in the block.
|
| 355 |
+
block_size (int): The maximum number of token IDs that can be stored in
|
| 356 |
+
the block.
|
| 357 |
+
allocator (BlockAllocator): The block allocator associated with this
|
| 358 |
+
block.
|
| 359 |
+
block_id (Optional[int], optional): The physical block index
|
| 360 |
+
of this block. Defaults to None, which means no allocation has been
|
| 361 |
+
made.
|
| 362 |
+
_cow_target (Optional[Block], optional): The copy-on-write target block.
|
| 363 |
+
If not provided, it defaults to self.
|
| 364 |
+
"""
|
| 365 |
+
|
| 366 |
+
def __init__(self,
|
| 367 |
+
prev_block: Optional[Block],
|
| 368 |
+
token_ids: List[int],
|
| 369 |
+
block_size: int,
|
| 370 |
+
allocator: BlockAllocator,
|
| 371 |
+
block_id: Optional[int] = None,
|
| 372 |
+
_cow_target: Optional[Block] = None,
|
| 373 |
+
extra_hash: Optional[int] = None):
|
| 374 |
+
self._token_ids: List[int] = []
|
| 375 |
+
self._block_size = block_size
|
| 376 |
+
self._prev_block = prev_block
|
| 377 |
+
self._block_id = block_id
|
| 378 |
+
self._allocator = allocator
|
| 379 |
+
self._cow_target = _cow_target if _cow_target is not None else self
|
| 380 |
+
|
| 381 |
+
self._append_token_ids_no_cow(token_ids)
|
| 382 |
+
|
| 383 |
+
def append_token_ids(self, token_ids: List[int]) -> None:
|
| 384 |
+
"""Appends the given token IDs to the block and performs a
|
| 385 |
+
copy-on-write if necessary.
|
| 386 |
+
|
| 387 |
+
Args:
|
| 388 |
+
token_ids (Optional[List[int]]): The token IDs to be appended
|
| 389 |
+
to the block.
|
| 390 |
+
"""
|
| 391 |
+
self._append_token_ids_no_cow(token_ids)
|
| 392 |
+
|
| 393 |
+
if self._block_id is not None:
|
| 394 |
+
self._block_id = (self._allocator.cow_block_if_not_appendable(
|
| 395 |
+
self._cow_target))
|
| 396 |
+
|
| 397 |
+
def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
|
| 398 |
+
"""Appends the given token IDs to the block
|
| 399 |
+
|
| 400 |
+
Args:
|
| 401 |
+
token_ids (List[int]): The token IDs to be appended to the block.
|
| 402 |
+
"""
|
| 403 |
+
if len(token_ids) == 0:
|
| 404 |
+
return
|
| 405 |
+
|
| 406 |
+
assert len(token_ids) <= self.num_empty_slots
|
| 407 |
+
|
| 408 |
+
self._token_ids.extend(token_ids)
|
| 409 |
+
|
| 410 |
+
@property
|
| 411 |
+
def computed(self) -> bool:
|
| 412 |
+
raise NotImplementedError
|
| 413 |
+
|
| 414 |
+
@computed.setter
|
| 415 |
+
def computed(self, value) -> None:
|
| 416 |
+
raise NotImplementedError
|
| 417 |
+
|
| 418 |
+
@property
|
| 419 |
+
def last_accessed(self) -> float:
|
| 420 |
+
raise NotImplementedError
|
| 421 |
+
|
| 422 |
+
@last_accessed.setter
|
| 423 |
+
def last_accessed(self, last_accessed_ts: float):
|
| 424 |
+
raise NotImplementedError
|
| 425 |
+
|
| 426 |
+
@property
|
| 427 |
+
def block_id(self) -> Optional[int]:
|
| 428 |
+
return self._block_id
|
| 429 |
+
|
| 430 |
+
@block_id.setter
|
| 431 |
+
def block_id(self, value: Optional[int]) -> None:
|
| 432 |
+
self._block_id = value
|
| 433 |
+
|
| 434 |
+
@property
|
| 435 |
+
def is_full(self) -> bool:
|
| 436 |
+
return self.num_empty_slots == 0
|
| 437 |
+
|
| 438 |
+
@property
|
| 439 |
+
def num_empty_slots(self) -> int:
|
| 440 |
+
return self._block_size - len(self.token_ids)
|
| 441 |
+
|
| 442 |
+
@property
|
| 443 |
+
def token_ids(self) -> List[int]:
|
| 444 |
+
return self._token_ids
|
| 445 |
+
|
| 446 |
+
@property
|
| 447 |
+
def num_tokens_total(self) -> int:
|
| 448 |
+
raise NotImplementedError(
|
| 449 |
+
"num_tokens_total is not used for naive block")
|
| 450 |
+
|
| 451 |
+
@property
|
| 452 |
+
def block_size(self) -> int:
|
| 453 |
+
return self._block_size
|
| 454 |
+
|
| 455 |
+
@property
|
| 456 |
+
def prev_block(self) -> Optional["Block"]:
|
| 457 |
+
return self._prev_block
|
| 458 |
+
|
| 459 |
+
@property
|
| 460 |
+
def extra_hash(self):
|
| 461 |
+
return None
|
| 462 |
+
|
| 463 |
+
@property
|
| 464 |
+
def content_hash(self) -> Optional[int]:
|
| 465 |
+
return None
|
.venv/lib/python3.11/site-packages/vllm/core/block/prefix_caching_block.py
ADDED
|
@@ -0,0 +1,1134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
"""Token blocks."""
|
| 3 |
+
import sys
|
| 4 |
+
from bisect import bisect_left
|
| 5 |
+
from os.path import commonprefix
|
| 6 |
+
from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
|
| 7 |
+
Tuple)
|
| 8 |
+
|
| 9 |
+
from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
|
| 10 |
+
get_all_blocks_recursively)
|
| 11 |
+
from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
|
| 12 |
+
DeviceAwareBlockAllocator)
|
| 13 |
+
from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
|
| 14 |
+
NaiveBlockAllocator)
|
| 15 |
+
from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
|
| 16 |
+
from vllm.logger import init_logger
|
| 17 |
+
from vllm.sequence import Sequence
|
| 18 |
+
|
| 19 |
+
PrefixHash = int
|
| 20 |
+
|
| 21 |
+
# By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
|
| 22 |
+
# so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
|
| 23 |
+
# then we know this block hasn't been accessed yet.
|
| 24 |
+
_DEFAULT_LAST_ACCESSED_TIME = -1
|
| 25 |
+
|
| 26 |
+
logger = init_logger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class BlockTracker:
|
| 30 |
+
"""Used to track the status of a block inside the prefix caching allocator
|
| 31 |
+
"""
|
| 32 |
+
__slots__ = ("active", "last_accessed", "computed")
|
| 33 |
+
|
| 34 |
+
def reset(self):
|
| 35 |
+
self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
|
| 36 |
+
self.computed: bool = False
|
| 37 |
+
|
| 38 |
+
def __init__(self):
|
| 39 |
+
self.active: bool = False
|
| 40 |
+
self.reset()
|
| 41 |
+
|
| 42 |
+
def enable(self):
|
| 43 |
+
assert not self.active
|
| 44 |
+
self.active = True
|
| 45 |
+
self.reset()
|
| 46 |
+
|
| 47 |
+
def disable(self):
|
| 48 |
+
assert self.active
|
| 49 |
+
self.active = False
|
| 50 |
+
self.reset()
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
class PrefixCachingBlockAllocator(BlockAllocator):
|
| 54 |
+
"""A block allocator that implements prefix caching.
|
| 55 |
+
|
| 56 |
+
The PrefixCachingBlockAllocator maintains a cache of blocks based on their
|
| 57 |
+
content hash. It reuses blocks with the same content hash to avoid redundant
|
| 58 |
+
memory allocation. The allocator also supports copy-on-write operations.
|
| 59 |
+
|
| 60 |
+
Args:
|
| 61 |
+
num_blocks (int): The total number of blocks to manage.
|
| 62 |
+
block_size (int): The size of each block in tokens.
|
| 63 |
+
block_ids(Optional[Iterable[int]], optional): An optional iterable of
|
| 64 |
+
block IDs. If not provided, block IDs will be assigned sequentially
|
| 65 |
+
from 0 to num_blocks - 1.
|
| 66 |
+
"""
|
| 67 |
+
|
| 68 |
+
# Note that we use 'None' as a string here instead of None because
|
| 69 |
+
# as of Python 3.12, hash(None) returns a constant predictable value.
|
| 70 |
+
# This could possibly make it easier to find and exploit hash
|
| 71 |
+
# collisions. 'None' as a string will be hashed differently per process,
|
| 72 |
+
# but consistently within the same process. This is the same as the
|
| 73 |
+
# behavior of None prior to Python 3.12.
|
| 74 |
+
_none_hash: int = hash('None')
|
| 75 |
+
|
| 76 |
+
# Implements Block.Factory.
|
| 77 |
+
def __init__(
|
| 78 |
+
self,
|
| 79 |
+
num_blocks: int,
|
| 80 |
+
block_size: int,
|
| 81 |
+
block_ids: Optional[Iterable[int]] = None,
|
| 82 |
+
eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
|
| 83 |
+
):
|
| 84 |
+
if block_ids is None:
|
| 85 |
+
block_ids = range(num_blocks)
|
| 86 |
+
|
| 87 |
+
self._block_size = block_size
|
| 88 |
+
|
| 89 |
+
# A mapping of prefix hash to block index. All blocks which have a
|
| 90 |
+
# prefix hash will be in this dict, even if they have refcount 0.
|
| 91 |
+
self._cached_blocks: Dict[PrefixHash, BlockId] = {}
|
| 92 |
+
|
| 93 |
+
# A list of immutable block IDs that have been touched by scheduler
|
| 94 |
+
# and should be marked as computed after an entire batch of sequences
|
| 95 |
+
# are scheduled.
|
| 96 |
+
self._touched_blocks: Set[BlockId] = set()
|
| 97 |
+
|
| 98 |
+
# Used to track status of each physical block id
|
| 99 |
+
self._block_tracker: Dict[BlockId, BlockTracker] = {}
|
| 100 |
+
for block_id in block_ids:
|
| 101 |
+
self._block_tracker[block_id] = BlockTracker()
|
| 102 |
+
|
| 103 |
+
# Pre-allocate "num_blocks * extra_factor" block objects.
|
| 104 |
+
# The "* extra_factor" is a buffer to allow more block objects
|
| 105 |
+
# than physical blocks
|
| 106 |
+
extra_factor = 4
|
| 107 |
+
self._block_pool = BlockPool(self._block_size, self._create_block,
|
| 108 |
+
self, num_blocks * extra_factor)
|
| 109 |
+
|
| 110 |
+
# An allocator for blocks that do not have prefix hashes.
|
| 111 |
+
self._hashless_allocator = NaiveBlockAllocator(
|
| 112 |
+
create_block=self._create_block, # type: ignore
|
| 113 |
+
num_blocks=num_blocks,
|
| 114 |
+
block_size=block_size,
|
| 115 |
+
block_ids=block_ids,
|
| 116 |
+
block_pool=self._block_pool, # Share block pool here
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Evitor used to maintain how we want to handle those computed blocks
|
| 120 |
+
# if we find memory pressure is high.
|
| 121 |
+
self.eviction_policy = eviction_policy
|
| 122 |
+
self.evictor: Evictor = make_evictor(self.eviction_policy)
|
| 123 |
+
|
| 124 |
+
# We share the refcounter between allocators. This allows us to promote
|
| 125 |
+
# blocks originally allocated in the hashless allocator to immutable
|
| 126 |
+
# blocks.
|
| 127 |
+
self._refcounter = self._hashless_allocator.refcounter
|
| 128 |
+
|
| 129 |
+
self._cow_tracker = CopyOnWriteTracker(
|
| 130 |
+
refcounter=self._refcounter.as_readonly())
|
| 131 |
+
|
| 132 |
+
self.metric_data = CacheMetricData()
|
| 133 |
+
|
| 134 |
+
def _create_block(
|
| 135 |
+
self,
|
| 136 |
+
prev_block: Optional[Block],
|
| 137 |
+
token_ids: List[int],
|
| 138 |
+
block_size: int,
|
| 139 |
+
allocator: BlockAllocator,
|
| 140 |
+
block_id: Optional[int] = None,
|
| 141 |
+
computed: bool = False,
|
| 142 |
+
extra_hash: Optional[int] = None,
|
| 143 |
+
) -> Block:
|
| 144 |
+
# Bind block to self.
|
| 145 |
+
allocator = self
|
| 146 |
+
|
| 147 |
+
return PrefixCachingBlock(
|
| 148 |
+
prev_block=prev_block,
|
| 149 |
+
token_ids=token_ids,
|
| 150 |
+
block_size=block_size,
|
| 151 |
+
block_id=block_id,
|
| 152 |
+
allocator=allocator,
|
| 153 |
+
computed=computed,
|
| 154 |
+
extra_hash=extra_hash,
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
def allocate_immutable_block(self,
|
| 158 |
+
prev_block: Optional[Block],
|
| 159 |
+
token_ids: List[int],
|
| 160 |
+
extra_hash: Optional[int] = None,
|
| 161 |
+
device: Optional[Device] = None) -> Block:
|
| 162 |
+
"""Allocates an immutable block with the given token IDs, reusing cached
|
| 163 |
+
blocks if possible.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
prev_block (Optional[Block]): The previous block in the sequence.
|
| 167 |
+
token_ids (List[int]): The token IDs to be stored in the block.
|
| 168 |
+
|
| 169 |
+
Returns:
|
| 170 |
+
Block: The allocated immutable block.
|
| 171 |
+
"""
|
| 172 |
+
assert device is None
|
| 173 |
+
assert_prefix_caching_block_or_none(prev_block)
|
| 174 |
+
|
| 175 |
+
# First, try to create a block that points to cached data
|
| 176 |
+
block = self._block_pool.init_block(prev_block=prev_block,
|
| 177 |
+
token_ids=token_ids,
|
| 178 |
+
block_size=self._block_size,
|
| 179 |
+
physical_block_id=None,
|
| 180 |
+
extra_hash=extra_hash)
|
| 181 |
+
assert block.content_hash is not None
|
| 182 |
+
|
| 183 |
+
cached_block_id = self._cached_blocks.get(block.content_hash, None)
|
| 184 |
+
if cached_block_id is not None:
|
| 185 |
+
self.metric_data.query(hit=True)
|
| 186 |
+
block.block_id = cached_block_id
|
| 187 |
+
self._incr_refcount_cached_block(block)
|
| 188 |
+
return block
|
| 189 |
+
self.metric_data.query(hit=False)
|
| 190 |
+
self._block_pool.free_block(block)
|
| 191 |
+
|
| 192 |
+
# No cached block => Allocate a new block
|
| 193 |
+
block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
|
| 194 |
+
block.append_token_ids(token_ids)
|
| 195 |
+
return block
|
| 196 |
+
|
| 197 |
+
def allocate_immutable_blocks(
|
| 198 |
+
self,
|
| 199 |
+
prev_block: Optional[Block],
|
| 200 |
+
block_token_ids: List[List[int]],
|
| 201 |
+
extra_hash: Optional[int] = None,
|
| 202 |
+
device: Optional[Device] = None) -> List[Block]:
|
| 203 |
+
blocks = []
|
| 204 |
+
for token_ids in block_token_ids:
|
| 205 |
+
prev_block = self.allocate_immutable_block(prev_block=prev_block,
|
| 206 |
+
token_ids=token_ids,
|
| 207 |
+
device=device,
|
| 208 |
+
extra_hash=extra_hash)
|
| 209 |
+
blocks.append(prev_block)
|
| 210 |
+
return blocks
|
| 211 |
+
|
| 212 |
+
def allocate_mutable_block(self,
|
| 213 |
+
prev_block: Optional[Block],
|
| 214 |
+
extra_hash: Optional[int] = None,
|
| 215 |
+
device: Optional[Device] = None) -> Block:
|
| 216 |
+
"""Allocates a mutable block. If there are no free blocks, this will
|
| 217 |
+
evict unused cached blocks.
|
| 218 |
+
|
| 219 |
+
Args:
|
| 220 |
+
prev_block (Block): The previous block in the sequence.
|
| 221 |
+
None is not allowed unlike it is super class.
|
| 222 |
+
|
| 223 |
+
Returns:
|
| 224 |
+
Block: The allocated mutable block.
|
| 225 |
+
"""
|
| 226 |
+
assert device is None
|
| 227 |
+
assert_prefix_caching_block_or_none(prev_block)
|
| 228 |
+
|
| 229 |
+
block_id = self._allocate_block_id()
|
| 230 |
+
block = self._block_pool.init_block(prev_block=prev_block,
|
| 231 |
+
token_ids=[],
|
| 232 |
+
block_size=self._block_size,
|
| 233 |
+
physical_block_id=block_id,
|
| 234 |
+
extra_hash=extra_hash)
|
| 235 |
+
assert not block.computed
|
| 236 |
+
assert block.content_hash is None
|
| 237 |
+
return block
|
| 238 |
+
|
| 239 |
+
def _incr_refcount_cached_block(self, block: Block) -> None:
|
| 240 |
+
# Set this block to be "computed" since it is pointing to a
|
| 241 |
+
# cached block id (which was already computed)
|
| 242 |
+
block.computed = True
|
| 243 |
+
|
| 244 |
+
block_id = block.block_id
|
| 245 |
+
assert block_id is not None
|
| 246 |
+
|
| 247 |
+
refcount = self._refcounter.incr(block_id)
|
| 248 |
+
if refcount == 1:
|
| 249 |
+
# In case a cached block was evicted, restore its tracking
|
| 250 |
+
if block_id in self.evictor:
|
| 251 |
+
self.evictor.remove(block_id)
|
| 252 |
+
|
| 253 |
+
self._track_block_id(block_id, computed=True)
|
| 254 |
+
|
| 255 |
+
def _decr_refcount_cached_block(self, block: Block) -> None:
|
| 256 |
+
# Ensure this is immutable/cached block
|
| 257 |
+
assert block.content_hash is not None
|
| 258 |
+
|
| 259 |
+
block_id = block.block_id
|
| 260 |
+
assert block_id is not None
|
| 261 |
+
|
| 262 |
+
refcount = self._refcounter.decr(block_id)
|
| 263 |
+
if refcount > 0:
|
| 264 |
+
block.block_id = None
|
| 265 |
+
return
|
| 266 |
+
else:
|
| 267 |
+
assert refcount == 0
|
| 268 |
+
|
| 269 |
+
# No longer used
|
| 270 |
+
assert block.content_hash in self._cached_blocks
|
| 271 |
+
|
| 272 |
+
# Add the cached block to the evictor
|
| 273 |
+
# (This keeps the cached block around so it can be reused)
|
| 274 |
+
self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
|
| 275 |
+
self._block_tracker[block_id].last_accessed)
|
| 276 |
+
|
| 277 |
+
# Stop tracking the block
|
| 278 |
+
self._untrack_block_id(block_id)
|
| 279 |
+
|
| 280 |
+
block.block_id = None
|
| 281 |
+
|
| 282 |
+
def _decr_refcount_hashless_block(self, block: Block) -> None:
|
| 283 |
+
block_id = block.block_id
|
| 284 |
+
assert block_id is not None
|
| 285 |
+
|
| 286 |
+
# We may have a fork case where block is shared,
|
| 287 |
+
# in which case, we cannot remove it from tracking
|
| 288 |
+
refcount = self._refcounter.get(block_id)
|
| 289 |
+
if refcount == 1:
|
| 290 |
+
self._untrack_block_id(block_id)
|
| 291 |
+
|
| 292 |
+
# Decrement refcount of the block_id, but do not free the block object
|
| 293 |
+
# itself (will be handled by the caller)
|
| 294 |
+
self._hashless_allocator.free(block, keep_block_object=True)
|
| 295 |
+
|
| 296 |
+
def _allocate_block_id(self) -> BlockId:
|
| 297 |
+
"""First tries to allocate a block id from the hashless allocator,
|
| 298 |
+
and if there are no blocks, then tries to evict an unused cached block.
|
| 299 |
+
"""
|
| 300 |
+
hashless_block_id = self._maybe_allocate_hashless_block_id()
|
| 301 |
+
if hashless_block_id is not None:
|
| 302 |
+
return hashless_block_id
|
| 303 |
+
|
| 304 |
+
evicted_block_id = self._maybe_allocate_evicted_block_id()
|
| 305 |
+
if evicted_block_id is not None:
|
| 306 |
+
return evicted_block_id
|
| 307 |
+
|
| 308 |
+
# No block available in hashless allocator, nor in unused cache blocks.
|
| 309 |
+
raise BlockAllocator.NoFreeBlocksError()
|
| 310 |
+
|
| 311 |
+
def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
|
| 312 |
+
try:
|
| 313 |
+
# Allocate mutable block and extract its block_id
|
| 314 |
+
block = self._hashless_allocator.allocate_mutable_block(
|
| 315 |
+
prev_block=None)
|
| 316 |
+
block_id = block.block_id
|
| 317 |
+
self._block_pool.free_block(block)
|
| 318 |
+
|
| 319 |
+
self._track_block_id(block_id, computed=False)
|
| 320 |
+
return block_id
|
| 321 |
+
except BlockAllocator.NoFreeBlocksError:
|
| 322 |
+
return None
|
| 323 |
+
|
| 324 |
+
def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
|
| 325 |
+
if self.evictor.num_blocks == 0:
|
| 326 |
+
return None
|
| 327 |
+
|
| 328 |
+
# Here we get an evicted block, which is only added
|
| 329 |
+
# into evictor if its ref counter is 0
|
| 330 |
+
# and since its content would be changed, we need
|
| 331 |
+
# to remove it from _cached_blocks's tracking list
|
| 332 |
+
block_id, content_hash_to_evict = self.evictor.evict()
|
| 333 |
+
|
| 334 |
+
# Sanity checks
|
| 335 |
+
assert content_hash_to_evict in self._cached_blocks
|
| 336 |
+
_block_id = self._cached_blocks[content_hash_to_evict]
|
| 337 |
+
assert self._refcounter.get(_block_id) == 0
|
| 338 |
+
assert _block_id == block_id
|
| 339 |
+
|
| 340 |
+
self._cached_blocks.pop(content_hash_to_evict)
|
| 341 |
+
|
| 342 |
+
self._refcounter.incr(block_id)
|
| 343 |
+
self._track_block_id(block_id, computed=False)
|
| 344 |
+
|
| 345 |
+
return block_id
|
| 346 |
+
|
| 347 |
+
def _free_block_id(self, block: Block) -> None:
|
| 348 |
+
"""Decrements the refcount of the block. The block may be in two
|
| 349 |
+
possible states: (1) immutable/cached or (2) mutable/hashless.
|
| 350 |
+
In the first case, the refcount is decremented directly and the block
|
| 351 |
+
may be possibly added to the evictor. In other case, hashless
|
| 352 |
+
allocator free(..) with keep_block_object=True is called to only free
|
| 353 |
+
the block id (since the block object may be reused by the caller)
|
| 354 |
+
"""
|
| 355 |
+
block_id = block.block_id
|
| 356 |
+
assert block_id is not None, "Freeing unallocated block is undefined"
|
| 357 |
+
|
| 358 |
+
if block.content_hash is not None:
|
| 359 |
+
# Immutable: This type of block is always cached, and we want to
|
| 360 |
+
# keep it in the evictor for future reuse
|
| 361 |
+
self._decr_refcount_cached_block(block)
|
| 362 |
+
else:
|
| 363 |
+
# Mutable: This type of block is not cached, so we release it
|
| 364 |
+
# directly to the hashless allocator
|
| 365 |
+
self._decr_refcount_hashless_block(block)
|
| 366 |
+
|
| 367 |
+
assert block.block_id is None
|
| 368 |
+
|
| 369 |
+
def free(self, block: Block, keep_block_object: bool = False) -> None:
|
| 370 |
+
"""Release the block (look at free_block_id(..) docs)
|
| 371 |
+
"""
|
| 372 |
+
# Release the physical block index
|
| 373 |
+
self._free_block_id(block)
|
| 374 |
+
|
| 375 |
+
# Release the block object to the pool
|
| 376 |
+
if not keep_block_object:
|
| 377 |
+
self._block_pool.free_block(block)
|
| 378 |
+
|
| 379 |
+
def fork(self, last_block: Block) -> List[Block]:
|
| 380 |
+
"""Creates a new sequence of blocks that shares the same underlying
|
| 381 |
+
memory as the original sequence.
|
| 382 |
+
|
| 383 |
+
Args:
|
| 384 |
+
last_block (Block): The last block in the original sequence.
|
| 385 |
+
|
| 386 |
+
Returns:
|
| 387 |
+
List[Block]: The new sequence of blocks that shares the same memory
|
| 388 |
+
as the original sequence.
|
| 389 |
+
"""
|
| 390 |
+
source_blocks = get_all_blocks_recursively(last_block)
|
| 391 |
+
|
| 392 |
+
forked_blocks: List[Block] = []
|
| 393 |
+
prev_block = None
|
| 394 |
+
for block in source_blocks:
|
| 395 |
+
block_id = block.block_id
|
| 396 |
+
assert block_id is not None
|
| 397 |
+
|
| 398 |
+
refcount = self._refcounter.incr(block_id)
|
| 399 |
+
assert refcount != 1, "can't fork free'd block_id = {}".format(
|
| 400 |
+
block_id)
|
| 401 |
+
|
| 402 |
+
forked_block = self._block_pool.init_block(
|
| 403 |
+
prev_block=prev_block,
|
| 404 |
+
token_ids=block.token_ids,
|
| 405 |
+
block_size=self._block_size,
|
| 406 |
+
physical_block_id=block_id,
|
| 407 |
+
extra_hash=block.extra_hash)
|
| 408 |
+
|
| 409 |
+
forked_blocks.append(forked_block)
|
| 410 |
+
prev_block = forked_blocks[-1]
|
| 411 |
+
|
| 412 |
+
return forked_blocks
|
| 413 |
+
|
| 414 |
+
def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
|
| 415 |
+
assert device is None
|
| 416 |
+
# The number of free blocks is the number of hashless free blocks
|
| 417 |
+
# plus the number of blocks evictor could free from its list.
|
| 418 |
+
return self._hashless_allocator.get_num_free_blocks(
|
| 419 |
+
) + self.evictor.num_blocks
|
| 420 |
+
|
| 421 |
+
def get_num_total_blocks(self) -> int:
|
| 422 |
+
return self._hashless_allocator.get_num_total_blocks()
|
| 423 |
+
|
| 424 |
+
def get_physical_block_id(self, absolute_id: int) -> int:
|
| 425 |
+
"""Returns the zero-offset block id on certain block allocator
|
| 426 |
+
given the absolute block id.
|
| 427 |
+
|
| 428 |
+
Args:
|
| 429 |
+
absolute_id (int): The absolute block id for the block
|
| 430 |
+
in whole allocator.
|
| 431 |
+
|
| 432 |
+
Returns:
|
| 433 |
+
int: The rzero-offset block id on certain device.
|
| 434 |
+
"""
|
| 435 |
+
return sorted(self.all_block_ids).index(absolute_id)
|
| 436 |
+
|
| 437 |
+
@property
|
| 438 |
+
def all_block_ids(self) -> FrozenSet[int]:
|
| 439 |
+
return self._hashless_allocator.all_block_ids
|
| 440 |
+
|
| 441 |
+
def get_prefix_cache_hit_rate(self) -> float:
|
| 442 |
+
return self.metric_data.get_hit_rate()
|
| 443 |
+
|
| 444 |
+
def reset_prefix_cache(self) -> bool:
|
| 445 |
+
"""Reset prefix cache. This function may be used in RLHF
|
| 446 |
+
flows to invalid prefix caching after the weights are updated,
|
| 447 |
+
or used for resetting prefix caching status for benchmarking.
|
| 448 |
+
|
| 449 |
+
Returns:
|
| 450 |
+
bool: True if the prefix cache is successfully reset,
|
| 451 |
+
False otherwise.
|
| 452 |
+
"""
|
| 453 |
+
num_used_blocks = (self.get_num_total_blocks() -
|
| 454 |
+
self.get_num_free_blocks())
|
| 455 |
+
if num_used_blocks > 0:
|
| 456 |
+
logger.warning(
|
| 457 |
+
"Failed to reset prefix cache because some "
|
| 458 |
+
"blocks (%d) are not freed yet", num_used_blocks)
|
| 459 |
+
return False
|
| 460 |
+
|
| 461 |
+
# Free all blocks in the evictor.
|
| 462 |
+
while (block_id :=
|
| 463 |
+
self._maybe_allocate_evicted_block_id()) is not None:
|
| 464 |
+
self._hashless_allocator.free_block_id(block_id)
|
| 465 |
+
|
| 466 |
+
# Should not have any cached blocks because all blocks are evicted.
|
| 467 |
+
assert not self._cached_blocks
|
| 468 |
+
|
| 469 |
+
# Reset the evictor.
|
| 470 |
+
self.evictor = make_evictor(self.eviction_policy)
|
| 471 |
+
|
| 472 |
+
# Reset the block tracker.
|
| 473 |
+
for block_id in self._block_tracker:
|
| 474 |
+
self._block_tracker[block_id] = BlockTracker()
|
| 475 |
+
|
| 476 |
+
# Reset the metrics.
|
| 477 |
+
self.metric_data = CacheMetricData()
|
| 478 |
+
|
| 479 |
+
logger.info("Successfully reset prefix cache")
|
| 480 |
+
return True
|
| 481 |
+
|
| 482 |
+
def is_block_cached(self, block: Block) -> bool:
|
| 483 |
+
assert block.content_hash is not None
|
| 484 |
+
return block.content_hash in self._cached_blocks
|
| 485 |
+
|
| 486 |
+
def promote_to_immutable_block(self, block: Block) -> BlockId:
|
| 487 |
+
"""Once a mutable block is full, it can be promoted to an immutable
|
| 488 |
+
block. This means that its content can be referenced by future blocks
|
| 489 |
+
having the same prefix.
|
| 490 |
+
|
| 491 |
+
Note that if we already have a cached block with the same content, we
|
| 492 |
+
will replace the newly-promoted block's mapping with the existing cached
|
| 493 |
+
block id.
|
| 494 |
+
|
| 495 |
+
Args:
|
| 496 |
+
block: The mutable block to be promoted.
|
| 497 |
+
|
| 498 |
+
Returns:
|
| 499 |
+
BlockId: Either the original block index, or the block index of
|
| 500 |
+
the previously cached block matching the same content.
|
| 501 |
+
"""
|
| 502 |
+
# Ensure block can be promoted
|
| 503 |
+
assert block.content_hash is not None
|
| 504 |
+
assert block.block_id is not None
|
| 505 |
+
assert self._refcounter.get(block.block_id) > 0
|
| 506 |
+
|
| 507 |
+
if block.content_hash not in self._cached_blocks:
|
| 508 |
+
# No cached content hash => Set this block as cached.
|
| 509 |
+
# Note that this block cannot be marked as computed yet
|
| 510 |
+
# because other sequences in the same batch cannot reuse
|
| 511 |
+
# this block.
|
| 512 |
+
self._cached_blocks[block.content_hash] = block.block_id
|
| 513 |
+
# Mark this block as touched so that it can be marked as
|
| 514 |
+
# computed after the entire batch of sequences are scheduled.
|
| 515 |
+
self._touched_blocks.add(block.block_id)
|
| 516 |
+
return block.block_id
|
| 517 |
+
|
| 518 |
+
# Reuse the cached content hash
|
| 519 |
+
self._decr_refcount_hashless_block(block)
|
| 520 |
+
block.block_id = self._cached_blocks[block.content_hash]
|
| 521 |
+
|
| 522 |
+
# Increment refcount of the cached block and (possibly) restore
|
| 523 |
+
# it from the evictor.
|
| 524 |
+
# Note that in this case, the block is marked as computed
|
| 525 |
+
self._incr_refcount_cached_block(block)
|
| 526 |
+
|
| 527 |
+
return block.block_id
|
| 528 |
+
|
| 529 |
+
def cow_block_if_not_appendable(self, block: Block) -> BlockId:
|
| 530 |
+
"""Performs a copy-on-write operation on the given block if it is not
|
| 531 |
+
appendable.
|
| 532 |
+
|
| 533 |
+
Args:
|
| 534 |
+
block (Block): The block to check for copy-on-write.
|
| 535 |
+
|
| 536 |
+
Returns:
|
| 537 |
+
BlockId: The block index of the new block if a copy-on-write
|
| 538 |
+
operation was performed, or the original block index if
|
| 539 |
+
no copy-on-write was necessary.
|
| 540 |
+
"""
|
| 541 |
+
src_block_id = block.block_id
|
| 542 |
+
assert src_block_id is not None
|
| 543 |
+
|
| 544 |
+
if self._cow_tracker.is_appendable(block):
|
| 545 |
+
return src_block_id
|
| 546 |
+
|
| 547 |
+
self._free_block_id(block)
|
| 548 |
+
trg_block_id = self._allocate_block_id()
|
| 549 |
+
|
| 550 |
+
self._cow_tracker.record_cow(src_block_id, trg_block_id)
|
| 551 |
+
|
| 552 |
+
return trg_block_id
|
| 553 |
+
|
| 554 |
+
def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
|
| 555 |
+
"""Returns the copy-on-write source->destination mapping and clears it.
|
| 556 |
+
|
| 557 |
+
Returns:
|
| 558 |
+
List[Tuple[BlockId, BlockId]]: A list mapping source
|
| 559 |
+
block indices to destination block indices.
|
| 560 |
+
"""
|
| 561 |
+
return self._cow_tracker.clear_cows()
|
| 562 |
+
|
| 563 |
+
def mark_blocks_as_accessed(self, block_ids: List[int],
|
| 564 |
+
now: float) -> None:
|
| 565 |
+
"""Mark blocks as accessed, used in prefix caching.
|
| 566 |
+
|
| 567 |
+
If the block is added into evictor, we need to update corresponding
|
| 568 |
+
info in evictor's metadata.
|
| 569 |
+
"""
|
| 570 |
+
|
| 571 |
+
for block_id in block_ids:
|
| 572 |
+
if self._block_tracker[block_id].active:
|
| 573 |
+
self._block_tracker[block_id].last_accessed = now
|
| 574 |
+
elif block_id in self.evictor:
|
| 575 |
+
self.evictor.update(block_id, now)
|
| 576 |
+
else:
|
| 577 |
+
raise ValueError(
|
| 578 |
+
"Mark block as accessed which is not belonged to GPU")
|
| 579 |
+
|
| 580 |
+
def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
|
| 581 |
+
# Mark all touched blocks as computed.
|
| 582 |
+
for block_id in self._touched_blocks:
|
| 583 |
+
self._block_tracker[block_id].computed = True
|
| 584 |
+
self._touched_blocks.clear()
|
| 585 |
+
|
| 586 |
+
def _track_block_id(self, block_id: Optional[BlockId],
|
| 587 |
+
computed: bool) -> None:
|
| 588 |
+
assert block_id is not None
|
| 589 |
+
self._block_tracker[block_id].enable()
|
| 590 |
+
self._block_tracker[block_id].computed = computed
|
| 591 |
+
|
| 592 |
+
def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
|
| 593 |
+
assert block_id is not None
|
| 594 |
+
self._block_tracker[block_id].disable()
|
| 595 |
+
|
| 596 |
+
def block_is_computed(self, block_id: int) -> bool:
|
| 597 |
+
if self._block_tracker[block_id].active:
|
| 598 |
+
return self._block_tracker[block_id].computed
|
| 599 |
+
else:
|
| 600 |
+
return block_id in self.evictor
|
| 601 |
+
|
| 602 |
+
def get_common_computed_block_ids(
|
| 603 |
+
self, computed_seq_block_ids: List[List[int]]) -> List[int]:
|
| 604 |
+
"""Return the block ids that are common for a given sequence group.
|
| 605 |
+
|
| 606 |
+
Only those blocks that are immutable and already be marked
|
| 607 |
+
compyted would be taken consideration.
|
| 608 |
+
"""
|
| 609 |
+
|
| 610 |
+
# NOTE We exclude the last block to avoid the case where the entire
|
| 611 |
+
# prompt is cached. This would cause erroneous behavior in model
|
| 612 |
+
# runner.
|
| 613 |
+
|
| 614 |
+
# It returns a list of int although type annotation says list of string.
|
| 615 |
+
if len(computed_seq_block_ids) == 1:
|
| 616 |
+
return computed_seq_block_ids[0]
|
| 617 |
+
|
| 618 |
+
return commonprefix([
|
| 619 |
+
ids for ids in computed_seq_block_ids # type: ignore
|
| 620 |
+
if ids
|
| 621 |
+
])
|
| 622 |
+
|
| 623 |
+
def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
|
| 624 |
+
"""Returns the number of full blocks that will be touched by
|
| 625 |
+
swapping in/out.
|
| 626 |
+
|
| 627 |
+
Args:
|
| 628 |
+
blocks: List of blocks to be swapped.
|
| 629 |
+
Returns:
|
| 630 |
+
int: the number of full blocks that will be touched by
|
| 631 |
+
swapping in/out the given blocks. Non full blocks are ignored
|
| 632 |
+
when deciding the number of blocks to touch.
|
| 633 |
+
"""
|
| 634 |
+
num_touched_blocks: int = 0
|
| 635 |
+
for block in blocks:
|
| 636 |
+
# If the block has a match in the cache and the cached
|
| 637 |
+
# block is not referenced, then we still count it as a
|
| 638 |
+
# touched block
|
| 639 |
+
if block.is_full and (not self.is_block_cached(block) or \
|
| 640 |
+
(block.content_hash is not None and \
|
| 641 |
+
self._cached_blocks[block.content_hash] in \
|
| 642 |
+
self.evictor)):
|
| 643 |
+
num_touched_blocks += 1
|
| 644 |
+
return num_touched_blocks
|
| 645 |
+
|
| 646 |
+
def swap_out(self, blocks: List[Block]) -> None:
|
| 647 |
+
"""Execute the swap out actions. Basically just free the
|
| 648 |
+
given blocks.
|
| 649 |
+
|
| 650 |
+
Args:
|
| 651 |
+
blocks: List of blocks to be swapped out.
|
| 652 |
+
"""
|
| 653 |
+
for block in blocks:
|
| 654 |
+
self._free_block_id(block)
|
| 655 |
+
|
| 656 |
+
def swap_in(self, blocks: List[Block]) -> None:
|
| 657 |
+
"""Execute the swap in actions. Change the block id from
|
| 658 |
+
old allocator to current allocator for each block to finish
|
| 659 |
+
the block table update.
|
| 660 |
+
|
| 661 |
+
Args:
|
| 662 |
+
blocks: List of blocks to be swapped in.
|
| 663 |
+
"""
|
| 664 |
+
for block in blocks:
|
| 665 |
+
# Here we allocate either immutable or mutable block and then
|
| 666 |
+
# extract its block_id. Note that the block object is released
|
| 667 |
+
# and the block_id is assigned to "block" to allow reusing the
|
| 668 |
+
# existing "block" object
|
| 669 |
+
if block.is_full:
|
| 670 |
+
tmp_block = self.allocate_immutable_block(
|
| 671 |
+
prev_block=block.prev_block,
|
| 672 |
+
token_ids=block.token_ids,
|
| 673 |
+
extra_hash=block.extra_hash)
|
| 674 |
+
else:
|
| 675 |
+
tmp_block = self.allocate_mutable_block(
|
| 676 |
+
prev_block=block.prev_block, extra_hash=block.extra_hash)
|
| 677 |
+
tmp_block.append_token_ids(block.token_ids)
|
| 678 |
+
|
| 679 |
+
block_id = tmp_block.block_id
|
| 680 |
+
self._block_pool.free_block(tmp_block)
|
| 681 |
+
|
| 682 |
+
block.block_id = block_id # Assign block_id
|
| 683 |
+
|
| 684 |
+
def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
|
| 685 |
+
"""
|
| 686 |
+
Given a list of block hashes, return the prefix of the block hashes that
|
| 687 |
+
are all cached.
|
| 688 |
+
|
| 689 |
+
Since a block's block hash includes the hashes of all previous blocks,
|
| 690 |
+
and we only allocate/deallocate blocks in the entire sequence, so if a
|
| 691 |
+
block is cached, then all previous blocks are also cached. With this
|
| 692 |
+
property, we can use binary search to find the prefix of cached blocks.
|
| 693 |
+
|
| 694 |
+
Args:
|
| 695 |
+
block_hashes (List[int]): The list of block hashes.
|
| 696 |
+
|
| 697 |
+
Returns:
|
| 698 |
+
List[int]: The prefix of the `block_hashes` that are cached.
|
| 699 |
+
"""
|
| 700 |
+
|
| 701 |
+
def _block_is_cached(block_hash: PrefixHash) -> bool:
|
| 702 |
+
if block_hash not in self._cached_blocks:
|
| 703 |
+
return False
|
| 704 |
+
|
| 705 |
+
cached_block_id = self._cached_blocks[block_hash]
|
| 706 |
+
# We only consider the blocks that are marked as computed.
|
| 707 |
+
return self.block_is_computed(cached_block_id)
|
| 708 |
+
|
| 709 |
+
def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
|
| 710 |
+
|
| 711 |
+
# python <= 3.10 don't have the key argument
|
| 712 |
+
if sys.version_info < (3, 10):
|
| 713 |
+
a = [key(e) for e in a]
|
| 714 |
+
return bisect_left(a, x)
|
| 715 |
+
else:
|
| 716 |
+
return bisect_left(a, x, key=key)
|
| 717 |
+
|
| 718 |
+
# Look for the first block that's not cached, and returns the prefix
|
| 719 |
+
# i.e. blocks that are cached.
|
| 720 |
+
idx = _bisect_left(block_hashes,
|
| 721 |
+
True,
|
| 722 |
+
key=lambda x: not _block_is_cached(x))
|
| 723 |
+
return block_hashes[:idx]
|
| 724 |
+
|
| 725 |
+
|
| 726 |
+
class PrefixCachingBlock(Block):
|
| 727 |
+
"""A block implementation that supports prefix caching.
|
| 728 |
+
|
| 729 |
+
The PrefixCachingBlock class represents a block of token IDs with prefix
|
| 730 |
+
caching capabilities. It wraps a NaiveBlock internally and provides
|
| 731 |
+
additional functionality for content hashing and promoting immutable blocks
|
| 732 |
+
with the prefix caching allocator.
|
| 733 |
+
|
| 734 |
+
Args:
|
| 735 |
+
prev_block (Optional[PrefixCachingBlock]): The previous block in the
|
| 736 |
+
sequence.
|
| 737 |
+
token_ids (List[int]): The initial token IDs to be stored in the block.
|
| 738 |
+
block_size (int): The maximum number of token IDs that can be stored in
|
| 739 |
+
the block.
|
| 740 |
+
allocator (BlockAllocator): The prefix
|
| 741 |
+
caching block allocator associated with this block.
|
| 742 |
+
block_id (Optional[int], optional): The physical block index
|
| 743 |
+
of this block. Defaults to None.
|
| 744 |
+
extra_hash (Optional[int]): The hash value of additional factors
|
| 745 |
+
such as adapters that influence the block, apart from the token_ids.
|
| 746 |
+
"""
|
| 747 |
+
|
| 748 |
+
# Note that we use 'None' as a string here instead of None because
|
| 749 |
+
# as of Python 3.12, hash(None) returns a constant predictable value.
|
| 750 |
+
# This could possibly make it easier to find and exploit hash
|
| 751 |
+
# collisions. 'None' as a string will be hashed differently per process,
|
| 752 |
+
# but consistently within the same process. This is the same as the
|
| 753 |
+
# behavior of None prior to Python 3.12.
|
| 754 |
+
_none_hash: int = hash('None')
|
| 755 |
+
|
| 756 |
+
def __init__(
|
| 757 |
+
self,
|
| 758 |
+
prev_block: Optional[Block],
|
| 759 |
+
token_ids: List[int],
|
| 760 |
+
block_size: int,
|
| 761 |
+
allocator: BlockAllocator,
|
| 762 |
+
block_id: Optional[int] = None,
|
| 763 |
+
computed: bool = False,
|
| 764 |
+
extra_hash: Optional[int] = None,
|
| 765 |
+
):
|
| 766 |
+
assert isinstance(allocator, PrefixCachingBlockAllocator), (
|
| 767 |
+
"Currently this class is only tested with "
|
| 768 |
+
"PrefixCachingBlockAllocator. Got instead allocator = {}".format(
|
| 769 |
+
allocator))
|
| 770 |
+
assert_prefix_caching_block_or_none(prev_block)
|
| 771 |
+
|
| 772 |
+
self._prev_block = prev_block
|
| 773 |
+
self._cached_content_hash: Optional[int] = None
|
| 774 |
+
self._cached_num_tokens_total: int = 0
|
| 775 |
+
self._allocator = allocator
|
| 776 |
+
self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
|
| 777 |
+
self._computed = computed
|
| 778 |
+
self._extra_hash = extra_hash
|
| 779 |
+
|
| 780 |
+
# On the first time, we create the block object, and next we only
|
| 781 |
+
# reinitialize it
|
| 782 |
+
if hasattr(self, "_block"):
|
| 783 |
+
self._block.__init__( # type: ignore[has-type]
|
| 784 |
+
prev_block=prev_block,
|
| 785 |
+
token_ids=token_ids,
|
| 786 |
+
block_size=block_size,
|
| 787 |
+
block_id=block_id,
|
| 788 |
+
allocator=self._allocator)
|
| 789 |
+
else:
|
| 790 |
+
self._block = NaiveBlock(prev_block=prev_block,
|
| 791 |
+
token_ids=token_ids,
|
| 792 |
+
block_size=block_size,
|
| 793 |
+
block_id=block_id,
|
| 794 |
+
allocator=self._allocator)
|
| 795 |
+
|
| 796 |
+
self._update_num_tokens_total()
|
| 797 |
+
|
| 798 |
+
def _update_num_tokens_total(self):
|
| 799 |
+
"""Incrementally computes the number of tokens that there is
|
| 800 |
+
till the current block (included)
|
| 801 |
+
"""
|
| 802 |
+
res = 0
|
| 803 |
+
|
| 804 |
+
# Add all previous blocks
|
| 805 |
+
if self._prev_block is not None:
|
| 806 |
+
res += self._prev_block.num_tokens_total
|
| 807 |
+
|
| 808 |
+
# Add current block
|
| 809 |
+
res += len(self.token_ids)
|
| 810 |
+
|
| 811 |
+
self._cached_num_tokens_total = res
|
| 812 |
+
|
| 813 |
+
@property
|
| 814 |
+
def computed(self) -> bool:
|
| 815 |
+
return self._computed
|
| 816 |
+
|
| 817 |
+
@computed.setter
|
| 818 |
+
def computed(self, value) -> None:
|
| 819 |
+
self._computed = value
|
| 820 |
+
|
| 821 |
+
@property
|
| 822 |
+
def last_accessed(self) -> float:
|
| 823 |
+
return self._last_accessed
|
| 824 |
+
|
| 825 |
+
@last_accessed.setter
|
| 826 |
+
def last_accessed(self, last_accessed_ts: float):
|
| 827 |
+
self._last_accessed = last_accessed_ts
|
| 828 |
+
|
| 829 |
+
def append_token_ids(self, token_ids: List[int]) -> None:
|
| 830 |
+
"""Appends the given token IDs to the block and registers the block as
|
| 831 |
+
immutable if the block becomes full.
|
| 832 |
+
|
| 833 |
+
Args:
|
| 834 |
+
token_ids (List[int]): The token IDs to be appended to the block.
|
| 835 |
+
"""
|
| 836 |
+
# Ensure this is mutable block (not promoted)
|
| 837 |
+
assert self.content_hash is None
|
| 838 |
+
assert not self.computed
|
| 839 |
+
|
| 840 |
+
if len(token_ids) == 0:
|
| 841 |
+
return
|
| 842 |
+
|
| 843 |
+
# Ensure there are input tokens
|
| 844 |
+
assert token_ids, "Got token_ids = {}".format(token_ids)
|
| 845 |
+
|
| 846 |
+
# Naive block handles CoW.
|
| 847 |
+
self._block.append_token_ids(token_ids)
|
| 848 |
+
self._update_num_tokens_total()
|
| 849 |
+
|
| 850 |
+
# If the content hash is present, then the block can be made immutable.
|
| 851 |
+
# Register ourselves with the allocator, potentially replacing the
|
| 852 |
+
# physical block index.
|
| 853 |
+
if self.content_hash is not None:
|
| 854 |
+
self.block_id = self._allocator.promote_to_immutable_block(self)
|
| 855 |
+
|
| 856 |
+
@property
|
| 857 |
+
def block_id(self) -> Optional[int]:
|
| 858 |
+
return self._block.block_id
|
| 859 |
+
|
| 860 |
+
@block_id.setter
|
| 861 |
+
def block_id(self, value) -> None:
|
| 862 |
+
self._block.block_id = value
|
| 863 |
+
|
| 864 |
+
@property
|
| 865 |
+
def is_full(self) -> bool:
|
| 866 |
+
return self._block.is_full
|
| 867 |
+
|
| 868 |
+
@property
|
| 869 |
+
def num_empty_slots(self) -> int:
|
| 870 |
+
return self._block.num_empty_slots
|
| 871 |
+
|
| 872 |
+
@property
|
| 873 |
+
def num_tokens_total(self) -> int:
|
| 874 |
+
return self._cached_num_tokens_total
|
| 875 |
+
|
| 876 |
+
@property
|
| 877 |
+
def block_size(self) -> int:
|
| 878 |
+
return self._block.block_size
|
| 879 |
+
|
| 880 |
+
@property
|
| 881 |
+
def token_ids(self) -> List[int]:
|
| 882 |
+
return self._block.token_ids
|
| 883 |
+
|
| 884 |
+
@property
|
| 885 |
+
def prev_block(self) -> Optional[Block]:
|
| 886 |
+
return self._prev_block
|
| 887 |
+
|
| 888 |
+
@property
|
| 889 |
+
def extra_hash(self) -> Optional[int]:
|
| 890 |
+
return self._extra_hash
|
| 891 |
+
|
| 892 |
+
@property
|
| 893 |
+
def content_hash(self) -> Optional[int]:
|
| 894 |
+
"""Return the content-based hash of the current block, or None if it is
|
| 895 |
+
not yet defined.
|
| 896 |
+
|
| 897 |
+
For the content-based hash to be defined, the current block must be
|
| 898 |
+
full.
|
| 899 |
+
"""
|
| 900 |
+
# If the hash is already computed, return it.
|
| 901 |
+
if self._cached_content_hash is not None:
|
| 902 |
+
return self._cached_content_hash
|
| 903 |
+
|
| 904 |
+
# We cannot compute a hash for the current block because it is not full.
|
| 905 |
+
if not self.is_full:
|
| 906 |
+
return None
|
| 907 |
+
|
| 908 |
+
is_first_block = self._prev_block is None
|
| 909 |
+
prev_block_hash = (
|
| 910 |
+
self._none_hash if is_first_block else
|
| 911 |
+
self._prev_block.content_hash # type: ignore
|
| 912 |
+
)
|
| 913 |
+
|
| 914 |
+
# Previous block exists but does not yet have a hash.
|
| 915 |
+
# Return no hash in this case.
|
| 916 |
+
if prev_block_hash == self._none_hash and not is_first_block:
|
| 917 |
+
return None
|
| 918 |
+
|
| 919 |
+
self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
|
| 920 |
+
is_first_block,
|
| 921 |
+
prev_block_hash,
|
| 922 |
+
cur_block_token_ids=self.token_ids,
|
| 923 |
+
extra_hash=self._extra_hash)
|
| 924 |
+
return self._cached_content_hash
|
| 925 |
+
|
| 926 |
+
@classmethod
|
| 927 |
+
def hash_block_tokens(cls,
|
| 928 |
+
is_first_block: bool,
|
| 929 |
+
prev_block_hash: Optional[int],
|
| 930 |
+
cur_block_token_ids: List[int],
|
| 931 |
+
extra_hash: Optional[int] = None) -> int:
|
| 932 |
+
"""Computes a hash value corresponding to the contents of a block and
|
| 933 |
+
the contents of the preceding block(s). The hash value is used for
|
| 934 |
+
prefix caching.
|
| 935 |
+
|
| 936 |
+
Parameters:
|
| 937 |
+
- is_first_block (bool): A flag indicating if the block is the first in
|
| 938 |
+
the sequence.
|
| 939 |
+
- prev_block_hash (Optional[int]): The hash of the previous block. None
|
| 940 |
+
if this is the first block.
|
| 941 |
+
- cur_block_token_ids (List[int]): A list of token ids in the current
|
| 942 |
+
block. The current block is assumed to be full.
|
| 943 |
+
- extra_hash (Optional[int]): The hash value of additional factors
|
| 944 |
+
such as adapters that influence the block, apart from the token_ids.
|
| 945 |
+
|
| 946 |
+
Returns:
|
| 947 |
+
- int: The computed hash value for the block.
|
| 948 |
+
"""
|
| 949 |
+
if is_first_block and prev_block_hash is None:
|
| 950 |
+
prev_block_hash = cls._none_hash
|
| 951 |
+
return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
|
| 952 |
+
extra_hash))
|
| 953 |
+
|
| 954 |
+
|
| 955 |
+
class ComputedBlocksTracker:
|
| 956 |
+
"""
|
| 957 |
+
Tracks the computed blocks for each sequence.
|
| 958 |
+
|
| 959 |
+
Internally, it maintains a map from sequence id to the list of block hashes
|
| 960 |
+
for the sequence. We cache the hashes of the full blocks for each sequence,
|
| 961 |
+
and make sure the hash is calculated in the same way as the allocator.
|
| 962 |
+
When a sequence is being decoded, we also update the sequence's hash
|
| 963 |
+
accordingly and incrementally.
|
| 964 |
+
|
| 965 |
+
From the sequence hash, with prefix caching enabled, we could also calculate
|
| 966 |
+
the number of cached tokens for the sequence by looking up the number of
|
| 967 |
+
cached block hashes in the allocator.
|
| 968 |
+
"""
|
| 969 |
+
|
| 970 |
+
# Note that we use 'None' as a string here instead of None because
|
| 971 |
+
# as of Python 3.12, hash(None) returns a constant predictable value.
|
| 972 |
+
# This could possibly make it easier to find and exploit hash
|
| 973 |
+
# collisions. 'None' as a string will be hashed differently per process,
|
| 974 |
+
# but consistently within the same process. This is the same as the
|
| 975 |
+
# behavior of None prior to Python 3.12.
|
| 976 |
+
_none_hash: int = hash('None')
|
| 977 |
+
|
| 978 |
+
def __init__(
|
| 979 |
+
self,
|
| 980 |
+
allocator: DeviceAwareBlockAllocator,
|
| 981 |
+
block_size: int,
|
| 982 |
+
enable_caching: bool,
|
| 983 |
+
):
|
| 984 |
+
self._allocator = allocator
|
| 985 |
+
self._block_size = block_size
|
| 986 |
+
self._enable_caching = enable_caching
|
| 987 |
+
|
| 988 |
+
# A map from seq_id to the list of block hashes for the
|
| 989 |
+
# sequence. This is so that we don't have to recompute the block hashes
|
| 990 |
+
# for the sequence when we need to check if the sequence is cached.
|
| 991 |
+
# Note a block that's not full will not have its hash calculated and
|
| 992 |
+
# recorded.
|
| 993 |
+
self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
|
| 994 |
+
|
| 995 |
+
# A map from seq_id to the number of tokens that are cached for the
|
| 996 |
+
# sequence.
|
| 997 |
+
# We need this so that a sequence in continuous prefill doesn't
|
| 998 |
+
# accidentally see its cached token count change. See comments in
|
| 999 |
+
# `get_num_cached_tokens` for more details.
|
| 1000 |
+
self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
|
| 1001 |
+
|
| 1002 |
+
def _update_seq_hashes(self, seq: Sequence) -> None:
|
| 1003 |
+
"""Incrementally update the sequence's block hashes and record them."""
|
| 1004 |
+
assert self._enable_caching
|
| 1005 |
+
|
| 1006 |
+
block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
|
| 1007 |
+
seq.seq_id, [])
|
| 1008 |
+
cur_num_blocks_recorded = len(block_hashes_recorded)
|
| 1009 |
+
token_ids = seq.get_token_ids()
|
| 1010 |
+
assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
|
| 1011 |
+
f"The sequence has {len(token_ids)} tokens, but"
|
| 1012 |
+
f" already recorded {cur_num_blocks_recorded} blocks. "
|
| 1013 |
+
"This should not happen since we assume blocks are "
|
| 1014 |
+
"only appended other than recomputation. When the sequence is "
|
| 1015 |
+
"recomputed, we should have removed the info of the old blocks.")
|
| 1016 |
+
# Update the computed block hashes for the sequence. Since only full
|
| 1017 |
+
# blocks are considered as "computed", we take floor here.
|
| 1018 |
+
num_computed_blocks = len(token_ids) // self._block_size
|
| 1019 |
+
|
| 1020 |
+
# We need to know the hash of the previous block to compute the hash of
|
| 1021 |
+
# the current block so that blocks could be uniquely identified across
|
| 1022 |
+
# sequences of prefixes.
|
| 1023 |
+
prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
|
| 1024 |
+
block_hashes_recorded[-1])
|
| 1025 |
+
# Only update the computed block hashes for the new blocks
|
| 1026 |
+
for i in range(cur_num_blocks_recorded, num_computed_blocks):
|
| 1027 |
+
assert len(token_ids) >= (i + 1) * self._block_size
|
| 1028 |
+
block_token_ids = token_ids[i * self._block_size:(i + 1) *
|
| 1029 |
+
self._block_size]
|
| 1030 |
+
|
| 1031 |
+
# NOTE: If there are any factors affecting the block besides
|
| 1032 |
+
# token_ids, they should be added as input to extra_hash.
|
| 1033 |
+
extra_hash = seq.extra_hash()
|
| 1034 |
+
|
| 1035 |
+
# This has to be kept in sync with the allocator's hash
|
| 1036 |
+
# calculation.
|
| 1037 |
+
block_hash = PrefixCachingBlock.hash_block_tokens(
|
| 1038 |
+
is_first_block=prev_block_hash == self._none_hash,
|
| 1039 |
+
prev_block_hash=prev_block_hash,
|
| 1040 |
+
cur_block_token_ids=block_token_ids,
|
| 1041 |
+
extra_hash=extra_hash,
|
| 1042 |
+
)
|
| 1043 |
+
block_hashes_recorded.append(block_hash)
|
| 1044 |
+
prev_block_hash = block_hash
|
| 1045 |
+
|
| 1046 |
+
self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
|
| 1047 |
+
|
| 1048 |
+
def get_num_cached_tokens(self, seq: Sequence) -> int:
|
| 1049 |
+
if not self._enable_caching:
|
| 1050 |
+
return 0
|
| 1051 |
+
|
| 1052 |
+
# We always try to update the sequence hashes on the fly.
|
| 1053 |
+
# This is to ensure that we don't miss any cached tokens for the
|
| 1054 |
+
# sequence during decode.
|
| 1055 |
+
# This routine should only update hash for any new blocks too.
|
| 1056 |
+
self._update_seq_hashes(seq)
|
| 1057 |
+
|
| 1058 |
+
num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
|
| 1059 |
+
seq.seq_id, None)
|
| 1060 |
+
|
| 1061 |
+
# TODO(rickyx): This hack could be removed once we mark blocks as
|
| 1062 |
+
# computed correctly with chunked prefills.
|
| 1063 |
+
if num_computed_tokens_prev is not None and seq.is_prefill():
|
| 1064 |
+
# For a sequence that is still in prefill, we don't
|
| 1065 |
+
# recompute the number of cached tokens.
|
| 1066 |
+
# This also handles correctly chunked prefill since currently
|
| 1067 |
+
# we mark blocks as computed even if the sequence is still partially
|
| 1068 |
+
# prefilled. So a continuously prefilled sequence should not
|
| 1069 |
+
# see its cached token count change while running.
|
| 1070 |
+
return num_computed_tokens_prev
|
| 1071 |
+
|
| 1072 |
+
block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
|
| 1073 |
+
|
| 1074 |
+
# This is O(logN), where N is the number of blocks.
|
| 1075 |
+
num_cached_blocks = len(
|
| 1076 |
+
self._allocator.find_cached_blocks_prefix(block_hashes))
|
| 1077 |
+
num_cached_tokens = num_cached_blocks * self._block_size
|
| 1078 |
+
self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
|
| 1079 |
+
return num_cached_tokens
|
| 1080 |
+
|
| 1081 |
+
def remove_seq(self, seq_id: int) -> None:
|
| 1082 |
+
"""Stop tracking the sequence."""
|
| 1083 |
+
if not self._enable_caching:
|
| 1084 |
+
return
|
| 1085 |
+
assert seq_id in self._seq_id_to_blocks_hashes
|
| 1086 |
+
del self._seq_id_to_blocks_hashes[seq_id]
|
| 1087 |
+
|
| 1088 |
+
assert seq_id in self._seq_id_to_num_tokens_computed
|
| 1089 |
+
del self._seq_id_to_num_tokens_computed[seq_id]
|
| 1090 |
+
|
| 1091 |
+
|
| 1092 |
+
class LastAccessBlocksTracker:
|
| 1093 |
+
"""Manages the last access time of the tracked sequences, in order to allow
|
| 1094 |
+
an efficient update of allocator's block last access times
|
| 1095 |
+
"""
|
| 1096 |
+
|
| 1097 |
+
def __init__(self, allocator):
|
| 1098 |
+
self._allocator = allocator
|
| 1099 |
+
self._seq_last_access: Dict[int, Optional[float]] = {}
|
| 1100 |
+
|
| 1101 |
+
def add_seq(self, seq_id: int) -> None:
|
| 1102 |
+
"""Start tracking seq_id
|
| 1103 |
+
"""
|
| 1104 |
+
assert seq_id not in self._seq_last_access
|
| 1105 |
+
self._seq_last_access[seq_id] = None
|
| 1106 |
+
|
| 1107 |
+
def remove_seq(self, seq_id: int) -> None:
|
| 1108 |
+
"""Stop tracking seq_id
|
| 1109 |
+
"""
|
| 1110 |
+
assert seq_id in self._seq_last_access
|
| 1111 |
+
del self._seq_last_access[seq_id]
|
| 1112 |
+
|
| 1113 |
+
def update_last_access(self, seq_id: int, time: float) -> None:
|
| 1114 |
+
assert seq_id in self._seq_last_access
|
| 1115 |
+
self._seq_last_access[seq_id] = time
|
| 1116 |
+
|
| 1117 |
+
def update_seq_blocks_last_access(self, seq_id: int,
|
| 1118 |
+
block_ids: List[int]) -> None:
|
| 1119 |
+
assert seq_id in self._seq_last_access
|
| 1120 |
+
|
| 1121 |
+
ts = self._seq_last_access[seq_id]
|
| 1122 |
+
|
| 1123 |
+
if ts is None:
|
| 1124 |
+
# No last access was recorded, no need to update.
|
| 1125 |
+
return
|
| 1126 |
+
|
| 1127 |
+
self._allocator.mark_blocks_as_accessed(block_ids, ts)
|
| 1128 |
+
|
| 1129 |
+
|
| 1130 |
+
def assert_prefix_caching_block_or_none(block: Optional[Block]):
|
| 1131 |
+
if block is None:
|
| 1132 |
+
return
|
| 1133 |
+
assert isinstance(block,
|
| 1134 |
+
PrefixCachingBlock), "Got block = {}".format(block)
|
.venv/lib/python3.11/site-packages/vllm/core/block/utils.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
"""Block manager utils."""
|
| 3 |
+
from vllm.sequence import SequenceGroup
|
| 4 |
+
from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
|
| 5 |
+
STR_NOT_IMPL_ENC_DEC_SWA)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
def check_no_caching_or_swa_for_blockmgr_encdec(
|
| 9 |
+
block_mgr, seq_group: SequenceGroup) -> None:
|
| 10 |
+
'''
|
| 11 |
+
Enforce that prefix caching & sliding-window attention (SWA)
|
| 12 |
+
are currently unsupported *specifically* for encoder/decoder models.
|
| 13 |
+
|
| 14 |
+
Raises NotImplementedError if unsupported scenario is detected.
|
| 15 |
+
|
| 16 |
+
Arguments:
|
| 17 |
+
|
| 18 |
+
* block_mgr: BlockSpaceManager instance
|
| 19 |
+
* seq_group: SequenceGroup passed to block_mgr
|
| 20 |
+
'''
|
| 21 |
+
|
| 22 |
+
if seq_group.is_encoder_decoder():
|
| 23 |
+
if block_mgr.max_block_sliding_window is not None:
|
| 24 |
+
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
|
| 25 |
+
|
| 26 |
+
if block_mgr.enable_caching:
|
| 27 |
+
raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
|
.venv/lib/python3.11/site-packages/vllm/core/interfaces.py
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import enum
|
| 4 |
+
from abc import ABC, abstractmethod
|
| 5 |
+
from typing import List
|
| 6 |
+
from typing import Sequence as GenericSequence
|
| 7 |
+
from typing import Tuple
|
| 8 |
+
|
| 9 |
+
from vllm.sequence import Sequence, SequenceGroup
|
| 10 |
+
from vllm.utils import Device
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class AllocStatus(enum.Enum):
|
| 14 |
+
"""Result for BlockSpaceManager.can_allocate
|
| 15 |
+
|
| 16 |
+
1. Ok: seq_group can be allocated now.
|
| 17 |
+
2. Later: seq_group cannot be allocated.
|
| 18 |
+
The capacity of allocator is larger than seq_group required.
|
| 19 |
+
3. Never: seq_group can never be allocated.
|
| 20 |
+
The seq_group is too large to allocated in GPU.
|
| 21 |
+
"""
|
| 22 |
+
OK = enum.auto()
|
| 23 |
+
LATER = enum.auto()
|
| 24 |
+
NEVER = enum.auto()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class BlockSpaceManager(ABC):
|
| 28 |
+
|
| 29 |
+
@staticmethod
|
| 30 |
+
def get_block_space_manager_class(version: str):
|
| 31 |
+
version = version.lower()
|
| 32 |
+
|
| 33 |
+
if version == "selfattn":
|
| 34 |
+
from vllm.core.block_manager import SelfAttnBlockSpaceManager
|
| 35 |
+
return SelfAttnBlockSpaceManager
|
| 36 |
+
|
| 37 |
+
if version == "placeholder":
|
| 38 |
+
from vllm.core.placeholder_block_space_manager import (
|
| 39 |
+
PlaceholderBlockSpaceManager)
|
| 40 |
+
return PlaceholderBlockSpaceManager
|
| 41 |
+
|
| 42 |
+
raise ValueError(f"Unknown version {version=}")
|
| 43 |
+
|
| 44 |
+
@abstractmethod
|
| 45 |
+
def can_allocate(self,
|
| 46 |
+
seq_group: SequenceGroup,
|
| 47 |
+
num_lookahead_slots: int = 0) -> AllocStatus:
|
| 48 |
+
pass
|
| 49 |
+
|
| 50 |
+
@abstractmethod
|
| 51 |
+
def allocate(self, seq_group: SequenceGroup) -> None:
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
@abstractmethod
|
| 55 |
+
def can_append_slots(self, seq_group: SequenceGroup,
|
| 56 |
+
num_lookahead_slots: int) -> bool:
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
@abstractmethod
|
| 60 |
+
def append_slots(
|
| 61 |
+
self,
|
| 62 |
+
seq: Sequence,
|
| 63 |
+
num_lookahead_slots: int,
|
| 64 |
+
) -> List[Tuple[int, int]]:
|
| 65 |
+
pass
|
| 66 |
+
|
| 67 |
+
@abstractmethod
|
| 68 |
+
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
| 69 |
+
pass
|
| 70 |
+
|
| 71 |
+
@abstractmethod
|
| 72 |
+
def can_swap_in(self, seq_group: SequenceGroup,
|
| 73 |
+
num_lookahead_slots: int) -> AllocStatus:
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
@abstractmethod
|
| 77 |
+
def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
@abstractmethod
|
| 81 |
+
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
@abstractmethod
|
| 85 |
+
def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
| 86 |
+
pass
|
| 87 |
+
|
| 88 |
+
@abstractmethod
|
| 89 |
+
def free(self, seq: Sequence) -> None:
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
@abstractmethod
|
| 93 |
+
def get_block_table(self, seq: Sequence) -> List[int]:
|
| 94 |
+
pass
|
| 95 |
+
|
| 96 |
+
@abstractmethod
|
| 97 |
+
def get_num_free_gpu_blocks(self) -> int:
|
| 98 |
+
pass
|
| 99 |
+
|
| 100 |
+
@abstractmethod
|
| 101 |
+
def get_num_free_cpu_blocks(self) -> int:
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
@abstractmethod
|
| 105 |
+
def access_all_blocks_in_seq(
|
| 106 |
+
self,
|
| 107 |
+
seq: Sequence,
|
| 108 |
+
access_time: float,
|
| 109 |
+
) -> None:
|
| 110 |
+
pass
|
| 111 |
+
|
| 112 |
+
@abstractmethod
|
| 113 |
+
def get_common_computed_block_ids(
|
| 114 |
+
self, seqs: List[Sequence]) -> GenericSequence[int]:
|
| 115 |
+
pass
|
| 116 |
+
|
| 117 |
+
@abstractmethod
|
| 118 |
+
def mark_blocks_as_computed(self, seq_group: SequenceGroup,
|
| 119 |
+
token_chunk_size: int):
|
| 120 |
+
pass
|
| 121 |
+
|
| 122 |
+
@abstractmethod
|
| 123 |
+
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
| 124 |
+
"""Prefix cache hit rate. -1 means not supported or disabled."""
|
| 125 |
+
pass
|
| 126 |
+
|
| 127 |
+
@abstractmethod
|
| 128 |
+
def reset_prefix_cache(self) -> bool:
|
| 129 |
+
"""Reset prefix cache for all devices."""
|
| 130 |
+
pass
|
| 131 |
+
|
| 132 |
+
@abstractmethod
|
| 133 |
+
def get_num_cached_tokens(self, seq: Sequence) -> int:
|
| 134 |
+
pass
|
.venv/lib/python3.11/site-packages/vllm/core/placeholder_block_space_manager.py
ADDED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
from typing import List, Tuple
|
| 4 |
+
|
| 5 |
+
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
| 6 |
+
from vllm.sequence import Sequence, SequenceGroup
|
| 7 |
+
from vllm.utils import Device
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class PlaceholderBlockSpaceManager(BlockSpaceManager):
|
| 11 |
+
"""A version of BlockSpaceManager for use in environments
|
| 12 |
+
where block management is not required.
|
| 13 |
+
For example: pooling models or attention-free models like Mamba.
|
| 14 |
+
|
| 15 |
+
This class provides the same interface as BlockSpaceManager, but its
|
| 16 |
+
methods perform no actions or return simple values like True in specific
|
| 17 |
+
actions. It's designed to be used in scenarios where the overhead of
|
| 18 |
+
block management is unnecessary, such as in an embedding environment.
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
def __init__(
|
| 22 |
+
self,
|
| 23 |
+
**kwargs,
|
| 24 |
+
) -> None:
|
| 25 |
+
pass
|
| 26 |
+
|
| 27 |
+
def can_allocate(self,
|
| 28 |
+
seq_group: SequenceGroup,
|
| 29 |
+
num_lookahead_slots: int = 0) -> AllocStatus:
|
| 30 |
+
# Always return OK for dummy purposes
|
| 31 |
+
return AllocStatus.OK
|
| 32 |
+
|
| 33 |
+
def allocate(self, seq_group: SequenceGroup) -> None:
|
| 34 |
+
# No actual allocation logic needed
|
| 35 |
+
pass
|
| 36 |
+
|
| 37 |
+
def can_append_slots(self, seq_group: SequenceGroup,
|
| 38 |
+
num_lookahead_slots: int) -> bool:
|
| 39 |
+
return True
|
| 40 |
+
|
| 41 |
+
def append_slots(
|
| 42 |
+
self,
|
| 43 |
+
seq: Sequence,
|
| 44 |
+
num_lookahead_slots: int,
|
| 45 |
+
) -> List[Tuple[int, int]]:
|
| 46 |
+
return []
|
| 47 |
+
|
| 48 |
+
def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
| 49 |
+
pass
|
| 50 |
+
|
| 51 |
+
def can_swap_in(self, seq_group: SequenceGroup,
|
| 52 |
+
num_lookahead_slots: int) -> AllocStatus:
|
| 53 |
+
return AllocStatus.OK
|
| 54 |
+
|
| 55 |
+
def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
| 56 |
+
return None # type: ignore
|
| 57 |
+
|
| 58 |
+
def can_swap_out(self, seq_group: SequenceGroup) -> bool:
|
| 59 |
+
return True
|
| 60 |
+
|
| 61 |
+
def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
|
| 62 |
+
return None # type: ignore
|
| 63 |
+
|
| 64 |
+
def free(self, seq: Sequence) -> None:
|
| 65 |
+
# No operation on free
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
def get_block_table(self, seq: Sequence) -> List[int]:
|
| 69 |
+
return None # type: ignore
|
| 70 |
+
|
| 71 |
+
def get_num_free_gpu_blocks(self) -> int:
|
| 72 |
+
return 1
|
| 73 |
+
|
| 74 |
+
def get_num_free_cpu_blocks(self) -> int:
|
| 75 |
+
return 1
|
| 76 |
+
|
| 77 |
+
def access_all_blocks_in_seq(
|
| 78 |
+
self,
|
| 79 |
+
seq: Sequence,
|
| 80 |
+
access_time: float,
|
| 81 |
+
) -> None:
|
| 82 |
+
pass
|
| 83 |
+
|
| 84 |
+
def get_common_computed_block_ids(self,
|
| 85 |
+
seq_group: List[Sequence]) -> List[int]:
|
| 86 |
+
return []
|
| 87 |
+
|
| 88 |
+
def mark_blocks_as_computed(self, seq_group: SequenceGroup,
|
| 89 |
+
token_chunk_size: int):
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
+
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
| 93 |
+
return -1
|
| 94 |
+
|
| 95 |
+
def reset_prefix_cache(self) -> bool:
|
| 96 |
+
return True
|
| 97 |
+
|
| 98 |
+
def get_num_cached_tokens(self, seq: Sequence) -> int:
|
| 99 |
+
return 0
|
.venv/lib/python3.11/site-packages/vllm/core/scheduler.py
ADDED
|
@@ -0,0 +1,1840 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SPDX-License-Identifier: Apache-2.0
|
| 2 |
+
|
| 3 |
+
import enum
|
| 4 |
+
import os
|
| 5 |
+
import random
|
| 6 |
+
import time
|
| 7 |
+
from collections import deque
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
from typing import Callable, Deque, Dict, Iterable, List, Optional
|
| 10 |
+
from typing import Sequence as GenericSequence
|
| 11 |
+
from typing import Set, Tuple, Union
|
| 12 |
+
|
| 13 |
+
from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
|
| 14 |
+
from vllm.core.interfaces import AllocStatus, BlockSpaceManager
|
| 15 |
+
from vllm.logger import init_logger
|
| 16 |
+
from vllm.lora.request import LoRARequest
|
| 17 |
+
from vllm.prompt_adapter.request import PromptAdapterRequest
|
| 18 |
+
from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
|
| 19 |
+
SequenceGroupMetadata, SequenceGroupMetadataDelta,
|
| 20 |
+
SequenceStatus)
|
| 21 |
+
from vllm.utils import Device, PyObjectCache
|
| 22 |
+
|
| 23 |
+
logger = init_logger(__name__)
|
| 24 |
+
|
| 25 |
+
# Test-only. If configured, decode is preempted with
|
| 26 |
+
# ARTIFICIAL_PREEMPTION_PROB% probability.
|
| 27 |
+
ENABLE_ARTIFICIAL_PREEMPT = bool(
|
| 28 |
+
os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
|
| 29 |
+
ARTIFICIAL_PREEMPTION_PROB = 0.5
|
| 30 |
+
ARTIFICIAL_PREEMPTION_MAX_CNT = 500
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class PreemptionMode(enum.Enum):
|
| 34 |
+
"""Preemption modes.
|
| 35 |
+
|
| 36 |
+
1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
|
| 37 |
+
and swap them back in when the sequences are resumed.
|
| 38 |
+
2. Recomputation: Discard the blocks of the preempted sequences and
|
| 39 |
+
recompute them when the sequences are resumed, treating the sequences as
|
| 40 |
+
new prompts.
|
| 41 |
+
"""
|
| 42 |
+
SWAP = enum.auto()
|
| 43 |
+
RECOMPUTE = enum.auto()
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class SchedulingBudget:
|
| 48 |
+
"""The available slots for scheduling.
|
| 49 |
+
|
| 50 |
+
TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
|
| 51 |
+
budget update from the same request_id. It is because in normal scheduling
|
| 52 |
+
path, we update RUNNING num_seqs ahead of time, meaning it could be
|
| 53 |
+
updated more than once when scheduling RUNNING requests. Since this won't
|
| 54 |
+
happen if we only have chunked prefill scheduling, we can remove this
|
| 55 |
+
feature from the API when chunked prefill is enabled by default.
|
| 56 |
+
"""
|
| 57 |
+
token_budget: int
|
| 58 |
+
max_num_seqs: int
|
| 59 |
+
_request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
|
| 60 |
+
_request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
|
| 61 |
+
# Number of cached tokens in the batch.
|
| 62 |
+
_num_cached_tokens: int = 0
|
| 63 |
+
# Number of actual non-cached tokens in the batch.
|
| 64 |
+
_num_batched_tokens: int = 0
|
| 65 |
+
_num_curr_seqs: int = 0
|
| 66 |
+
|
| 67 |
+
def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
|
| 68 |
+
# We allow num_new_tokens to be 0 when the entire sequence has
|
| 69 |
+
# been cached.
|
| 70 |
+
assert num_new_tokens >= 0
|
| 71 |
+
assert num_new_seqs != 0
|
| 72 |
+
return (self.num_batched_tokens + num_new_tokens <= self.token_budget
|
| 73 |
+
and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
|
| 74 |
+
|
| 75 |
+
def remaining_token_budget(self):
|
| 76 |
+
return self.token_budget - self.num_batched_tokens
|
| 77 |
+
|
| 78 |
+
def add_num_batched_tokens(self,
|
| 79 |
+
req_id: str,
|
| 80 |
+
num_batched_tokens: int,
|
| 81 |
+
num_cached_tokens: int = 0):
|
| 82 |
+
if req_id in self._request_ids_num_batched_tokens:
|
| 83 |
+
return
|
| 84 |
+
assert num_cached_tokens >= 0
|
| 85 |
+
assert num_batched_tokens >= 0
|
| 86 |
+
|
| 87 |
+
self._request_ids_num_batched_tokens.add(req_id)
|
| 88 |
+
self._num_batched_tokens += num_batched_tokens
|
| 89 |
+
self._num_cached_tokens += num_cached_tokens
|
| 90 |
+
|
| 91 |
+
def subtract_num_batched_tokens(self, req_id: str,
|
| 92 |
+
num_batched_tokens: int):
|
| 93 |
+
if req_id in self._request_ids_num_batched_tokens:
|
| 94 |
+
self._request_ids_num_batched_tokens.remove(req_id)
|
| 95 |
+
self._num_batched_tokens -= num_batched_tokens
|
| 96 |
+
|
| 97 |
+
def add_num_seqs(self, req_id: str, num_curr_seqs: int):
|
| 98 |
+
if req_id in self._request_ids_num_curr_seqs:
|
| 99 |
+
return
|
| 100 |
+
|
| 101 |
+
self._request_ids_num_curr_seqs.add(req_id)
|
| 102 |
+
self._num_curr_seqs += num_curr_seqs
|
| 103 |
+
|
| 104 |
+
def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
|
| 105 |
+
if req_id in self._request_ids_num_curr_seqs:
|
| 106 |
+
self._request_ids_num_curr_seqs.remove(req_id)
|
| 107 |
+
self._num_curr_seqs -= num_curr_seqs
|
| 108 |
+
|
| 109 |
+
@property
|
| 110 |
+
def num_batched_tokens(self):
|
| 111 |
+
return self._num_batched_tokens
|
| 112 |
+
|
| 113 |
+
@property
|
| 114 |
+
def num_curr_seqs(self):
|
| 115 |
+
return self._num_curr_seqs
|
| 116 |
+
|
| 117 |
+
@property
|
| 118 |
+
def num_cached_tokens(self):
|
| 119 |
+
return self._num_cached_tokens
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
@dataclass
|
| 123 |
+
class ScheduledSequenceGroup:
|
| 124 |
+
# A sequence group that's scheduled.
|
| 125 |
+
seq_group: SequenceGroup
|
| 126 |
+
# The total chunk size (number of tokens) to process for next iteration.
|
| 127 |
+
# 1 for decoding. Same as prompt tokens for prefill, but if prefill is
|
| 128 |
+
# chunked, it can be smaller than that.
|
| 129 |
+
token_chunk_size: int
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
@dataclass
|
| 133 |
+
class SchedulerOutputs:
|
| 134 |
+
"""The scheduling decision made from a scheduler."""
|
| 135 |
+
# Scheduled sequence groups.
|
| 136 |
+
scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
|
| 137 |
+
# Number of prefill groups scheduled.
|
| 138 |
+
num_prefill_groups: int
|
| 139 |
+
# Total number of batched tokens.
|
| 140 |
+
num_batched_tokens: int
|
| 141 |
+
# Blocks to swap in. List of CPU -> GPU block number.
|
| 142 |
+
blocks_to_swap_in: List[Tuple[int, int]]
|
| 143 |
+
# Blocks to swap out. List of GPU -> CPU block number.
|
| 144 |
+
blocks_to_swap_out: List[Tuple[int, int]]
|
| 145 |
+
# Blocks to copy. Source to dest block.
|
| 146 |
+
blocks_to_copy: List[Tuple[int, int]]
|
| 147 |
+
# Sequence groups that are going to be ignored.
|
| 148 |
+
ignored_seq_groups: List[SequenceGroup]
|
| 149 |
+
# The number of slots for lookahead decoding.
|
| 150 |
+
num_lookahead_slots: int
|
| 151 |
+
# The number of requests in the running queue
|
| 152 |
+
running_queue_size: int
|
| 153 |
+
preempted: int
|
| 154 |
+
|
| 155 |
+
def __post_init__(self):
|
| 156 |
+
# Swap in and swap out should never happen at the same time.
|
| 157 |
+
assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
|
| 158 |
+
|
| 159 |
+
self.num_loras: int = len(self.lora_requests)
|
| 160 |
+
if self.num_loras > 0:
|
| 161 |
+
self._sort_by_lora_ids()
|
| 162 |
+
|
| 163 |
+
self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
|
| 164 |
+
|
| 165 |
+
def is_empty(self) -> bool:
|
| 166 |
+
# NOTE: We do not consider the ignored sequence groups.
|
| 167 |
+
return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
|
| 168 |
+
and not self.blocks_to_swap_out and not self.blocks_to_copy)
|
| 169 |
+
|
| 170 |
+
def _sort_by_lora_ids(self):
|
| 171 |
+
assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
|
| 172 |
+
|
| 173 |
+
def key_fn(group: ScheduledSequenceGroup):
|
| 174 |
+
key = (group.seq_group.lora_int_id, group.seq_group.request_id)
|
| 175 |
+
if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
|
| 176 |
+
# Sort sequence groups so that all prefills come before all
|
| 177 |
+
# decodes as required by chunked prefill.
|
| 178 |
+
return (not group.seq_group.is_prefill(), *key)
|
| 179 |
+
return key
|
| 180 |
+
|
| 181 |
+
self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
|
| 182 |
+
key=key_fn)
|
| 183 |
+
|
| 184 |
+
@property
|
| 185 |
+
def lora_requests(self) -> Set[LoRARequest]:
|
| 186 |
+
return {
|
| 187 |
+
g.seq_group.lora_request
|
| 188 |
+
for g in self.scheduled_seq_groups
|
| 189 |
+
if g.seq_group.lora_request is not None
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
@property
|
| 193 |
+
def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
|
| 194 |
+
return {
|
| 195 |
+
g.seq_group.prompt_adapter_request
|
| 196 |
+
for g in self.scheduled_seq_groups
|
| 197 |
+
if g.seq_group.prompt_adapter_request is not None
|
| 198 |
+
}
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
@dataclass
|
| 202 |
+
class SchedulerRunningOutputs:
|
| 203 |
+
"""The requests that are scheduled from a running queue.
|
| 204 |
+
|
| 205 |
+
Could contain prefill (prefill that's chunked) or decodes. If there's not
|
| 206 |
+
enough memory, it can be preempted (for recompute) or swapped out.
|
| 207 |
+
"""
|
| 208 |
+
# Selected sequences that are running and in a decoding phase.
|
| 209 |
+
decode_seq_groups: List[ScheduledSequenceGroup]
|
| 210 |
+
# Selected sequences that are running and in a prefill phase.
|
| 211 |
+
# I.e., it means the prefill has been chunked.
|
| 212 |
+
prefill_seq_groups: List[ScheduledSequenceGroup]
|
| 213 |
+
# The preempted sequences.
|
| 214 |
+
preempted: List[SequenceGroup]
|
| 215 |
+
# Sequences that are swapped out.
|
| 216 |
+
swapped_out: List[SequenceGroup]
|
| 217 |
+
# The blocks to swap out.
|
| 218 |
+
blocks_to_swap_out: List[Tuple[int, int]]
|
| 219 |
+
# The blocks to copy.
|
| 220 |
+
blocks_to_copy: List[Tuple[int, int]]
|
| 221 |
+
# The number of slots for lookahead decoding.
|
| 222 |
+
num_lookahead_slots: int
|
| 223 |
+
|
| 224 |
+
# Optimization for fast-access to seq_group lists
|
| 225 |
+
decode_seq_groups_list: List[SequenceGroup]
|
| 226 |
+
prefill_seq_groups_list: List[SequenceGroup]
|
| 227 |
+
|
| 228 |
+
@classmethod
|
| 229 |
+
def create_empty(cls) -> "SchedulerRunningOutputs":
|
| 230 |
+
return SchedulerRunningOutputs(
|
| 231 |
+
decode_seq_groups=[],
|
| 232 |
+
prefill_seq_groups=[],
|
| 233 |
+
preempted=[],
|
| 234 |
+
swapped_out=[],
|
| 235 |
+
blocks_to_swap_out=[],
|
| 236 |
+
blocks_to_copy=[],
|
| 237 |
+
num_lookahead_slots=0,
|
| 238 |
+
decode_seq_groups_list=[],
|
| 239 |
+
prefill_seq_groups_list=[],
|
| 240 |
+
)
|
| 241 |
+
|
| 242 |
+
|
| 243 |
+
@dataclass
|
| 244 |
+
class SchedulerSwappedInOutputs:
|
| 245 |
+
"""The requests that are scheduled from a swap queue.
|
| 246 |
+
|
| 247 |
+
Could contain prefill (prefill that's chunked) or decodes.
|
| 248 |
+
"""
|
| 249 |
+
# Selected sequences that are going to be swapped in and is in a
|
| 250 |
+
# decoding phase.
|
| 251 |
+
decode_seq_groups: List[ScheduledSequenceGroup]
|
| 252 |
+
# Selected sequences that are going to be swapped in and in a prefill
|
| 253 |
+
# phase. I.e., it means the prefill has been chunked.
|
| 254 |
+
prefill_seq_groups: List[ScheduledSequenceGroup]
|
| 255 |
+
# The blocks to swap in.
|
| 256 |
+
blocks_to_swap_in: List[Tuple[int, int]]
|
| 257 |
+
# The blocks to copy.
|
| 258 |
+
blocks_to_copy: List[Tuple[int, int]]
|
| 259 |
+
# The number of slots for lookahead decoding.
|
| 260 |
+
num_lookahead_slots: int
|
| 261 |
+
# Infeasible sequence groups.
|
| 262 |
+
infeasible_seq_groups: List[SequenceGroup]
|
| 263 |
+
|
| 264 |
+
@classmethod
|
| 265 |
+
def create_empty(cls) -> "SchedulerSwappedInOutputs":
|
| 266 |
+
return SchedulerSwappedInOutputs(
|
| 267 |
+
decode_seq_groups=[],
|
| 268 |
+
prefill_seq_groups=[],
|
| 269 |
+
blocks_to_swap_in=[],
|
| 270 |
+
blocks_to_copy=[],
|
| 271 |
+
num_lookahead_slots=0,
|
| 272 |
+
infeasible_seq_groups=[],
|
| 273 |
+
)
|
| 274 |
+
|
| 275 |
+
|
| 276 |
+
@dataclass
|
| 277 |
+
class SchedulerPrefillOutputs:
|
| 278 |
+
"""The requests that are scheduled from a waiting queue.
|
| 279 |
+
|
| 280 |
+
Could contain a fresh prefill requests or preempted requests that need
|
| 281 |
+
to be recomputed from scratch.
|
| 282 |
+
"""
|
| 283 |
+
# Selected sequences for prefill.
|
| 284 |
+
seq_groups: List[ScheduledSequenceGroup]
|
| 285 |
+
# Ignored sequence groups.
|
| 286 |
+
ignored_seq_groups: List[SequenceGroup]
|
| 287 |
+
num_lookahead_slots: int
|
| 288 |
+
|
| 289 |
+
@classmethod
|
| 290 |
+
def create_empty(cls) -> "SchedulerPrefillOutputs":
|
| 291 |
+
return SchedulerPrefillOutputs(
|
| 292 |
+
seq_groups=[],
|
| 293 |
+
ignored_seq_groups=[],
|
| 294 |
+
num_lookahead_slots=0,
|
| 295 |
+
)
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def seq_group_metadata_builder():
|
| 299 |
+
return SequenceGroupMetadata(request_id="",
|
| 300 |
+
is_prompt=False,
|
| 301 |
+
seq_data={},
|
| 302 |
+
sampling_params=None,
|
| 303 |
+
block_tables={})
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
def scheduler_running_outputs_builder():
|
| 307 |
+
return SchedulerRunningOutputs(decode_seq_groups=[],
|
| 308 |
+
prefill_seq_groups=[],
|
| 309 |
+
preempted=[],
|
| 310 |
+
swapped_out=[],
|
| 311 |
+
blocks_to_swap_out=[],
|
| 312 |
+
blocks_to_copy=[],
|
| 313 |
+
num_lookahead_slots=0,
|
| 314 |
+
prefill_seq_groups_list=[],
|
| 315 |
+
decode_seq_groups_list=[])
|
| 316 |
+
|
| 317 |
+
|
| 318 |
+
def scheduled_seq_group_builder():
|
| 319 |
+
return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
|
| 320 |
+
token_chunk_size=0)
|
| 321 |
+
# return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
|
| 322 |
+
|
| 323 |
+
|
| 324 |
+
class Scheduler:
|
| 325 |
+
|
| 326 |
+
def __init__(
|
| 327 |
+
self,
|
| 328 |
+
scheduler_config: SchedulerConfig,
|
| 329 |
+
cache_config: CacheConfig,
|
| 330 |
+
lora_config: Optional[LoRAConfig],
|
| 331 |
+
pipeline_parallel_size: int = 1,
|
| 332 |
+
output_proc_callback: Optional[Callable] = None,
|
| 333 |
+
) -> None:
|
| 334 |
+
self.scheduler_config = scheduler_config
|
| 335 |
+
self.cache_config = cache_config
|
| 336 |
+
# Note for LoRA scheduling: the current policy is extremely
|
| 337 |
+
# simple and NOT fair. It can lead to starvation of some
|
| 338 |
+
# LoRAs. This should be improved in the future.
|
| 339 |
+
self.lora_config = lora_config
|
| 340 |
+
|
| 341 |
+
version = "selfattn"
|
| 342 |
+
if (self.scheduler_config.runner_type == "pooling"
|
| 343 |
+
or self.cache_config.is_attention_free):
|
| 344 |
+
version = "placeholder"
|
| 345 |
+
|
| 346 |
+
BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
|
| 347 |
+
version)
|
| 348 |
+
|
| 349 |
+
num_gpu_blocks = cache_config.num_gpu_blocks
|
| 350 |
+
if num_gpu_blocks:
|
| 351 |
+
num_gpu_blocks //= pipeline_parallel_size
|
| 352 |
+
|
| 353 |
+
num_cpu_blocks = cache_config.num_cpu_blocks
|
| 354 |
+
if num_cpu_blocks:
|
| 355 |
+
num_cpu_blocks //= pipeline_parallel_size
|
| 356 |
+
|
| 357 |
+
# Create the block space manager.
|
| 358 |
+
self.block_manager = BlockSpaceManagerImpl(
|
| 359 |
+
block_size=self.cache_config.block_size,
|
| 360 |
+
num_gpu_blocks=num_gpu_blocks,
|
| 361 |
+
num_cpu_blocks=num_cpu_blocks,
|
| 362 |
+
sliding_window=self.cache_config.sliding_window,
|
| 363 |
+
enable_caching=self.cache_config.enable_prefix_caching)
|
| 364 |
+
|
| 365 |
+
# Sequence groups in the WAITING state.
|
| 366 |
+
# Contain new prefill or preempted requests.
|
| 367 |
+
self.waiting: Deque[SequenceGroup] = deque()
|
| 368 |
+
# Sequence groups in the RUNNING state.
|
| 369 |
+
# Contain decode requests.
|
| 370 |
+
self.running: Deque[SequenceGroup] = deque()
|
| 371 |
+
# Sequence groups in the SWAPPED state.
|
| 372 |
+
# Contain decode requests that are swapped out.
|
| 373 |
+
self.swapped: Deque[SequenceGroup] = deque()
|
| 374 |
+
# Sequence groups finished requests ids since last step iteration.
|
| 375 |
+
# It lets the model know that any state associated with these requests
|
| 376 |
+
# can and must be released after the current step.
|
| 377 |
+
# This is used to evict the finished requests from the Mamba cache.
|
| 378 |
+
self._finished_requests_ids: List[str] = list()
|
| 379 |
+
# Time at previous scheduling step
|
| 380 |
+
self.prev_time = 0.0
|
| 381 |
+
# Did we schedule a prompt at previous step?
|
| 382 |
+
self.prev_prompt = False
|
| 383 |
+
# Latency of the last prompt step
|
| 384 |
+
self.last_prompt_latency = 0.0
|
| 385 |
+
# preemption mode, RECOMPUTE or SWAP
|
| 386 |
+
self.user_specified_preemption_mode = scheduler_config.preemption_mode
|
| 387 |
+
|
| 388 |
+
# The following field is test-only. It is used to inject artificial
|
| 389 |
+
# preemption.
|
| 390 |
+
self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
|
| 391 |
+
self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
|
| 392 |
+
if self.enable_artificial_preemption
|
| 393 |
+
else 0)
|
| 394 |
+
self.num_cumulative_preemption: int = 0
|
| 395 |
+
|
| 396 |
+
# Used to cache python objects
|
| 397 |
+
self._seq_group_metadata_cache: List[PyObjectCache] = []
|
| 398 |
+
self._scheduler_running_outputs_cache: List[PyObjectCache] = []
|
| 399 |
+
self._scheduled_seq_group_cache: List[PyObjectCache] = []
|
| 400 |
+
|
| 401 |
+
# For async output processing, we need to swap cache buffers between
|
| 402 |
+
# iterations. I.e. since the output processing is lagged one step,
|
| 403 |
+
# we cannot reuse the cached objects immediately when the schedule()
|
| 404 |
+
# is called again, but only when schedule() is called the second time.
|
| 405 |
+
self.output_proc_callback = output_proc_callback
|
| 406 |
+
self.use_async_output_proc = self.output_proc_callback is not None
|
| 407 |
+
self.num_cache_iters = 2 if self.use_async_output_proc else 1
|
| 408 |
+
|
| 409 |
+
self.cache_id = 0
|
| 410 |
+
for i in range(self.num_cache_iters):
|
| 411 |
+
self._seq_group_metadata_cache.append(
|
| 412 |
+
PyObjectCache(seq_group_metadata_builder))
|
| 413 |
+
self._scheduler_running_outputs_cache.append(
|
| 414 |
+
PyObjectCache(scheduler_running_outputs_builder))
|
| 415 |
+
self._scheduled_seq_group_cache.append(
|
| 416 |
+
PyObjectCache(scheduled_seq_group_builder))
|
| 417 |
+
|
| 418 |
+
# For async postprocessor, the extra decode run cannot be done
|
| 419 |
+
# when the request reaches max_model_len. In this case, the request
|
| 420 |
+
# will be stopped during schedule() call and added to this stop list
|
| 421 |
+
# for processing and deallocation by the free_finished_seq_groups()
|
| 422 |
+
self._async_stopped: List[SequenceGroup] = []
|
| 423 |
+
|
| 424 |
+
@property
|
| 425 |
+
def next_cache_id(self):
|
| 426 |
+
return (self.cache_id + 1) % self.num_cache_iters
|
| 427 |
+
|
| 428 |
+
@property
|
| 429 |
+
def lora_enabled(self) -> bool:
|
| 430 |
+
return bool(self.lora_config)
|
| 431 |
+
|
| 432 |
+
@property
|
| 433 |
+
def num_decoding_tokens_per_seq(self) -> int:
|
| 434 |
+
"""The number of new tokens."""
|
| 435 |
+
return 1
|
| 436 |
+
|
| 437 |
+
def add_seq_group(self, seq_group: SequenceGroup) -> None:
|
| 438 |
+
# Add sequence groups to the waiting queue.
|
| 439 |
+
self.waiting.append(seq_group)
|
| 440 |
+
|
| 441 |
+
def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
|
| 442 |
+
# Add sequence groups to the running queue.
|
| 443 |
+
# Only for testing purposes.
|
| 444 |
+
self.running.append(seq_group)
|
| 445 |
+
|
| 446 |
+
def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
|
| 447 |
+
# Add sequence groups to the swapped queue.
|
| 448 |
+
# Only for testing purposes.
|
| 449 |
+
self.swapped.append(seq_group)
|
| 450 |
+
|
| 451 |
+
def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
|
| 452 |
+
"""Aborts a sequence group with the given ID.
|
| 453 |
+
|
| 454 |
+
Check if the sequence group with the given ID
|
| 455 |
+
is present in any of the state queue.
|
| 456 |
+
If present, remove the sequence group from the state queue.
|
| 457 |
+
Also, if any of the sequences in the sequence group is not finished,
|
| 458 |
+
free the sequence with status `FINISHED_ABORTED`.
|
| 459 |
+
Otherwise, do nothing.
|
| 460 |
+
|
| 461 |
+
Args:
|
| 462 |
+
request_id: The ID(s) of the sequence group to abort.
|
| 463 |
+
"""
|
| 464 |
+
if isinstance(request_id, str):
|
| 465 |
+
request_id = (request_id, )
|
| 466 |
+
request_ids = set(request_id)
|
| 467 |
+
for state_queue in [self.waiting, self.running, self.swapped]:
|
| 468 |
+
aborted_groups: List[SequenceGroup] = []
|
| 469 |
+
for seq_group in state_queue:
|
| 470 |
+
if not request_ids:
|
| 471 |
+
# Using 'break' here may add two extra iterations,
|
| 472 |
+
# but is acceptable to reduce complexity.
|
| 473 |
+
break
|
| 474 |
+
if seq_group.request_id in request_ids:
|
| 475 |
+
# Appending aborted group into pending list.
|
| 476 |
+
aborted_groups.append(seq_group)
|
| 477 |
+
request_ids.remove(seq_group.request_id)
|
| 478 |
+
for aborted_group in aborted_groups:
|
| 479 |
+
# Remove the sequence group from the state queue.
|
| 480 |
+
state_queue.remove(aborted_group)
|
| 481 |
+
# Remove the aborted request from the Mamba cache.
|
| 482 |
+
self._finished_requests_ids.append(aborted_group.request_id)
|
| 483 |
+
for seq in aborted_group.get_seqs():
|
| 484 |
+
if seq.is_finished():
|
| 485 |
+
continue
|
| 486 |
+
seq.status = SequenceStatus.FINISHED_ABORTED
|
| 487 |
+
self.free_seq(seq)
|
| 488 |
+
|
| 489 |
+
self._free_seq_group_cross_attn_blocks(aborted_group)
|
| 490 |
+
|
| 491 |
+
def _free_seq_group_cross_attn_blocks(
|
| 492 |
+
self,
|
| 493 |
+
seq_group: SequenceGroup,
|
| 494 |
+
) -> None:
|
| 495 |
+
"""
|
| 496 |
+
Free a sequence group from a cross-attention block table.
|
| 497 |
+
Has no effect on decoder-only models.
|
| 498 |
+
"""
|
| 499 |
+
if seq_group.is_encoder_decoder():
|
| 500 |
+
self.block_manager.free_cross(seq_group)
|
| 501 |
+
|
| 502 |
+
def has_unfinished_seqs(self) -> bool:
|
| 503 |
+
return len(self.waiting) != 0 or len(self.running) != 0 or len(
|
| 504 |
+
self.swapped) != 0
|
| 505 |
+
|
| 506 |
+
def get_prefix_cache_hit_rate(self, device: Device) -> float:
|
| 507 |
+
return self.block_manager.get_prefix_cache_hit_rate(device)
|
| 508 |
+
|
| 509 |
+
def reset_prefix_cache(self) -> bool:
|
| 510 |
+
return self.block_manager.reset_prefix_cache()
|
| 511 |
+
|
| 512 |
+
def get_num_unfinished_seq_groups(self) -> int:
|
| 513 |
+
return len(self.waiting) + len(self.running) + len(self.swapped)
|
| 514 |
+
|
| 515 |
+
def get_and_reset_finished_requests_ids(self) -> List[str]:
|
| 516 |
+
"""Flushes the list of request ids of previously finished seq_groups."""
|
| 517 |
+
finished_requests_ids = self._finished_requests_ids
|
| 518 |
+
self._finished_requests_ids = list()
|
| 519 |
+
return finished_requests_ids
|
| 520 |
+
|
| 521 |
+
def _schedule_running(
|
| 522 |
+
self,
|
| 523 |
+
budget: SchedulingBudget,
|
| 524 |
+
curr_loras: Optional[Set[int]],
|
| 525 |
+
enable_chunking: bool = False,
|
| 526 |
+
) -> SchedulerRunningOutputs:
|
| 527 |
+
"""Schedule sequence groups that are running.
|
| 528 |
+
|
| 529 |
+
Running queue should include decode and chunked prefill requests.
|
| 530 |
+
|
| 531 |
+
Args:
|
| 532 |
+
budget: The scheduling budget. The argument is in-place updated
|
| 533 |
+
when any decodes are preempted.
|
| 534 |
+
curr_loras: Currently batched lora request ids. The argument is
|
| 535 |
+
in-place updated when any decodes are preempted.
|
| 536 |
+
enable_chunking: If True, seq group can be chunked and only a
|
| 537 |
+
chunked number of tokens are scheduled if
|
| 538 |
+
`budget.num_batched_tokens` has not enough capacity to schedule
|
| 539 |
+
all tokens.
|
| 540 |
+
|
| 541 |
+
Returns:
|
| 542 |
+
SchedulerRunningOutputs.
|
| 543 |
+
"""
|
| 544 |
+
ret: SchedulerRunningOutputs = \
|
| 545 |
+
self._scheduler_running_outputs_cache[self.cache_id].get_object()
|
| 546 |
+
ret.blocks_to_swap_out.clear()
|
| 547 |
+
ret.blocks_to_copy.clear()
|
| 548 |
+
ret.decode_seq_groups.clear()
|
| 549 |
+
ret.prefill_seq_groups.clear()
|
| 550 |
+
ret.preempted.clear()
|
| 551 |
+
ret.swapped_out.clear()
|
| 552 |
+
|
| 553 |
+
ret.num_lookahead_slots = self._get_num_lookahead_slots(
|
| 554 |
+
is_prefill=False, enable_chunking=enable_chunking)
|
| 555 |
+
|
| 556 |
+
ret.decode_seq_groups_list.clear()
|
| 557 |
+
ret.prefill_seq_groups_list.clear()
|
| 558 |
+
|
| 559 |
+
# Blocks that need to be swapped or copied before model execution.
|
| 560 |
+
blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
|
| 561 |
+
blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
|
| 562 |
+
|
| 563 |
+
decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
|
| 564 |
+
prefill_seq_groups: List[
|
| 565 |
+
ScheduledSequenceGroup] = ret.prefill_seq_groups
|
| 566 |
+
preempted: List[SequenceGroup] = ret.preempted
|
| 567 |
+
swapped_out: List[SequenceGroup] = ret.swapped_out
|
| 568 |
+
|
| 569 |
+
running_queue = self.running
|
| 570 |
+
assert len(self._async_stopped) == 0
|
| 571 |
+
while running_queue:
|
| 572 |
+
seq_group = running_queue[0]
|
| 573 |
+
# We discard the cached tokens info here because we don't need it
|
| 574 |
+
# for running sequence:
|
| 575 |
+
# 1. If a sequence is running with chunked prefill, the cached
|
| 576 |
+
# tokens info was already used for the first prefill.
|
| 577 |
+
# 2. If a sequence is running with non-chunked prefill, then
|
| 578 |
+
# there it's a decoding sequence, and the cached tokens info is
|
| 579 |
+
# irrelevant.
|
| 580 |
+
num_uncached_new_tokens, _ = (
|
| 581 |
+
self._get_num_new_uncached_and_cached_tokens(
|
| 582 |
+
seq_group, SequenceStatus.RUNNING, enable_chunking,
|
| 583 |
+
budget))
|
| 584 |
+
|
| 585 |
+
num_running_tokens = num_uncached_new_tokens
|
| 586 |
+
if num_running_tokens == 0:
|
| 587 |
+
# No budget => Stop
|
| 588 |
+
break
|
| 589 |
+
|
| 590 |
+
running_queue.popleft()
|
| 591 |
+
|
| 592 |
+
# With async postprocessor, an extra decode run is done
|
| 593 |
+
# to process the final tokens. The check below avoids this extra
|
| 594 |
+
# decode run when the model max len is reached, in order to avoid
|
| 595 |
+
# a memory overflow.
|
| 596 |
+
if self.use_async_output_proc and seq_group.seqs[0].get_len(
|
| 597 |
+
) > self.scheduler_config.max_model_len:
|
| 598 |
+
self._async_stopped.append(seq_group)
|
| 599 |
+
continue
|
| 600 |
+
|
| 601 |
+
# NOTE(woosuk): Preemption happens only when there is no available
|
| 602 |
+
# slot to keep all the sequence groups in the RUNNING state.
|
| 603 |
+
while not self._can_append_slots(seq_group, enable_chunking):
|
| 604 |
+
budget.subtract_num_batched_tokens(seq_group.request_id,
|
| 605 |
+
num_running_tokens)
|
| 606 |
+
num_running_seqs = seq_group.get_max_num_running_seqs()
|
| 607 |
+
budget.subtract_num_seqs(seq_group.request_id,
|
| 608 |
+
num_running_seqs)
|
| 609 |
+
|
| 610 |
+
if (curr_loras is not None and seq_group.lora_int_id > 0
|
| 611 |
+
and seq_group.lora_int_id in curr_loras):
|
| 612 |
+
curr_loras.remove(seq_group.lora_int_id)
|
| 613 |
+
|
| 614 |
+
# Determine victim sequence
|
| 615 |
+
cont_loop = True
|
| 616 |
+
if running_queue:
|
| 617 |
+
# Preempt the lowest-priority sequence group.
|
| 618 |
+
victim_seq_group = running_queue.pop()
|
| 619 |
+
else:
|
| 620 |
+
# No other sequence group can be preempted.
|
| 621 |
+
# Preempt the current sequence group.
|
| 622 |
+
# Note: This is also where we stop this loop
|
| 623 |
+
# (since there is nothing else to preempt)
|
| 624 |
+
victim_seq_group = seq_group
|
| 625 |
+
cont_loop = False
|
| 626 |
+
|
| 627 |
+
# With async postprocessor, before preempting a sequence
|
| 628 |
+
# we need to ensure it has no pending async postprocessor
|
| 629 |
+
do_preempt = True
|
| 630 |
+
if self.use_async_output_proc:
|
| 631 |
+
assert self.output_proc_callback is not None
|
| 632 |
+
self.output_proc_callback(
|
| 633 |
+
request_id=victim_seq_group.request_id)
|
| 634 |
+
|
| 635 |
+
# It may be that the async pending "victim_seq_group"
|
| 636 |
+
# becomes finished, in which case we simply free it.
|
| 637 |
+
if victim_seq_group.is_finished():
|
| 638 |
+
self._free_finished_seq_group(victim_seq_group)
|
| 639 |
+
do_preempt = False
|
| 640 |
+
|
| 641 |
+
# Do preemption
|
| 642 |
+
if do_preempt:
|
| 643 |
+
preempted_mode = self._preempt(victim_seq_group,
|
| 644 |
+
blocks_to_swap_out)
|
| 645 |
+
if preempted_mode == PreemptionMode.RECOMPUTE:
|
| 646 |
+
preempted.append(victim_seq_group)
|
| 647 |
+
else:
|
| 648 |
+
swapped_out.append(victim_seq_group)
|
| 649 |
+
|
| 650 |
+
if not cont_loop:
|
| 651 |
+
break
|
| 652 |
+
else:
|
| 653 |
+
self._append_slots(seq_group, blocks_to_copy, enable_chunking)
|
| 654 |
+
is_prefill = seq_group.is_prefill()
|
| 655 |
+
|
| 656 |
+
scheduled_seq_group: ScheduledSequenceGroup = \
|
| 657 |
+
self._scheduled_seq_group_cache[self.cache_id].get_object()
|
| 658 |
+
scheduled_seq_group.seq_group = seq_group
|
| 659 |
+
if is_prefill:
|
| 660 |
+
scheduled_seq_group.token_chunk_size = num_running_tokens
|
| 661 |
+
prefill_seq_groups.append(scheduled_seq_group)
|
| 662 |
+
ret.prefill_seq_groups_list.append(seq_group)
|
| 663 |
+
else:
|
| 664 |
+
scheduled_seq_group.token_chunk_size = 1
|
| 665 |
+
decode_seq_groups.append(scheduled_seq_group)
|
| 666 |
+
ret.decode_seq_groups_list.append(seq_group)
|
| 667 |
+
|
| 668 |
+
budget.add_num_batched_tokens(seq_group.request_id,
|
| 669 |
+
num_running_tokens)
|
| 670 |
+
# OPTIMIZATION: Note that get_max_num_running_seqs is
|
| 671 |
+
# expensive. For the default scheduling chase where
|
| 672 |
+
# enable_chunking is False, num_seqs are updated before running
|
| 673 |
+
# this method, so we don't have to update it again here.
|
| 674 |
+
if enable_chunking:
|
| 675 |
+
num_running_seqs = seq_group.get_max_num_running_seqs()
|
| 676 |
+
budget.add_num_seqs(seq_group.request_id, num_running_seqs)
|
| 677 |
+
if curr_loras is not None and seq_group.lora_int_id > 0:
|
| 678 |
+
curr_loras.add(seq_group.lora_int_id)
|
| 679 |
+
|
| 680 |
+
self._scheduler_running_outputs_cache[self.next_cache_id].reset()
|
| 681 |
+
self._scheduled_seq_group_cache[self.next_cache_id].reset()
|
| 682 |
+
|
| 683 |
+
return ret
|
| 684 |
+
|
| 685 |
+
def _schedule_swapped(
|
| 686 |
+
self,
|
| 687 |
+
budget: SchedulingBudget,
|
| 688 |
+
curr_loras: Optional[Set[int]],
|
| 689 |
+
enable_chunking: bool = False,
|
| 690 |
+
) -> SchedulerSwappedInOutputs:
|
| 691 |
+
"""Schedule sequence groups that are swapped out.
|
| 692 |
+
|
| 693 |
+
It schedules swapped requests as long as it fits `budget` and
|
| 694 |
+
curr_loras <= max_lora from the scheduling config. The input arguments
|
| 695 |
+
`budget` and `curr_loras` are updated based on scheduled seq_groups.
|
| 696 |
+
|
| 697 |
+
Args:
|
| 698 |
+
budget: The scheduling budget. The argument is in-place updated
|
| 699 |
+
when any requests are swapped in.
|
| 700 |
+
curr_loras: Currently batched lora request ids. The argument is
|
| 701 |
+
in-place updated when any requests are swapped in.
|
| 702 |
+
enable_chunking: If True, seq group can be chunked and only a
|
| 703 |
+
chunked number of tokens are scheduled if
|
| 704 |
+
`budget.num_batched_tokens` has not enough capacity to schedule
|
| 705 |
+
all tokens.
|
| 706 |
+
|
| 707 |
+
Returns:
|
| 708 |
+
SchedulerSwappedInOutputs.
|
| 709 |
+
"""
|
| 710 |
+
# Blocks that need to be swapped or copied before model execution.
|
| 711 |
+
blocks_to_swap_in: List[Tuple[int, int]] = []
|
| 712 |
+
blocks_to_copy: List[Tuple[int, int]] = []
|
| 713 |
+
decode_seq_groups: List[ScheduledSequenceGroup] = []
|
| 714 |
+
prefill_seq_groups: List[ScheduledSequenceGroup] = []
|
| 715 |
+
infeasible_seq_groups: List[SequenceGroup] = []
|
| 716 |
+
|
| 717 |
+
swapped_queue = self.swapped
|
| 718 |
+
|
| 719 |
+
leftover_swapped: Deque[SequenceGroup] = deque()
|
| 720 |
+
while swapped_queue:
|
| 721 |
+
seq_group = swapped_queue[0]
|
| 722 |
+
|
| 723 |
+
# If the sequence group cannot be swapped in, stop.
|
| 724 |
+
is_prefill = seq_group.is_prefill()
|
| 725 |
+
alloc_status = self.block_manager.can_swap_in(
|
| 726 |
+
seq_group,
|
| 727 |
+
self._get_num_lookahead_slots(is_prefill, enable_chunking))
|
| 728 |
+
if alloc_status == AllocStatus.LATER:
|
| 729 |
+
break
|
| 730 |
+
elif alloc_status == AllocStatus.NEVER:
|
| 731 |
+
logger.warning(
|
| 732 |
+
"Failing the request %s because there's not enough kv "
|
| 733 |
+
"cache blocks to run the entire sequence.",
|
| 734 |
+
seq_group.request_id)
|
| 735 |
+
for seq in seq_group.get_seqs():
|
| 736 |
+
seq.status = SequenceStatus.FINISHED_IGNORED
|
| 737 |
+
infeasible_seq_groups.append(seq_group)
|
| 738 |
+
swapped_queue.popleft()
|
| 739 |
+
continue
|
| 740 |
+
|
| 741 |
+
lora_int_id = 0
|
| 742 |
+
if self.lora_enabled:
|
| 743 |
+
lora_int_id = seq_group.lora_int_id
|
| 744 |
+
assert curr_loras is not None
|
| 745 |
+
assert self.lora_config is not None
|
| 746 |
+
if (lora_int_id > 0 and (lora_int_id not in curr_loras)
|
| 747 |
+
and len(curr_loras) >= self.lora_config.max_loras):
|
| 748 |
+
# We don't have a space for another LoRA, so
|
| 749 |
+
# we ignore this request for now.
|
| 750 |
+
leftover_swapped.appendleft(seq_group)
|
| 751 |
+
swapped_queue.popleft()
|
| 752 |
+
continue
|
| 753 |
+
|
| 754 |
+
# The total number of sequences in the RUNNING state should not
|
| 755 |
+
# exceed the maximum number of sequences.
|
| 756 |
+
num_new_seqs = seq_group.get_max_num_running_seqs()
|
| 757 |
+
num_new_tokens_uncached, num_new_tokens_cached = (
|
| 758 |
+
self._get_num_new_uncached_and_cached_tokens(
|
| 759 |
+
seq_group, SequenceStatus.SWAPPED, enable_chunking,
|
| 760 |
+
budget))
|
| 761 |
+
|
| 762 |
+
if num_new_tokens_uncached == 0 or not budget.can_schedule(
|
| 763 |
+
num_new_tokens=num_new_tokens_uncached,
|
| 764 |
+
num_new_seqs=num_new_seqs,
|
| 765 |
+
):
|
| 766 |
+
break
|
| 767 |
+
|
| 768 |
+
if lora_int_id > 0 and curr_loras is not None:
|
| 769 |
+
curr_loras.add(lora_int_id)
|
| 770 |
+
swapped_queue.popleft()
|
| 771 |
+
self._swap_in(seq_group, blocks_to_swap_in)
|
| 772 |
+
self._append_slots(seq_group, blocks_to_copy, enable_chunking)
|
| 773 |
+
is_prefill = seq_group.is_prefill()
|
| 774 |
+
if is_prefill:
|
| 775 |
+
prefill_seq_groups.append(
|
| 776 |
+
ScheduledSequenceGroup(
|
| 777 |
+
seq_group,
|
| 778 |
+
token_chunk_size=num_new_tokens_uncached +
|
| 779 |
+
num_new_tokens_cached,
|
| 780 |
+
))
|
| 781 |
+
else:
|
| 782 |
+
decode_seq_groups.append(
|
| 783 |
+
ScheduledSequenceGroup(seq_group, token_chunk_size=1))
|
| 784 |
+
budget.add_num_batched_tokens(
|
| 785 |
+
seq_group.request_id,
|
| 786 |
+
num_batched_tokens=num_new_tokens_uncached,
|
| 787 |
+
num_cached_tokens=num_new_tokens_cached,
|
| 788 |
+
)
|
| 789 |
+
budget.add_num_seqs(seq_group.request_id, num_new_seqs)
|
| 790 |
+
|
| 791 |
+
swapped_queue.extendleft(leftover_swapped)
|
| 792 |
+
|
| 793 |
+
return SchedulerSwappedInOutputs(
|
| 794 |
+
decode_seq_groups=decode_seq_groups,
|
| 795 |
+
prefill_seq_groups=prefill_seq_groups,
|
| 796 |
+
blocks_to_swap_in=blocks_to_swap_in,
|
| 797 |
+
blocks_to_copy=blocks_to_copy,
|
| 798 |
+
num_lookahead_slots=self._get_num_lookahead_slots(
|
| 799 |
+
is_prefill=False, enable_chunking=enable_chunking),
|
| 800 |
+
infeasible_seq_groups=infeasible_seq_groups,
|
| 801 |
+
)
|
| 802 |
+
|
| 803 |
+
def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
|
| 804 |
+
if self.scheduler_config.chunked_prefill_enabled and \
|
| 805 |
+
not self.scheduler_config.is_multi_step:
|
| 806 |
+
prompt_limit = self.scheduler_config.max_model_len
|
| 807 |
+
else:
|
| 808 |
+
prompt_limit = min(self.scheduler_config.max_model_len,
|
| 809 |
+
self.scheduler_config.max_num_batched_tokens)
|
| 810 |
+
|
| 811 |
+
# Model is fine tuned with long context. Return the fine tuned max_len.
|
| 812 |
+
if (seq_group.lora_request
|
| 813 |
+
and seq_group.lora_request.long_lora_max_len):
|
| 814 |
+
assert prompt_limit <= seq_group.lora_request.long_lora_max_len
|
| 815 |
+
return seq_group.lora_request.long_lora_max_len
|
| 816 |
+
else:
|
| 817 |
+
return prompt_limit
|
| 818 |
+
|
| 819 |
+
def _get_priority(self,
|
| 820 |
+
seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
|
| 821 |
+
""" Get the priority of the sequence group.
|
| 822 |
+
Highest preference to user-defined priority, followed by arrival time.
|
| 823 |
+
Args:
|
| 824 |
+
seq_group: The sequence group input.
|
| 825 |
+
Returns:
|
| 826 |
+
The priority of the sequence group.
|
| 827 |
+
"""
|
| 828 |
+
return seq_group.priority, seq_group.arrival_time
|
| 829 |
+
|
| 830 |
+
def _schedule_priority_preemption(
|
| 831 |
+
self,
|
| 832 |
+
budget: SchedulingBudget,
|
| 833 |
+
) -> int:
|
| 834 |
+
"""Sorts waiting and running queue. Also, force preempt requests
|
| 835 |
+
from the running queue if their priority is lower.
|
| 836 |
+
Priority-based preemption is used with the priority policy.
|
| 837 |
+
Args:
|
| 838 |
+
budget: The scheduling budget. The argument is in-place updated
|
| 839 |
+
when any requests are scheduled.
|
| 840 |
+
Returns:
|
| 841 |
+
A count of priority-based preemptions.
|
| 842 |
+
"""
|
| 843 |
+
|
| 844 |
+
waiting_queue = self.waiting
|
| 845 |
+
|
| 846 |
+
running_queue = deque(sorted(self.running, key=self._get_priority))
|
| 847 |
+
|
| 848 |
+
blocks_to_swap_out: List[Tuple[int, int]] = []
|
| 849 |
+
force_preemption_count = 0
|
| 850 |
+
|
| 851 |
+
if waiting_queue:
|
| 852 |
+
seq_group = waiting_queue.popleft()
|
| 853 |
+
num_new_seqs = seq_group.get_max_num_running_seqs()
|
| 854 |
+
num_new_tokens_uncached, _ = (
|
| 855 |
+
self._get_num_new_uncached_and_cached_tokens(
|
| 856 |
+
seq_group, SequenceStatus.WAITING, False, budget))
|
| 857 |
+
|
| 858 |
+
#Only preempt if priority inversion exists
|
| 859 |
+
while running_queue and self._get_priority(
|
| 860 |
+
running_queue[-1]) > self._get_priority(seq_group):
|
| 861 |
+
#Only preempt if waiting sequence cannot be allocated
|
| 862 |
+
can_allocate = self.block_manager.can_allocate(seq_group)
|
| 863 |
+
if (num_new_tokens_uncached > 0
|
| 864 |
+
and can_allocate == AllocStatus.OK
|
| 865 |
+
and budget.can_schedule(
|
| 866 |
+
num_new_tokens=num_new_tokens_uncached,
|
| 867 |
+
num_new_seqs=num_new_seqs,
|
| 868 |
+
)):
|
| 869 |
+
break
|
| 870 |
+
|
| 871 |
+
#Adjust budget to remove the victim sequence group
|
| 872 |
+
vseq_group = running_queue.pop()
|
| 873 |
+
num_running_tokens_uncached, _ = (
|
| 874 |
+
self._get_num_new_uncached_and_cached_tokens(
|
| 875 |
+
vseq_group, SequenceStatus.RUNNING, False, budget))
|
| 876 |
+
budget.subtract_num_batched_tokens(
|
| 877 |
+
vseq_group.request_id, num_running_tokens_uncached)
|
| 878 |
+
num_running_seqs = vseq_group.get_max_num_running_seqs()
|
| 879 |
+
budget.subtract_num_seqs(vseq_group.request_id,
|
| 880 |
+
num_running_seqs)
|
| 881 |
+
|
| 882 |
+
#Preempt out the victim sequence group
|
| 883 |
+
self._preempt(vseq_group, blocks_to_swap_out)
|
| 884 |
+
waiting_queue.appendleft(vseq_group)
|
| 885 |
+
force_preemption_count += 1
|
| 886 |
+
#Put the sequence back into the waiting queue
|
| 887 |
+
waiting_queue.appendleft(seq_group)
|
| 888 |
+
|
| 889 |
+
waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
|
| 890 |
+
|
| 891 |
+
self.waiting = waiting_queue
|
| 892 |
+
self.running = running_queue
|
| 893 |
+
return force_preemption_count
|
| 894 |
+
|
| 895 |
+
def _schedule_prefills(
|
| 896 |
+
self,
|
| 897 |
+
budget: SchedulingBudget,
|
| 898 |
+
curr_loras: Optional[Set[int]],
|
| 899 |
+
enable_chunking: bool = False,
|
| 900 |
+
) -> SchedulerPrefillOutputs:
|
| 901 |
+
"""Schedule sequence groups that are in prefill stage.
|
| 902 |
+
|
| 903 |
+
Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
|
| 904 |
+
as a new prefill (that starts from beginning -> most recently generated
|
| 905 |
+
tokens).
|
| 906 |
+
|
| 907 |
+
It schedules waiting requests as long as it fits `budget` and
|
| 908 |
+
curr_loras <= max_lora from the scheduling config. The input arguments
|
| 909 |
+
`budget` and `curr_loras` are updated based on scheduled seq_groups.
|
| 910 |
+
|
| 911 |
+
Args:
|
| 912 |
+
budget: The scheduling budget. The argument is in-place updated
|
| 913 |
+
when any requests are scheduled.
|
| 914 |
+
curr_loras: Currently batched lora request ids. The argument is
|
| 915 |
+
in-place updated when any requests are scheduled.
|
| 916 |
+
enable_chunking: If True, seq group can be chunked and only a
|
| 917 |
+
chunked number of tokens are scheduled if
|
| 918 |
+
`budget.num_batched_tokens` has not enough capacity to schedule
|
| 919 |
+
all tokens.
|
| 920 |
+
|
| 921 |
+
Returns:
|
| 922 |
+
SchedulerPrefillOutputs.
|
| 923 |
+
"""
|
| 924 |
+
ignored_seq_groups: List[SequenceGroup] = []
|
| 925 |
+
seq_groups: List[ScheduledSequenceGroup] = []
|
| 926 |
+
|
| 927 |
+
waiting_queue = self.waiting
|
| 928 |
+
|
| 929 |
+
leftover_waiting_sequences: Deque[SequenceGroup] = deque()
|
| 930 |
+
while self._passed_delay(time.time()) and waiting_queue:
|
| 931 |
+
seq_group = waiting_queue[0]
|
| 932 |
+
|
| 933 |
+
waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
|
| 934 |
+
assert len(waiting_seqs) == 1, (
|
| 935 |
+
"Waiting sequence group should have only one prompt "
|
| 936 |
+
"sequence.")
|
| 937 |
+
num_new_tokens_uncached, num_new_tokens_cached = (
|
| 938 |
+
self._get_num_new_uncached_and_cached_tokens(
|
| 939 |
+
seq_group, SequenceStatus.WAITING, enable_chunking,
|
| 940 |
+
budget))
|
| 941 |
+
num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
|
| 942 |
+
|
| 943 |
+
if not enable_chunking:
|
| 944 |
+
num_prompt_tokens = waiting_seqs[0].get_len()
|
| 945 |
+
assert num_new_tokens == num_prompt_tokens
|
| 946 |
+
|
| 947 |
+
prompt_limit = self._get_prompt_limit(seq_group)
|
| 948 |
+
if num_new_tokens > prompt_limit:
|
| 949 |
+
logger.warning(
|
| 950 |
+
"Input prompt (%d tokens) is too long"
|
| 951 |
+
" and exceeds limit of %d", num_new_tokens, prompt_limit)
|
| 952 |
+
for seq in waiting_seqs:
|
| 953 |
+
seq.status = SequenceStatus.FINISHED_IGNORED
|
| 954 |
+
ignored_seq_groups.append(seq_group)
|
| 955 |
+
waiting_queue.popleft()
|
| 956 |
+
continue
|
| 957 |
+
|
| 958 |
+
num_lookahead_slots: int = 0
|
| 959 |
+
if self.scheduler_config.is_multi_step and enable_chunking:
|
| 960 |
+
num_lookahead_slots = self._get_num_lookahead_slots(
|
| 961 |
+
True, enable_chunking)
|
| 962 |
+
|
| 963 |
+
# If the sequence group cannot be allocated, stop.
|
| 964 |
+
can_allocate = self.block_manager.can_allocate(
|
| 965 |
+
seq_group, num_lookahead_slots=num_lookahead_slots)
|
| 966 |
+
if can_allocate == AllocStatus.LATER:
|
| 967 |
+
break
|
| 968 |
+
elif can_allocate == AllocStatus.NEVER:
|
| 969 |
+
logger.warning(
|
| 970 |
+
"Input prompt (%d tokens) + lookahead slots (%d) is "
|
| 971 |
+
"too long and exceeds the capacity of block_manager",
|
| 972 |
+
num_new_tokens, num_lookahead_slots)
|
| 973 |
+
for seq in waiting_seqs:
|
| 974 |
+
seq.status = SequenceStatus.FINISHED_IGNORED
|
| 975 |
+
ignored_seq_groups.append(seq_group)
|
| 976 |
+
waiting_queue.popleft()
|
| 977 |
+
continue
|
| 978 |
+
|
| 979 |
+
lora_int_id = 0
|
| 980 |
+
if self.lora_enabled:
|
| 981 |
+
lora_int_id = seq_group.lora_int_id
|
| 982 |
+
assert curr_loras is not None
|
| 983 |
+
assert self.lora_config is not None
|
| 984 |
+
if (self.lora_enabled and lora_int_id > 0
|
| 985 |
+
and lora_int_id not in curr_loras
|
| 986 |
+
and len(curr_loras) >= self.lora_config.max_loras):
|
| 987 |
+
# We don't have a space for another LoRA, so
|
| 988 |
+
# we ignore this request for now.
|
| 989 |
+
leftover_waiting_sequences.appendleft(seq_group)
|
| 990 |
+
waiting_queue.popleft()
|
| 991 |
+
continue
|
| 992 |
+
|
| 993 |
+
if (budget.num_batched_tokens
|
| 994 |
+
>= self.scheduler_config.max_num_batched_tokens):
|
| 995 |
+
# We've reached the budget limit - since there might be
|
| 996 |
+
# continuous prefills in the running queue, we should break
|
| 997 |
+
# to avoid scheduling any new prefills.
|
| 998 |
+
break
|
| 999 |
+
|
| 1000 |
+
num_new_seqs = seq_group.get_max_num_running_seqs()
|
| 1001 |
+
if num_new_tokens_uncached == 0 or not budget.can_schedule(
|
| 1002 |
+
num_new_tokens=num_new_tokens_uncached,
|
| 1003 |
+
num_new_seqs=num_new_seqs,
|
| 1004 |
+
):
|
| 1005 |
+
break
|
| 1006 |
+
|
| 1007 |
+
# Can schedule this request.
|
| 1008 |
+
if curr_loras is not None and lora_int_id > 0:
|
| 1009 |
+
curr_loras.add(lora_int_id)
|
| 1010 |
+
waiting_queue.popleft()
|
| 1011 |
+
self._allocate_and_set_running(seq_group)
|
| 1012 |
+
|
| 1013 |
+
if enable_chunking and self.scheduler_config.is_multi_step:
|
| 1014 |
+
blocks_to_copy: List[Tuple[int, int]] = []
|
| 1015 |
+
# init_multi_step_from_lookahead_slots happens in append_slots
|
| 1016 |
+
self._append_slots(seq_group, blocks_to_copy, enable_chunking)
|
| 1017 |
+
# This assert will trip when a copy-on-write happens. This is
|
| 1018 |
+
# not a concern as the very first sequence-group block
|
| 1019 |
+
# allocation happens above. Still, we have the assert to
|
| 1020 |
+
# catch any edge-cases.
|
| 1021 |
+
assert not blocks_to_copy
|
| 1022 |
+
else:
|
| 1023 |
+
seq_group.init_multi_step_from_lookahead_slots(
|
| 1024 |
+
num_lookahead_slots,
|
| 1025 |
+
num_scheduler_steps=self.scheduler_config.
|
| 1026 |
+
num_scheduler_steps,
|
| 1027 |
+
is_multi_step=self.scheduler_config.is_multi_step,
|
| 1028 |
+
enable_chunking=enable_chunking)
|
| 1029 |
+
|
| 1030 |
+
seq_groups.append(
|
| 1031 |
+
ScheduledSequenceGroup(seq_group=seq_group,
|
| 1032 |
+
token_chunk_size=num_new_tokens))
|
| 1033 |
+
budget.add_num_batched_tokens(
|
| 1034 |
+
seq_group.request_id,
|
| 1035 |
+
num_batched_tokens=num_new_tokens_uncached,
|
| 1036 |
+
num_cached_tokens=num_new_tokens_cached,
|
| 1037 |
+
)
|
| 1038 |
+
budget.add_num_seqs(seq_group.request_id, num_new_seqs)
|
| 1039 |
+
|
| 1040 |
+
# Queue requests that couldn't be scheduled.
|
| 1041 |
+
waiting_queue.extendleft(leftover_waiting_sequences)
|
| 1042 |
+
if len(seq_groups) > 0:
|
| 1043 |
+
self.prev_prompt = True
|
| 1044 |
+
|
| 1045 |
+
return SchedulerPrefillOutputs(
|
| 1046 |
+
seq_groups=seq_groups,
|
| 1047 |
+
ignored_seq_groups=ignored_seq_groups,
|
| 1048 |
+
num_lookahead_slots=self._get_num_lookahead_slots(
|
| 1049 |
+
is_prefill=True, enable_chunking=enable_chunking))
|
| 1050 |
+
|
| 1051 |
+
def _schedule_default(self) -> SchedulerOutputs:
|
| 1052 |
+
"""Schedule queued requests.
|
| 1053 |
+
|
| 1054 |
+
The current policy is designed to optimize the throughput. First,
|
| 1055 |
+
it batches as many prefill requests as possible. And it schedules
|
| 1056 |
+
decodes. If there's a pressure on GPU memory, decode requests can
|
| 1057 |
+
be swapped or preempted.
|
| 1058 |
+
"""
|
| 1059 |
+
# Include running requests to the budget.
|
| 1060 |
+
budget = SchedulingBudget(
|
| 1061 |
+
token_budget=self.scheduler_config.max_num_batched_tokens,
|
| 1062 |
+
max_num_seqs=self.scheduler_config.max_num_seqs,
|
| 1063 |
+
)
|
| 1064 |
+
# Make sure we include num running seqs before scheduling prefill,
|
| 1065 |
+
# so that we don't schedule beyond max_num_seqs for prefill.
|
| 1066 |
+
for seq_group in self.running:
|
| 1067 |
+
budget.add_num_seqs(seq_group.request_id,
|
| 1068 |
+
seq_group.get_max_num_running_seqs())
|
| 1069 |
+
curr_loras = set(
|
| 1070 |
+
seq_group.lora_int_id for seq_group in self.running
|
| 1071 |
+
if seq_group.lora_int_id > 0) if self.lora_enabled else None
|
| 1072 |
+
|
| 1073 |
+
prefills = SchedulerPrefillOutputs.create_empty()
|
| 1074 |
+
running_scheduled = SchedulerRunningOutputs.create_empty()
|
| 1075 |
+
swapped_in = SchedulerSwappedInOutputs.create_empty()
|
| 1076 |
+
|
| 1077 |
+
# If any requests are swapped, prioritized swapped requests.
|
| 1078 |
+
if not self.swapped:
|
| 1079 |
+
prefills = self._schedule_prefills(budget,
|
| 1080 |
+
curr_loras,
|
| 1081 |
+
enable_chunking=False)
|
| 1082 |
+
|
| 1083 |
+
if len(prefills.seq_groups
|
| 1084 |
+
) == 0 and self.scheduler_config.policy == "priority":
|
| 1085 |
+
self._schedule_priority_preemption(budget)
|
| 1086 |
+
|
| 1087 |
+
# Don't schedule decodes if prefills are scheduled.
|
| 1088 |
+
# NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
|
| 1089 |
+
# only contains decode requests, not chunked prefills.
|
| 1090 |
+
if len(prefills.seq_groups) == 0:
|
| 1091 |
+
running_scheduled = self._schedule_running(budget,
|
| 1092 |
+
curr_loras,
|
| 1093 |
+
enable_chunking=False)
|
| 1094 |
+
|
| 1095 |
+
# If any sequence group is preempted, do not swap in any sequence
|
| 1096 |
+
# group. because it means there's no slot for new running requests.
|
| 1097 |
+
if len(running_scheduled.preempted) + len(
|
| 1098 |
+
running_scheduled.swapped_out) == 0:
|
| 1099 |
+
swapped_in = self._schedule_swapped(budget, curr_loras)
|
| 1100 |
+
|
| 1101 |
+
assert (budget.num_batched_tokens
|
| 1102 |
+
<= self.scheduler_config.max_num_batched_tokens)
|
| 1103 |
+
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
| 1104 |
+
|
| 1105 |
+
# Update waiting requests.
|
| 1106 |
+
self.waiting.extendleft(running_scheduled.preempted)
|
| 1107 |
+
# Update new running requests.
|
| 1108 |
+
if len(prefills.seq_groups) > 0:
|
| 1109 |
+
self.running.extend([s.seq_group for s in prefills.seq_groups])
|
| 1110 |
+
|
| 1111 |
+
self.running.extend(running_scheduled.decode_seq_groups_list)
|
| 1112 |
+
|
| 1113 |
+
if len(swapped_in.decode_seq_groups) > 0:
|
| 1114 |
+
self.running.extend(
|
| 1115 |
+
[s.seq_group for s in swapped_in.decode_seq_groups])
|
| 1116 |
+
|
| 1117 |
+
# Update swapped requests.
|
| 1118 |
+
self.swapped.extend(running_scheduled.swapped_out)
|
| 1119 |
+
preempted = (len(running_scheduled.preempted) +
|
| 1120 |
+
len(running_scheduled.swapped_out))
|
| 1121 |
+
|
| 1122 |
+
# There should be no prefill from running queue because this policy
|
| 1123 |
+
# doesn't allow chunked prefills.
|
| 1124 |
+
assert len(running_scheduled.prefill_seq_groups) == 0
|
| 1125 |
+
assert len(swapped_in.prefill_seq_groups) == 0
|
| 1126 |
+
|
| 1127 |
+
# Merge lists
|
| 1128 |
+
num_prefill_groups = len(prefills.seq_groups)
|
| 1129 |
+
if num_prefill_groups > 0:
|
| 1130 |
+
scheduled_seq_groups = prefills.seq_groups
|
| 1131 |
+
scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
|
| 1132 |
+
else:
|
| 1133 |
+
scheduled_seq_groups = running_scheduled.decode_seq_groups
|
| 1134 |
+
scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
|
| 1135 |
+
|
| 1136 |
+
blocks_to_copy = running_scheduled.blocks_to_copy
|
| 1137 |
+
blocks_to_copy.extend(swapped_in.blocks_to_copy)
|
| 1138 |
+
|
| 1139 |
+
ignored_seq_groups = prefills.ignored_seq_groups
|
| 1140 |
+
ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
|
| 1141 |
+
|
| 1142 |
+
return SchedulerOutputs(
|
| 1143 |
+
scheduled_seq_groups=scheduled_seq_groups,
|
| 1144 |
+
num_prefill_groups=num_prefill_groups,
|
| 1145 |
+
num_batched_tokens=budget.num_batched_tokens +
|
| 1146 |
+
budget.num_cached_tokens,
|
| 1147 |
+
blocks_to_swap_in=swapped_in.blocks_to_swap_in,
|
| 1148 |
+
blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
|
| 1149 |
+
blocks_to_copy=blocks_to_copy,
|
| 1150 |
+
ignored_seq_groups=ignored_seq_groups,
|
| 1151 |
+
num_lookahead_slots=running_scheduled.num_lookahead_slots,
|
| 1152 |
+
running_queue_size=len(self.running),
|
| 1153 |
+
preempted=preempted,
|
| 1154 |
+
)
|
| 1155 |
+
|
| 1156 |
+
def _schedule_chunked_prefill(self) -> SchedulerOutputs:
|
| 1157 |
+
"""Schedule queued requests.
|
| 1158 |
+
|
| 1159 |
+
Chunked prefill allows to chunk prefill requests, batch them together
|
| 1160 |
+
with decode requests. This policy 1. schedule as many decoding requests
|
| 1161 |
+
as possible. 2. schedule chunked prefill requests that are not
|
| 1162 |
+
finished. 3. schedule swapped request. 4. schedule new prefill
|
| 1163 |
+
requests.
|
| 1164 |
+
|
| 1165 |
+
The policy can sustain the high GPU utilization because it can put
|
| 1166 |
+
prefill and decodes requests to the same batch, while it improves
|
| 1167 |
+
inter token latency because decodes requests don't need to be blocked
|
| 1168 |
+
by prefill requests.
|
| 1169 |
+
"""
|
| 1170 |
+
budget = SchedulingBudget(
|
| 1171 |
+
token_budget=self.scheduler_config.max_num_batched_tokens,
|
| 1172 |
+
max_num_seqs=self.scheduler_config.max_num_seqs,
|
| 1173 |
+
)
|
| 1174 |
+
curr_loras: Set[int] = set()
|
| 1175 |
+
|
| 1176 |
+
prefills = SchedulerPrefillOutputs.create_empty()
|
| 1177 |
+
swapped_in = SchedulerSwappedInOutputs.create_empty()
|
| 1178 |
+
|
| 1179 |
+
# Decoding should be always scheduled first by fcfs.
|
| 1180 |
+
running_scheduled = self._schedule_running(budget,
|
| 1181 |
+
curr_loras,
|
| 1182 |
+
enable_chunking=True)
|
| 1183 |
+
|
| 1184 |
+
# Schedule swapped out requests.
|
| 1185 |
+
# If preemption happens, it means we don't have space for swap-in.
|
| 1186 |
+
if len(running_scheduled.preempted) + len(
|
| 1187 |
+
running_scheduled.swapped_out) == 0:
|
| 1188 |
+
swapped_in = self._schedule_swapped(budget, curr_loras)
|
| 1189 |
+
|
| 1190 |
+
prefills = self._schedule_prefills(budget,
|
| 1191 |
+
curr_loras,
|
| 1192 |
+
enable_chunking=True)
|
| 1193 |
+
|
| 1194 |
+
assert (budget.num_batched_tokens
|
| 1195 |
+
<= self.scheduler_config.max_num_batched_tokens)
|
| 1196 |
+
assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
|
| 1197 |
+
|
| 1198 |
+
# Update waiting requests.
|
| 1199 |
+
self.waiting.extendleft(running_scheduled.preempted)
|
| 1200 |
+
|
| 1201 |
+
# Update new running requests.
|
| 1202 |
+
# By default, vLLM scheduler prioritizes prefills.
|
| 1203 |
+
# Once chunked prefill is enabled,
|
| 1204 |
+
# the policy is changed to prioritize decode requests.
|
| 1205 |
+
self.running.extend(
|
| 1206 |
+
[s.seq_group for s in swapped_in.decode_seq_groups])
|
| 1207 |
+
self.running.extend(
|
| 1208 |
+
[s.seq_group for s in swapped_in.prefill_seq_groups])
|
| 1209 |
+
self.running.extend(
|
| 1210 |
+
[s.seq_group for s in running_scheduled.decode_seq_groups])
|
| 1211 |
+
self.running.extend(
|
| 1212 |
+
[s.seq_group for s in running_scheduled.prefill_seq_groups])
|
| 1213 |
+
self.running.extend([s.seq_group for s in prefills.seq_groups])
|
| 1214 |
+
|
| 1215 |
+
# Update swapped requests.
|
| 1216 |
+
self.swapped.extend(running_scheduled.swapped_out)
|
| 1217 |
+
# Put prefills first due to Attention backend ordering assumption.
|
| 1218 |
+
scheduled_seq_groups = (prefills.seq_groups +
|
| 1219 |
+
running_scheduled.prefill_seq_groups +
|
| 1220 |
+
swapped_in.prefill_seq_groups +
|
| 1221 |
+
running_scheduled.decode_seq_groups +
|
| 1222 |
+
swapped_in.decode_seq_groups)
|
| 1223 |
+
num_prefill_groups = (len(prefills.seq_groups) +
|
| 1224 |
+
len(swapped_in.prefill_seq_groups) +
|
| 1225 |
+
len(running_scheduled.prefill_seq_groups))
|
| 1226 |
+
# If all prompts, then we set num_lookahead_slots to 0
|
| 1227 |
+
# this allows us to go through the `no_spec` path in
|
| 1228 |
+
# `spec_decode_worker.py`
|
| 1229 |
+
all_prefills = (len(scheduled_seq_groups) == num_prefill_groups)
|
| 1230 |
+
num_lookahead_slots = (0 if
|
| 1231 |
+
(all_prefills
|
| 1232 |
+
and not self.scheduler_config.is_multi_step)
|
| 1233 |
+
else running_scheduled.num_lookahead_slots)
|
| 1234 |
+
return SchedulerOutputs(
|
| 1235 |
+
scheduled_seq_groups=scheduled_seq_groups,
|
| 1236 |
+
num_prefill_groups=num_prefill_groups,
|
| 1237 |
+
num_batched_tokens=budget.num_batched_tokens +
|
| 1238 |
+
budget.num_cached_tokens,
|
| 1239 |
+
blocks_to_swap_in=swapped_in.blocks_to_swap_in,
|
| 1240 |
+
blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
|
| 1241 |
+
blocks_to_copy=running_scheduled.blocks_to_copy +
|
| 1242 |
+
swapped_in.blocks_to_copy,
|
| 1243 |
+
ignored_seq_groups=prefills.ignored_seq_groups +
|
| 1244 |
+
swapped_in.infeasible_seq_groups,
|
| 1245 |
+
num_lookahead_slots=num_lookahead_slots,
|
| 1246 |
+
running_queue_size=len(self.running),
|
| 1247 |
+
preempted=(len(running_scheduled.preempted) +
|
| 1248 |
+
len(running_scheduled.swapped_out)),
|
| 1249 |
+
)
|
| 1250 |
+
|
| 1251 |
+
def _schedule(self) -> SchedulerOutputs:
|
| 1252 |
+
"""Schedule queued requests."""
|
| 1253 |
+
if self.scheduler_config.chunked_prefill_enabled:
|
| 1254 |
+
return self._schedule_chunked_prefill()
|
| 1255 |
+
else:
|
| 1256 |
+
return self._schedule_default()
|
| 1257 |
+
|
| 1258 |
+
def _can_append_slots(self, seq_group: SequenceGroup,
|
| 1259 |
+
enable_chunking: bool) -> bool:
|
| 1260 |
+
"""Determine whether or not we have enough space in the KV cache to
|
| 1261 |
+
continue generation of the sequence group.
|
| 1262 |
+
"""
|
| 1263 |
+
# It is True only for testing case to trigger artificial preemption.
|
| 1264 |
+
if (self.enable_artificial_preemption
|
| 1265 |
+
and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
|
| 1266 |
+
and self.artificial_preempt_cnt > 0):
|
| 1267 |
+
self.artificial_preempt_cnt -= 1
|
| 1268 |
+
return False
|
| 1269 |
+
|
| 1270 |
+
is_prefill = seq_group.is_prefill()
|
| 1271 |
+
num_lookahead_slots = self._get_num_lookahead_slots(
|
| 1272 |
+
is_prefill, enable_chunking)
|
| 1273 |
+
|
| 1274 |
+
if is_prefill and num_lookahead_slots > 0:
|
| 1275 |
+
# Appending prefill slots only happens multi-step and
|
| 1276 |
+
# chunked-prefill are enabled together.
|
| 1277 |
+
assert self.scheduler_config.is_multi_step and enable_chunking
|
| 1278 |
+
|
| 1279 |
+
return self.block_manager.can_append_slots(
|
| 1280 |
+
seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
|
| 1281 |
+
|
| 1282 |
+
def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
|
| 1283 |
+
# async_output_proc is allowed only when we have a single sequence
|
| 1284 |
+
# in the sequence group
|
| 1285 |
+
no_single_seq = seq_group.sampling_params is None or (
|
| 1286 |
+
seq_group.sampling_params.n == 1)
|
| 1287 |
+
return no_single_seq
|
| 1288 |
+
|
| 1289 |
+
def schedule(
|
| 1290 |
+
self
|
| 1291 |
+
) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
|
| 1292 |
+
# Schedule sequence groups.
|
| 1293 |
+
# This function call changes the internal states of the scheduler
|
| 1294 |
+
# such as self.running, self.swapped, and self.waiting.
|
| 1295 |
+
scheduler_start_time = time.perf_counter()
|
| 1296 |
+
|
| 1297 |
+
scheduler_outputs: SchedulerOutputs = self._schedule()
|
| 1298 |
+
now = time.time()
|
| 1299 |
+
|
| 1300 |
+
if not self.cache_config.enable_prefix_caching:
|
| 1301 |
+
common_computed_block_nums = []
|
| 1302 |
+
|
| 1303 |
+
allow_async_output_proc: bool = self.use_async_output_proc
|
| 1304 |
+
|
| 1305 |
+
# Create input data structures.
|
| 1306 |
+
seq_group_metadata_list: List[SequenceGroupMetadata] = []
|
| 1307 |
+
for i, scheduled_seq_group in enumerate(
|
| 1308 |
+
scheduler_outputs.scheduled_seq_groups):
|
| 1309 |
+
seq_group = scheduled_seq_group.seq_group
|
| 1310 |
+
token_chunk_size = scheduled_seq_group.token_chunk_size
|
| 1311 |
+
seq_group.maybe_set_first_scheduled_time(now)
|
| 1312 |
+
|
| 1313 |
+
seq_group_metadata = self._seq_group_metadata_cache[
|
| 1314 |
+
self.cache_id].get_object()
|
| 1315 |
+
seq_group_metadata.seq_data.clear()
|
| 1316 |
+
seq_group_metadata.block_tables.clear()
|
| 1317 |
+
|
| 1318 |
+
# seq_id -> SequenceData
|
| 1319 |
+
seq_data: Dict[int, SequenceData] = {}
|
| 1320 |
+
# seq_id -> physical block numbers
|
| 1321 |
+
block_tables: Dict[int, List[int]] = {}
|
| 1322 |
+
|
| 1323 |
+
if seq_group.is_encoder_decoder():
|
| 1324 |
+
# Encoder associated with SequenceGroup
|
| 1325 |
+
encoder_seq = seq_group.get_encoder_seq()
|
| 1326 |
+
assert encoder_seq is not None
|
| 1327 |
+
encoder_seq_data = encoder_seq.data
|
| 1328 |
+
# Block table for cross-attention
|
| 1329 |
+
# Also managed at SequenceGroup level
|
| 1330 |
+
cross_block_table = self.block_manager.get_cross_block_table(
|
| 1331 |
+
seq_group)
|
| 1332 |
+
else:
|
| 1333 |
+
encoder_seq_data = None
|
| 1334 |
+
cross_block_table = None
|
| 1335 |
+
|
| 1336 |
+
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
| 1337 |
+
seq_id = seq.seq_id
|
| 1338 |
+
seq_data[seq_id] = seq.data
|
| 1339 |
+
block_tables[seq_id] = self.block_manager.get_block_table(seq)
|
| 1340 |
+
self.block_manager.access_all_blocks_in_seq(seq, now)
|
| 1341 |
+
|
| 1342 |
+
if self.cache_config.enable_prefix_caching:
|
| 1343 |
+
common_computed_block_nums = (
|
| 1344 |
+
self.block_manager.get_common_computed_block_ids(
|
| 1345 |
+
seq_group.get_seqs(status=SequenceStatus.RUNNING)))
|
| 1346 |
+
|
| 1347 |
+
do_sample = True
|
| 1348 |
+
is_prompt = seq_group.is_prefill()
|
| 1349 |
+
# We should send the metadata to workers when the first prefill
|
| 1350 |
+
# is sent. Subsequent requests could be chunked prefill or decode.
|
| 1351 |
+
is_first_prefill = False
|
| 1352 |
+
if is_prompt:
|
| 1353 |
+
seqs = seq_group.get_seqs()
|
| 1354 |
+
# Prefill has only 1 sequence.
|
| 1355 |
+
assert len(seqs) == 1
|
| 1356 |
+
num_computed_tokens = seqs[0].data.get_num_computed_tokens()
|
| 1357 |
+
is_first_prefill = num_computed_tokens == 0
|
| 1358 |
+
# In the next iteration, all prompt tokens are not computed.
|
| 1359 |
+
# It means the prefill is chunked, and we don't need sampling.
|
| 1360 |
+
# NOTE: We use get_len instead of get_prompt_len because when
|
| 1361 |
+
# a sequence is preempted, prefill includes previous generated
|
| 1362 |
+
# output tokens.
|
| 1363 |
+
if (token_chunk_size + num_computed_tokens
|
| 1364 |
+
< seqs[0].data.get_len()):
|
| 1365 |
+
do_sample = False
|
| 1366 |
+
|
| 1367 |
+
# It assumes the scheduled_seq_groups is ordered by
|
| 1368 |
+
# prefill < decoding.
|
| 1369 |
+
if is_first_prefill or not self.scheduler_config.send_delta_data:
|
| 1370 |
+
seq_group_metadata = SequenceGroupMetadata(
|
| 1371 |
+
request_id=seq_group.request_id,
|
| 1372 |
+
is_prompt=is_prompt,
|
| 1373 |
+
seq_data=seq_data,
|
| 1374 |
+
sampling_params=seq_group.sampling_params,
|
| 1375 |
+
block_tables=block_tables,
|
| 1376 |
+
do_sample=do_sample,
|
| 1377 |
+
pooling_params=seq_group.pooling_params,
|
| 1378 |
+
token_chunk_size=token_chunk_size,
|
| 1379 |
+
lora_request=seq_group.lora_request,
|
| 1380 |
+
computed_block_nums=common_computed_block_nums,
|
| 1381 |
+
encoder_seq_data=encoder_seq_data,
|
| 1382 |
+
cross_block_table=cross_block_table,
|
| 1383 |
+
state=seq_group.state,
|
| 1384 |
+
token_type_ids=seq_group.token_type_ids,
|
| 1385 |
+
# `multi_modal_data` will only be present for the 1st comm
|
| 1386 |
+
# between engine and worker.
|
| 1387 |
+
# the subsequent comms can still use delta, but
|
| 1388 |
+
# `multi_modal_data` will be None.
|
| 1389 |
+
multi_modal_data=seq_group.multi_modal_data
|
| 1390 |
+
if scheduler_outputs.num_prefill_groups > 0 else None,
|
| 1391 |
+
multi_modal_placeholders=seq_group.multi_modal_placeholders
|
| 1392 |
+
if scheduler_outputs.num_prefill_groups > 0 else None,
|
| 1393 |
+
mm_processor_kwargs=seq_group.mm_processor_kwargs,
|
| 1394 |
+
prompt_adapter_request=seq_group.prompt_adapter_request,
|
| 1395 |
+
)
|
| 1396 |
+
else:
|
| 1397 |
+
# When SPMD mode is enabled, we only send delta data except for
|
| 1398 |
+
# the first request to reduce serialization cost.
|
| 1399 |
+
seq_data_delta = {}
|
| 1400 |
+
for id, data in seq_data.items():
|
| 1401 |
+
seq_data_delta[id] = data.get_delta_and_reset()
|
| 1402 |
+
seq_group_metadata = SequenceGroupMetadataDelta(
|
| 1403 |
+
seq_data_delta,
|
| 1404 |
+
seq_group.request_id,
|
| 1405 |
+
block_tables,
|
| 1406 |
+
is_prompt,
|
| 1407 |
+
do_sample=do_sample,
|
| 1408 |
+
token_chunk_size=token_chunk_size,
|
| 1409 |
+
computed_block_nums=common_computed_block_nums,
|
| 1410 |
+
)
|
| 1411 |
+
seq_group_metadata_list.append(seq_group_metadata)
|
| 1412 |
+
|
| 1413 |
+
if allow_async_output_proc:
|
| 1414 |
+
allow_async_output_proc = self._allow_async_output_proc(
|
| 1415 |
+
seq_group)
|
| 1416 |
+
|
| 1417 |
+
# Now that the batch has been created, we can assume all blocks in the
|
| 1418 |
+
# batch will have been computed before the next scheduling invocation.
|
| 1419 |
+
# This is because the engine assumes that a failure in model execution
|
| 1420 |
+
# will crash the vLLM instance / will not retry.
|
| 1421 |
+
for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
|
| 1422 |
+
self.block_manager.mark_blocks_as_computed(
|
| 1423 |
+
scheduled_seq_group.seq_group,
|
| 1424 |
+
scheduled_seq_group.token_chunk_size)
|
| 1425 |
+
|
| 1426 |
+
self._seq_group_metadata_cache[self.next_cache_id].reset()
|
| 1427 |
+
|
| 1428 |
+
scheduler_time = time.perf_counter() - scheduler_start_time
|
| 1429 |
+
# Add this to scheduler time to all the sequences that are currently
|
| 1430 |
+
# running. This will help estimate if the scheduler is a significant
|
| 1431 |
+
# component in the e2e latency.
|
| 1432 |
+
for seq_group in self.running:
|
| 1433 |
+
if seq_group is not None and seq_group.metrics is not None:
|
| 1434 |
+
if seq_group.metrics.scheduler_time is not None:
|
| 1435 |
+
seq_group.metrics.scheduler_time += scheduler_time
|
| 1436 |
+
else:
|
| 1437 |
+
seq_group.metrics.scheduler_time = scheduler_time
|
| 1438 |
+
|
| 1439 |
+
# Move to next cache (if exists)
|
| 1440 |
+
self.cache_id = self.next_cache_id
|
| 1441 |
+
|
| 1442 |
+
# Return results
|
| 1443 |
+
return (seq_group_metadata_list, scheduler_outputs,
|
| 1444 |
+
allow_async_output_proc)
|
| 1445 |
+
|
| 1446 |
+
def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
|
| 1447 |
+
self.block_manager.fork(parent_seq, child_seq)
|
| 1448 |
+
|
| 1449 |
+
def free_seq(self, seq: Sequence) -> None:
|
| 1450 |
+
"""Free a sequence from a block table."""
|
| 1451 |
+
self.block_manager.free(seq)
|
| 1452 |
+
|
| 1453 |
+
def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
|
| 1454 |
+
"""Free finished seqs in a sequence group."""
|
| 1455 |
+
for seq in seq_group.get_seqs():
|
| 1456 |
+
if seq.is_finished():
|
| 1457 |
+
self.free_seq(seq)
|
| 1458 |
+
|
| 1459 |
+
def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
|
| 1460 |
+
if seq_group.is_finished():
|
| 1461 |
+
# Free cross-attention block table, if it exists
|
| 1462 |
+
self._free_seq_group_cross_attn_blocks(seq_group)
|
| 1463 |
+
|
| 1464 |
+
# Add the finished requests to the finished requests list.
|
| 1465 |
+
# This list will be used to update the Mamba cache in the
|
| 1466 |
+
# next step.
|
| 1467 |
+
self._finished_requests_ids.append(seq_group.request_id)
|
| 1468 |
+
|
| 1469 |
+
# Free finished seqs
|
| 1470 |
+
self._free_finished_seqs(seq_group)
|
| 1471 |
+
|
| 1472 |
+
def free_finished_seq_groups(self) -> None:
|
| 1473 |
+
remaining: Deque[SequenceGroup] = deque()
|
| 1474 |
+
for seq_group in self.running:
|
| 1475 |
+
self._free_finished_seq_group(seq_group)
|
| 1476 |
+
if not seq_group.is_finished():
|
| 1477 |
+
remaining.append(seq_group)
|
| 1478 |
+
|
| 1479 |
+
self.running = remaining
|
| 1480 |
+
|
| 1481 |
+
# Handle async stopped sequence groups
|
| 1482 |
+
# (ones that reached max model len)
|
| 1483 |
+
if self._async_stopped:
|
| 1484 |
+
for seq_group in self._async_stopped:
|
| 1485 |
+
self._free_seq_group_cross_attn_blocks(seq_group)
|
| 1486 |
+
self._finished_requests_ids.append(seq_group.request_id)
|
| 1487 |
+
|
| 1488 |
+
# Free finished seqs
|
| 1489 |
+
self._free_finished_seqs(seq_group)
|
| 1490 |
+
|
| 1491 |
+
self._async_stopped.clear()
|
| 1492 |
+
|
| 1493 |
+
def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
|
| 1494 |
+
self.block_manager.allocate(seq_group)
|
| 1495 |
+
for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
|
| 1496 |
+
seq.status = SequenceStatus.RUNNING
|
| 1497 |
+
|
| 1498 |
+
def _append_slots(self,
|
| 1499 |
+
seq_group: SequenceGroup,
|
| 1500 |
+
blocks_to_copy: List[Tuple[int, int]],
|
| 1501 |
+
enable_chunking: bool = False) -> None:
|
| 1502 |
+
"""Appends new slots to the sequences in the given sequence group.
|
| 1503 |
+
|
| 1504 |
+
Args:
|
| 1505 |
+
seq_group (SequenceGroup): The sequence group containing the
|
| 1506 |
+
sequences to append slots to.
|
| 1507 |
+
blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
|
| 1508 |
+
ints, the first int is the source block index, and the second
|
| 1509 |
+
int is the destination block index. This list is updated with
|
| 1510 |
+
the new source and destination block indices for the appended
|
| 1511 |
+
slots.
|
| 1512 |
+
enable_chunking (bool): True if chunked prefill is enabled.
|
| 1513 |
+
"""
|
| 1514 |
+
is_prefill: bool = seq_group.is_prefill()
|
| 1515 |
+
num_lookahead_slots: int = self._get_num_lookahead_slots(
|
| 1516 |
+
is_prefill, enable_chunking)
|
| 1517 |
+
|
| 1518 |
+
seq_group.init_multi_step_from_lookahead_slots(
|
| 1519 |
+
num_lookahead_slots,
|
| 1520 |
+
num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
|
| 1521 |
+
is_multi_step=self.scheduler_config.is_multi_step,
|
| 1522 |
+
enable_chunking=enable_chunking)
|
| 1523 |
+
|
| 1524 |
+
seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
|
| 1525 |
+
if self.scheduler_config.is_multi_step and enable_chunking:
|
| 1526 |
+
# In multi-step chunked-prefill any sequence type can have
|
| 1527 |
+
# slots appended.
|
| 1528 |
+
seq_status = None
|
| 1529 |
+
|
| 1530 |
+
for seq in seq_group.get_seqs(status=seq_status):
|
| 1531 |
+
cows = self.block_manager.append_slots(seq, num_lookahead_slots)
|
| 1532 |
+
if len(cows) > 0:
|
| 1533 |
+
blocks_to_copy.extend(cows)
|
| 1534 |
+
|
| 1535 |
+
def _preempt(self, seq_group: SequenceGroup,
|
| 1536 |
+
blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
|
| 1537 |
+
# If preemption mode is not specified, we determine the mode as follows:
|
| 1538 |
+
# We use recomputation by default since it incurs lower overhead than
|
| 1539 |
+
# swapping. However, when the sequence group has multiple sequences
|
| 1540 |
+
# (e.g., beam search), recomputation is not currently supported. In
|
| 1541 |
+
# such a case, we use swapping instead.
|
| 1542 |
+
# FIXME(woosuk): This makes our scheduling policy a bit bizarre.
|
| 1543 |
+
# As swapped sequences are prioritized over waiting sequences,
|
| 1544 |
+
# sequence groups with multiple sequences are implicitly prioritized
|
| 1545 |
+
# over sequence groups with a single sequence.
|
| 1546 |
+
# TODO(woosuk): Support recomputation for sequence groups with multiple
|
| 1547 |
+
# sequences. This may require a more sophisticated CUDA kernel.
|
| 1548 |
+
if self.user_specified_preemption_mode is None:
|
| 1549 |
+
if seq_group.get_max_num_running_seqs() == 1:
|
| 1550 |
+
preemption_mode = PreemptionMode.RECOMPUTE
|
| 1551 |
+
else:
|
| 1552 |
+
preemption_mode = PreemptionMode.SWAP
|
| 1553 |
+
|
| 1554 |
+
elif self.user_specified_preemption_mode == "swap":
|
| 1555 |
+
preemption_mode = PreemptionMode.SWAP
|
| 1556 |
+
else:
|
| 1557 |
+
preemption_mode = PreemptionMode.RECOMPUTE
|
| 1558 |
+
|
| 1559 |
+
if self.num_cumulative_preemption % 50 == 0:
|
| 1560 |
+
logger.warning(
|
| 1561 |
+
"Sequence group %s is preempted by %s mode because there is "
|
| 1562 |
+
"not enough KV cache space. This can affect the end-to-end "
|
| 1563 |
+
"performance. Increase gpu_memory_utilization or "
|
| 1564 |
+
"tensor_parallel_size to provide more KV cache memory. "
|
| 1565 |
+
"total_num_cumulative_preemption=%d", seq_group.request_id,
|
| 1566 |
+
preemption_mode, self.num_cumulative_preemption + 1)
|
| 1567 |
+
self.num_cumulative_preemption += 1
|
| 1568 |
+
|
| 1569 |
+
if preemption_mode == PreemptionMode.RECOMPUTE:
|
| 1570 |
+
self._preempt_by_recompute(seq_group)
|
| 1571 |
+
elif preemption_mode == PreemptionMode.SWAP:
|
| 1572 |
+
self._preempt_by_swap(seq_group, blocks_to_swap_out)
|
| 1573 |
+
else:
|
| 1574 |
+
raise AssertionError("Invalid preemption mode.")
|
| 1575 |
+
return preemption_mode
|
| 1576 |
+
|
| 1577 |
+
def _preempt_by_recompute(
|
| 1578 |
+
self,
|
| 1579 |
+
seq_group: SequenceGroup,
|
| 1580 |
+
) -> None:
|
| 1581 |
+
seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
|
| 1582 |
+
assert len(seqs) == 1
|
| 1583 |
+
for seq in seqs:
|
| 1584 |
+
seq.status = SequenceStatus.WAITING
|
| 1585 |
+
self.free_seq(seq)
|
| 1586 |
+
seq.reset_state_for_recompute()
|
| 1587 |
+
self._free_seq_group_cross_attn_blocks(seq_group)
|
| 1588 |
+
|
| 1589 |
+
def _preempt_by_swap(
|
| 1590 |
+
self,
|
| 1591 |
+
seq_group: SequenceGroup,
|
| 1592 |
+
blocks_to_swap_out: List[Tuple[int, int]],
|
| 1593 |
+
) -> None:
|
| 1594 |
+
self._swap_out(seq_group, blocks_to_swap_out)
|
| 1595 |
+
|
| 1596 |
+
def _swap_in(
|
| 1597 |
+
self,
|
| 1598 |
+
seq_group: SequenceGroup,
|
| 1599 |
+
blocks_to_swap_in: List[Tuple[int, int]],
|
| 1600 |
+
) -> None:
|
| 1601 |
+
mapping = self.block_manager.swap_in(seq_group)
|
| 1602 |
+
blocks_to_swap_in.extend(mapping)
|
| 1603 |
+
for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
|
| 1604 |
+
seq.status = SequenceStatus.RUNNING
|
| 1605 |
+
|
| 1606 |
+
def _swap_out(
|
| 1607 |
+
self,
|
| 1608 |
+
seq_group: SequenceGroup,
|
| 1609 |
+
blocks_to_swap_out: List[Tuple[int, int]],
|
| 1610 |
+
) -> None:
|
| 1611 |
+
if not self.block_manager.can_swap_out(seq_group):
|
| 1612 |
+
# FIXME(woosuk): Abort the sequence group instead of aborting the
|
| 1613 |
+
# entire engine.
|
| 1614 |
+
raise RuntimeError(
|
| 1615 |
+
"Aborted due to the lack of CPU swap space. Please increase "
|
| 1616 |
+
"the swap space to avoid this error.")
|
| 1617 |
+
mapping = self.block_manager.swap_out(seq_group)
|
| 1618 |
+
blocks_to_swap_out.extend(mapping)
|
| 1619 |
+
for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
|
| 1620 |
+
seq.status = SequenceStatus.SWAPPED
|
| 1621 |
+
|
| 1622 |
+
def _passed_delay(self, now: float) -> bool:
|
| 1623 |
+
if self.prev_prompt:
|
| 1624 |
+
self.last_prompt_latency = now - self.prev_time
|
| 1625 |
+
self.prev_time, self.prev_prompt = now, False
|
| 1626 |
+
# Delay scheduling prompts to let waiting queue fill up
|
| 1627 |
+
if self.scheduler_config.delay_factor > 0 and self.waiting:
|
| 1628 |
+
earliest_arrival_time = min(
|
| 1629 |
+
[e.metrics.arrival_time for e in self.waiting])
|
| 1630 |
+
passed_delay = ((now - earliest_arrival_time)
|
| 1631 |
+
> (self.scheduler_config.delay_factor *
|
| 1632 |
+
self.last_prompt_latency) or not self.running)
|
| 1633 |
+
else:
|
| 1634 |
+
passed_delay = True
|
| 1635 |
+
return passed_delay
|
| 1636 |
+
|
| 1637 |
+
def _get_num_lookahead_slots(self, is_prefill: bool,
|
| 1638 |
+
enable_chunking: bool) -> int:
|
| 1639 |
+
"""The number of slots to allocate per sequence per step, beyond known
|
| 1640 |
+
token ids. Speculative decoding uses these slots to store KV activations
|
| 1641 |
+
of tokens which may or may not be accepted.
|
| 1642 |
+
|
| 1643 |
+
Speculative decoding does not yet support prefill, so we do not perform
|
| 1644 |
+
lookahead allocation for prefill.
|
| 1645 |
+
|
| 1646 |
+
When chunking is enabled with multi-step, we allocate lookahead slots
|
| 1647 |
+
for the prefills for when the prefills turn into decodes in the first
|
| 1648 |
+
step.
|
| 1649 |
+
"""
|
| 1650 |
+
if is_prefill:
|
| 1651 |
+
if self.scheduler_config.is_multi_step and enable_chunking:
|
| 1652 |
+
# num_lookahead_slots was introduced in the context of decodes,
|
| 1653 |
+
# in Speculative Decoding.
|
| 1654 |
+
# When the num_scheduler_steps is 8, say, then the
|
| 1655 |
+
# num_lookahead_slots is 7. Meaning, we are doing a 1-step of
|
| 1656 |
+
# decode anyways and we wish to do 7 more.
|
| 1657 |
+
#
|
| 1658 |
+
# "lookaheads" for prefills, is introduced in support for
|
| 1659 |
+
# Chunked-Prefill in Multi-Step.
|
| 1660 |
+
return self.scheduler_config.num_lookahead_slots + 1
|
| 1661 |
+
else:
|
| 1662 |
+
return 0
|
| 1663 |
+
|
| 1664 |
+
return self.scheduler_config.num_lookahead_slots
|
| 1665 |
+
|
| 1666 |
+
def _get_num_new_uncached_and_cached_tokens(
|
| 1667 |
+
self,
|
| 1668 |
+
seq_group: SequenceGroup,
|
| 1669 |
+
status: SequenceStatus,
|
| 1670 |
+
enable_chunking: bool,
|
| 1671 |
+
budget: SchedulingBudget,
|
| 1672 |
+
) -> Tuple[int, int]:
|
| 1673 |
+
"""
|
| 1674 |
+
Returns the number of new uncached and cached tokens to schedule for a
|
| 1675 |
+
given sequence group that's in a given `status`.
|
| 1676 |
+
|
| 1677 |
+
The API could chunk the number of tokens to compute based on `budget`
|
| 1678 |
+
if `enable_chunking` is True. If a sequence group has multiple
|
| 1679 |
+
sequences (e.g., running beam search), it means it is in decoding
|
| 1680 |
+
phase, so chunking doesn't happen.
|
| 1681 |
+
|
| 1682 |
+
Returns (0, 0) if the new token cannot be computed due to token budget.
|
| 1683 |
+
|
| 1684 |
+
The cached tokens's blocks are already computed, and the attention
|
| 1685 |
+
backend will reuse the cached blocks rather than recomputing them. So
|
| 1686 |
+
the scheduler could schedule these cached tokens "for free".
|
| 1687 |
+
|
| 1688 |
+
Args:
|
| 1689 |
+
seq_group: The sequence group to get the number of new tokens to
|
| 1690 |
+
schedule.
|
| 1691 |
+
status: The status of the sequences to get the number of new tokens
|
| 1692 |
+
to schedule.
|
| 1693 |
+
enable_chunking: Whether to chunk the number of tokens to compute.
|
| 1694 |
+
budget: The budget to chunk the number of tokens to compute.
|
| 1695 |
+
|
| 1696 |
+
|
| 1697 |
+
Returns:
|
| 1698 |
+
A tuple of two ints. The first int is the number of new uncached
|
| 1699 |
+
tokens to schedule. The second int is the number of cached tokens.
|
| 1700 |
+
If no more new tokens can be scheduled, returns (0, 0).
|
| 1701 |
+
"""
|
| 1702 |
+
num_cached_new_tokens = 0
|
| 1703 |
+
num_uncached_new_tokens = 0
|
| 1704 |
+
|
| 1705 |
+
seqs = seq_group.get_seqs(status=status)
|
| 1706 |
+
# Compute the number of new uncached and cached tokens for
|
| 1707 |
+
# each sequence.
|
| 1708 |
+
for seq in seqs:
|
| 1709 |
+
if not seq.is_prefill():
|
| 1710 |
+
# Decode sequences should always just have 1 uncached token
|
| 1711 |
+
# TODO(rickyx): Actually is this still correct for multi-step?
|
| 1712 |
+
num_uncached_new_tokens += 1
|
| 1713 |
+
continue
|
| 1714 |
+
|
| 1715 |
+
num_computed_tokens_seq = seq.get_num_computed_tokens()
|
| 1716 |
+
all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
|
| 1717 |
+
if not self.cache_config.enable_prefix_caching:
|
| 1718 |
+
# If prefix caching is not enabled, all new tokens are uncached.
|
| 1719 |
+
num_uncached_new_tokens += all_num_new_tokens_seq
|
| 1720 |
+
continue
|
| 1721 |
+
|
| 1722 |
+
# NOTE: the cache token might be currently in a block that's in an
|
| 1723 |
+
# evictor meaning that it's not yet allocated. However, we don't
|
| 1724 |
+
# exclude such tokens in the cache count because it will be
|
| 1725 |
+
# guaranteed to be allocated later if the sequence can be allocated.
|
| 1726 |
+
num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
|
| 1727 |
+
seq)
|
| 1728 |
+
|
| 1729 |
+
# Sanity check.
|
| 1730 |
+
if num_cached_tokens_seq < num_computed_tokens_seq:
|
| 1731 |
+
# This should only happen with chunked prefill, and
|
| 1732 |
+
# the seq is still in prefill. The `num_cached_tokens_seq`
|
| 1733 |
+
# is the value we calculated on scheduling the first prefill.
|
| 1734 |
+
# For subsequent continuous prefill steps, we cached the
|
| 1735 |
+
# number of cache tokens for the sequence so the cached token
|
| 1736 |
+
# count could be less than the number of computed tokens.
|
| 1737 |
+
# See comments on `ComputedBlocksTracker` for more details.
|
| 1738 |
+
assert (
|
| 1739 |
+
seq.is_prefill() and seq.status == SequenceStatus.RUNNING
|
| 1740 |
+
and self.scheduler_config.chunked_prefill_enabled
|
| 1741 |
+
), ("Number of cached tokens should not be less than the "
|
| 1742 |
+
"number of computed tokens for a sequence that's still "
|
| 1743 |
+
f"in prefill. But there are {num_cached_tokens_seq} cached "
|
| 1744 |
+
f"tokens and {num_computed_tokens_seq} computed tokens "
|
| 1745 |
+
f"for sequence {seq.seq_id}.")
|
| 1746 |
+
|
| 1747 |
+
num_cached_new_tokens_seq = max(
|
| 1748 |
+
0, num_cached_tokens_seq - num_computed_tokens_seq)
|
| 1749 |
+
num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
|
| 1750 |
+
num_cached_new_tokens_seq)
|
| 1751 |
+
|
| 1752 |
+
num_uncached_new_tokens += num_uncached_new_tokens_seq
|
| 1753 |
+
num_cached_new_tokens += num_cached_new_tokens_seq
|
| 1754 |
+
|
| 1755 |
+
if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
|
| 1756 |
+
# For a fully cached hit sequence, we actually need to recompute the
|
| 1757 |
+
# last token. So we need at least 1 uncached token to schedule.
|
| 1758 |
+
# See ModelRunner._compute_for_prefix_cache_hit for more details.
|
| 1759 |
+
num_uncached_new_tokens = 1
|
| 1760 |
+
num_cached_new_tokens -= 1
|
| 1761 |
+
|
| 1762 |
+
if enable_chunking and len(seqs) == 1:
|
| 1763 |
+
# Chunk if a running request cannot fit in the given budget.
|
| 1764 |
+
# If number of seq > 1, it means it is doing beam search
|
| 1765 |
+
# in a decode phase. Do not chunk.
|
| 1766 |
+
num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
|
| 1767 |
+
self.scheduler_config,
|
| 1768 |
+
self.cache_config,
|
| 1769 |
+
budget,
|
| 1770 |
+
self._get_prompt_limit(seq_group),
|
| 1771 |
+
num_uncached_new_tokens,
|
| 1772 |
+
)
|
| 1773 |
+
|
| 1774 |
+
return num_uncached_new_tokens, num_cached_new_tokens
|
| 1775 |
+
|
| 1776 |
+
@staticmethod
|
| 1777 |
+
def _chunk_new_tokens_to_schedule(
|
| 1778 |
+
scheduler_config: SchedulerConfig,
|
| 1779 |
+
cache_config: CacheConfig,
|
| 1780 |
+
budget: SchedulingBudget,
|
| 1781 |
+
prompt_limit: int,
|
| 1782 |
+
num_new_tokens: int,
|
| 1783 |
+
) -> int:
|
| 1784 |
+
"""
|
| 1785 |
+
Chunks the number of new tokens to schedule based on the budget when
|
| 1786 |
+
chunked prefill is enabled.
|
| 1787 |
+
|
| 1788 |
+
Args:
|
| 1789 |
+
scheduler_config: The scheduler config.
|
| 1790 |
+
cache_config: The cache config.
|
| 1791 |
+
budget: The budget to chunk the number of tokens to compute.
|
| 1792 |
+
prompt_limit: The maximum number of tokens allowed in a prompt.
|
| 1793 |
+
num_new_tokens: The number of new tokens to schedule.
|
| 1794 |
+
|
| 1795 |
+
Returns:
|
| 1796 |
+
The number of new tokens to schedule after chunking.
|
| 1797 |
+
"""
|
| 1798 |
+
remaining_token_budget = budget.remaining_token_budget()
|
| 1799 |
+
if scheduler_config.is_multi_step:
|
| 1800 |
+
# The current multi-step + chunked prefill capability does
|
| 1801 |
+
# not actually support chunking prompts.
|
| 1802 |
+
#
|
| 1803 |
+
# Therefore, `num_new_tokens` is computed in the same fashion
|
| 1804 |
+
# for both multi-step+chunked-prefill &
|
| 1805 |
+
# multi-step+chunked-prefill+APC
|
| 1806 |
+
#
|
| 1807 |
+
# Prompts with more tokens than the current remaining budget
|
| 1808 |
+
# are postponed to future scheduler steps
|
| 1809 |
+
if num_new_tokens > prompt_limit:
|
| 1810 |
+
# If the seq_group is in prompt-stage, pass the
|
| 1811 |
+
# num_new_tokens as-is so the caller can ignore
|
| 1812 |
+
# the sequence.
|
| 1813 |
+
return num_new_tokens
|
| 1814 |
+
|
| 1815 |
+
return (0 if num_new_tokens > remaining_token_budget else
|
| 1816 |
+
num_new_tokens)
|
| 1817 |
+
|
| 1818 |
+
if cache_config.enable_prefix_caching:
|
| 1819 |
+
# Adjust the remaining token budget to be divisible by the block
|
| 1820 |
+
# size when prefix caching is enabled.
|
| 1821 |
+
|
| 1822 |
+
# When prefix caching is enabled, we always allocate
|
| 1823 |
+
# the number of new tokens that is dividable by the block
|
| 1824 |
+
# size to avoid partial block matching.
|
| 1825 |
+
block_size = cache_config.block_size
|
| 1826 |
+
remainder = budget.token_budget % block_size
|
| 1827 |
+
if remainder != 0:
|
| 1828 |
+
raise ValueError("When enabling chunked prefill and "
|
| 1829 |
+
"prefix caching, max_num_batched_tokens "
|
| 1830 |
+
"(chunk size) must be dividable by "
|
| 1831 |
+
"block size, but got chunk_size "
|
| 1832 |
+
f"({budget.token_budget}) % block_size "
|
| 1833 |
+
f"({block_size}) = {remainder}")
|
| 1834 |
+
# Round down to block size.
|
| 1835 |
+
remaining_token_budget = (remaining_token_budget // block_size *
|
| 1836 |
+
block_size)
|
| 1837 |
+
|
| 1838 |
+
num_new_tokens = min(num_new_tokens, remaining_token_budget)
|
| 1839 |
+
|
| 1840 |
+
return num_new_tokens
|
.venv/lib/python3.11/site-packages/vllm/device_allocator/__pycache__/cumem.cpython-311.pyc
ADDED
|
Binary file (12 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (210 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc
ADDED
|
Binary file (9.62 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc
ADDED
|
Binary file (12.7 kB). View file
|
|
|