koichi12 commited on
Commit
c27e68a
·
verified ·
1 Parent(s): db5dd97

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so +3 -0
  3. .venv/lib/python3.11/site-packages/vllm/__pycache__/__init__.cpython-311.pyc +0 -0
  4. .venv/lib/python3.11/site-packages/vllm/__pycache__/_custom_ops.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/vllm/__pycache__/_ipex_ops.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/vllm/__pycache__/_version.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/vllm/__pycache__/beam_search.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/vllm/__pycache__/connections.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/vllm/__pycache__/envs.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/vllm/__pycache__/forward_context.cpython-311.pyc +0 -0
  11. .venv/lib/python3.11/site-packages/vllm/__pycache__/logger.cpython-311.pyc +0 -0
  12. .venv/lib/python3.11/site-packages/vllm/__pycache__/logits_process.cpython-311.pyc +0 -0
  13. .venv/lib/python3.11/site-packages/vllm/__pycache__/outputs.cpython-311.pyc +0 -0
  14. .venv/lib/python3.11/site-packages/vllm/__pycache__/pooling_params.cpython-311.pyc +0 -0
  15. .venv/lib/python3.11/site-packages/vllm/__pycache__/sampling_params.cpython-311.pyc +0 -0
  16. .venv/lib/python3.11/site-packages/vllm/__pycache__/scalar_type.cpython-311.pyc +0 -0
  17. .venv/lib/python3.11/site-packages/vllm/__pycache__/scripts.cpython-311.pyc +0 -0
  18. .venv/lib/python3.11/site-packages/vllm/__pycache__/sequence.cpython-311.pyc +0 -0
  19. .venv/lib/python3.11/site-packages/vllm/__pycache__/tracing.cpython-311.pyc +0 -0
  20. .venv/lib/python3.11/site-packages/vllm/__pycache__/version.cpython-311.pyc +0 -0
  21. .venv/lib/python3.11/site-packages/vllm/core/__init__.py +0 -0
  22. .venv/lib/python3.11/site-packages/vllm/core/__pycache__/__init__.cpython-311.pyc +0 -0
  23. .venv/lib/python3.11/site-packages/vllm/core/__pycache__/block_manager.cpython-311.pyc +0 -0
  24. .venv/lib/python3.11/site-packages/vllm/core/__pycache__/evictor.cpython-311.pyc +0 -0
  25. .venv/lib/python3.11/site-packages/vllm/core/__pycache__/interfaces.cpython-311.pyc +0 -0
  26. .venv/lib/python3.11/site-packages/vllm/core/__pycache__/placeholder_block_space_manager.cpython-311.pyc +0 -0
  27. .venv/lib/python3.11/site-packages/vllm/core/__pycache__/scheduler.cpython-311.pyc +0 -0
  28. .venv/lib/python3.11/site-packages/vllm/core/block/__init__.py +0 -0
  29. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/__init__.cpython-311.pyc +0 -0
  30. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/block_table.cpython-311.pyc +0 -0
  31. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/common.cpython-311.pyc +0 -0
  32. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-311.pyc +0 -0
  33. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/interfaces.cpython-311.pyc +0 -0
  34. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/naive_block.cpython-311.pyc +0 -0
  35. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/prefix_caching_block.cpython-311.pyc +0 -0
  36. .venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/utils.cpython-311.pyc +0 -0
  37. .venv/lib/python3.11/site-packages/vllm/core/block/block_table.py +398 -0
  38. .venv/lib/python3.11/site-packages/vllm/core/block/common.py +370 -0
  39. .venv/lib/python3.11/site-packages/vllm/core/block/cpu_gpu_block_allocator.py +438 -0
  40. .venv/lib/python3.11/site-packages/vllm/core/block/interfaces.py +318 -0
  41. .venv/lib/python3.11/site-packages/vllm/core/block/naive_block.py +465 -0
  42. .venv/lib/python3.11/site-packages/vllm/core/block/prefix_caching_block.py +1134 -0
  43. .venv/lib/python3.11/site-packages/vllm/core/block/utils.py +27 -0
  44. .venv/lib/python3.11/site-packages/vllm/core/interfaces.py +134 -0
  45. .venv/lib/python3.11/site-packages/vllm/core/placeholder_block_space_manager.py +99 -0
  46. .venv/lib/python3.11/site-packages/vllm/core/scheduler.py +1840 -0
  47. .venv/lib/python3.11/site-packages/vllm/device_allocator/__pycache__/cumem.cpython-311.pyc +0 -0
  48. .venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/__init__.cpython-311.pyc +0 -0
  49. .venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc +0 -0
  50. .venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc +0 -0
.gitattributes CHANGED
@@ -200,3 +200,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
200
  .venv/lib/python3.11/site-packages/google/protobuf/__pycache__/descriptor_pb2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
201
  .venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
202
  .venv/lib/python3.11/site-packages/grpc/__pycache__/_channel.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 
 
200
  .venv/lib/python3.11/site-packages/google/protobuf/__pycache__/descriptor_pb2.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
201
  .venv/lib/python3.11/site-packages/jinja2/__pycache__/compiler.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
202
  .venv/lib/python3.11/site-packages/grpc/__pycache__/_channel.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
203
+ .venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.11/site-packages/msgpack/_cmsgpack.cpython-311-x86_64-linux-gnu.so ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8da100e7b8957b1fbf02ef3114676091bdd6d861169f948bbbeaf0fceade5992
3
+ size 1296528
.venv/lib/python3.11/site-packages/vllm/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.97 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/_custom_ops.cpython-311.pyc ADDED
Binary file (57.4 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/_ipex_ops.cpython-311.pyc ADDED
Binary file (11.4 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/_version.cpython-311.pyc ADDED
Binary file (635 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/beam_search.cpython-311.pyc ADDED
Binary file (3.98 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/connections.cpython-311.pyc ADDED
Binary file (9.88 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/envs.cpython-311.pyc ADDED
Binary file (27.6 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/forward_context.cpython-311.pyc ADDED
Binary file (5.77 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/logger.cpython-311.pyc ADDED
Binary file (9.24 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/logits_process.cpython-311.pyc ADDED
Binary file (5.47 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/outputs.cpython-311.pyc ADDED
Binary file (24.4 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/pooling_params.cpython-311.pyc ADDED
Binary file (1.42 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/sampling_params.cpython-311.pyc ADDED
Binary file (26.9 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/scalar_type.cpython-311.pyc ADDED
Binary file (14.4 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/scripts.cpython-311.pyc ADDED
Binary file (8.83 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/sequence.cpython-311.pyc ADDED
Binary file (77.6 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/tracing.cpython-311.pyc ADDED
Binary file (6.92 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/__pycache__/version.cpython-311.pyc ADDED
Binary file (622 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/core/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (182 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/block_manager.cpython-311.pyc ADDED
Binary file (24.7 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/evictor.cpython-311.pyc ADDED
Binary file (8.26 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/interfaces.cpython-311.pyc ADDED
Binary file (6.89 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/placeholder_block_space_manager.cpython-311.pyc ADDED
Binary file (5.79 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/__pycache__/scheduler.cpython-311.pyc ADDED
Binary file (73.9 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (188 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/block_table.cpython-311.pyc ADDED
Binary file (18.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/common.cpython-311.pyc ADDED
Binary file (19.2 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/cpu_gpu_block_allocator.cpython-311.pyc ADDED
Binary file (22.8 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/interfaces.cpython-311.pyc ADDED
Binary file (16.3 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/naive_block.cpython-311.pyc ADDED
Binary file (22.5 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/prefix_caching_block.cpython-311.pyc ADDED
Binary file (47.4 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/__pycache__/utils.cpython-311.pyc ADDED
Binary file (1.33 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/core/block/block_table.py ADDED
@@ -0,0 +1,398 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import math
4
+ from typing import List, Optional
5
+
6
+ from vllm.core.block.common import BlockList
7
+ from vllm.core.block.interfaces import Block, DeviceAwareBlockAllocator
8
+ from vllm.utils import Device, cdiv, chunk_list
9
+
10
+
11
+ class BlockTable:
12
+ """A class to manage blocks for a specific sequence.
13
+
14
+ The BlockTable maps a sequence of tokens to a list of blocks, where each
15
+ block represents a contiguous memory allocation for a portion of the
16
+ sequence. The blocks are managed by a DeviceAwareBlockAllocator, which is
17
+ responsible for allocating and freeing memory for the blocks.
18
+
19
+ Args:
20
+ block_size (int): The maximum number of tokens that can be stored in a
21
+ single block.
22
+ block_allocator (DeviceAwareBlockAllocator): The block allocator used to
23
+ manage memory for the blocks.
24
+ _blocks (Optional[List[Block]], optional): An optional list of existing
25
+ blocks to initialize the BlockTable with. If not provided, an empty
26
+ BlockTable is created.
27
+ max_block_sliding_window (Optional[int], optional): The number of
28
+ blocks to keep around for each sequence. If None, all blocks
29
+ are kept (eg., when sliding window is not used).
30
+ It should at least fit the sliding window size of the model.
31
+
32
+ Attributes:
33
+ _block_size (int): The maximum number of tokens that can be stored in a
34
+ single block.
35
+ _allocator (DeviceAwareBlockAllocator): The block allocator used to
36
+ manage memory for the blocks.
37
+ _blocks (Optional[List[Block]]): The list of blocks managed by this
38
+ BlockTable.
39
+ _num_full_slots (int): The number of tokens currently stored in the
40
+ blocks.
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ block_size: int,
46
+ block_allocator: DeviceAwareBlockAllocator,
47
+ _blocks: Optional[List[Block]] = None,
48
+ max_block_sliding_window: Optional[int] = None,
49
+ ):
50
+ self._block_size = block_size
51
+ self._allocator = block_allocator
52
+ if _blocks is None:
53
+ _blocks = []
54
+ self._blocks: BlockList = BlockList(_blocks)
55
+
56
+ self._max_block_sliding_window = max_block_sliding_window
57
+ self._num_full_slots = self._get_num_token_ids()
58
+
59
+ @staticmethod
60
+ def get_num_required_blocks(token_ids: List[int],
61
+ block_size: int,
62
+ num_lookahead_slots: int = 0) -> int:
63
+ """Calculates the minimum number of blocks required to store a given
64
+ sequence of token IDs along with any look-ahead slots that may be
65
+ required (like in multi-step + chunked-prefill).
66
+
67
+ This assumes worst-case scenario, where every block requires a new
68
+ allocation (e.g. ignoring prefix caching).
69
+
70
+ Args:
71
+ token_ids (List[int]): The sequence of token IDs to be stored.
72
+ block_size (int): The maximum number of tokens that can be stored in
73
+ a single block.
74
+ num_lookahead_slots (int): look-ahead slots that the sequence may
75
+ require.
76
+
77
+ Returns:
78
+ int: The minimum number of blocks required to store the given
79
+ sequence of token IDs along with any required look-ahead slots.
80
+ """
81
+ return cdiv(len(token_ids) + num_lookahead_slots, block_size)
82
+
83
+ def allocate(self,
84
+ token_ids: List[int],
85
+ device: Device = Device.GPU,
86
+ extra_hash: Optional[int] = None) -> None:
87
+ """Allocates memory blocks for storing the given sequence of token IDs.
88
+
89
+ This method allocates the required number of blocks to store the given
90
+ sequence of token IDs.
91
+
92
+ Args:
93
+ token_ids (List[int]): The sequence of token IDs to be stored.
94
+ device (Device, optional): The device on which the blocks should be
95
+ allocated. Defaults to Device.GPU.
96
+ extra_hash (Optional[int]): The hash value of additional
97
+ factors, such as adapters, that influence the block hash
98
+ in the prefixcaching block.
99
+ """
100
+ assert not self._is_allocated
101
+ assert token_ids
102
+ blocks = self._allocate_blocks_for_token_ids(prev_block=None,
103
+ token_ids=token_ids,
104
+ device=device,
105
+ extra_hash=extra_hash)
106
+ self.update(blocks)
107
+ self._num_full_slots = len(token_ids)
108
+
109
+ def update(self, blocks: List[Block]) -> None:
110
+ """Resets the table to the newly provided blocks
111
+ (with their corresponding block ids)
112
+ """
113
+ self._blocks.update(blocks)
114
+
115
+ def append_token_ids(self,
116
+ token_ids: List[int],
117
+ num_lookahead_slots: int = 0,
118
+ num_computed_slots: Optional[int] = None,
119
+ extra_hash: Optional[int] = None) -> None:
120
+ """Appends a sequence of token IDs to the existing blocks in the
121
+ BlockTable.
122
+
123
+ This method appends the given sequence of token IDs to the existing
124
+ blocks in the BlockTable. If there is not enough space in the existing
125
+ blocks, new blocks are allocated using the `ensure_num_empty_slots`
126
+ method to accommodate the additional tokens.
127
+
128
+ The token IDs are divided into chunks of size `block_size` (except for
129
+ the first chunk, which may be smaller), and each chunk is appended to a
130
+ separate block.
131
+
132
+ Args:
133
+ token_ids (List[int]): The sequence of token IDs to be appended.
134
+ num_computed_slots (Optional[int]): The number of KV cache slots
135
+ that are already filled (computed).
136
+ When sliding window is enabled, this is used to compute how many
137
+ blocks to drop at the front of the sequence.
138
+ Without sliding window, None can be passed.
139
+ Without chunked prefill, it should be the same as
140
+ _num_full_slots.
141
+ extra_hash (Optional[int]): The hash value of additional
142
+ factors such as adapters that influence the block, apart
143
+ from the token_ids.
144
+ """
145
+ assert self._is_allocated, "no blocks have been allocated"
146
+ assert len(self._blocks) > 0
147
+
148
+ # Drop blocks that are no longer needed due to sliding window
149
+ if self._max_block_sliding_window is not None:
150
+ null_block = self._allocator.allocate_or_get_null_block()
151
+ assert num_computed_slots is not None
152
+ end_block_idx = (num_computed_slots //
153
+ self._block_size) - self._max_block_sliding_window
154
+ for idx in range(0, end_block_idx):
155
+ b = self._blocks[idx]
156
+ if b is not null_block:
157
+ self._allocator.free(b)
158
+ self._blocks[idx] = null_block
159
+
160
+ # Ensure there are enough empty slots for the new tokens plus
161
+ # lookahead slots
162
+ self.ensure_num_empty_slots(num_empty_slots=len(token_ids) +
163
+ num_lookahead_slots,
164
+ extra_hash=extra_hash)
165
+
166
+ # Update the blocks with the new tokens
167
+ first_block_idx = self._num_full_slots // self._block_size
168
+ token_blocks = self._chunk_token_blocks_for_append(token_ids)
169
+
170
+ for i, token_block in enumerate(token_blocks):
171
+ self._blocks.append_token_ids(first_block_idx + i, token_block)
172
+
173
+ self._num_full_slots += len(token_ids)
174
+
175
+ def ensure_num_empty_slots(self,
176
+ num_empty_slots: int,
177
+ extra_hash: Optional[int] = None) -> None:
178
+ """Ensures that the BlockTable has at least the specified number of
179
+ empty slots available.
180
+
181
+ This method checks if the BlockTable has enough empty slots (i.e.,
182
+ available space) to accommodate the requested number of tokens. If not,
183
+ it allocates additional blocks on the GPU to ensure that the required
184
+ number of empty slots is available.
185
+
186
+ Args:
187
+ num_empty_slots (int): The minimum number of empty slots required.
188
+ extra_hash (Optional[int]): The hash value of additional
189
+ factors such as adapters that influence the block, apart
190
+ from the token_ids.
191
+ """
192
+ # Currently the block table only supports
193
+ # appending tokens to GPU blocks.
194
+ device = Device.GPU
195
+ assert self._is_allocated
196
+
197
+ if self._num_empty_slots >= num_empty_slots:
198
+ return
199
+
200
+ slots_to_allocate = num_empty_slots - self._num_empty_slots
201
+ blocks_to_allocate = cdiv(slots_to_allocate, self._block_size)
202
+
203
+ for _ in range(blocks_to_allocate):
204
+ assert len(self._blocks) > 0
205
+ self._blocks.append(
206
+ self._allocator.allocate_mutable_block(
207
+ prev_block=self._blocks[-1],
208
+ device=device,
209
+ extra_hash=extra_hash))
210
+
211
+ def fork(self) -> "BlockTable":
212
+ """Creates a new BlockTable instance with a copy of the blocks from the
213
+ current instance.
214
+
215
+ This method creates a new BlockTable instance with the same block size,
216
+ block allocator, and a copy of the blocks from the current instance. The
217
+ new BlockTable has its own independent set of blocks, but shares the
218
+ same underlying memory allocation with the original BlockTable.
219
+
220
+ Returns:
221
+ BlockTable: A new BlockTable instance with a copy of the blocks from
222
+ the current instance.
223
+ """
224
+ assert self._is_allocated
225
+ assert len(self._blocks) > 0
226
+ forked_blocks = self._allocator.fork(self._blocks[-1])
227
+ return BlockTable(
228
+ block_size=self._block_size,
229
+ block_allocator=self._allocator,
230
+ _blocks=forked_blocks,
231
+ max_block_sliding_window=self._max_block_sliding_window,
232
+ )
233
+
234
+ def free(self) -> None:
235
+ """Frees the memory occupied by the blocks in the BlockTable.
236
+
237
+ This method iterates over all the blocks in the `_blocks` list and calls
238
+ the `free` method of the `_allocator` object to release the memory
239
+ occupied by each block. After freeing all the blocks, the `_blocks` list
240
+ is set to `None`.
241
+ """
242
+ for block in self.blocks:
243
+ self._allocator.free(block)
244
+ self._blocks.reset()
245
+
246
+ @property
247
+ def physical_block_ids(self) -> List[int]:
248
+ """Returns a list of physical block indices for the blocks in the
249
+ BlockTable.
250
+
251
+ This property returns a list of integers, where each integer represents
252
+ the physical block index of a corresponding block in the `_blocks` list.
253
+ The physical block index is a unique identifier for the memory location
254
+ occupied by the block.
255
+
256
+ Returns:
257
+ List[int]: A list of physical block indices for the blocks in the
258
+ BlockTable.
259
+ """
260
+ return self._blocks.ids()
261
+
262
+ def get_unseen_token_ids(self, sequence_token_ids: List[int]) -> List[int]:
263
+ """Get the number of "unseen" tokens in the sequence.
264
+
265
+ Unseen tokens are tokens in the sequence corresponding to this block
266
+ table, but are not yet appended to this block table.
267
+
268
+ Args:
269
+ sequence_token_ids (List[int]): The list of token ids in the
270
+ sequence.
271
+
272
+ Returns:
273
+ List[int]: The postfix of sequence_token_ids that has not yet been
274
+ appended to the block table.
275
+ """
276
+
277
+ # Since the block table is append-only, the unseen token ids are the
278
+ # ones after the appended ones.
279
+ return sequence_token_ids[self.num_full_slots:]
280
+
281
+ def _allocate_blocks_for_token_ids(
282
+ self,
283
+ prev_block: Optional[Block],
284
+ token_ids: List[int],
285
+ device: Device,
286
+ extra_hash: Optional[int] = None) -> List[Block]:
287
+ blocks: List[Block] = []
288
+
289
+ block_token_ids = []
290
+ tail_token_ids = []
291
+ for cur_token_ids in chunk_list(token_ids, self._block_size):
292
+ if len(cur_token_ids) == self._block_size:
293
+ block_token_ids.append(cur_token_ids)
294
+ else:
295
+ tail_token_ids.append(cur_token_ids)
296
+
297
+ if block_token_ids:
298
+ blocks.extend(
299
+ self._allocator.allocate_immutable_blocks(
300
+ prev_block,
301
+ block_token_ids=block_token_ids,
302
+ device=device,
303
+ extra_hash=extra_hash))
304
+ prev_block = blocks[-1]
305
+
306
+ if tail_token_ids:
307
+ assert len(tail_token_ids) == 1
308
+ cur_token_ids = tail_token_ids[0]
309
+
310
+ block = self._allocator.allocate_mutable_block(
311
+ prev_block=prev_block, device=device, extra_hash=extra_hash)
312
+ block.append_token_ids(cur_token_ids)
313
+
314
+ blocks.append(block)
315
+
316
+ return blocks
317
+
318
+ def _get_all_token_ids(self) -> List[int]:
319
+ # NOTE: This function is O(seq_len); use sparingly.
320
+ token_ids: List[int] = []
321
+
322
+ if not self._is_allocated:
323
+ return token_ids
324
+
325
+ for block in self.blocks:
326
+ token_ids.extend(block.token_ids)
327
+
328
+ return token_ids
329
+
330
+ def _get_num_token_ids(self) -> int:
331
+ res = 0
332
+ for block in self.blocks:
333
+ res += len(block.token_ids)
334
+
335
+ return res
336
+
337
+ @property
338
+ def _is_allocated(self) -> bool:
339
+ return len(self._blocks) > 0
340
+
341
+ @property
342
+ def blocks(self) -> List[Block]:
343
+ return self._blocks.list()
344
+
345
+ @property
346
+ def _num_empty_slots(self) -> int:
347
+ assert self._is_allocated
348
+ return len(self._blocks) * self._block_size - self._num_full_slots
349
+
350
+ @property
351
+ def num_full_slots(self) -> int:
352
+ """Returns the total number of tokens currently stored in the
353
+ BlockTable.
354
+
355
+ Returns:
356
+ int: The total number of tokens currently stored in the BlockTable.
357
+ """
358
+ return self._num_full_slots
359
+
360
+ def get_num_blocks_touched_by_append_slots(
361
+ self, token_ids: List[int], num_lookahead_slots: int) -> int:
362
+ """Determine how many blocks will be "touched" by appending the token
363
+ ids.
364
+
365
+ This is required for the scheduler to determine whether a sequence can
366
+ continue generation, or if it must be preempted.
367
+ """
368
+ # Math below is equivalent to:
369
+ # all_token_ids = token_ids + [-1] * num_lookahead_slots
370
+ # token_blocks = self._chunk_token_blocks_for_append(all_token_ids)
371
+ # return len(token_blocks)
372
+
373
+ num_token_ids = len(token_ids) + num_lookahead_slots
374
+ first_chunk_size = self._block_size - (self._num_full_slots %
375
+ self._block_size)
376
+ num_token_blocks = (1 + math.ceil(
377
+ (num_token_ids - first_chunk_size) / self._block_size))
378
+ return num_token_blocks
379
+
380
+ def _chunk_token_blocks_for_append(
381
+ self, token_ids: List[int]) -> List[List[int]]:
382
+ """Split the token ids into block-sized chunks so they can be easily
383
+ appended to blocks. The first such "token block" may have less token ids
384
+ than the block size, since the last allocated block may be partially
385
+ full.
386
+
387
+ If no token ids are provided, then no chunks are returned.
388
+ """
389
+
390
+ if not token_ids:
391
+ return []
392
+
393
+ first_chunk_size = self._block_size - (self._num_full_slots %
394
+ self._block_size)
395
+ token_blocks = [token_ids[:first_chunk_size]]
396
+ token_blocks.extend(
397
+ chunk_list(token_ids[first_chunk_size:], self._block_size))
398
+ return token_blocks
.venv/lib/python3.11/site-packages/vllm/core/block/common.py ADDED
@@ -0,0 +1,370 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from collections import deque
4
+ from dataclasses import dataclass
5
+ from typing import Deque, Dict, Iterable, List, Optional, Protocol, Tuple
6
+
7
+ from vllm.core.block.interfaces import Block, BlockAllocator
8
+
9
+ BlockId = int
10
+ RefCount = int
11
+
12
+
13
+ class RefCounterProtocol(Protocol):
14
+
15
+ def incr(self, block_id: BlockId) -> RefCount:
16
+ raise NotImplementedError
17
+
18
+ def decr(self, block_id: BlockId) -> RefCount:
19
+ raise NotImplementedError
20
+
21
+ def get(self, block_id: BlockId) -> RefCount:
22
+ raise NotImplementedError
23
+
24
+
25
+ class RefCounter(RefCounterProtocol):
26
+ """A class for managing reference counts for a set of block indices.
27
+
28
+ The RefCounter class maintains a dictionary that maps block indices to their
29
+ corresponding reference counts. It provides methods to increment, decrement,
30
+ and retrieve the reference count for a given block index.
31
+
32
+ Args:
33
+ all_block_indices (Iterable[BlockId]): An iterable of block indices
34
+ to initialize the reference counter with.
35
+ """
36
+
37
+ def __init__(self, all_block_indices: Iterable[BlockId]):
38
+ deduped = set(all_block_indices)
39
+ self._refcounts: Dict[BlockId, RefCount] = {
40
+ index: 0
41
+ for index in deduped
42
+ }
43
+
44
+ def incr(self, block_id: BlockId) -> RefCount:
45
+ assert block_id in self._refcounts
46
+ pre_incr_refcount = self._refcounts[block_id]
47
+
48
+ assert pre_incr_refcount >= 0
49
+
50
+ post_incr_refcount = pre_incr_refcount + 1
51
+ self._refcounts[block_id] = post_incr_refcount
52
+ return post_incr_refcount
53
+
54
+ def decr(self, block_id: BlockId) -> RefCount:
55
+ assert block_id in self._refcounts
56
+ refcount = self._refcounts[block_id]
57
+
58
+ assert refcount > 0
59
+ refcount -= 1
60
+
61
+ self._refcounts[block_id] = refcount
62
+
63
+ return refcount
64
+
65
+ def get(self, block_id: BlockId) -> RefCount:
66
+ assert block_id in self._refcounts
67
+ return self._refcounts[block_id]
68
+
69
+ def as_readonly(self) -> "ReadOnlyRefCounter":
70
+ return ReadOnlyRefCounter(self)
71
+
72
+
73
+ class ReadOnlyRefCounter(RefCounterProtocol):
74
+ """A read-only view of the RefCounter class.
75
+
76
+ The ReadOnlyRefCounter class provides a read-only interface to access the
77
+ reference counts maintained by a RefCounter instance. It does not allow
78
+ modifications to the reference counts.
79
+
80
+ Args:
81
+ refcounter (RefCounter): The RefCounter instance to create a read-only
82
+ view for.
83
+ """
84
+
85
+ def __init__(self, refcounter: RefCounter):
86
+ self._refcounter = refcounter
87
+
88
+ def incr(self, block_id: BlockId) -> RefCount:
89
+ raise ValueError("Incr not allowed")
90
+
91
+ def decr(self, block_id: BlockId) -> RefCount:
92
+ raise ValueError("Decr not allowed")
93
+
94
+ def get(self, block_id: BlockId) -> RefCount:
95
+ return self._refcounter.get(block_id)
96
+
97
+
98
+ class CopyOnWriteTracker:
99
+ """A class for tracking and managing copy-on-write operations for blocks.
100
+
101
+ The CopyOnWriteTracker class maintains a mapping of source block indices to
102
+ their corresponding copy-on-write destination block indices. It works in
103
+ conjunction with a RefCounter.
104
+
105
+ Args:
106
+ refcounter (RefCounter): The reference counter used to track block
107
+ reference counts.
108
+ """
109
+
110
+ def __init__(self, refcounter: RefCounterProtocol):
111
+ self._copy_on_writes: List[Tuple[BlockId, BlockId]] = []
112
+ self._refcounter = refcounter
113
+
114
+ def is_appendable(self, block: Block) -> bool:
115
+ """Checks if the block is shared or not. If shared, then it cannot
116
+ be appended and needs to be duplicated via copy-on-write
117
+ """
118
+ block_id = block.block_id
119
+ if block_id is None:
120
+ return True
121
+
122
+ refcount = self._refcounter.get(block_id)
123
+ return refcount <= 1
124
+
125
+ def record_cow(self, src_block_id: Optional[BlockId],
126
+ trg_block_id: Optional[BlockId]) -> None:
127
+ """Records a copy-on-write operation from source to target block id
128
+ Args:
129
+ src_block_id (BlockId): The source block id from which to copy
130
+ the data
131
+ trg_block_id (BlockId): The target block id to which the data
132
+ is copied
133
+ """
134
+ assert src_block_id is not None
135
+ assert trg_block_id is not None
136
+ self._copy_on_writes.append((src_block_id, trg_block_id))
137
+
138
+ def clear_cows(self) -> List[Tuple[BlockId, BlockId]]:
139
+ """Clears the copy-on-write tracking information and returns the current
140
+ state.
141
+
142
+ This method returns a list mapping source block indices to
143
+ destination block indices for the current copy-on-write operations.
144
+ It then clears the internal tracking information.
145
+
146
+ Returns:
147
+ List[Tuple[BlockId, BlockId]]: A list mapping source
148
+ block indices to destination block indices for the
149
+ current copy-on-write operations.
150
+ """
151
+ cows = self._copy_on_writes
152
+ self._copy_on_writes = []
153
+ return cows
154
+
155
+
156
+ class BlockPool:
157
+ """Used to pre-allocate block objects, in order to avoid excessive python
158
+ object allocations/deallocations.
159
+ The pool starts from "pool_size" objects and will increase to more objects
160
+ if necessary
161
+
162
+ Note that multiple block objects may point to the same physical block id,
163
+ which is why this pool is needed, so that it will be easier to support
164
+ prefix caching and more complicated sharing of physical blocks.
165
+ """
166
+
167
+ def __init__(self, block_size: int, create_block: Block.Factory,
168
+ allocator: BlockAllocator, pool_size: int):
169
+ self._block_size = block_size
170
+ self._create_block = create_block
171
+ self._allocator = allocator
172
+ self._pool_size = pool_size
173
+ assert self._pool_size >= 0
174
+
175
+ self._free_ids: Deque[int] = deque(range(self._pool_size))
176
+ self._pool = []
177
+ for i in range(self._pool_size):
178
+ self._pool.append(
179
+ self._create_block(prev_block=None,
180
+ token_ids=[],
181
+ block_size=self._block_size,
182
+ allocator=self._allocator,
183
+ block_id=None,
184
+ extra_hash=None))
185
+
186
+ def increase_pool(self):
187
+ """Doubles the internal pool size
188
+ """
189
+ cur_pool_size = self._pool_size
190
+ new_pool_size = cur_pool_size * 2
191
+ self._pool_size = new_pool_size
192
+
193
+ self._free_ids += deque(range(cur_pool_size, new_pool_size))
194
+
195
+ for i in range(cur_pool_size, new_pool_size):
196
+ self._pool.append(
197
+ self._create_block(prev_block=None,
198
+ token_ids=[],
199
+ block_size=self._block_size,
200
+ allocator=self._allocator,
201
+ block_id=None,
202
+ extra_hash=None))
203
+
204
+ def init_block(self,
205
+ prev_block: Optional[Block],
206
+ token_ids: List[int],
207
+ block_size: int,
208
+ physical_block_id: Optional[int],
209
+ extra_hash: Optional[int] = None) -> Block:
210
+ if len(self._free_ids) == 0:
211
+ self.increase_pool()
212
+ assert len(self._free_ids) > 0
213
+
214
+ pool_id = self._free_ids.popleft()
215
+
216
+ block = self._pool[pool_id]
217
+ block.__init__( # type: ignore[misc]
218
+ prev_block=prev_block,
219
+ token_ids=token_ids,
220
+ block_size=block_size,
221
+ allocator=block._allocator, # type: ignore[attr-defined]
222
+ block_id=physical_block_id,
223
+ extra_hash=extra_hash)
224
+ block.pool_id = pool_id # type: ignore[attr-defined]
225
+ return block
226
+
227
+ def free_block(self, block: Block) -> None:
228
+ self._free_ids.appendleft(block.pool_id) # type: ignore[attr-defined]
229
+
230
+
231
+ class BlockList:
232
+ """This class is an optimization to allow fast-access to physical
233
+ block ids. It maintains a block id list that is updated with the
234
+ block list and this avoids the need to reconstruct the block id
235
+ list on every iteration of the block manager
236
+ """
237
+
238
+ def __init__(self, blocks: List[Block]):
239
+ self._blocks: List[Block] = []
240
+ self._block_ids: List[int] = []
241
+
242
+ self.update(blocks)
243
+
244
+ def _add_block_id(self, block_id: Optional[BlockId]) -> None:
245
+ assert block_id is not None
246
+ self._block_ids.append(block_id)
247
+
248
+ def _update_block_id(self, block_index: int,
249
+ new_block_id: Optional[BlockId]) -> None:
250
+ assert new_block_id is not None
251
+ self._block_ids[block_index] = new_block_id
252
+
253
+ def update(self, blocks: List[Block]):
254
+ self._blocks = blocks
255
+
256
+ # Cache block ids for fast query
257
+ self._block_ids = []
258
+ for block in self._blocks:
259
+ self._add_block_id(block.block_id)
260
+
261
+ def append_token_ids(self, block_index: int, token_ids: List[int]) -> None:
262
+ block = self._blocks[block_index]
263
+ prev_block_id = block.block_id
264
+
265
+ block.append_token_ids(token_ids)
266
+
267
+ # CoW or promotion may update the internal block_id
268
+ if prev_block_id != block.block_id:
269
+ self._update_block_id(block_index, block.block_id)
270
+
271
+ def append(self, new_block: Block):
272
+ self._blocks.append(new_block)
273
+ self._add_block_id(new_block.block_id)
274
+
275
+ def __len__(self) -> int:
276
+ return len(self._blocks)
277
+
278
+ def __getitem__(self, block_index: int) -> Block:
279
+ return self._blocks[block_index]
280
+
281
+ def __setitem__(self, block_index: int, new_block: Block) -> None:
282
+ self._blocks[block_index] = new_block
283
+ self._update_block_id(block_index, new_block.block_id)
284
+
285
+ def reset(self):
286
+ self._blocks = []
287
+ self._block_ids = []
288
+
289
+ def list(self) -> List[Block]:
290
+ return self._blocks
291
+
292
+ def ids(self) -> List[int]:
293
+ return self._block_ids
294
+
295
+
296
+ @dataclass
297
+ class CacheMetricData:
298
+ """A utility dataclass to maintain cache metric.
299
+ To avoid overflow, we maintain the hit rate in block granularity, so that
300
+ we can maintain a single hit rate for n_completed_block x block_size,
301
+ and calculate the real time hit rate by the following:
302
+ BS = The number of queries per block.
303
+ nB = The number of completed blocks.
304
+ HR = hit rate of (nB x BS) queries.
305
+ Q = current number of queries (< BS).
306
+ H = current number of hits (< BS).
307
+ hit rate = ((HR x nB) + (H / Q) x (Q / BS)) / (nB + Q / BS)
308
+ """
309
+ num_completed_blocks: int = 0
310
+ completed_block_cache_hit_rate: float = 0.0
311
+ num_incompleted_block_queries: int = 0
312
+ num_incompleted_block_hit: int = 0
313
+ block_size: int = 1000
314
+
315
+ def query(self, hit: bool):
316
+ self.num_incompleted_block_queries += 1
317
+ self.num_incompleted_block_hit += 1 if hit else 0
318
+
319
+ # When a block is completed, update the cache hit rate
320
+ # and reset the incomplete numbers.
321
+ if self.num_incompleted_block_queries == self.block_size:
322
+ hit_rate = (self.num_incompleted_block_hit /
323
+ self.num_incompleted_block_queries)
324
+ self.completed_block_cache_hit_rate = (
325
+ self.completed_block_cache_hit_rate * self.num_completed_blocks
326
+ + hit_rate) / (self.num_completed_blocks + 1)
327
+ self.num_incompleted_block_queries = 0
328
+ self.num_incompleted_block_hit = 0
329
+ self.num_completed_blocks += 1
330
+
331
+ def get_hit_rate(self):
332
+ incomplete_ratio = self.num_incompleted_block_queries / self.block_size
333
+ total_blocks = self.num_completed_blocks + incomplete_ratio
334
+ if total_blocks == 0:
335
+ return 0.0
336
+
337
+ completed_block_hit, incompleted_block_hit = 0.0, 0.0
338
+ if self.num_completed_blocks > 0:
339
+ completed_block_hit = (self.completed_block_cache_hit_rate *
340
+ self.num_completed_blocks)
341
+ if self.num_incompleted_block_queries > 0:
342
+ incompleted_hit_rate = (self.num_incompleted_block_hit /
343
+ self.num_incompleted_block_queries)
344
+ incompleted_block_hit = (incompleted_hit_rate * incomplete_ratio)
345
+ return (completed_block_hit + incompleted_block_hit) / total_blocks
346
+
347
+
348
+ def get_all_blocks_recursively(last_block: Block) -> List[Block]:
349
+ """Retrieves all the blocks in a sequence starting from the last block.
350
+
351
+ This function recursively traverses the sequence of blocks in reverse order,
352
+ starting from the given last block, and returns a list of all the blocks in
353
+ the sequence.
354
+
355
+ Args:
356
+ last_block (Block): The last block in the sequence.
357
+
358
+ Returns:
359
+ List[Block]: A list of all the blocks in the sequence, in the order they
360
+ appear.
361
+ """
362
+
363
+ def recurse(block: Block, lst: List[Block]) -> None:
364
+ if block.prev_block is not None:
365
+ recurse(block.prev_block, lst)
366
+ lst.append(block)
367
+
368
+ all_blocks: List[Block] = []
369
+ recurse(last_block, all_blocks)
370
+ return all_blocks
.venv/lib/python3.11/site-packages/vllm/core/block/cpu_gpu_block_allocator.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import Dict, FrozenSet, List, Optional, Tuple
4
+
5
+ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId,
6
+ DeviceAwareBlockAllocator)
7
+ from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
8
+ from vllm.core.block.prefix_caching_block import PrefixCachingBlockAllocator
9
+ from vllm.platforms import current_platform
10
+ from vllm.utils import Device
11
+
12
+
13
+ class CpuGpuBlockAllocator(DeviceAwareBlockAllocator):
14
+ """A block allocator that can allocate blocks on both CPU and GPU memory.
15
+
16
+ This class implements the `DeviceAwareBlockAllocator` interface and provides
17
+ functionality for allocating and managing blocks of memory on both CPU and
18
+ GPU devices.
19
+
20
+ The `CpuGpuBlockAllocator` maintains separate memory pools for CPU and GPU
21
+ blocks, and allows for allocation, deallocation, forking, and swapping of
22
+ blocks across these memory pools.
23
+ """
24
+
25
+ @staticmethod
26
+ def create(
27
+ allocator_type: str,
28
+ num_gpu_blocks: int,
29
+ num_cpu_blocks: int,
30
+ block_size: int,
31
+ ) -> DeviceAwareBlockAllocator:
32
+ """Creates a CpuGpuBlockAllocator instance with the specified
33
+ configuration.
34
+
35
+ This static method creates and returns a CpuGpuBlockAllocator instance
36
+ based on the provided parameters. It initializes the CPU and GPU block
37
+ allocators with the specified number of blocks, block size, and
38
+ allocator type.
39
+
40
+ Args:
41
+ allocator_type (str): The type of block allocator to use for CPU
42
+ and GPU blocks. Currently supported values are "naive" and
43
+ "prefix_caching".
44
+ num_gpu_blocks (int): The number of blocks to allocate for GPU
45
+ memory.
46
+ num_cpu_blocks (int): The number of blocks to allocate for CPU
47
+ memory.
48
+ block_size (int): The size of each block in number of tokens.
49
+
50
+ Returns:
51
+ DeviceAwareBlockAllocator: A CpuGpuBlockAllocator instance with the
52
+ specified configuration.
53
+
54
+ Notes:
55
+ - The block IDs are assigned contiguously, with GPU block IDs coming
56
+ before CPU block IDs.
57
+ """
58
+ # For HPU, block id 0 is used only for padding
59
+ reserved_blocks = 1 if current_platform.is_hpu() else 0
60
+ block_ids = list(
61
+ range(reserved_blocks, num_gpu_blocks + num_cpu_blocks))
62
+ num_gpu_blocks -= reserved_blocks
63
+ gpu_block_ids = block_ids[:num_gpu_blocks]
64
+ cpu_block_ids = block_ids[num_gpu_blocks:]
65
+
66
+ if allocator_type == "naive":
67
+ gpu_allocator: BlockAllocator = NaiveBlockAllocator(
68
+ create_block=NaiveBlock, # type: ignore
69
+ num_blocks=num_gpu_blocks,
70
+ block_size=block_size,
71
+ block_ids=gpu_block_ids,
72
+ )
73
+
74
+ cpu_allocator: BlockAllocator = NaiveBlockAllocator(
75
+ create_block=NaiveBlock, # type: ignore
76
+ num_blocks=num_cpu_blocks,
77
+ block_size=block_size,
78
+ block_ids=cpu_block_ids,
79
+ )
80
+ elif allocator_type == "prefix_caching":
81
+ gpu_allocator = PrefixCachingBlockAllocator(
82
+ num_blocks=num_gpu_blocks,
83
+ block_size=block_size,
84
+ block_ids=gpu_block_ids,
85
+ )
86
+
87
+ cpu_allocator = PrefixCachingBlockAllocator(
88
+ num_blocks=num_cpu_blocks,
89
+ block_size=block_size,
90
+ block_ids=cpu_block_ids,
91
+ )
92
+ else:
93
+ raise ValueError(f"Unknown allocator type {allocator_type=}")
94
+
95
+ return CpuGpuBlockAllocator(
96
+ cpu_block_allocator=cpu_allocator,
97
+ gpu_block_allocator=gpu_allocator,
98
+ )
99
+
100
+ def __init__(self, cpu_block_allocator: BlockAllocator,
101
+ gpu_block_allocator: BlockAllocator):
102
+ assert not (
103
+ cpu_block_allocator.all_block_ids
104
+ & gpu_block_allocator.all_block_ids
105
+ ), "cpu and gpu block allocators can't have intersection of block ids"
106
+
107
+ self._allocators = {
108
+ Device.CPU: cpu_block_allocator,
109
+ Device.GPU: gpu_block_allocator,
110
+ }
111
+
112
+ self._swap_mapping: Dict[int, int] = {}
113
+ self._null_block: Optional[Block] = None
114
+
115
+ self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
116
+ for _, allocator in self._allocators.items():
117
+ for block_id in allocator.all_block_ids:
118
+ self._block_ids_to_allocator[block_id] = allocator
119
+
120
+ def allocate_or_get_null_block(self) -> Block:
121
+ if self._null_block is None:
122
+ self._null_block = NullBlock(
123
+ self.allocate_mutable_block(None, Device.GPU))
124
+ return self._null_block
125
+
126
+ def allocate_mutable_block(self,
127
+ prev_block: Optional[Block],
128
+ device: Device,
129
+ extra_hash: Optional[int] = None) -> Block:
130
+ """Allocates a new mutable block on the specified device.
131
+
132
+ Args:
133
+ prev_block (Optional[Block]): The previous block to in the sequence.
134
+ Used for prefix hashing.
135
+ device (Device): The device on which to allocate the new block.
136
+ extra_hash (Optional[int]): The hash value of additional
137
+ factors, such as adapters, that influence the block hash
138
+ in the prefix caching block.
139
+
140
+ Returns:
141
+ Block: The newly allocated mutable block.
142
+ """
143
+ return self._allocators[device].allocate_mutable_block(
144
+ prev_block, extra_hash=extra_hash)
145
+
146
+ def allocate_immutable_blocks(
147
+ self,
148
+ prev_block: Optional[Block],
149
+ block_token_ids: List[List[int]],
150
+ device: Device,
151
+ extra_hash: Optional[int] = None) -> List[Block]:
152
+ """Allocates a new group of immutable blocks with the provided block
153
+ token IDs on the specified device.
154
+
155
+ Args:
156
+ prev_block (Optional[Block]): The previous block in the sequence.
157
+ Used for prefix hashing.
158
+ block_token_ids (List[int]): The list of block token IDs to be
159
+ stored in the new blocks.
160
+ device (Device): The device on which to allocate the new block.
161
+ extra_hash (Optional[int]): The hash value of additional
162
+ factors, such as adapters, that influence the block hash
163
+ in the prefix caching block.
164
+
165
+ Returns:
166
+ List[Block]: The newly allocated list of immutable blocks
167
+ containing the provided block token IDs.
168
+ """
169
+ return self._allocators[device].allocate_immutable_blocks(
170
+ prev_block, block_token_ids, extra_hash=extra_hash)
171
+
172
+ def allocate_immutable_block(self,
173
+ prev_block: Optional[Block],
174
+ token_ids: List[int],
175
+ device: Device,
176
+ extra_hash: Optional[int] = None) -> Block:
177
+ """Allocates a new immutable block with the provided token IDs on the
178
+ specified device.
179
+
180
+ Args:
181
+ prev_block (Optional[Block]): The previous block in the sequence.
182
+ Used for prefix hashing.
183
+ token_ids (List[int]): The list of token IDs to be stored in the new
184
+ block.
185
+ device (Device): The device on which to allocate the new block.
186
+ extra_hash (Optional[int]): The hash value of additional
187
+ factors, such as adapters, that influence the block hash
188
+ in the prefix caching block.
189
+
190
+ Returns:
191
+ Block: The newly allocated immutable block containing the provided
192
+ token IDs.
193
+ """
194
+ return self._allocators[device].allocate_immutable_block(
195
+ prev_block, token_ids, extra_hash=extra_hash)
196
+
197
+ def free(self, block: Block) -> None:
198
+ """Frees the memory occupied by the given block.
199
+
200
+ Args:
201
+ block (Block): The block to be freed.
202
+ """
203
+ # Null block should never be freed
204
+ if isinstance(block, NullBlock):
205
+ return
206
+ block_id = block.block_id
207
+ assert block_id is not None
208
+ allocator = self._block_ids_to_allocator[block_id]
209
+ allocator.free(block)
210
+
211
+ def fork(self, last_block: Block) -> List[Block]:
212
+ """Creates a new sequence of blocks that shares the same underlying
213
+ memory as the original sequence.
214
+
215
+ Args:
216
+ last_block (Block): The last block in the original sequence.
217
+
218
+ Returns:
219
+ List[Block]: A new list of blocks that shares the same memory as the
220
+ original sequence.
221
+ """
222
+ # do not attempt to fork the null block
223
+ assert not isinstance(last_block, NullBlock)
224
+ block_id = last_block.block_id
225
+ assert block_id is not None
226
+ allocator = self._block_ids_to_allocator[block_id]
227
+ return allocator.fork(last_block)
228
+
229
+ def get_num_free_blocks(self, device: Device) -> int:
230
+ """Returns the number of free blocks available on the specified device.
231
+
232
+ Args:
233
+ device (Device): The device for which to query the number of free
234
+ blocks. AssertionError is raised if None is passed.
235
+
236
+ Returns:
237
+ int: The number of free blocks available on the specified device.
238
+ """
239
+ return self._allocators[device].get_num_free_blocks()
240
+
241
+ def get_num_total_blocks(self, device: Device) -> int:
242
+ return self._allocators[device].get_num_total_blocks()
243
+
244
+ def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
245
+ """Returns the zero-offset block id on certain device given the
246
+ absolute block id.
247
+
248
+ Args:
249
+ device (Device): The device for which to query relative block id.
250
+ absolute_id (int): The absolute block id for the block in
251
+ whole allocator.
252
+
253
+ Returns:
254
+ int: The zero-offset block id on certain device.
255
+ """
256
+ return self._allocators[device].get_physical_block_id(absolute_id)
257
+
258
+ def swap(self, blocks: List[Block], src_device: Device,
259
+ dst_device: Device) -> Dict[int, int]:
260
+ """Execute the swap for the given blocks from source_device
261
+ on to dest_device, save the current swap mapping and append
262
+ them to the accumulated `self._swap_mapping` for each
263
+ scheduling move.
264
+
265
+ Args:
266
+ blocks: List of blocks to be swapped.
267
+ src_device (Device): Device to swap the 'blocks' from.
268
+ dst_device (Device): Device to swap the 'blocks' to.
269
+
270
+ Returns:
271
+ Dict[int, int]: Swap mapping from source_device
272
+ on to dest_device.
273
+ """
274
+ src_block_ids = [block.block_id for block in blocks]
275
+ self._allocators[src_device].swap_out(blocks)
276
+ self._allocators[dst_device].swap_in(blocks)
277
+ dst_block_ids = [block.block_id for block in blocks]
278
+
279
+ current_swap_mapping: Dict[int, int] = {}
280
+ for src_block_id, dst_block_id in zip(src_block_ids, dst_block_ids):
281
+ if src_block_id is not None and dst_block_id is not None:
282
+ self._swap_mapping[src_block_id] = dst_block_id
283
+ current_swap_mapping[src_block_id] = dst_block_id
284
+ return current_swap_mapping
285
+
286
+ def get_num_full_blocks_touched(self, blocks: List[Block],
287
+ device: Device) -> int:
288
+ """Returns the number of full blocks that will be touched by
289
+ swapping in/out the given blocks on to the 'device'.
290
+
291
+ Args:
292
+ blocks: List of blocks to be swapped.
293
+ device (Device): Device to swap the 'blocks' on.
294
+
295
+ Returns:
296
+ int: the number of full blocks that will be touched by
297
+ swapping in/out the given blocks on to the 'device'.
298
+ Non full blocks are ignored when deciding the number
299
+ of blocks to touch.
300
+ """
301
+ return self._allocators[device].get_num_full_blocks_touched(blocks)
302
+
303
+ def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
304
+ """Clears the copy-on-write (CoW) state and returns the mapping of
305
+ source to destination block IDs.
306
+
307
+ Returns:
308
+ List[Tuple[int, int]]: A list mapping source block IDs to
309
+ destination block IDs.
310
+ """
311
+ # CoW only supported on GPU
312
+ device = Device.GPU
313
+ return self._allocators[device].clear_copy_on_writes()
314
+
315
+ def mark_blocks_as_accessed(self, block_ids: List[int],
316
+ now: float) -> None:
317
+ """Mark blocks as accessed, only use for prefix caching."""
318
+ # Prefix caching only supported on GPU.
319
+ device = Device.GPU
320
+ return self._allocators[device].mark_blocks_as_accessed(block_ids, now)
321
+
322
+ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
323
+ """Mark blocks as accessed, only use for prefix caching."""
324
+ # Prefix caching only supported on GPU.
325
+ device = Device.GPU
326
+ return self._allocators[device].mark_blocks_as_computed(block_ids)
327
+
328
+ def get_common_computed_block_ids(
329
+ self, computed_seq_block_ids: List[List[int]]) -> List[int]:
330
+ # Prefix caching only supported on GPU.
331
+ device = Device.GPU
332
+ return self._allocators[device].get_common_computed_block_ids(
333
+ computed_seq_block_ids)
334
+
335
+ @property
336
+ def all_block_ids(self) -> FrozenSet[int]:
337
+ return frozenset(self._block_ids_to_allocator.keys())
338
+
339
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
340
+ """Prefix cache hit rate. -1 means not supported or disabled."""
341
+ assert device in self._allocators
342
+ return self._allocators[device].get_prefix_cache_hit_rate()
343
+
344
+ def reset_prefix_cache(self) -> bool:
345
+ """Reset prefix cache for all devices."""
346
+ success = True
347
+ for allocator in self._allocators.values():
348
+ success = success and allocator.reset_prefix_cache()
349
+ return success
350
+
351
+ def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
352
+ """Returns and clears the mapping of source to destination block IDs.
353
+ Will be called after every swapping operations for now, and after every
354
+ schedule when BlockManagerV2 become default. Currently not useful.
355
+
356
+ Returns:
357
+ List[Tuple[int, int]]: A mapping of source to destination block IDs.
358
+ """
359
+ mapping = self._swap_mapping.copy()
360
+ self._swap_mapping.clear()
361
+ return list(mapping.items())
362
+
363
+ def find_cached_blocks_prefix(
364
+ self,
365
+ block_hashes: List[int],
366
+ device: Device = Device.GPU,
367
+ ) -> List[int]:
368
+ return self._allocators[device].find_cached_blocks_prefix(block_hashes)
369
+
370
+
371
+ class NullBlock(Block):
372
+ """
373
+ Null blocks are used as a placeholders for KV cache blocks that have
374
+ been dropped due to sliding window.
375
+ This implementation just wraps an ordinary block and prevents it from
376
+ being modified. It also allows for testing if a block is NullBlock
377
+ via isinstance().
378
+ """
379
+
380
+ def __init__(self, proxy: Block):
381
+ super().__init__()
382
+ self._proxy = proxy
383
+
384
+ def append_token_ids(self, token_ids: List[BlockId]):
385
+ raise ValueError("null block should not be modified")
386
+
387
+ @property
388
+ def block_id(self):
389
+ return self._proxy.block_id
390
+
391
+ @block_id.setter
392
+ def block_id(self, value: Optional[BlockId]):
393
+ raise ValueError("null block should not be modified")
394
+
395
+ @property
396
+ def token_ids(self) -> List[BlockId]:
397
+ return self._proxy.token_ids
398
+
399
+ @property
400
+ def num_tokens_total(self) -> int:
401
+ raise NotImplementedError(
402
+ "num_tokens_total is not used for null block")
403
+
404
+ @property
405
+ def num_empty_slots(self) -> BlockId:
406
+ return self._proxy.num_empty_slots
407
+
408
+ @property
409
+ def is_full(self):
410
+ return self._proxy.is_full
411
+
412
+ @property
413
+ def prev_block(self):
414
+ return self._proxy.prev_block
415
+
416
+ @property
417
+ def extra_hash(self):
418
+ return None
419
+
420
+ @property
421
+ def computed(self):
422
+ return self._proxy.computed
423
+
424
+ @computed.setter
425
+ def computed(self, value):
426
+ self._proxy.computed = value
427
+
428
+ @property
429
+ def last_accessed(self) -> float:
430
+ return self._proxy.last_accessed
431
+
432
+ @last_accessed.setter
433
+ def last_accessed(self, last_accessed_ts: float):
434
+ self._proxy.last_accessed = last_accessed_ts
435
+
436
+ @property
437
+ def content_hash(self):
438
+ return self._proxy.content_hash
.venv/lib/python3.11/site-packages/vllm/core/block/interfaces.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
5
+
6
+ from vllm.utils import Device
7
+
8
+ BlockId = int
9
+
10
+
11
+ class Block(ABC):
12
+
13
+ @abstractmethod
14
+ def append_token_ids(self, token_ids: List[int]) -> None:
15
+ pass
16
+
17
+ @property
18
+ @abstractmethod
19
+ def block_id(self) -> Optional[int]:
20
+ pass
21
+
22
+ @block_id.setter
23
+ @abstractmethod
24
+ def block_id(self, value: Optional[int]) -> None:
25
+ """NOTE: Do not use this API outside Block."""
26
+ self._block_id = value
27
+
28
+ @property
29
+ @abstractmethod
30
+ def token_ids(self) -> List[int]:
31
+ pass
32
+
33
+ @property
34
+ @abstractmethod
35
+ def num_tokens_total(self) -> int:
36
+ """The number of tokens till the current block (inclusive)
37
+ """
38
+ pass
39
+
40
+ @property
41
+ @abstractmethod
42
+ def num_empty_slots(self) -> int:
43
+ pass
44
+
45
+ @property
46
+ @abstractmethod
47
+ def is_full(self) -> bool:
48
+ pass
49
+
50
+ @property
51
+ @abstractmethod
52
+ def prev_block(self) -> Optional["Block"]:
53
+ pass
54
+
55
+ @property
56
+ @abstractmethod
57
+ def extra_hash(self) -> Optional[int]:
58
+ return None
59
+
60
+ @property
61
+ @abstractmethod
62
+ def computed(self) -> bool:
63
+ raise NotImplementedError
64
+
65
+ @computed.setter
66
+ @abstractmethod
67
+ def computed(self, value) -> bool:
68
+ """Should be only used by PrefixCacingAllocator"""
69
+ raise NotImplementedError
70
+
71
+ @property
72
+ @abstractmethod
73
+ def last_accessed(self) -> float:
74
+ raise NotImplementedError
75
+
76
+ @last_accessed.setter
77
+ @abstractmethod
78
+ def last_accessed(self, last_accessed_ts: float):
79
+ raise NotImplementedError
80
+
81
+ class Factory(Protocol):
82
+
83
+ @abstractmethod
84
+ def __call__(
85
+ self,
86
+ prev_block: Optional["Block"],
87
+ token_ids: List[int],
88
+ block_size: int,
89
+ allocator: "BlockAllocator",
90
+ block_id: Optional[int] = None,
91
+ computed: bool = False,
92
+ extra_hash: Optional[int] = None,
93
+ ) -> "Block":
94
+ pass
95
+
96
+ @property
97
+ @abstractmethod
98
+ def content_hash(self) -> Optional[int]:
99
+ """Return the content-based hash of the current block, or None if it is
100
+ not yet defined or not supported.
101
+
102
+ For the content-based hash to be defined, the current block must be
103
+ full.
104
+ """
105
+ return None
106
+
107
+
108
+ class BlockAllocator(ABC):
109
+
110
+ @abstractmethod
111
+ def allocate_mutable_block(self, prev_block: Optional[Block],
112
+ extra_hash: Optional[int]) -> Block:
113
+ pass
114
+
115
+ @abstractmethod
116
+ def allocate_immutable_block(self, prev_block: Optional[Block],
117
+ token_ids: List[int],
118
+ extra_hash: Optional[int]) -> Block:
119
+ pass
120
+
121
+ @abstractmethod
122
+ def allocate_immutable_blocks(self, prev_block: Optional[Block],
123
+ block_token_ids: List[List[int]],
124
+ extra_hash: Optional[int]) -> List[Block]:
125
+ pass
126
+
127
+ @abstractmethod
128
+ def free(self, block: Block) -> None:
129
+ pass
130
+
131
+ @abstractmethod
132
+ def fork(self, last_block: Block) -> List[Block]:
133
+ pass
134
+
135
+ @abstractmethod
136
+ def get_num_total_blocks(self) -> int:
137
+ pass
138
+
139
+ @abstractmethod
140
+ def get_num_free_blocks(self) -> int:
141
+ pass
142
+
143
+ @abstractmethod
144
+ def get_physical_block_id(self, absolute_id: int) -> int:
145
+ pass
146
+
147
+ @abstractmethod
148
+ def swap_out(self, blocks: List[Block]) -> None:
149
+ pass
150
+
151
+ @abstractmethod
152
+ def swap_in(self, blocks: List[Block]) -> None:
153
+ pass
154
+
155
+ @property
156
+ @abstractmethod
157
+ def all_block_ids(self) -> FrozenSet[int]:
158
+ pass
159
+
160
+ @abstractmethod
161
+ def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
162
+ pass
163
+
164
+ @abstractmethod
165
+ def mark_blocks_as_accessed(self, block_ids: List[int],
166
+ now: float) -> None:
167
+ pass
168
+
169
+ @abstractmethod
170
+ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
171
+ pass
172
+
173
+ @abstractmethod
174
+ def get_common_computed_block_ids(
175
+ self, computed_seq_block_ids: List[List[int]]) -> List[int]:
176
+ pass
177
+
178
+ @abstractmethod
179
+ def cow_block_if_not_appendable(self, block: Block) -> BlockId:
180
+ """NOTE: This should not be used besides Block"""
181
+ pass
182
+
183
+ @abstractmethod
184
+ def promote_to_immutable_block(self, block: Block) -> BlockId:
185
+ """NOTE: This should not be used besides Block"""
186
+ pass
187
+
188
+ @abstractmethod
189
+ def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
190
+ pass
191
+
192
+ @abstractmethod
193
+ def get_prefix_cache_hit_rate(self) -> float:
194
+ """Prefix cache hit rate. -1 means not supported or disabled."""
195
+ pass
196
+
197
+ @abstractmethod
198
+ def reset_prefix_cache(self) -> bool:
199
+ """Reset prefix cache."""
200
+ pass
201
+
202
+ class NoFreeBlocksError(ValueError):
203
+ pass
204
+
205
+ @abstractmethod
206
+ def find_cached_blocks_prefix(
207
+ self,
208
+ block_hashes: List[int],
209
+ ) -> List[int]:
210
+ pass
211
+
212
+
213
+ class DeviceAwareBlockAllocator(ABC):
214
+
215
+ @abstractmethod
216
+ def allocate_mutable_block(self,
217
+ prev_block: Optional[Block],
218
+ device: Device,
219
+ extra_hash: Optional[int] = None) -> Block:
220
+ pass
221
+
222
+ @abstractmethod
223
+ def allocate_immutable_block(self,
224
+ prev_block: Optional[Block],
225
+ token_ids: List[int],
226
+ device: Device,
227
+ extra_hash: Optional[int] = None) -> Block:
228
+ pass
229
+
230
+ @abstractmethod
231
+ def allocate_immutable_blocks(
232
+ self,
233
+ prev_block: Optional[Block],
234
+ block_token_ids: List[List[int]],
235
+ device: Device,
236
+ extra_hash: Optional[int] = None,
237
+ ) -> List[Block]:
238
+ pass
239
+
240
+ @abstractmethod
241
+ def get_num_free_blocks(self, device: Device) -> int:
242
+ pass
243
+
244
+ @abstractmethod
245
+ def get_num_total_blocks(self, device: Device) -> int:
246
+ pass
247
+
248
+ @abstractmethod
249
+ def free(self, block: Block) -> None:
250
+ pass
251
+
252
+ @abstractmethod
253
+ def fork(self, last_block: Block) -> List[Block]:
254
+ pass
255
+
256
+ @property
257
+ @abstractmethod
258
+ def all_block_ids(self) -> FrozenSet[int]:
259
+ pass
260
+
261
+ @abstractmethod
262
+ def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
263
+ pass
264
+
265
+ @abstractmethod
266
+ def mark_blocks_as_accessed(self, block_ids: List[int],
267
+ now: float) -> None:
268
+ pass
269
+
270
+ @abstractmethod
271
+ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
272
+ pass
273
+
274
+ @abstractmethod
275
+ def get_common_computed_block_ids(
276
+ self, computed_seq_block_ids: List[List[int]]) -> List[int]:
277
+ pass
278
+
279
+ @abstractmethod
280
+ def get_num_full_blocks_touched(self, blocks: List[Block],
281
+ device: Device) -> int:
282
+ pass
283
+
284
+ @abstractmethod
285
+ def swap(self, blocks: List[Block], src_device: Device,
286
+ dst_device: Device) -> Dict[int, int]:
287
+ pass
288
+
289
+ @abstractmethod
290
+ def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
291
+ pass
292
+
293
+ @abstractmethod
294
+ def allocate_or_get_null_block(self) -> Block:
295
+ """
296
+ Null blocks are used as a placeholders for KV cache blocks that have
297
+ been dropped due to sliding window.
298
+ There is at most one null block per allocator.
299
+ """
300
+ pass
301
+
302
+ @abstractmethod
303
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
304
+ """Prefix cache hit rate. -1 means not supported or disabled."""
305
+ pass
306
+
307
+ @abstractmethod
308
+ def reset_prefix_cache(self) -> bool:
309
+ """Reset prefix cache."""
310
+ pass
311
+
312
+ @abstractmethod
313
+ def find_cached_blocks_prefix(
314
+ self,
315
+ block_hashes: List[int],
316
+ device: Device = Device.GPU,
317
+ ) -> List[int]:
318
+ pass
.venv/lib/python3.11/site-packages/vllm/core/block/naive_block.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from collections import deque
4
+ from typing import Deque, FrozenSet, Iterable, List, Optional, Tuple, Union
5
+
6
+ from vllm.core.block.common import (BlockPool, CopyOnWriteTracker, RefCounter,
7
+ get_all_blocks_recursively)
8
+ from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
9
+
10
+ Refcount = int
11
+
12
+
13
+ class NaiveBlockAllocator(BlockAllocator):
14
+ """A simple block allocator that manages blocks of memory without prefix
15
+ caching.
16
+
17
+ Args:
18
+ create_block (Block.Factory): A factory function for creating new
19
+ blocks. This is used when a NaiveBlockAllocator is composed within
20
+ a prefix caching allocator -- the naive block allocator must
21
+ construct prefix caching blocks (but shouldn't know anything else
22
+ about them).
23
+ num_blocks (int): The total number of blocks to manage.
24
+ block_size (int): The size of each block in tokens.
25
+ block_ids (Optional[Iterable[int]], optional): An optional iterable of
26
+ block IDs. If not provided, block IDs will be assigned sequentially
27
+ from 0 to num_blocks - 1.
28
+ """
29
+
30
+ def __init__(
31
+ self,
32
+ create_block: Block.Factory,
33
+ num_blocks: int,
34
+ block_size: int,
35
+ block_ids: Optional[Iterable[int]] = None,
36
+ block_pool: Optional[BlockPool] = None,
37
+ ):
38
+ if block_ids is None:
39
+ block_ids = range(num_blocks)
40
+
41
+ self._free_block_indices: Deque[BlockId] = deque(block_ids)
42
+ self._all_block_indices = frozenset(block_ids)
43
+ assert len(self._all_block_indices) == num_blocks
44
+
45
+ self._refcounter = RefCounter(
46
+ all_block_indices=self._free_block_indices)
47
+ self._block_size = block_size
48
+
49
+ self._cow_tracker = CopyOnWriteTracker(
50
+ refcounter=self._refcounter.as_readonly())
51
+
52
+ if block_pool is None:
53
+ extra_factor = 4
54
+ # Pre-allocate "num_blocks * extra_factor" block objects.
55
+ # The "* extra_factor" is a buffer to allow more block objects
56
+ # than physical blocks
57
+ self._block_pool = BlockPool(self._block_size, create_block, self,
58
+ num_blocks * extra_factor)
59
+ else:
60
+ # In this case, the block pool is provided by the caller,
61
+ # which means that there is most likely a need to share
62
+ # a block pool between allocators
63
+ self._block_pool = block_pool
64
+
65
+ def allocate_immutable_block(self,
66
+ prev_block: Optional[Block],
67
+ token_ids: List[int],
68
+ extra_hash: Optional[int] = None,
69
+ device: Optional[Device] = None) -> Block:
70
+ """Allocates a new immutable block with the given token IDs, linked to
71
+ the previous block.
72
+
73
+ Args:
74
+ prev_block (Optional[Block]): The previous block in the sequence. If
75
+ None, then the block to be allocated is the first block in the
76
+ sequence.
77
+ token_ids (List[int]): The token IDs to be stored in the new block.
78
+
79
+ Returns:
80
+ Block: The newly allocated immutable block.
81
+ """
82
+ assert device is None
83
+ block = self.allocate_mutable_block(prev_block=prev_block)
84
+ block.append_token_ids(token_ids)
85
+ return block
86
+
87
+ def allocate_immutable_blocks(
88
+ self,
89
+ prev_block: Optional[Block],
90
+ block_token_ids: List[List[int]],
91
+ extra_hash: Optional[int] = None,
92
+ device: Optional[Device] = None) -> List[Block]:
93
+ assert device is None
94
+ num_blocks = len(block_token_ids)
95
+
96
+ block_ids = []
97
+ for i in range(num_blocks):
98
+ block_ids.append(self._allocate_block_id())
99
+
100
+ blocks = []
101
+ for i in range(num_blocks):
102
+ prev_block = self._block_pool.init_block(
103
+ prev_block=prev_block,
104
+ token_ids=block_token_ids[i],
105
+ block_size=self._block_size,
106
+ physical_block_id=block_ids[i])
107
+ blocks.append(prev_block)
108
+
109
+ return blocks
110
+
111
+ def allocate_mutable_block(self,
112
+ prev_block: Optional[Block],
113
+ extra_hash: Optional[int] = None,
114
+ device: Optional[Device] = None) -> Block:
115
+ """Allocates a new mutable block, linked to the previous block.
116
+
117
+ Args:
118
+ prev_block (Optional[Block]): The previous block in the sequence. If
119
+ None, then the block to be allocated is the first block in the
120
+ sequence.
121
+
122
+ Returns:
123
+ Block: The newly allocated mutable block.
124
+ """
125
+ assert device is None
126
+ block_id = self._allocate_block_id()
127
+ block = self._block_pool.init_block(prev_block=prev_block,
128
+ token_ids=[],
129
+ block_size=self._block_size,
130
+ physical_block_id=block_id)
131
+ return block
132
+
133
+ def _allocate_block_id(self) -> BlockId:
134
+ if not self._free_block_indices:
135
+ raise BlockAllocator.NoFreeBlocksError()
136
+
137
+ block_id = self._free_block_indices.popleft()
138
+ self._refcounter.incr(block_id)
139
+ return block_id
140
+
141
+ def _free_block_id(self, block: Union[Block, BlockId]) -> None:
142
+ if isinstance(block, Block):
143
+ block_id = block.block_id
144
+ block.block_id = None
145
+ else:
146
+ block_id = block
147
+ assert block_id is not None
148
+
149
+ refcount = self._refcounter.decr(block_id)
150
+ if refcount == 0:
151
+ self._free_block_indices.appendleft(block_id)
152
+
153
+ def free(self, block: Block, keep_block_object: bool = False) -> None:
154
+ # Release the physical block id
155
+ self._free_block_id(block)
156
+
157
+ # Release the block object
158
+ if not keep_block_object:
159
+ self._block_pool.free_block(block)
160
+
161
+ def free_block_id(self, block_id: BlockId) -> None:
162
+ self._free_block_id(block_id)
163
+
164
+ def fork(self, last_block: Block) -> List[Block]:
165
+ """Creates a new sequence of blocks that shares the same underlying
166
+ memory as the original sequence.
167
+
168
+ Args:
169
+ last_block (Block): The last block in the original sequence.
170
+
171
+ Returns:
172
+ List[Block]: The new sequence of blocks that shares the same memory
173
+ as the original sequence.
174
+ """
175
+ source_blocks = get_all_blocks_recursively(last_block)
176
+
177
+ forked_blocks: List[Block] = []
178
+ prev_block = None
179
+ for block in source_blocks:
180
+
181
+ # Increment refcount for each block.
182
+ assert block.block_id is not None
183
+ refcount = self._refcounter.incr(block.block_id)
184
+ assert refcount != 1, "can't fork free'd block"
185
+
186
+ forked_block = self._block_pool.init_block(
187
+ prev_block=prev_block,
188
+ token_ids=block.token_ids,
189
+ block_size=self._block_size,
190
+ physical_block_id=block.block_id)
191
+
192
+ forked_blocks.append(forked_block)
193
+ prev_block = forked_blocks[-1]
194
+
195
+ return forked_blocks
196
+
197
+ def get_num_free_blocks(self) -> int:
198
+ return len(self._free_block_indices)
199
+
200
+ def get_num_total_blocks(self) -> int:
201
+ return len(self._all_block_indices)
202
+
203
+ def get_physical_block_id(self, absolute_id: int) -> int:
204
+ """Returns the zero-offset block id on certain block allocator
205
+ given the absolute block id.
206
+
207
+ Args:
208
+ absolute_id (int): The absolute block id for the block
209
+ in whole allocator.
210
+
211
+ Returns:
212
+ int: The zero-offset block id on certain device.
213
+ """
214
+ return sorted(self._all_block_indices).index(absolute_id)
215
+
216
+ @property
217
+ def refcounter(self):
218
+ return self._refcounter
219
+
220
+ @property
221
+ def all_block_ids(self) -> FrozenSet[int]:
222
+ return self._all_block_indices
223
+
224
+ def cow_block_if_not_appendable(self, block: Block) -> BlockId:
225
+ """Performs a copy-on-write operation on the given block if it is not
226
+ appendable.
227
+
228
+ Args:
229
+ block (Block): The block to check for copy-on-write.
230
+
231
+ Returns:
232
+ BlockId: The block index of the new block if a copy-on-write
233
+ operation was performed, or the original block index if
234
+ no copy-on-write was necessary.
235
+ """
236
+ src_block_id = block.block_id
237
+ assert src_block_id is not None
238
+
239
+ if self._cow_tracker.is_appendable(block):
240
+ return src_block_id
241
+
242
+ self._free_block_id(block)
243
+ trg_block_id = self._allocate_block_id()
244
+
245
+ self._cow_tracker.record_cow(src_block_id, trg_block_id)
246
+
247
+ return trg_block_id
248
+
249
+ def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
250
+ """Returns the copy-on-write source->destination mapping and clears it.
251
+
252
+ Returns:
253
+ List[Tuple[BlockId, BlockId]]: A list mapping source
254
+ block indices to destination block indices.
255
+ """
256
+ return self._cow_tracker.clear_cows()
257
+
258
+ def mark_blocks_as_accessed(self, block_ids: List[int],
259
+ now: float) -> None:
260
+ """Mark blocks as accessed, used in prefix caching.
261
+
262
+ Since the naive allocator does not implement prefix caching, we do
263
+ nothing.
264
+ """
265
+ pass
266
+
267
+ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
268
+ """Mark blocks as computed, used in prefix caching.
269
+
270
+ Since the naive allocator does not implement prefix caching, we do
271
+ nothing.
272
+ """
273
+ pass
274
+
275
+ def get_common_computed_block_ids(
276
+ self, computed_seq_block_ids: List[List[int]]) -> List[int]:
277
+ """Determine blocks that can be skipped in prefill.
278
+
279
+ Since the naive allocator does not support prefix caching, always return
280
+ an empty list.
281
+ """
282
+ return []
283
+
284
+ def promote_to_immutable_block(self, block: Block) -> BlockId:
285
+ raise NotImplementedError("There is no promotion for naive blocks")
286
+
287
+ def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
288
+ """Returns the number of full blocks that will be touched by
289
+ swapping in/out.
290
+
291
+ Args:
292
+ blocks: List of blocks to be swapped.
293
+ Returns:
294
+ int: the number of full blocks that will be touched by
295
+ swapping in/out the given blocks. Non full blocks are ignored
296
+ when deciding the number of blocks to touch.
297
+ """
298
+ # NOTE: for naive block, we use set to eliminate common blocks among
299
+ # seqs, also we compare the empty slots in the mutable blocks with
300
+ # lookahead slots to get the number of unique new block that are
301
+ # needed.
302
+ old_block_set = set()
303
+ for block in blocks:
304
+ if block.is_full:
305
+ old_block_set.add(block)
306
+ return len(old_block_set)
307
+
308
+ def swap_out(self, blocks: List[Block]) -> None:
309
+ for block in blocks:
310
+ self._free_block_id(block)
311
+
312
+ def swap_in(self, blocks: List[Block]) -> None:
313
+ for block in blocks:
314
+ # Here we allocate either immutable or mutable block and then
315
+ # extract its block_id. Note that the block object is released
316
+ # and the block_id is assigned to "block" to allow reusing the
317
+ # existing "block" object
318
+ if block.is_full:
319
+ tmp_block = self.allocate_immutable_block(
320
+ prev_block=block.prev_block, token_ids=block.token_ids)
321
+ else:
322
+ tmp_block = self.allocate_mutable_block(
323
+ prev_block=block.prev_block)
324
+ tmp_block.append_token_ids(block.token_ids)
325
+
326
+ block_id = tmp_block.block_id
327
+ tmp_block.block_id = None
328
+ self._block_pool.free_block(tmp_block)
329
+
330
+ block.block_id = block_id # Assign block_id
331
+
332
+ def get_prefix_cache_hit_rate(self) -> float:
333
+ return -1
334
+
335
+ def reset_prefix_cache(self) -> bool:
336
+ """No prefix cache for naive block allocator."""
337
+ return True
338
+
339
+ def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
340
+ # Not applicable for naive block allocator.
341
+ return []
342
+
343
+
344
+ class NaiveBlock(Block):
345
+ """An implementation of the Block class that does not support prefix
346
+ caching.
347
+
348
+ The NaiveBlock class represents a block of token IDs with a fixed size. It
349
+ provides methods for appending token IDs to the block and manages copy-on
350
+ -write operations when necessary.
351
+
352
+ Args:
353
+ prev_block (Block): The previous block in the sequence.
354
+ token_ids (List[int]): The initial token IDs to be stored in the block.
355
+ block_size (int): The maximum number of token IDs that can be stored in
356
+ the block.
357
+ allocator (BlockAllocator): The block allocator associated with this
358
+ block.
359
+ block_id (Optional[int], optional): The physical block index
360
+ of this block. Defaults to None, which means no allocation has been
361
+ made.
362
+ _cow_target (Optional[Block], optional): The copy-on-write target block.
363
+ If not provided, it defaults to self.
364
+ """
365
+
366
+ def __init__(self,
367
+ prev_block: Optional[Block],
368
+ token_ids: List[int],
369
+ block_size: int,
370
+ allocator: BlockAllocator,
371
+ block_id: Optional[int] = None,
372
+ _cow_target: Optional[Block] = None,
373
+ extra_hash: Optional[int] = None):
374
+ self._token_ids: List[int] = []
375
+ self._block_size = block_size
376
+ self._prev_block = prev_block
377
+ self._block_id = block_id
378
+ self._allocator = allocator
379
+ self._cow_target = _cow_target if _cow_target is not None else self
380
+
381
+ self._append_token_ids_no_cow(token_ids)
382
+
383
+ def append_token_ids(self, token_ids: List[int]) -> None:
384
+ """Appends the given token IDs to the block and performs a
385
+ copy-on-write if necessary.
386
+
387
+ Args:
388
+ token_ids (Optional[List[int]]): The token IDs to be appended
389
+ to the block.
390
+ """
391
+ self._append_token_ids_no_cow(token_ids)
392
+
393
+ if self._block_id is not None:
394
+ self._block_id = (self._allocator.cow_block_if_not_appendable(
395
+ self._cow_target))
396
+
397
+ def _append_token_ids_no_cow(self, token_ids: List[int]) -> None:
398
+ """Appends the given token IDs to the block
399
+
400
+ Args:
401
+ token_ids (List[int]): The token IDs to be appended to the block.
402
+ """
403
+ if len(token_ids) == 0:
404
+ return
405
+
406
+ assert len(token_ids) <= self.num_empty_slots
407
+
408
+ self._token_ids.extend(token_ids)
409
+
410
+ @property
411
+ def computed(self) -> bool:
412
+ raise NotImplementedError
413
+
414
+ @computed.setter
415
+ def computed(self, value) -> None:
416
+ raise NotImplementedError
417
+
418
+ @property
419
+ def last_accessed(self) -> float:
420
+ raise NotImplementedError
421
+
422
+ @last_accessed.setter
423
+ def last_accessed(self, last_accessed_ts: float):
424
+ raise NotImplementedError
425
+
426
+ @property
427
+ def block_id(self) -> Optional[int]:
428
+ return self._block_id
429
+
430
+ @block_id.setter
431
+ def block_id(self, value: Optional[int]) -> None:
432
+ self._block_id = value
433
+
434
+ @property
435
+ def is_full(self) -> bool:
436
+ return self.num_empty_slots == 0
437
+
438
+ @property
439
+ def num_empty_slots(self) -> int:
440
+ return self._block_size - len(self.token_ids)
441
+
442
+ @property
443
+ def token_ids(self) -> List[int]:
444
+ return self._token_ids
445
+
446
+ @property
447
+ def num_tokens_total(self) -> int:
448
+ raise NotImplementedError(
449
+ "num_tokens_total is not used for naive block")
450
+
451
+ @property
452
+ def block_size(self) -> int:
453
+ return self._block_size
454
+
455
+ @property
456
+ def prev_block(self) -> Optional["Block"]:
457
+ return self._prev_block
458
+
459
+ @property
460
+ def extra_hash(self):
461
+ return None
462
+
463
+ @property
464
+ def content_hash(self) -> Optional[int]:
465
+ return None
.venv/lib/python3.11/site-packages/vllm/core/block/prefix_caching_block.py ADDED
@@ -0,0 +1,1134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Token blocks."""
3
+ import sys
4
+ from bisect import bisect_left
5
+ from os.path import commonprefix
6
+ from typing import (Callable, Dict, FrozenSet, Iterable, List, Optional, Set,
7
+ Tuple)
8
+
9
+ from vllm.core.block.common import (CacheMetricData, CopyOnWriteTracker,
10
+ get_all_blocks_recursively)
11
+ from vllm.core.block.interfaces import (Block, BlockAllocator, BlockId, Device,
12
+ DeviceAwareBlockAllocator)
13
+ from vllm.core.block.naive_block import (BlockPool, NaiveBlock,
14
+ NaiveBlockAllocator)
15
+ from vllm.core.evictor import EvictionPolicy, Evictor, make_evictor
16
+ from vllm.logger import init_logger
17
+ from vllm.sequence import Sequence
18
+
19
+ PrefixHash = int
20
+
21
+ # By default, we init our block access time as _DEFAULT_LAST_ACCESSED_TIME
22
+ # so that if we find one block is still hold _DEFAULT_LAST_ACCESSED_TIME,
23
+ # then we know this block hasn't been accessed yet.
24
+ _DEFAULT_LAST_ACCESSED_TIME = -1
25
+
26
+ logger = init_logger(__name__)
27
+
28
+
29
+ class BlockTracker:
30
+ """Used to track the status of a block inside the prefix caching allocator
31
+ """
32
+ __slots__ = ("active", "last_accessed", "computed")
33
+
34
+ def reset(self):
35
+ self.last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
36
+ self.computed: bool = False
37
+
38
+ def __init__(self):
39
+ self.active: bool = False
40
+ self.reset()
41
+
42
+ def enable(self):
43
+ assert not self.active
44
+ self.active = True
45
+ self.reset()
46
+
47
+ def disable(self):
48
+ assert self.active
49
+ self.active = False
50
+ self.reset()
51
+
52
+
53
+ class PrefixCachingBlockAllocator(BlockAllocator):
54
+ """A block allocator that implements prefix caching.
55
+
56
+ The PrefixCachingBlockAllocator maintains a cache of blocks based on their
57
+ content hash. It reuses blocks with the same content hash to avoid redundant
58
+ memory allocation. The allocator also supports copy-on-write operations.
59
+
60
+ Args:
61
+ num_blocks (int): The total number of blocks to manage.
62
+ block_size (int): The size of each block in tokens.
63
+ block_ids(Optional[Iterable[int]], optional): An optional iterable of
64
+ block IDs. If not provided, block IDs will be assigned sequentially
65
+ from 0 to num_blocks - 1.
66
+ """
67
+
68
+ # Note that we use 'None' as a string here instead of None because
69
+ # as of Python 3.12, hash(None) returns a constant predictable value.
70
+ # This could possibly make it easier to find and exploit hash
71
+ # collisions. 'None' as a string will be hashed differently per process,
72
+ # but consistently within the same process. This is the same as the
73
+ # behavior of None prior to Python 3.12.
74
+ _none_hash: int = hash('None')
75
+
76
+ # Implements Block.Factory.
77
+ def __init__(
78
+ self,
79
+ num_blocks: int,
80
+ block_size: int,
81
+ block_ids: Optional[Iterable[int]] = None,
82
+ eviction_policy: EvictionPolicy = EvictionPolicy.LRU,
83
+ ):
84
+ if block_ids is None:
85
+ block_ids = range(num_blocks)
86
+
87
+ self._block_size = block_size
88
+
89
+ # A mapping of prefix hash to block index. All blocks which have a
90
+ # prefix hash will be in this dict, even if they have refcount 0.
91
+ self._cached_blocks: Dict[PrefixHash, BlockId] = {}
92
+
93
+ # A list of immutable block IDs that have been touched by scheduler
94
+ # and should be marked as computed after an entire batch of sequences
95
+ # are scheduled.
96
+ self._touched_blocks: Set[BlockId] = set()
97
+
98
+ # Used to track status of each physical block id
99
+ self._block_tracker: Dict[BlockId, BlockTracker] = {}
100
+ for block_id in block_ids:
101
+ self._block_tracker[block_id] = BlockTracker()
102
+
103
+ # Pre-allocate "num_blocks * extra_factor" block objects.
104
+ # The "* extra_factor" is a buffer to allow more block objects
105
+ # than physical blocks
106
+ extra_factor = 4
107
+ self._block_pool = BlockPool(self._block_size, self._create_block,
108
+ self, num_blocks * extra_factor)
109
+
110
+ # An allocator for blocks that do not have prefix hashes.
111
+ self._hashless_allocator = NaiveBlockAllocator(
112
+ create_block=self._create_block, # type: ignore
113
+ num_blocks=num_blocks,
114
+ block_size=block_size,
115
+ block_ids=block_ids,
116
+ block_pool=self._block_pool, # Share block pool here
117
+ )
118
+
119
+ # Evitor used to maintain how we want to handle those computed blocks
120
+ # if we find memory pressure is high.
121
+ self.eviction_policy = eviction_policy
122
+ self.evictor: Evictor = make_evictor(self.eviction_policy)
123
+
124
+ # We share the refcounter between allocators. This allows us to promote
125
+ # blocks originally allocated in the hashless allocator to immutable
126
+ # blocks.
127
+ self._refcounter = self._hashless_allocator.refcounter
128
+
129
+ self._cow_tracker = CopyOnWriteTracker(
130
+ refcounter=self._refcounter.as_readonly())
131
+
132
+ self.metric_data = CacheMetricData()
133
+
134
+ def _create_block(
135
+ self,
136
+ prev_block: Optional[Block],
137
+ token_ids: List[int],
138
+ block_size: int,
139
+ allocator: BlockAllocator,
140
+ block_id: Optional[int] = None,
141
+ computed: bool = False,
142
+ extra_hash: Optional[int] = None,
143
+ ) -> Block:
144
+ # Bind block to self.
145
+ allocator = self
146
+
147
+ return PrefixCachingBlock(
148
+ prev_block=prev_block,
149
+ token_ids=token_ids,
150
+ block_size=block_size,
151
+ block_id=block_id,
152
+ allocator=allocator,
153
+ computed=computed,
154
+ extra_hash=extra_hash,
155
+ )
156
+
157
+ def allocate_immutable_block(self,
158
+ prev_block: Optional[Block],
159
+ token_ids: List[int],
160
+ extra_hash: Optional[int] = None,
161
+ device: Optional[Device] = None) -> Block:
162
+ """Allocates an immutable block with the given token IDs, reusing cached
163
+ blocks if possible.
164
+
165
+ Args:
166
+ prev_block (Optional[Block]): The previous block in the sequence.
167
+ token_ids (List[int]): The token IDs to be stored in the block.
168
+
169
+ Returns:
170
+ Block: The allocated immutable block.
171
+ """
172
+ assert device is None
173
+ assert_prefix_caching_block_or_none(prev_block)
174
+
175
+ # First, try to create a block that points to cached data
176
+ block = self._block_pool.init_block(prev_block=prev_block,
177
+ token_ids=token_ids,
178
+ block_size=self._block_size,
179
+ physical_block_id=None,
180
+ extra_hash=extra_hash)
181
+ assert block.content_hash is not None
182
+
183
+ cached_block_id = self._cached_blocks.get(block.content_hash, None)
184
+ if cached_block_id is not None:
185
+ self.metric_data.query(hit=True)
186
+ block.block_id = cached_block_id
187
+ self._incr_refcount_cached_block(block)
188
+ return block
189
+ self.metric_data.query(hit=False)
190
+ self._block_pool.free_block(block)
191
+
192
+ # No cached block => Allocate a new block
193
+ block = self.allocate_mutable_block(prev_block, extra_hash=extra_hash)
194
+ block.append_token_ids(token_ids)
195
+ return block
196
+
197
+ def allocate_immutable_blocks(
198
+ self,
199
+ prev_block: Optional[Block],
200
+ block_token_ids: List[List[int]],
201
+ extra_hash: Optional[int] = None,
202
+ device: Optional[Device] = None) -> List[Block]:
203
+ blocks = []
204
+ for token_ids in block_token_ids:
205
+ prev_block = self.allocate_immutable_block(prev_block=prev_block,
206
+ token_ids=token_ids,
207
+ device=device,
208
+ extra_hash=extra_hash)
209
+ blocks.append(prev_block)
210
+ return blocks
211
+
212
+ def allocate_mutable_block(self,
213
+ prev_block: Optional[Block],
214
+ extra_hash: Optional[int] = None,
215
+ device: Optional[Device] = None) -> Block:
216
+ """Allocates a mutable block. If there are no free blocks, this will
217
+ evict unused cached blocks.
218
+
219
+ Args:
220
+ prev_block (Block): The previous block in the sequence.
221
+ None is not allowed unlike it is super class.
222
+
223
+ Returns:
224
+ Block: The allocated mutable block.
225
+ """
226
+ assert device is None
227
+ assert_prefix_caching_block_or_none(prev_block)
228
+
229
+ block_id = self._allocate_block_id()
230
+ block = self._block_pool.init_block(prev_block=prev_block,
231
+ token_ids=[],
232
+ block_size=self._block_size,
233
+ physical_block_id=block_id,
234
+ extra_hash=extra_hash)
235
+ assert not block.computed
236
+ assert block.content_hash is None
237
+ return block
238
+
239
+ def _incr_refcount_cached_block(self, block: Block) -> None:
240
+ # Set this block to be "computed" since it is pointing to a
241
+ # cached block id (which was already computed)
242
+ block.computed = True
243
+
244
+ block_id = block.block_id
245
+ assert block_id is not None
246
+
247
+ refcount = self._refcounter.incr(block_id)
248
+ if refcount == 1:
249
+ # In case a cached block was evicted, restore its tracking
250
+ if block_id in self.evictor:
251
+ self.evictor.remove(block_id)
252
+
253
+ self._track_block_id(block_id, computed=True)
254
+
255
+ def _decr_refcount_cached_block(self, block: Block) -> None:
256
+ # Ensure this is immutable/cached block
257
+ assert block.content_hash is not None
258
+
259
+ block_id = block.block_id
260
+ assert block_id is not None
261
+
262
+ refcount = self._refcounter.decr(block_id)
263
+ if refcount > 0:
264
+ block.block_id = None
265
+ return
266
+ else:
267
+ assert refcount == 0
268
+
269
+ # No longer used
270
+ assert block.content_hash in self._cached_blocks
271
+
272
+ # Add the cached block to the evictor
273
+ # (This keeps the cached block around so it can be reused)
274
+ self.evictor.add(block_id, block.content_hash, block.num_tokens_total,
275
+ self._block_tracker[block_id].last_accessed)
276
+
277
+ # Stop tracking the block
278
+ self._untrack_block_id(block_id)
279
+
280
+ block.block_id = None
281
+
282
+ def _decr_refcount_hashless_block(self, block: Block) -> None:
283
+ block_id = block.block_id
284
+ assert block_id is not None
285
+
286
+ # We may have a fork case where block is shared,
287
+ # in which case, we cannot remove it from tracking
288
+ refcount = self._refcounter.get(block_id)
289
+ if refcount == 1:
290
+ self._untrack_block_id(block_id)
291
+
292
+ # Decrement refcount of the block_id, but do not free the block object
293
+ # itself (will be handled by the caller)
294
+ self._hashless_allocator.free(block, keep_block_object=True)
295
+
296
+ def _allocate_block_id(self) -> BlockId:
297
+ """First tries to allocate a block id from the hashless allocator,
298
+ and if there are no blocks, then tries to evict an unused cached block.
299
+ """
300
+ hashless_block_id = self._maybe_allocate_hashless_block_id()
301
+ if hashless_block_id is not None:
302
+ return hashless_block_id
303
+
304
+ evicted_block_id = self._maybe_allocate_evicted_block_id()
305
+ if evicted_block_id is not None:
306
+ return evicted_block_id
307
+
308
+ # No block available in hashless allocator, nor in unused cache blocks.
309
+ raise BlockAllocator.NoFreeBlocksError()
310
+
311
+ def _maybe_allocate_hashless_block_id(self) -> Optional[BlockId]:
312
+ try:
313
+ # Allocate mutable block and extract its block_id
314
+ block = self._hashless_allocator.allocate_mutable_block(
315
+ prev_block=None)
316
+ block_id = block.block_id
317
+ self._block_pool.free_block(block)
318
+
319
+ self._track_block_id(block_id, computed=False)
320
+ return block_id
321
+ except BlockAllocator.NoFreeBlocksError:
322
+ return None
323
+
324
+ def _maybe_allocate_evicted_block_id(self) -> Optional[BlockId]:
325
+ if self.evictor.num_blocks == 0:
326
+ return None
327
+
328
+ # Here we get an evicted block, which is only added
329
+ # into evictor if its ref counter is 0
330
+ # and since its content would be changed, we need
331
+ # to remove it from _cached_blocks's tracking list
332
+ block_id, content_hash_to_evict = self.evictor.evict()
333
+
334
+ # Sanity checks
335
+ assert content_hash_to_evict in self._cached_blocks
336
+ _block_id = self._cached_blocks[content_hash_to_evict]
337
+ assert self._refcounter.get(_block_id) == 0
338
+ assert _block_id == block_id
339
+
340
+ self._cached_blocks.pop(content_hash_to_evict)
341
+
342
+ self._refcounter.incr(block_id)
343
+ self._track_block_id(block_id, computed=False)
344
+
345
+ return block_id
346
+
347
+ def _free_block_id(self, block: Block) -> None:
348
+ """Decrements the refcount of the block. The block may be in two
349
+ possible states: (1) immutable/cached or (2) mutable/hashless.
350
+ In the first case, the refcount is decremented directly and the block
351
+ may be possibly added to the evictor. In other case, hashless
352
+ allocator free(..) with keep_block_object=True is called to only free
353
+ the block id (since the block object may be reused by the caller)
354
+ """
355
+ block_id = block.block_id
356
+ assert block_id is not None, "Freeing unallocated block is undefined"
357
+
358
+ if block.content_hash is not None:
359
+ # Immutable: This type of block is always cached, and we want to
360
+ # keep it in the evictor for future reuse
361
+ self._decr_refcount_cached_block(block)
362
+ else:
363
+ # Mutable: This type of block is not cached, so we release it
364
+ # directly to the hashless allocator
365
+ self._decr_refcount_hashless_block(block)
366
+
367
+ assert block.block_id is None
368
+
369
+ def free(self, block: Block, keep_block_object: bool = False) -> None:
370
+ """Release the block (look at free_block_id(..) docs)
371
+ """
372
+ # Release the physical block index
373
+ self._free_block_id(block)
374
+
375
+ # Release the block object to the pool
376
+ if not keep_block_object:
377
+ self._block_pool.free_block(block)
378
+
379
+ def fork(self, last_block: Block) -> List[Block]:
380
+ """Creates a new sequence of blocks that shares the same underlying
381
+ memory as the original sequence.
382
+
383
+ Args:
384
+ last_block (Block): The last block in the original sequence.
385
+
386
+ Returns:
387
+ List[Block]: The new sequence of blocks that shares the same memory
388
+ as the original sequence.
389
+ """
390
+ source_blocks = get_all_blocks_recursively(last_block)
391
+
392
+ forked_blocks: List[Block] = []
393
+ prev_block = None
394
+ for block in source_blocks:
395
+ block_id = block.block_id
396
+ assert block_id is not None
397
+
398
+ refcount = self._refcounter.incr(block_id)
399
+ assert refcount != 1, "can't fork free'd block_id = {}".format(
400
+ block_id)
401
+
402
+ forked_block = self._block_pool.init_block(
403
+ prev_block=prev_block,
404
+ token_ids=block.token_ids,
405
+ block_size=self._block_size,
406
+ physical_block_id=block_id,
407
+ extra_hash=block.extra_hash)
408
+
409
+ forked_blocks.append(forked_block)
410
+ prev_block = forked_blocks[-1]
411
+
412
+ return forked_blocks
413
+
414
+ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
415
+ assert device is None
416
+ # The number of free blocks is the number of hashless free blocks
417
+ # plus the number of blocks evictor could free from its list.
418
+ return self._hashless_allocator.get_num_free_blocks(
419
+ ) + self.evictor.num_blocks
420
+
421
+ def get_num_total_blocks(self) -> int:
422
+ return self._hashless_allocator.get_num_total_blocks()
423
+
424
+ def get_physical_block_id(self, absolute_id: int) -> int:
425
+ """Returns the zero-offset block id on certain block allocator
426
+ given the absolute block id.
427
+
428
+ Args:
429
+ absolute_id (int): The absolute block id for the block
430
+ in whole allocator.
431
+
432
+ Returns:
433
+ int: The rzero-offset block id on certain device.
434
+ """
435
+ return sorted(self.all_block_ids).index(absolute_id)
436
+
437
+ @property
438
+ def all_block_ids(self) -> FrozenSet[int]:
439
+ return self._hashless_allocator.all_block_ids
440
+
441
+ def get_prefix_cache_hit_rate(self) -> float:
442
+ return self.metric_data.get_hit_rate()
443
+
444
+ def reset_prefix_cache(self) -> bool:
445
+ """Reset prefix cache. This function may be used in RLHF
446
+ flows to invalid prefix caching after the weights are updated,
447
+ or used for resetting prefix caching status for benchmarking.
448
+
449
+ Returns:
450
+ bool: True if the prefix cache is successfully reset,
451
+ False otherwise.
452
+ """
453
+ num_used_blocks = (self.get_num_total_blocks() -
454
+ self.get_num_free_blocks())
455
+ if num_used_blocks > 0:
456
+ logger.warning(
457
+ "Failed to reset prefix cache because some "
458
+ "blocks (%d) are not freed yet", num_used_blocks)
459
+ return False
460
+
461
+ # Free all blocks in the evictor.
462
+ while (block_id :=
463
+ self._maybe_allocate_evicted_block_id()) is not None:
464
+ self._hashless_allocator.free_block_id(block_id)
465
+
466
+ # Should not have any cached blocks because all blocks are evicted.
467
+ assert not self._cached_blocks
468
+
469
+ # Reset the evictor.
470
+ self.evictor = make_evictor(self.eviction_policy)
471
+
472
+ # Reset the block tracker.
473
+ for block_id in self._block_tracker:
474
+ self._block_tracker[block_id] = BlockTracker()
475
+
476
+ # Reset the metrics.
477
+ self.metric_data = CacheMetricData()
478
+
479
+ logger.info("Successfully reset prefix cache")
480
+ return True
481
+
482
+ def is_block_cached(self, block: Block) -> bool:
483
+ assert block.content_hash is not None
484
+ return block.content_hash in self._cached_blocks
485
+
486
+ def promote_to_immutable_block(self, block: Block) -> BlockId:
487
+ """Once a mutable block is full, it can be promoted to an immutable
488
+ block. This means that its content can be referenced by future blocks
489
+ having the same prefix.
490
+
491
+ Note that if we already have a cached block with the same content, we
492
+ will replace the newly-promoted block's mapping with the existing cached
493
+ block id.
494
+
495
+ Args:
496
+ block: The mutable block to be promoted.
497
+
498
+ Returns:
499
+ BlockId: Either the original block index, or the block index of
500
+ the previously cached block matching the same content.
501
+ """
502
+ # Ensure block can be promoted
503
+ assert block.content_hash is not None
504
+ assert block.block_id is not None
505
+ assert self._refcounter.get(block.block_id) > 0
506
+
507
+ if block.content_hash not in self._cached_blocks:
508
+ # No cached content hash => Set this block as cached.
509
+ # Note that this block cannot be marked as computed yet
510
+ # because other sequences in the same batch cannot reuse
511
+ # this block.
512
+ self._cached_blocks[block.content_hash] = block.block_id
513
+ # Mark this block as touched so that it can be marked as
514
+ # computed after the entire batch of sequences are scheduled.
515
+ self._touched_blocks.add(block.block_id)
516
+ return block.block_id
517
+
518
+ # Reuse the cached content hash
519
+ self._decr_refcount_hashless_block(block)
520
+ block.block_id = self._cached_blocks[block.content_hash]
521
+
522
+ # Increment refcount of the cached block and (possibly) restore
523
+ # it from the evictor.
524
+ # Note that in this case, the block is marked as computed
525
+ self._incr_refcount_cached_block(block)
526
+
527
+ return block.block_id
528
+
529
+ def cow_block_if_not_appendable(self, block: Block) -> BlockId:
530
+ """Performs a copy-on-write operation on the given block if it is not
531
+ appendable.
532
+
533
+ Args:
534
+ block (Block): The block to check for copy-on-write.
535
+
536
+ Returns:
537
+ BlockId: The block index of the new block if a copy-on-write
538
+ operation was performed, or the original block index if
539
+ no copy-on-write was necessary.
540
+ """
541
+ src_block_id = block.block_id
542
+ assert src_block_id is not None
543
+
544
+ if self._cow_tracker.is_appendable(block):
545
+ return src_block_id
546
+
547
+ self._free_block_id(block)
548
+ trg_block_id = self._allocate_block_id()
549
+
550
+ self._cow_tracker.record_cow(src_block_id, trg_block_id)
551
+
552
+ return trg_block_id
553
+
554
+ def clear_copy_on_writes(self) -> List[Tuple[BlockId, BlockId]]:
555
+ """Returns the copy-on-write source->destination mapping and clears it.
556
+
557
+ Returns:
558
+ List[Tuple[BlockId, BlockId]]: A list mapping source
559
+ block indices to destination block indices.
560
+ """
561
+ return self._cow_tracker.clear_cows()
562
+
563
+ def mark_blocks_as_accessed(self, block_ids: List[int],
564
+ now: float) -> None:
565
+ """Mark blocks as accessed, used in prefix caching.
566
+
567
+ If the block is added into evictor, we need to update corresponding
568
+ info in evictor's metadata.
569
+ """
570
+
571
+ for block_id in block_ids:
572
+ if self._block_tracker[block_id].active:
573
+ self._block_tracker[block_id].last_accessed = now
574
+ elif block_id in self.evictor:
575
+ self.evictor.update(block_id, now)
576
+ else:
577
+ raise ValueError(
578
+ "Mark block as accessed which is not belonged to GPU")
579
+
580
+ def mark_blocks_as_computed(self, block_ids: List[int]) -> None:
581
+ # Mark all touched blocks as computed.
582
+ for block_id in self._touched_blocks:
583
+ self._block_tracker[block_id].computed = True
584
+ self._touched_blocks.clear()
585
+
586
+ def _track_block_id(self, block_id: Optional[BlockId],
587
+ computed: bool) -> None:
588
+ assert block_id is not None
589
+ self._block_tracker[block_id].enable()
590
+ self._block_tracker[block_id].computed = computed
591
+
592
+ def _untrack_block_id(self, block_id: Optional[BlockId]) -> None:
593
+ assert block_id is not None
594
+ self._block_tracker[block_id].disable()
595
+
596
+ def block_is_computed(self, block_id: int) -> bool:
597
+ if self._block_tracker[block_id].active:
598
+ return self._block_tracker[block_id].computed
599
+ else:
600
+ return block_id in self.evictor
601
+
602
+ def get_common_computed_block_ids(
603
+ self, computed_seq_block_ids: List[List[int]]) -> List[int]:
604
+ """Return the block ids that are common for a given sequence group.
605
+
606
+ Only those blocks that are immutable and already be marked
607
+ compyted would be taken consideration.
608
+ """
609
+
610
+ # NOTE We exclude the last block to avoid the case where the entire
611
+ # prompt is cached. This would cause erroneous behavior in model
612
+ # runner.
613
+
614
+ # It returns a list of int although type annotation says list of string.
615
+ if len(computed_seq_block_ids) == 1:
616
+ return computed_seq_block_ids[0]
617
+
618
+ return commonprefix([
619
+ ids for ids in computed_seq_block_ids # type: ignore
620
+ if ids
621
+ ])
622
+
623
+ def get_num_full_blocks_touched(self, blocks: List[Block]) -> int:
624
+ """Returns the number of full blocks that will be touched by
625
+ swapping in/out.
626
+
627
+ Args:
628
+ blocks: List of blocks to be swapped.
629
+ Returns:
630
+ int: the number of full blocks that will be touched by
631
+ swapping in/out the given blocks. Non full blocks are ignored
632
+ when deciding the number of blocks to touch.
633
+ """
634
+ num_touched_blocks: int = 0
635
+ for block in blocks:
636
+ # If the block has a match in the cache and the cached
637
+ # block is not referenced, then we still count it as a
638
+ # touched block
639
+ if block.is_full and (not self.is_block_cached(block) or \
640
+ (block.content_hash is not None and \
641
+ self._cached_blocks[block.content_hash] in \
642
+ self.evictor)):
643
+ num_touched_blocks += 1
644
+ return num_touched_blocks
645
+
646
+ def swap_out(self, blocks: List[Block]) -> None:
647
+ """Execute the swap out actions. Basically just free the
648
+ given blocks.
649
+
650
+ Args:
651
+ blocks: List of blocks to be swapped out.
652
+ """
653
+ for block in blocks:
654
+ self._free_block_id(block)
655
+
656
+ def swap_in(self, blocks: List[Block]) -> None:
657
+ """Execute the swap in actions. Change the block id from
658
+ old allocator to current allocator for each block to finish
659
+ the block table update.
660
+
661
+ Args:
662
+ blocks: List of blocks to be swapped in.
663
+ """
664
+ for block in blocks:
665
+ # Here we allocate either immutable or mutable block and then
666
+ # extract its block_id. Note that the block object is released
667
+ # and the block_id is assigned to "block" to allow reusing the
668
+ # existing "block" object
669
+ if block.is_full:
670
+ tmp_block = self.allocate_immutable_block(
671
+ prev_block=block.prev_block,
672
+ token_ids=block.token_ids,
673
+ extra_hash=block.extra_hash)
674
+ else:
675
+ tmp_block = self.allocate_mutable_block(
676
+ prev_block=block.prev_block, extra_hash=block.extra_hash)
677
+ tmp_block.append_token_ids(block.token_ids)
678
+
679
+ block_id = tmp_block.block_id
680
+ self._block_pool.free_block(tmp_block)
681
+
682
+ block.block_id = block_id # Assign block_id
683
+
684
+ def find_cached_blocks_prefix(self, block_hashes: List[int]) -> List[int]:
685
+ """
686
+ Given a list of block hashes, return the prefix of the block hashes that
687
+ are all cached.
688
+
689
+ Since a block's block hash includes the hashes of all previous blocks,
690
+ and we only allocate/deallocate blocks in the entire sequence, so if a
691
+ block is cached, then all previous blocks are also cached. With this
692
+ property, we can use binary search to find the prefix of cached blocks.
693
+
694
+ Args:
695
+ block_hashes (List[int]): The list of block hashes.
696
+
697
+ Returns:
698
+ List[int]: The prefix of the `block_hashes` that are cached.
699
+ """
700
+
701
+ def _block_is_cached(block_hash: PrefixHash) -> bool:
702
+ if block_hash not in self._cached_blocks:
703
+ return False
704
+
705
+ cached_block_id = self._cached_blocks[block_hash]
706
+ # We only consider the blocks that are marked as computed.
707
+ return self.block_is_computed(cached_block_id)
708
+
709
+ def _bisect_left(a, x, key: Callable[[PrefixHash], bool]) -> int:
710
+
711
+ # python <= 3.10 don't have the key argument
712
+ if sys.version_info < (3, 10):
713
+ a = [key(e) for e in a]
714
+ return bisect_left(a, x)
715
+ else:
716
+ return bisect_left(a, x, key=key)
717
+
718
+ # Look for the first block that's not cached, and returns the prefix
719
+ # i.e. blocks that are cached.
720
+ idx = _bisect_left(block_hashes,
721
+ True,
722
+ key=lambda x: not _block_is_cached(x))
723
+ return block_hashes[:idx]
724
+
725
+
726
+ class PrefixCachingBlock(Block):
727
+ """A block implementation that supports prefix caching.
728
+
729
+ The PrefixCachingBlock class represents a block of token IDs with prefix
730
+ caching capabilities. It wraps a NaiveBlock internally and provides
731
+ additional functionality for content hashing and promoting immutable blocks
732
+ with the prefix caching allocator.
733
+
734
+ Args:
735
+ prev_block (Optional[PrefixCachingBlock]): The previous block in the
736
+ sequence.
737
+ token_ids (List[int]): The initial token IDs to be stored in the block.
738
+ block_size (int): The maximum number of token IDs that can be stored in
739
+ the block.
740
+ allocator (BlockAllocator): The prefix
741
+ caching block allocator associated with this block.
742
+ block_id (Optional[int], optional): The physical block index
743
+ of this block. Defaults to None.
744
+ extra_hash (Optional[int]): The hash value of additional factors
745
+ such as adapters that influence the block, apart from the token_ids.
746
+ """
747
+
748
+ # Note that we use 'None' as a string here instead of None because
749
+ # as of Python 3.12, hash(None) returns a constant predictable value.
750
+ # This could possibly make it easier to find and exploit hash
751
+ # collisions. 'None' as a string will be hashed differently per process,
752
+ # but consistently within the same process. This is the same as the
753
+ # behavior of None prior to Python 3.12.
754
+ _none_hash: int = hash('None')
755
+
756
+ def __init__(
757
+ self,
758
+ prev_block: Optional[Block],
759
+ token_ids: List[int],
760
+ block_size: int,
761
+ allocator: BlockAllocator,
762
+ block_id: Optional[int] = None,
763
+ computed: bool = False,
764
+ extra_hash: Optional[int] = None,
765
+ ):
766
+ assert isinstance(allocator, PrefixCachingBlockAllocator), (
767
+ "Currently this class is only tested with "
768
+ "PrefixCachingBlockAllocator. Got instead allocator = {}".format(
769
+ allocator))
770
+ assert_prefix_caching_block_or_none(prev_block)
771
+
772
+ self._prev_block = prev_block
773
+ self._cached_content_hash: Optional[int] = None
774
+ self._cached_num_tokens_total: int = 0
775
+ self._allocator = allocator
776
+ self._last_accessed: float = _DEFAULT_LAST_ACCESSED_TIME
777
+ self._computed = computed
778
+ self._extra_hash = extra_hash
779
+
780
+ # On the first time, we create the block object, and next we only
781
+ # reinitialize it
782
+ if hasattr(self, "_block"):
783
+ self._block.__init__( # type: ignore[has-type]
784
+ prev_block=prev_block,
785
+ token_ids=token_ids,
786
+ block_size=block_size,
787
+ block_id=block_id,
788
+ allocator=self._allocator)
789
+ else:
790
+ self._block = NaiveBlock(prev_block=prev_block,
791
+ token_ids=token_ids,
792
+ block_size=block_size,
793
+ block_id=block_id,
794
+ allocator=self._allocator)
795
+
796
+ self._update_num_tokens_total()
797
+
798
+ def _update_num_tokens_total(self):
799
+ """Incrementally computes the number of tokens that there is
800
+ till the current block (included)
801
+ """
802
+ res = 0
803
+
804
+ # Add all previous blocks
805
+ if self._prev_block is not None:
806
+ res += self._prev_block.num_tokens_total
807
+
808
+ # Add current block
809
+ res += len(self.token_ids)
810
+
811
+ self._cached_num_tokens_total = res
812
+
813
+ @property
814
+ def computed(self) -> bool:
815
+ return self._computed
816
+
817
+ @computed.setter
818
+ def computed(self, value) -> None:
819
+ self._computed = value
820
+
821
+ @property
822
+ def last_accessed(self) -> float:
823
+ return self._last_accessed
824
+
825
+ @last_accessed.setter
826
+ def last_accessed(self, last_accessed_ts: float):
827
+ self._last_accessed = last_accessed_ts
828
+
829
+ def append_token_ids(self, token_ids: List[int]) -> None:
830
+ """Appends the given token IDs to the block and registers the block as
831
+ immutable if the block becomes full.
832
+
833
+ Args:
834
+ token_ids (List[int]): The token IDs to be appended to the block.
835
+ """
836
+ # Ensure this is mutable block (not promoted)
837
+ assert self.content_hash is None
838
+ assert not self.computed
839
+
840
+ if len(token_ids) == 0:
841
+ return
842
+
843
+ # Ensure there are input tokens
844
+ assert token_ids, "Got token_ids = {}".format(token_ids)
845
+
846
+ # Naive block handles CoW.
847
+ self._block.append_token_ids(token_ids)
848
+ self._update_num_tokens_total()
849
+
850
+ # If the content hash is present, then the block can be made immutable.
851
+ # Register ourselves with the allocator, potentially replacing the
852
+ # physical block index.
853
+ if self.content_hash is not None:
854
+ self.block_id = self._allocator.promote_to_immutable_block(self)
855
+
856
+ @property
857
+ def block_id(self) -> Optional[int]:
858
+ return self._block.block_id
859
+
860
+ @block_id.setter
861
+ def block_id(self, value) -> None:
862
+ self._block.block_id = value
863
+
864
+ @property
865
+ def is_full(self) -> bool:
866
+ return self._block.is_full
867
+
868
+ @property
869
+ def num_empty_slots(self) -> int:
870
+ return self._block.num_empty_slots
871
+
872
+ @property
873
+ def num_tokens_total(self) -> int:
874
+ return self._cached_num_tokens_total
875
+
876
+ @property
877
+ def block_size(self) -> int:
878
+ return self._block.block_size
879
+
880
+ @property
881
+ def token_ids(self) -> List[int]:
882
+ return self._block.token_ids
883
+
884
+ @property
885
+ def prev_block(self) -> Optional[Block]:
886
+ return self._prev_block
887
+
888
+ @property
889
+ def extra_hash(self) -> Optional[int]:
890
+ return self._extra_hash
891
+
892
+ @property
893
+ def content_hash(self) -> Optional[int]:
894
+ """Return the content-based hash of the current block, or None if it is
895
+ not yet defined.
896
+
897
+ For the content-based hash to be defined, the current block must be
898
+ full.
899
+ """
900
+ # If the hash is already computed, return it.
901
+ if self._cached_content_hash is not None:
902
+ return self._cached_content_hash
903
+
904
+ # We cannot compute a hash for the current block because it is not full.
905
+ if not self.is_full:
906
+ return None
907
+
908
+ is_first_block = self._prev_block is None
909
+ prev_block_hash = (
910
+ self._none_hash if is_first_block else
911
+ self._prev_block.content_hash # type: ignore
912
+ )
913
+
914
+ # Previous block exists but does not yet have a hash.
915
+ # Return no hash in this case.
916
+ if prev_block_hash == self._none_hash and not is_first_block:
917
+ return None
918
+
919
+ self._cached_content_hash = PrefixCachingBlock.hash_block_tokens(
920
+ is_first_block,
921
+ prev_block_hash,
922
+ cur_block_token_ids=self.token_ids,
923
+ extra_hash=self._extra_hash)
924
+ return self._cached_content_hash
925
+
926
+ @classmethod
927
+ def hash_block_tokens(cls,
928
+ is_first_block: bool,
929
+ prev_block_hash: Optional[int],
930
+ cur_block_token_ids: List[int],
931
+ extra_hash: Optional[int] = None) -> int:
932
+ """Computes a hash value corresponding to the contents of a block and
933
+ the contents of the preceding block(s). The hash value is used for
934
+ prefix caching.
935
+
936
+ Parameters:
937
+ - is_first_block (bool): A flag indicating if the block is the first in
938
+ the sequence.
939
+ - prev_block_hash (Optional[int]): The hash of the previous block. None
940
+ if this is the first block.
941
+ - cur_block_token_ids (List[int]): A list of token ids in the current
942
+ block. The current block is assumed to be full.
943
+ - extra_hash (Optional[int]): The hash value of additional factors
944
+ such as adapters that influence the block, apart from the token_ids.
945
+
946
+ Returns:
947
+ - int: The computed hash value for the block.
948
+ """
949
+ if is_first_block and prev_block_hash is None:
950
+ prev_block_hash = cls._none_hash
951
+ return hash((is_first_block, prev_block_hash, *cur_block_token_ids,
952
+ extra_hash))
953
+
954
+
955
+ class ComputedBlocksTracker:
956
+ """
957
+ Tracks the computed blocks for each sequence.
958
+
959
+ Internally, it maintains a map from sequence id to the list of block hashes
960
+ for the sequence. We cache the hashes of the full blocks for each sequence,
961
+ and make sure the hash is calculated in the same way as the allocator.
962
+ When a sequence is being decoded, we also update the sequence's hash
963
+ accordingly and incrementally.
964
+
965
+ From the sequence hash, with prefix caching enabled, we could also calculate
966
+ the number of cached tokens for the sequence by looking up the number of
967
+ cached block hashes in the allocator.
968
+ """
969
+
970
+ # Note that we use 'None' as a string here instead of None because
971
+ # as of Python 3.12, hash(None) returns a constant predictable value.
972
+ # This could possibly make it easier to find and exploit hash
973
+ # collisions. 'None' as a string will be hashed differently per process,
974
+ # but consistently within the same process. This is the same as the
975
+ # behavior of None prior to Python 3.12.
976
+ _none_hash: int = hash('None')
977
+
978
+ def __init__(
979
+ self,
980
+ allocator: DeviceAwareBlockAllocator,
981
+ block_size: int,
982
+ enable_caching: bool,
983
+ ):
984
+ self._allocator = allocator
985
+ self._block_size = block_size
986
+ self._enable_caching = enable_caching
987
+
988
+ # A map from seq_id to the list of block hashes for the
989
+ # sequence. This is so that we don't have to recompute the block hashes
990
+ # for the sequence when we need to check if the sequence is cached.
991
+ # Note a block that's not full will not have its hash calculated and
992
+ # recorded.
993
+ self._seq_id_to_blocks_hashes: Dict[int, List[int]] = {}
994
+
995
+ # A map from seq_id to the number of tokens that are cached for the
996
+ # sequence.
997
+ # We need this so that a sequence in continuous prefill doesn't
998
+ # accidentally see its cached token count change. See comments in
999
+ # `get_num_cached_tokens` for more details.
1000
+ self._seq_id_to_num_tokens_computed: Dict[int, int] = {}
1001
+
1002
+ def _update_seq_hashes(self, seq: Sequence) -> None:
1003
+ """Incrementally update the sequence's block hashes and record them."""
1004
+ assert self._enable_caching
1005
+
1006
+ block_hashes_recorded = self._seq_id_to_blocks_hashes.get(
1007
+ seq.seq_id, [])
1008
+ cur_num_blocks_recorded = len(block_hashes_recorded)
1009
+ token_ids = seq.get_token_ids()
1010
+ assert len(token_ids) >= cur_num_blocks_recorded * self._block_size, (
1011
+ f"The sequence has {len(token_ids)} tokens, but"
1012
+ f" already recorded {cur_num_blocks_recorded} blocks. "
1013
+ "This should not happen since we assume blocks are "
1014
+ "only appended other than recomputation. When the sequence is "
1015
+ "recomputed, we should have removed the info of the old blocks.")
1016
+ # Update the computed block hashes for the sequence. Since only full
1017
+ # blocks are considered as "computed", we take floor here.
1018
+ num_computed_blocks = len(token_ids) // self._block_size
1019
+
1020
+ # We need to know the hash of the previous block to compute the hash of
1021
+ # the current block so that blocks could be uniquely identified across
1022
+ # sequences of prefixes.
1023
+ prev_block_hash = (self._none_hash if cur_num_blocks_recorded == 0 else
1024
+ block_hashes_recorded[-1])
1025
+ # Only update the computed block hashes for the new blocks
1026
+ for i in range(cur_num_blocks_recorded, num_computed_blocks):
1027
+ assert len(token_ids) >= (i + 1) * self._block_size
1028
+ block_token_ids = token_ids[i * self._block_size:(i + 1) *
1029
+ self._block_size]
1030
+
1031
+ # NOTE: If there are any factors affecting the block besides
1032
+ # token_ids, they should be added as input to extra_hash.
1033
+ extra_hash = seq.extra_hash()
1034
+
1035
+ # This has to be kept in sync with the allocator's hash
1036
+ # calculation.
1037
+ block_hash = PrefixCachingBlock.hash_block_tokens(
1038
+ is_first_block=prev_block_hash == self._none_hash,
1039
+ prev_block_hash=prev_block_hash,
1040
+ cur_block_token_ids=block_token_ids,
1041
+ extra_hash=extra_hash,
1042
+ )
1043
+ block_hashes_recorded.append(block_hash)
1044
+ prev_block_hash = block_hash
1045
+
1046
+ self._seq_id_to_blocks_hashes[seq.seq_id] = block_hashes_recorded
1047
+
1048
+ def get_num_cached_tokens(self, seq: Sequence) -> int:
1049
+ if not self._enable_caching:
1050
+ return 0
1051
+
1052
+ # We always try to update the sequence hashes on the fly.
1053
+ # This is to ensure that we don't miss any cached tokens for the
1054
+ # sequence during decode.
1055
+ # This routine should only update hash for any new blocks too.
1056
+ self._update_seq_hashes(seq)
1057
+
1058
+ num_computed_tokens_prev = self._seq_id_to_num_tokens_computed.get(
1059
+ seq.seq_id, None)
1060
+
1061
+ # TODO(rickyx): This hack could be removed once we mark blocks as
1062
+ # computed correctly with chunked prefills.
1063
+ if num_computed_tokens_prev is not None and seq.is_prefill():
1064
+ # For a sequence that is still in prefill, we don't
1065
+ # recompute the number of cached tokens.
1066
+ # This also handles correctly chunked prefill since currently
1067
+ # we mark blocks as computed even if the sequence is still partially
1068
+ # prefilled. So a continuously prefilled sequence should not
1069
+ # see its cached token count change while running.
1070
+ return num_computed_tokens_prev
1071
+
1072
+ block_hashes = self._seq_id_to_blocks_hashes[seq.seq_id]
1073
+
1074
+ # This is O(logN), where N is the number of blocks.
1075
+ num_cached_blocks = len(
1076
+ self._allocator.find_cached_blocks_prefix(block_hashes))
1077
+ num_cached_tokens = num_cached_blocks * self._block_size
1078
+ self._seq_id_to_num_tokens_computed[seq.seq_id] = num_cached_tokens
1079
+ return num_cached_tokens
1080
+
1081
+ def remove_seq(self, seq_id: int) -> None:
1082
+ """Stop tracking the sequence."""
1083
+ if not self._enable_caching:
1084
+ return
1085
+ assert seq_id in self._seq_id_to_blocks_hashes
1086
+ del self._seq_id_to_blocks_hashes[seq_id]
1087
+
1088
+ assert seq_id in self._seq_id_to_num_tokens_computed
1089
+ del self._seq_id_to_num_tokens_computed[seq_id]
1090
+
1091
+
1092
+ class LastAccessBlocksTracker:
1093
+ """Manages the last access time of the tracked sequences, in order to allow
1094
+ an efficient update of allocator's block last access times
1095
+ """
1096
+
1097
+ def __init__(self, allocator):
1098
+ self._allocator = allocator
1099
+ self._seq_last_access: Dict[int, Optional[float]] = {}
1100
+
1101
+ def add_seq(self, seq_id: int) -> None:
1102
+ """Start tracking seq_id
1103
+ """
1104
+ assert seq_id not in self._seq_last_access
1105
+ self._seq_last_access[seq_id] = None
1106
+
1107
+ def remove_seq(self, seq_id: int) -> None:
1108
+ """Stop tracking seq_id
1109
+ """
1110
+ assert seq_id in self._seq_last_access
1111
+ del self._seq_last_access[seq_id]
1112
+
1113
+ def update_last_access(self, seq_id: int, time: float) -> None:
1114
+ assert seq_id in self._seq_last_access
1115
+ self._seq_last_access[seq_id] = time
1116
+
1117
+ def update_seq_blocks_last_access(self, seq_id: int,
1118
+ block_ids: List[int]) -> None:
1119
+ assert seq_id in self._seq_last_access
1120
+
1121
+ ts = self._seq_last_access[seq_id]
1122
+
1123
+ if ts is None:
1124
+ # No last access was recorded, no need to update.
1125
+ return
1126
+
1127
+ self._allocator.mark_blocks_as_accessed(block_ids, ts)
1128
+
1129
+
1130
+ def assert_prefix_caching_block_or_none(block: Optional[Block]):
1131
+ if block is None:
1132
+ return
1133
+ assert isinstance(block,
1134
+ PrefixCachingBlock), "Got block = {}".format(block)
.venv/lib/python3.11/site-packages/vllm/core/block/utils.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+ """Block manager utils."""
3
+ from vllm.sequence import SequenceGroup
4
+ from vllm.utils import (STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE,
5
+ STR_NOT_IMPL_ENC_DEC_SWA)
6
+
7
+
8
+ def check_no_caching_or_swa_for_blockmgr_encdec(
9
+ block_mgr, seq_group: SequenceGroup) -> None:
10
+ '''
11
+ Enforce that prefix caching & sliding-window attention (SWA)
12
+ are currently unsupported *specifically* for encoder/decoder models.
13
+
14
+ Raises NotImplementedError if unsupported scenario is detected.
15
+
16
+ Arguments:
17
+
18
+ * block_mgr: BlockSpaceManager instance
19
+ * seq_group: SequenceGroup passed to block_mgr
20
+ '''
21
+
22
+ if seq_group.is_encoder_decoder():
23
+ if block_mgr.max_block_sliding_window is not None:
24
+ raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_SWA)
25
+
26
+ if block_mgr.enable_caching:
27
+ raise NotImplementedError(STR_NOT_IMPL_ENC_DEC_PREFIX_CACHE)
.venv/lib/python3.11/site-packages/vllm/core/interfaces.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import enum
4
+ from abc import ABC, abstractmethod
5
+ from typing import List
6
+ from typing import Sequence as GenericSequence
7
+ from typing import Tuple
8
+
9
+ from vllm.sequence import Sequence, SequenceGroup
10
+ from vllm.utils import Device
11
+
12
+
13
+ class AllocStatus(enum.Enum):
14
+ """Result for BlockSpaceManager.can_allocate
15
+
16
+ 1. Ok: seq_group can be allocated now.
17
+ 2. Later: seq_group cannot be allocated.
18
+ The capacity of allocator is larger than seq_group required.
19
+ 3. Never: seq_group can never be allocated.
20
+ The seq_group is too large to allocated in GPU.
21
+ """
22
+ OK = enum.auto()
23
+ LATER = enum.auto()
24
+ NEVER = enum.auto()
25
+
26
+
27
+ class BlockSpaceManager(ABC):
28
+
29
+ @staticmethod
30
+ def get_block_space_manager_class(version: str):
31
+ version = version.lower()
32
+
33
+ if version == "selfattn":
34
+ from vllm.core.block_manager import SelfAttnBlockSpaceManager
35
+ return SelfAttnBlockSpaceManager
36
+
37
+ if version == "placeholder":
38
+ from vllm.core.placeholder_block_space_manager import (
39
+ PlaceholderBlockSpaceManager)
40
+ return PlaceholderBlockSpaceManager
41
+
42
+ raise ValueError(f"Unknown version {version=}")
43
+
44
+ @abstractmethod
45
+ def can_allocate(self,
46
+ seq_group: SequenceGroup,
47
+ num_lookahead_slots: int = 0) -> AllocStatus:
48
+ pass
49
+
50
+ @abstractmethod
51
+ def allocate(self, seq_group: SequenceGroup) -> None:
52
+ pass
53
+
54
+ @abstractmethod
55
+ def can_append_slots(self, seq_group: SequenceGroup,
56
+ num_lookahead_slots: int) -> bool:
57
+ pass
58
+
59
+ @abstractmethod
60
+ def append_slots(
61
+ self,
62
+ seq: Sequence,
63
+ num_lookahead_slots: int,
64
+ ) -> List[Tuple[int, int]]:
65
+ pass
66
+
67
+ @abstractmethod
68
+ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
69
+ pass
70
+
71
+ @abstractmethod
72
+ def can_swap_in(self, seq_group: SequenceGroup,
73
+ num_lookahead_slots: int) -> AllocStatus:
74
+ pass
75
+
76
+ @abstractmethod
77
+ def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
78
+ pass
79
+
80
+ @abstractmethod
81
+ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
82
+ pass
83
+
84
+ @abstractmethod
85
+ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
86
+ pass
87
+
88
+ @abstractmethod
89
+ def free(self, seq: Sequence) -> None:
90
+ pass
91
+
92
+ @abstractmethod
93
+ def get_block_table(self, seq: Sequence) -> List[int]:
94
+ pass
95
+
96
+ @abstractmethod
97
+ def get_num_free_gpu_blocks(self) -> int:
98
+ pass
99
+
100
+ @abstractmethod
101
+ def get_num_free_cpu_blocks(self) -> int:
102
+ pass
103
+
104
+ @abstractmethod
105
+ def access_all_blocks_in_seq(
106
+ self,
107
+ seq: Sequence,
108
+ access_time: float,
109
+ ) -> None:
110
+ pass
111
+
112
+ @abstractmethod
113
+ def get_common_computed_block_ids(
114
+ self, seqs: List[Sequence]) -> GenericSequence[int]:
115
+ pass
116
+
117
+ @abstractmethod
118
+ def mark_blocks_as_computed(self, seq_group: SequenceGroup,
119
+ token_chunk_size: int):
120
+ pass
121
+
122
+ @abstractmethod
123
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
124
+ """Prefix cache hit rate. -1 means not supported or disabled."""
125
+ pass
126
+
127
+ @abstractmethod
128
+ def reset_prefix_cache(self) -> bool:
129
+ """Reset prefix cache for all devices."""
130
+ pass
131
+
132
+ @abstractmethod
133
+ def get_num_cached_tokens(self, seq: Sequence) -> int:
134
+ pass
.venv/lib/python3.11/site-packages/vllm/core/placeholder_block_space_manager.py ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ from typing import List, Tuple
4
+
5
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
6
+ from vllm.sequence import Sequence, SequenceGroup
7
+ from vllm.utils import Device
8
+
9
+
10
+ class PlaceholderBlockSpaceManager(BlockSpaceManager):
11
+ """A version of BlockSpaceManager for use in environments
12
+ where block management is not required.
13
+ For example: pooling models or attention-free models like Mamba.
14
+
15
+ This class provides the same interface as BlockSpaceManager, but its
16
+ methods perform no actions or return simple values like True in specific
17
+ actions. It's designed to be used in scenarios where the overhead of
18
+ block management is unnecessary, such as in an embedding environment.
19
+ """
20
+
21
+ def __init__(
22
+ self,
23
+ **kwargs,
24
+ ) -> None:
25
+ pass
26
+
27
+ def can_allocate(self,
28
+ seq_group: SequenceGroup,
29
+ num_lookahead_slots: int = 0) -> AllocStatus:
30
+ # Always return OK for dummy purposes
31
+ return AllocStatus.OK
32
+
33
+ def allocate(self, seq_group: SequenceGroup) -> None:
34
+ # No actual allocation logic needed
35
+ pass
36
+
37
+ def can_append_slots(self, seq_group: SequenceGroup,
38
+ num_lookahead_slots: int) -> bool:
39
+ return True
40
+
41
+ def append_slots(
42
+ self,
43
+ seq: Sequence,
44
+ num_lookahead_slots: int,
45
+ ) -> List[Tuple[int, int]]:
46
+ return []
47
+
48
+ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
49
+ pass
50
+
51
+ def can_swap_in(self, seq_group: SequenceGroup,
52
+ num_lookahead_slots: int) -> AllocStatus:
53
+ return AllocStatus.OK
54
+
55
+ def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
56
+ return None # type: ignore
57
+
58
+ def can_swap_out(self, seq_group: SequenceGroup) -> bool:
59
+ return True
60
+
61
+ def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
62
+ return None # type: ignore
63
+
64
+ def free(self, seq: Sequence) -> None:
65
+ # No operation on free
66
+ return
67
+
68
+ def get_block_table(self, seq: Sequence) -> List[int]:
69
+ return None # type: ignore
70
+
71
+ def get_num_free_gpu_blocks(self) -> int:
72
+ return 1
73
+
74
+ def get_num_free_cpu_blocks(self) -> int:
75
+ return 1
76
+
77
+ def access_all_blocks_in_seq(
78
+ self,
79
+ seq: Sequence,
80
+ access_time: float,
81
+ ) -> None:
82
+ pass
83
+
84
+ def get_common_computed_block_ids(self,
85
+ seq_group: List[Sequence]) -> List[int]:
86
+ return []
87
+
88
+ def mark_blocks_as_computed(self, seq_group: SequenceGroup,
89
+ token_chunk_size: int):
90
+ pass
91
+
92
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
93
+ return -1
94
+
95
+ def reset_prefix_cache(self) -> bool:
96
+ return True
97
+
98
+ def get_num_cached_tokens(self, seq: Sequence) -> int:
99
+ return 0
.venv/lib/python3.11/site-packages/vllm/core/scheduler.py ADDED
@@ -0,0 +1,1840 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-License-Identifier: Apache-2.0
2
+
3
+ import enum
4
+ import os
5
+ import random
6
+ import time
7
+ from collections import deque
8
+ from dataclasses import dataclass, field
9
+ from typing import Callable, Deque, Dict, Iterable, List, Optional
10
+ from typing import Sequence as GenericSequence
11
+ from typing import Set, Tuple, Union
12
+
13
+ from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
14
+ from vllm.core.interfaces import AllocStatus, BlockSpaceManager
15
+ from vllm.logger import init_logger
16
+ from vllm.lora.request import LoRARequest
17
+ from vllm.prompt_adapter.request import PromptAdapterRequest
18
+ from vllm.sequence import (Sequence, SequenceData, SequenceGroup,
19
+ SequenceGroupMetadata, SequenceGroupMetadataDelta,
20
+ SequenceStatus)
21
+ from vllm.utils import Device, PyObjectCache
22
+
23
+ logger = init_logger(__name__)
24
+
25
+ # Test-only. If configured, decode is preempted with
26
+ # ARTIFICIAL_PREEMPTION_PROB% probability.
27
+ ENABLE_ARTIFICIAL_PREEMPT = bool(
28
+ os.getenv("VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT", False)) # noqa
29
+ ARTIFICIAL_PREEMPTION_PROB = 0.5
30
+ ARTIFICIAL_PREEMPTION_MAX_CNT = 500
31
+
32
+
33
+ class PreemptionMode(enum.Enum):
34
+ """Preemption modes.
35
+
36
+ 1. Swapping: Swap out the blocks of the preempted sequences to CPU memory
37
+ and swap them back in when the sequences are resumed.
38
+ 2. Recomputation: Discard the blocks of the preempted sequences and
39
+ recompute them when the sequences are resumed, treating the sequences as
40
+ new prompts.
41
+ """
42
+ SWAP = enum.auto()
43
+ RECOMPUTE = enum.auto()
44
+
45
+
46
+ @dataclass
47
+ class SchedulingBudget:
48
+ """The available slots for scheduling.
49
+
50
+ TODO(sang): Right now, the budget is request_id-aware meaning it can ignore
51
+ budget update from the same request_id. It is because in normal scheduling
52
+ path, we update RUNNING num_seqs ahead of time, meaning it could be
53
+ updated more than once when scheduling RUNNING requests. Since this won't
54
+ happen if we only have chunked prefill scheduling, we can remove this
55
+ feature from the API when chunked prefill is enabled by default.
56
+ """
57
+ token_budget: int
58
+ max_num_seqs: int
59
+ _request_ids_num_batched_tokens: Set[str] = field(default_factory=set)
60
+ _request_ids_num_curr_seqs: Set[str] = field(default_factory=set)
61
+ # Number of cached tokens in the batch.
62
+ _num_cached_tokens: int = 0
63
+ # Number of actual non-cached tokens in the batch.
64
+ _num_batched_tokens: int = 0
65
+ _num_curr_seqs: int = 0
66
+
67
+ def can_schedule(self, *, num_new_tokens: int, num_new_seqs: int):
68
+ # We allow num_new_tokens to be 0 when the entire sequence has
69
+ # been cached.
70
+ assert num_new_tokens >= 0
71
+ assert num_new_seqs != 0
72
+ return (self.num_batched_tokens + num_new_tokens <= self.token_budget
73
+ and self.num_curr_seqs + num_new_seqs <= self.max_num_seqs)
74
+
75
+ def remaining_token_budget(self):
76
+ return self.token_budget - self.num_batched_tokens
77
+
78
+ def add_num_batched_tokens(self,
79
+ req_id: str,
80
+ num_batched_tokens: int,
81
+ num_cached_tokens: int = 0):
82
+ if req_id in self._request_ids_num_batched_tokens:
83
+ return
84
+ assert num_cached_tokens >= 0
85
+ assert num_batched_tokens >= 0
86
+
87
+ self._request_ids_num_batched_tokens.add(req_id)
88
+ self._num_batched_tokens += num_batched_tokens
89
+ self._num_cached_tokens += num_cached_tokens
90
+
91
+ def subtract_num_batched_tokens(self, req_id: str,
92
+ num_batched_tokens: int):
93
+ if req_id in self._request_ids_num_batched_tokens:
94
+ self._request_ids_num_batched_tokens.remove(req_id)
95
+ self._num_batched_tokens -= num_batched_tokens
96
+
97
+ def add_num_seqs(self, req_id: str, num_curr_seqs: int):
98
+ if req_id in self._request_ids_num_curr_seqs:
99
+ return
100
+
101
+ self._request_ids_num_curr_seqs.add(req_id)
102
+ self._num_curr_seqs += num_curr_seqs
103
+
104
+ def subtract_num_seqs(self, req_id: str, num_curr_seqs: int):
105
+ if req_id in self._request_ids_num_curr_seqs:
106
+ self._request_ids_num_curr_seqs.remove(req_id)
107
+ self._num_curr_seqs -= num_curr_seqs
108
+
109
+ @property
110
+ def num_batched_tokens(self):
111
+ return self._num_batched_tokens
112
+
113
+ @property
114
+ def num_curr_seqs(self):
115
+ return self._num_curr_seqs
116
+
117
+ @property
118
+ def num_cached_tokens(self):
119
+ return self._num_cached_tokens
120
+
121
+
122
+ @dataclass
123
+ class ScheduledSequenceGroup:
124
+ # A sequence group that's scheduled.
125
+ seq_group: SequenceGroup
126
+ # The total chunk size (number of tokens) to process for next iteration.
127
+ # 1 for decoding. Same as prompt tokens for prefill, but if prefill is
128
+ # chunked, it can be smaller than that.
129
+ token_chunk_size: int
130
+
131
+
132
+ @dataclass
133
+ class SchedulerOutputs:
134
+ """The scheduling decision made from a scheduler."""
135
+ # Scheduled sequence groups.
136
+ scheduled_seq_groups: GenericSequence[ScheduledSequenceGroup]
137
+ # Number of prefill groups scheduled.
138
+ num_prefill_groups: int
139
+ # Total number of batched tokens.
140
+ num_batched_tokens: int
141
+ # Blocks to swap in. List of CPU -> GPU block number.
142
+ blocks_to_swap_in: List[Tuple[int, int]]
143
+ # Blocks to swap out. List of GPU -> CPU block number.
144
+ blocks_to_swap_out: List[Tuple[int, int]]
145
+ # Blocks to copy. Source to dest block.
146
+ blocks_to_copy: List[Tuple[int, int]]
147
+ # Sequence groups that are going to be ignored.
148
+ ignored_seq_groups: List[SequenceGroup]
149
+ # The number of slots for lookahead decoding.
150
+ num_lookahead_slots: int
151
+ # The number of requests in the running queue
152
+ running_queue_size: int
153
+ preempted: int
154
+
155
+ def __post_init__(self):
156
+ # Swap in and swap out should never happen at the same time.
157
+ assert not (self.blocks_to_swap_in and self.blocks_to_swap_out)
158
+
159
+ self.num_loras: int = len(self.lora_requests)
160
+ if self.num_loras > 0:
161
+ self._sort_by_lora_ids()
162
+
163
+ self.num_prompt_adapters: int = len(self.prompt_adapter_requests)
164
+
165
+ def is_empty(self) -> bool:
166
+ # NOTE: We do not consider the ignored sequence groups.
167
+ return (not self.scheduled_seq_groups and not self.blocks_to_swap_in
168
+ and not self.blocks_to_swap_out and not self.blocks_to_copy)
169
+
170
+ def _sort_by_lora_ids(self):
171
+ assert 0 <= self.num_prefill_groups <= len(self.scheduled_seq_groups)
172
+
173
+ def key_fn(group: ScheduledSequenceGroup):
174
+ key = (group.seq_group.lora_int_id, group.seq_group.request_id)
175
+ if 0 < self.num_prefill_groups < len(self.scheduled_seq_groups):
176
+ # Sort sequence groups so that all prefills come before all
177
+ # decodes as required by chunked prefill.
178
+ return (not group.seq_group.is_prefill(), *key)
179
+ return key
180
+
181
+ self.scheduled_seq_groups = sorted(self.scheduled_seq_groups,
182
+ key=key_fn)
183
+
184
+ @property
185
+ def lora_requests(self) -> Set[LoRARequest]:
186
+ return {
187
+ g.seq_group.lora_request
188
+ for g in self.scheduled_seq_groups
189
+ if g.seq_group.lora_request is not None
190
+ }
191
+
192
+ @property
193
+ def prompt_adapter_requests(self) -> Set[PromptAdapterRequest]:
194
+ return {
195
+ g.seq_group.prompt_adapter_request
196
+ for g in self.scheduled_seq_groups
197
+ if g.seq_group.prompt_adapter_request is not None
198
+ }
199
+
200
+
201
+ @dataclass
202
+ class SchedulerRunningOutputs:
203
+ """The requests that are scheduled from a running queue.
204
+
205
+ Could contain prefill (prefill that's chunked) or decodes. If there's not
206
+ enough memory, it can be preempted (for recompute) or swapped out.
207
+ """
208
+ # Selected sequences that are running and in a decoding phase.
209
+ decode_seq_groups: List[ScheduledSequenceGroup]
210
+ # Selected sequences that are running and in a prefill phase.
211
+ # I.e., it means the prefill has been chunked.
212
+ prefill_seq_groups: List[ScheduledSequenceGroup]
213
+ # The preempted sequences.
214
+ preempted: List[SequenceGroup]
215
+ # Sequences that are swapped out.
216
+ swapped_out: List[SequenceGroup]
217
+ # The blocks to swap out.
218
+ blocks_to_swap_out: List[Tuple[int, int]]
219
+ # The blocks to copy.
220
+ blocks_to_copy: List[Tuple[int, int]]
221
+ # The number of slots for lookahead decoding.
222
+ num_lookahead_slots: int
223
+
224
+ # Optimization for fast-access to seq_group lists
225
+ decode_seq_groups_list: List[SequenceGroup]
226
+ prefill_seq_groups_list: List[SequenceGroup]
227
+
228
+ @classmethod
229
+ def create_empty(cls) -> "SchedulerRunningOutputs":
230
+ return SchedulerRunningOutputs(
231
+ decode_seq_groups=[],
232
+ prefill_seq_groups=[],
233
+ preempted=[],
234
+ swapped_out=[],
235
+ blocks_to_swap_out=[],
236
+ blocks_to_copy=[],
237
+ num_lookahead_slots=0,
238
+ decode_seq_groups_list=[],
239
+ prefill_seq_groups_list=[],
240
+ )
241
+
242
+
243
+ @dataclass
244
+ class SchedulerSwappedInOutputs:
245
+ """The requests that are scheduled from a swap queue.
246
+
247
+ Could contain prefill (prefill that's chunked) or decodes.
248
+ """
249
+ # Selected sequences that are going to be swapped in and is in a
250
+ # decoding phase.
251
+ decode_seq_groups: List[ScheduledSequenceGroup]
252
+ # Selected sequences that are going to be swapped in and in a prefill
253
+ # phase. I.e., it means the prefill has been chunked.
254
+ prefill_seq_groups: List[ScheduledSequenceGroup]
255
+ # The blocks to swap in.
256
+ blocks_to_swap_in: List[Tuple[int, int]]
257
+ # The blocks to copy.
258
+ blocks_to_copy: List[Tuple[int, int]]
259
+ # The number of slots for lookahead decoding.
260
+ num_lookahead_slots: int
261
+ # Infeasible sequence groups.
262
+ infeasible_seq_groups: List[SequenceGroup]
263
+
264
+ @classmethod
265
+ def create_empty(cls) -> "SchedulerSwappedInOutputs":
266
+ return SchedulerSwappedInOutputs(
267
+ decode_seq_groups=[],
268
+ prefill_seq_groups=[],
269
+ blocks_to_swap_in=[],
270
+ blocks_to_copy=[],
271
+ num_lookahead_slots=0,
272
+ infeasible_seq_groups=[],
273
+ )
274
+
275
+
276
+ @dataclass
277
+ class SchedulerPrefillOutputs:
278
+ """The requests that are scheduled from a waiting queue.
279
+
280
+ Could contain a fresh prefill requests or preempted requests that need
281
+ to be recomputed from scratch.
282
+ """
283
+ # Selected sequences for prefill.
284
+ seq_groups: List[ScheduledSequenceGroup]
285
+ # Ignored sequence groups.
286
+ ignored_seq_groups: List[SequenceGroup]
287
+ num_lookahead_slots: int
288
+
289
+ @classmethod
290
+ def create_empty(cls) -> "SchedulerPrefillOutputs":
291
+ return SchedulerPrefillOutputs(
292
+ seq_groups=[],
293
+ ignored_seq_groups=[],
294
+ num_lookahead_slots=0,
295
+ )
296
+
297
+
298
+ def seq_group_metadata_builder():
299
+ return SequenceGroupMetadata(request_id="",
300
+ is_prompt=False,
301
+ seq_data={},
302
+ sampling_params=None,
303
+ block_tables={})
304
+
305
+
306
+ def scheduler_running_outputs_builder():
307
+ return SchedulerRunningOutputs(decode_seq_groups=[],
308
+ prefill_seq_groups=[],
309
+ preempted=[],
310
+ swapped_out=[],
311
+ blocks_to_swap_out=[],
312
+ blocks_to_copy=[],
313
+ num_lookahead_slots=0,
314
+ prefill_seq_groups_list=[],
315
+ decode_seq_groups_list=[])
316
+
317
+
318
+ def scheduled_seq_group_builder():
319
+ return ScheduledSequenceGroup(SequenceGroup.__new__(SequenceGroup),
320
+ token_chunk_size=0)
321
+ # return ScheduledSequenceGroup(seq_group=None, token_chunk_size=0)
322
+
323
+
324
+ class Scheduler:
325
+
326
+ def __init__(
327
+ self,
328
+ scheduler_config: SchedulerConfig,
329
+ cache_config: CacheConfig,
330
+ lora_config: Optional[LoRAConfig],
331
+ pipeline_parallel_size: int = 1,
332
+ output_proc_callback: Optional[Callable] = None,
333
+ ) -> None:
334
+ self.scheduler_config = scheduler_config
335
+ self.cache_config = cache_config
336
+ # Note for LoRA scheduling: the current policy is extremely
337
+ # simple and NOT fair. It can lead to starvation of some
338
+ # LoRAs. This should be improved in the future.
339
+ self.lora_config = lora_config
340
+
341
+ version = "selfattn"
342
+ if (self.scheduler_config.runner_type == "pooling"
343
+ or self.cache_config.is_attention_free):
344
+ version = "placeholder"
345
+
346
+ BlockSpaceManagerImpl = BlockSpaceManager.get_block_space_manager_class(
347
+ version)
348
+
349
+ num_gpu_blocks = cache_config.num_gpu_blocks
350
+ if num_gpu_blocks:
351
+ num_gpu_blocks //= pipeline_parallel_size
352
+
353
+ num_cpu_blocks = cache_config.num_cpu_blocks
354
+ if num_cpu_blocks:
355
+ num_cpu_blocks //= pipeline_parallel_size
356
+
357
+ # Create the block space manager.
358
+ self.block_manager = BlockSpaceManagerImpl(
359
+ block_size=self.cache_config.block_size,
360
+ num_gpu_blocks=num_gpu_blocks,
361
+ num_cpu_blocks=num_cpu_blocks,
362
+ sliding_window=self.cache_config.sliding_window,
363
+ enable_caching=self.cache_config.enable_prefix_caching)
364
+
365
+ # Sequence groups in the WAITING state.
366
+ # Contain new prefill or preempted requests.
367
+ self.waiting: Deque[SequenceGroup] = deque()
368
+ # Sequence groups in the RUNNING state.
369
+ # Contain decode requests.
370
+ self.running: Deque[SequenceGroup] = deque()
371
+ # Sequence groups in the SWAPPED state.
372
+ # Contain decode requests that are swapped out.
373
+ self.swapped: Deque[SequenceGroup] = deque()
374
+ # Sequence groups finished requests ids since last step iteration.
375
+ # It lets the model know that any state associated with these requests
376
+ # can and must be released after the current step.
377
+ # This is used to evict the finished requests from the Mamba cache.
378
+ self._finished_requests_ids: List[str] = list()
379
+ # Time at previous scheduling step
380
+ self.prev_time = 0.0
381
+ # Did we schedule a prompt at previous step?
382
+ self.prev_prompt = False
383
+ # Latency of the last prompt step
384
+ self.last_prompt_latency = 0.0
385
+ # preemption mode, RECOMPUTE or SWAP
386
+ self.user_specified_preemption_mode = scheduler_config.preemption_mode
387
+
388
+ # The following field is test-only. It is used to inject artificial
389
+ # preemption.
390
+ self.enable_artificial_preemption = ENABLE_ARTIFICIAL_PREEMPT
391
+ self.artificial_preempt_cnt = (ARTIFICIAL_PREEMPTION_MAX_CNT
392
+ if self.enable_artificial_preemption
393
+ else 0)
394
+ self.num_cumulative_preemption: int = 0
395
+
396
+ # Used to cache python objects
397
+ self._seq_group_metadata_cache: List[PyObjectCache] = []
398
+ self._scheduler_running_outputs_cache: List[PyObjectCache] = []
399
+ self._scheduled_seq_group_cache: List[PyObjectCache] = []
400
+
401
+ # For async output processing, we need to swap cache buffers between
402
+ # iterations. I.e. since the output processing is lagged one step,
403
+ # we cannot reuse the cached objects immediately when the schedule()
404
+ # is called again, but only when schedule() is called the second time.
405
+ self.output_proc_callback = output_proc_callback
406
+ self.use_async_output_proc = self.output_proc_callback is not None
407
+ self.num_cache_iters = 2 if self.use_async_output_proc else 1
408
+
409
+ self.cache_id = 0
410
+ for i in range(self.num_cache_iters):
411
+ self._seq_group_metadata_cache.append(
412
+ PyObjectCache(seq_group_metadata_builder))
413
+ self._scheduler_running_outputs_cache.append(
414
+ PyObjectCache(scheduler_running_outputs_builder))
415
+ self._scheduled_seq_group_cache.append(
416
+ PyObjectCache(scheduled_seq_group_builder))
417
+
418
+ # For async postprocessor, the extra decode run cannot be done
419
+ # when the request reaches max_model_len. In this case, the request
420
+ # will be stopped during schedule() call and added to this stop list
421
+ # for processing and deallocation by the free_finished_seq_groups()
422
+ self._async_stopped: List[SequenceGroup] = []
423
+
424
+ @property
425
+ def next_cache_id(self):
426
+ return (self.cache_id + 1) % self.num_cache_iters
427
+
428
+ @property
429
+ def lora_enabled(self) -> bool:
430
+ return bool(self.lora_config)
431
+
432
+ @property
433
+ def num_decoding_tokens_per_seq(self) -> int:
434
+ """The number of new tokens."""
435
+ return 1
436
+
437
+ def add_seq_group(self, seq_group: SequenceGroup) -> None:
438
+ # Add sequence groups to the waiting queue.
439
+ self.waiting.append(seq_group)
440
+
441
+ def _add_seq_group_to_running(self, seq_group: SequenceGroup) -> None:
442
+ # Add sequence groups to the running queue.
443
+ # Only for testing purposes.
444
+ self.running.append(seq_group)
445
+
446
+ def _add_seq_group_to_swapped(self, seq_group: SequenceGroup) -> None:
447
+ # Add sequence groups to the swapped queue.
448
+ # Only for testing purposes.
449
+ self.swapped.append(seq_group)
450
+
451
+ def abort_seq_group(self, request_id: Union[str, Iterable[str]]) -> None:
452
+ """Aborts a sequence group with the given ID.
453
+
454
+ Check if the sequence group with the given ID
455
+ is present in any of the state queue.
456
+ If present, remove the sequence group from the state queue.
457
+ Also, if any of the sequences in the sequence group is not finished,
458
+ free the sequence with status `FINISHED_ABORTED`.
459
+ Otherwise, do nothing.
460
+
461
+ Args:
462
+ request_id: The ID(s) of the sequence group to abort.
463
+ """
464
+ if isinstance(request_id, str):
465
+ request_id = (request_id, )
466
+ request_ids = set(request_id)
467
+ for state_queue in [self.waiting, self.running, self.swapped]:
468
+ aborted_groups: List[SequenceGroup] = []
469
+ for seq_group in state_queue:
470
+ if not request_ids:
471
+ # Using 'break' here may add two extra iterations,
472
+ # but is acceptable to reduce complexity.
473
+ break
474
+ if seq_group.request_id in request_ids:
475
+ # Appending aborted group into pending list.
476
+ aborted_groups.append(seq_group)
477
+ request_ids.remove(seq_group.request_id)
478
+ for aborted_group in aborted_groups:
479
+ # Remove the sequence group from the state queue.
480
+ state_queue.remove(aborted_group)
481
+ # Remove the aborted request from the Mamba cache.
482
+ self._finished_requests_ids.append(aborted_group.request_id)
483
+ for seq in aborted_group.get_seqs():
484
+ if seq.is_finished():
485
+ continue
486
+ seq.status = SequenceStatus.FINISHED_ABORTED
487
+ self.free_seq(seq)
488
+
489
+ self._free_seq_group_cross_attn_blocks(aborted_group)
490
+
491
+ def _free_seq_group_cross_attn_blocks(
492
+ self,
493
+ seq_group: SequenceGroup,
494
+ ) -> None:
495
+ """
496
+ Free a sequence group from a cross-attention block table.
497
+ Has no effect on decoder-only models.
498
+ """
499
+ if seq_group.is_encoder_decoder():
500
+ self.block_manager.free_cross(seq_group)
501
+
502
+ def has_unfinished_seqs(self) -> bool:
503
+ return len(self.waiting) != 0 or len(self.running) != 0 or len(
504
+ self.swapped) != 0
505
+
506
+ def get_prefix_cache_hit_rate(self, device: Device) -> float:
507
+ return self.block_manager.get_prefix_cache_hit_rate(device)
508
+
509
+ def reset_prefix_cache(self) -> bool:
510
+ return self.block_manager.reset_prefix_cache()
511
+
512
+ def get_num_unfinished_seq_groups(self) -> int:
513
+ return len(self.waiting) + len(self.running) + len(self.swapped)
514
+
515
+ def get_and_reset_finished_requests_ids(self) -> List[str]:
516
+ """Flushes the list of request ids of previously finished seq_groups."""
517
+ finished_requests_ids = self._finished_requests_ids
518
+ self._finished_requests_ids = list()
519
+ return finished_requests_ids
520
+
521
+ def _schedule_running(
522
+ self,
523
+ budget: SchedulingBudget,
524
+ curr_loras: Optional[Set[int]],
525
+ enable_chunking: bool = False,
526
+ ) -> SchedulerRunningOutputs:
527
+ """Schedule sequence groups that are running.
528
+
529
+ Running queue should include decode and chunked prefill requests.
530
+
531
+ Args:
532
+ budget: The scheduling budget. The argument is in-place updated
533
+ when any decodes are preempted.
534
+ curr_loras: Currently batched lora request ids. The argument is
535
+ in-place updated when any decodes are preempted.
536
+ enable_chunking: If True, seq group can be chunked and only a
537
+ chunked number of tokens are scheduled if
538
+ `budget.num_batched_tokens` has not enough capacity to schedule
539
+ all tokens.
540
+
541
+ Returns:
542
+ SchedulerRunningOutputs.
543
+ """
544
+ ret: SchedulerRunningOutputs = \
545
+ self._scheduler_running_outputs_cache[self.cache_id].get_object()
546
+ ret.blocks_to_swap_out.clear()
547
+ ret.blocks_to_copy.clear()
548
+ ret.decode_seq_groups.clear()
549
+ ret.prefill_seq_groups.clear()
550
+ ret.preempted.clear()
551
+ ret.swapped_out.clear()
552
+
553
+ ret.num_lookahead_slots = self._get_num_lookahead_slots(
554
+ is_prefill=False, enable_chunking=enable_chunking)
555
+
556
+ ret.decode_seq_groups_list.clear()
557
+ ret.prefill_seq_groups_list.clear()
558
+
559
+ # Blocks that need to be swapped or copied before model execution.
560
+ blocks_to_swap_out: List[Tuple[int, int]] = ret.blocks_to_swap_out
561
+ blocks_to_copy: List[Tuple[int, int]] = ret.blocks_to_copy
562
+
563
+ decode_seq_groups: List[ScheduledSequenceGroup] = ret.decode_seq_groups
564
+ prefill_seq_groups: List[
565
+ ScheduledSequenceGroup] = ret.prefill_seq_groups
566
+ preempted: List[SequenceGroup] = ret.preempted
567
+ swapped_out: List[SequenceGroup] = ret.swapped_out
568
+
569
+ running_queue = self.running
570
+ assert len(self._async_stopped) == 0
571
+ while running_queue:
572
+ seq_group = running_queue[0]
573
+ # We discard the cached tokens info here because we don't need it
574
+ # for running sequence:
575
+ # 1. If a sequence is running with chunked prefill, the cached
576
+ # tokens info was already used for the first prefill.
577
+ # 2. If a sequence is running with non-chunked prefill, then
578
+ # there it's a decoding sequence, and the cached tokens info is
579
+ # irrelevant.
580
+ num_uncached_new_tokens, _ = (
581
+ self._get_num_new_uncached_and_cached_tokens(
582
+ seq_group, SequenceStatus.RUNNING, enable_chunking,
583
+ budget))
584
+
585
+ num_running_tokens = num_uncached_new_tokens
586
+ if num_running_tokens == 0:
587
+ # No budget => Stop
588
+ break
589
+
590
+ running_queue.popleft()
591
+
592
+ # With async postprocessor, an extra decode run is done
593
+ # to process the final tokens. The check below avoids this extra
594
+ # decode run when the model max len is reached, in order to avoid
595
+ # a memory overflow.
596
+ if self.use_async_output_proc and seq_group.seqs[0].get_len(
597
+ ) > self.scheduler_config.max_model_len:
598
+ self._async_stopped.append(seq_group)
599
+ continue
600
+
601
+ # NOTE(woosuk): Preemption happens only when there is no available
602
+ # slot to keep all the sequence groups in the RUNNING state.
603
+ while not self._can_append_slots(seq_group, enable_chunking):
604
+ budget.subtract_num_batched_tokens(seq_group.request_id,
605
+ num_running_tokens)
606
+ num_running_seqs = seq_group.get_max_num_running_seqs()
607
+ budget.subtract_num_seqs(seq_group.request_id,
608
+ num_running_seqs)
609
+
610
+ if (curr_loras is not None and seq_group.lora_int_id > 0
611
+ and seq_group.lora_int_id in curr_loras):
612
+ curr_loras.remove(seq_group.lora_int_id)
613
+
614
+ # Determine victim sequence
615
+ cont_loop = True
616
+ if running_queue:
617
+ # Preempt the lowest-priority sequence group.
618
+ victim_seq_group = running_queue.pop()
619
+ else:
620
+ # No other sequence group can be preempted.
621
+ # Preempt the current sequence group.
622
+ # Note: This is also where we stop this loop
623
+ # (since there is nothing else to preempt)
624
+ victim_seq_group = seq_group
625
+ cont_loop = False
626
+
627
+ # With async postprocessor, before preempting a sequence
628
+ # we need to ensure it has no pending async postprocessor
629
+ do_preempt = True
630
+ if self.use_async_output_proc:
631
+ assert self.output_proc_callback is not None
632
+ self.output_proc_callback(
633
+ request_id=victim_seq_group.request_id)
634
+
635
+ # It may be that the async pending "victim_seq_group"
636
+ # becomes finished, in which case we simply free it.
637
+ if victim_seq_group.is_finished():
638
+ self._free_finished_seq_group(victim_seq_group)
639
+ do_preempt = False
640
+
641
+ # Do preemption
642
+ if do_preempt:
643
+ preempted_mode = self._preempt(victim_seq_group,
644
+ blocks_to_swap_out)
645
+ if preempted_mode == PreemptionMode.RECOMPUTE:
646
+ preempted.append(victim_seq_group)
647
+ else:
648
+ swapped_out.append(victim_seq_group)
649
+
650
+ if not cont_loop:
651
+ break
652
+ else:
653
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
654
+ is_prefill = seq_group.is_prefill()
655
+
656
+ scheduled_seq_group: ScheduledSequenceGroup = \
657
+ self._scheduled_seq_group_cache[self.cache_id].get_object()
658
+ scheduled_seq_group.seq_group = seq_group
659
+ if is_prefill:
660
+ scheduled_seq_group.token_chunk_size = num_running_tokens
661
+ prefill_seq_groups.append(scheduled_seq_group)
662
+ ret.prefill_seq_groups_list.append(seq_group)
663
+ else:
664
+ scheduled_seq_group.token_chunk_size = 1
665
+ decode_seq_groups.append(scheduled_seq_group)
666
+ ret.decode_seq_groups_list.append(seq_group)
667
+
668
+ budget.add_num_batched_tokens(seq_group.request_id,
669
+ num_running_tokens)
670
+ # OPTIMIZATION: Note that get_max_num_running_seqs is
671
+ # expensive. For the default scheduling chase where
672
+ # enable_chunking is False, num_seqs are updated before running
673
+ # this method, so we don't have to update it again here.
674
+ if enable_chunking:
675
+ num_running_seqs = seq_group.get_max_num_running_seqs()
676
+ budget.add_num_seqs(seq_group.request_id, num_running_seqs)
677
+ if curr_loras is not None and seq_group.lora_int_id > 0:
678
+ curr_loras.add(seq_group.lora_int_id)
679
+
680
+ self._scheduler_running_outputs_cache[self.next_cache_id].reset()
681
+ self._scheduled_seq_group_cache[self.next_cache_id].reset()
682
+
683
+ return ret
684
+
685
+ def _schedule_swapped(
686
+ self,
687
+ budget: SchedulingBudget,
688
+ curr_loras: Optional[Set[int]],
689
+ enable_chunking: bool = False,
690
+ ) -> SchedulerSwappedInOutputs:
691
+ """Schedule sequence groups that are swapped out.
692
+
693
+ It schedules swapped requests as long as it fits `budget` and
694
+ curr_loras <= max_lora from the scheduling config. The input arguments
695
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
696
+
697
+ Args:
698
+ budget: The scheduling budget. The argument is in-place updated
699
+ when any requests are swapped in.
700
+ curr_loras: Currently batched lora request ids. The argument is
701
+ in-place updated when any requests are swapped in.
702
+ enable_chunking: If True, seq group can be chunked and only a
703
+ chunked number of tokens are scheduled if
704
+ `budget.num_batched_tokens` has not enough capacity to schedule
705
+ all tokens.
706
+
707
+ Returns:
708
+ SchedulerSwappedInOutputs.
709
+ """
710
+ # Blocks that need to be swapped or copied before model execution.
711
+ blocks_to_swap_in: List[Tuple[int, int]] = []
712
+ blocks_to_copy: List[Tuple[int, int]] = []
713
+ decode_seq_groups: List[ScheduledSequenceGroup] = []
714
+ prefill_seq_groups: List[ScheduledSequenceGroup] = []
715
+ infeasible_seq_groups: List[SequenceGroup] = []
716
+
717
+ swapped_queue = self.swapped
718
+
719
+ leftover_swapped: Deque[SequenceGroup] = deque()
720
+ while swapped_queue:
721
+ seq_group = swapped_queue[0]
722
+
723
+ # If the sequence group cannot be swapped in, stop.
724
+ is_prefill = seq_group.is_prefill()
725
+ alloc_status = self.block_manager.can_swap_in(
726
+ seq_group,
727
+ self._get_num_lookahead_slots(is_prefill, enable_chunking))
728
+ if alloc_status == AllocStatus.LATER:
729
+ break
730
+ elif alloc_status == AllocStatus.NEVER:
731
+ logger.warning(
732
+ "Failing the request %s because there's not enough kv "
733
+ "cache blocks to run the entire sequence.",
734
+ seq_group.request_id)
735
+ for seq in seq_group.get_seqs():
736
+ seq.status = SequenceStatus.FINISHED_IGNORED
737
+ infeasible_seq_groups.append(seq_group)
738
+ swapped_queue.popleft()
739
+ continue
740
+
741
+ lora_int_id = 0
742
+ if self.lora_enabled:
743
+ lora_int_id = seq_group.lora_int_id
744
+ assert curr_loras is not None
745
+ assert self.lora_config is not None
746
+ if (lora_int_id > 0 and (lora_int_id not in curr_loras)
747
+ and len(curr_loras) >= self.lora_config.max_loras):
748
+ # We don't have a space for another LoRA, so
749
+ # we ignore this request for now.
750
+ leftover_swapped.appendleft(seq_group)
751
+ swapped_queue.popleft()
752
+ continue
753
+
754
+ # The total number of sequences in the RUNNING state should not
755
+ # exceed the maximum number of sequences.
756
+ num_new_seqs = seq_group.get_max_num_running_seqs()
757
+ num_new_tokens_uncached, num_new_tokens_cached = (
758
+ self._get_num_new_uncached_and_cached_tokens(
759
+ seq_group, SequenceStatus.SWAPPED, enable_chunking,
760
+ budget))
761
+
762
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
763
+ num_new_tokens=num_new_tokens_uncached,
764
+ num_new_seqs=num_new_seqs,
765
+ ):
766
+ break
767
+
768
+ if lora_int_id > 0 and curr_loras is not None:
769
+ curr_loras.add(lora_int_id)
770
+ swapped_queue.popleft()
771
+ self._swap_in(seq_group, blocks_to_swap_in)
772
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
773
+ is_prefill = seq_group.is_prefill()
774
+ if is_prefill:
775
+ prefill_seq_groups.append(
776
+ ScheduledSequenceGroup(
777
+ seq_group,
778
+ token_chunk_size=num_new_tokens_uncached +
779
+ num_new_tokens_cached,
780
+ ))
781
+ else:
782
+ decode_seq_groups.append(
783
+ ScheduledSequenceGroup(seq_group, token_chunk_size=1))
784
+ budget.add_num_batched_tokens(
785
+ seq_group.request_id,
786
+ num_batched_tokens=num_new_tokens_uncached,
787
+ num_cached_tokens=num_new_tokens_cached,
788
+ )
789
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
790
+
791
+ swapped_queue.extendleft(leftover_swapped)
792
+
793
+ return SchedulerSwappedInOutputs(
794
+ decode_seq_groups=decode_seq_groups,
795
+ prefill_seq_groups=prefill_seq_groups,
796
+ blocks_to_swap_in=blocks_to_swap_in,
797
+ blocks_to_copy=blocks_to_copy,
798
+ num_lookahead_slots=self._get_num_lookahead_slots(
799
+ is_prefill=False, enable_chunking=enable_chunking),
800
+ infeasible_seq_groups=infeasible_seq_groups,
801
+ )
802
+
803
+ def _get_prompt_limit(self, seq_group: SequenceGroup) -> int:
804
+ if self.scheduler_config.chunked_prefill_enabled and \
805
+ not self.scheduler_config.is_multi_step:
806
+ prompt_limit = self.scheduler_config.max_model_len
807
+ else:
808
+ prompt_limit = min(self.scheduler_config.max_model_len,
809
+ self.scheduler_config.max_num_batched_tokens)
810
+
811
+ # Model is fine tuned with long context. Return the fine tuned max_len.
812
+ if (seq_group.lora_request
813
+ and seq_group.lora_request.long_lora_max_len):
814
+ assert prompt_limit <= seq_group.lora_request.long_lora_max_len
815
+ return seq_group.lora_request.long_lora_max_len
816
+ else:
817
+ return prompt_limit
818
+
819
+ def _get_priority(self,
820
+ seq_group: SequenceGroup) -> Tuple[Optional[int], float]:
821
+ """ Get the priority of the sequence group.
822
+ Highest preference to user-defined priority, followed by arrival time.
823
+ Args:
824
+ seq_group: The sequence group input.
825
+ Returns:
826
+ The priority of the sequence group.
827
+ """
828
+ return seq_group.priority, seq_group.arrival_time
829
+
830
+ def _schedule_priority_preemption(
831
+ self,
832
+ budget: SchedulingBudget,
833
+ ) -> int:
834
+ """Sorts waiting and running queue. Also, force preempt requests
835
+ from the running queue if their priority is lower.
836
+ Priority-based preemption is used with the priority policy.
837
+ Args:
838
+ budget: The scheduling budget. The argument is in-place updated
839
+ when any requests are scheduled.
840
+ Returns:
841
+ A count of priority-based preemptions.
842
+ """
843
+
844
+ waiting_queue = self.waiting
845
+
846
+ running_queue = deque(sorted(self.running, key=self._get_priority))
847
+
848
+ blocks_to_swap_out: List[Tuple[int, int]] = []
849
+ force_preemption_count = 0
850
+
851
+ if waiting_queue:
852
+ seq_group = waiting_queue.popleft()
853
+ num_new_seqs = seq_group.get_max_num_running_seqs()
854
+ num_new_tokens_uncached, _ = (
855
+ self._get_num_new_uncached_and_cached_tokens(
856
+ seq_group, SequenceStatus.WAITING, False, budget))
857
+
858
+ #Only preempt if priority inversion exists
859
+ while running_queue and self._get_priority(
860
+ running_queue[-1]) > self._get_priority(seq_group):
861
+ #Only preempt if waiting sequence cannot be allocated
862
+ can_allocate = self.block_manager.can_allocate(seq_group)
863
+ if (num_new_tokens_uncached > 0
864
+ and can_allocate == AllocStatus.OK
865
+ and budget.can_schedule(
866
+ num_new_tokens=num_new_tokens_uncached,
867
+ num_new_seqs=num_new_seqs,
868
+ )):
869
+ break
870
+
871
+ #Adjust budget to remove the victim sequence group
872
+ vseq_group = running_queue.pop()
873
+ num_running_tokens_uncached, _ = (
874
+ self._get_num_new_uncached_and_cached_tokens(
875
+ vseq_group, SequenceStatus.RUNNING, False, budget))
876
+ budget.subtract_num_batched_tokens(
877
+ vseq_group.request_id, num_running_tokens_uncached)
878
+ num_running_seqs = vseq_group.get_max_num_running_seqs()
879
+ budget.subtract_num_seqs(vseq_group.request_id,
880
+ num_running_seqs)
881
+
882
+ #Preempt out the victim sequence group
883
+ self._preempt(vseq_group, blocks_to_swap_out)
884
+ waiting_queue.appendleft(vseq_group)
885
+ force_preemption_count += 1
886
+ #Put the sequence back into the waiting queue
887
+ waiting_queue.appendleft(seq_group)
888
+
889
+ waiting_queue = deque(sorted(waiting_queue, key=self._get_priority))
890
+
891
+ self.waiting = waiting_queue
892
+ self.running = running_queue
893
+ return force_preemption_count
894
+
895
+ def _schedule_prefills(
896
+ self,
897
+ budget: SchedulingBudget,
898
+ curr_loras: Optional[Set[int]],
899
+ enable_chunking: bool = False,
900
+ ) -> SchedulerPrefillOutputs:
901
+ """Schedule sequence groups that are in prefill stage.
902
+
903
+ Note that the current scheduler treats PREEMPTED_FOR_RECOMPUTE
904
+ as a new prefill (that starts from beginning -> most recently generated
905
+ tokens).
906
+
907
+ It schedules waiting requests as long as it fits `budget` and
908
+ curr_loras <= max_lora from the scheduling config. The input arguments
909
+ `budget` and `curr_loras` are updated based on scheduled seq_groups.
910
+
911
+ Args:
912
+ budget: The scheduling budget. The argument is in-place updated
913
+ when any requests are scheduled.
914
+ curr_loras: Currently batched lora request ids. The argument is
915
+ in-place updated when any requests are scheduled.
916
+ enable_chunking: If True, seq group can be chunked and only a
917
+ chunked number of tokens are scheduled if
918
+ `budget.num_batched_tokens` has not enough capacity to schedule
919
+ all tokens.
920
+
921
+ Returns:
922
+ SchedulerPrefillOutputs.
923
+ """
924
+ ignored_seq_groups: List[SequenceGroup] = []
925
+ seq_groups: List[ScheduledSequenceGroup] = []
926
+
927
+ waiting_queue = self.waiting
928
+
929
+ leftover_waiting_sequences: Deque[SequenceGroup] = deque()
930
+ while self._passed_delay(time.time()) and waiting_queue:
931
+ seq_group = waiting_queue[0]
932
+
933
+ waiting_seqs = seq_group.get_seqs(status=SequenceStatus.WAITING)
934
+ assert len(waiting_seqs) == 1, (
935
+ "Waiting sequence group should have only one prompt "
936
+ "sequence.")
937
+ num_new_tokens_uncached, num_new_tokens_cached = (
938
+ self._get_num_new_uncached_and_cached_tokens(
939
+ seq_group, SequenceStatus.WAITING, enable_chunking,
940
+ budget))
941
+ num_new_tokens = num_new_tokens_uncached + num_new_tokens_cached
942
+
943
+ if not enable_chunking:
944
+ num_prompt_tokens = waiting_seqs[0].get_len()
945
+ assert num_new_tokens == num_prompt_tokens
946
+
947
+ prompt_limit = self._get_prompt_limit(seq_group)
948
+ if num_new_tokens > prompt_limit:
949
+ logger.warning(
950
+ "Input prompt (%d tokens) is too long"
951
+ " and exceeds limit of %d", num_new_tokens, prompt_limit)
952
+ for seq in waiting_seqs:
953
+ seq.status = SequenceStatus.FINISHED_IGNORED
954
+ ignored_seq_groups.append(seq_group)
955
+ waiting_queue.popleft()
956
+ continue
957
+
958
+ num_lookahead_slots: int = 0
959
+ if self.scheduler_config.is_multi_step and enable_chunking:
960
+ num_lookahead_slots = self._get_num_lookahead_slots(
961
+ True, enable_chunking)
962
+
963
+ # If the sequence group cannot be allocated, stop.
964
+ can_allocate = self.block_manager.can_allocate(
965
+ seq_group, num_lookahead_slots=num_lookahead_slots)
966
+ if can_allocate == AllocStatus.LATER:
967
+ break
968
+ elif can_allocate == AllocStatus.NEVER:
969
+ logger.warning(
970
+ "Input prompt (%d tokens) + lookahead slots (%d) is "
971
+ "too long and exceeds the capacity of block_manager",
972
+ num_new_tokens, num_lookahead_slots)
973
+ for seq in waiting_seqs:
974
+ seq.status = SequenceStatus.FINISHED_IGNORED
975
+ ignored_seq_groups.append(seq_group)
976
+ waiting_queue.popleft()
977
+ continue
978
+
979
+ lora_int_id = 0
980
+ if self.lora_enabled:
981
+ lora_int_id = seq_group.lora_int_id
982
+ assert curr_loras is not None
983
+ assert self.lora_config is not None
984
+ if (self.lora_enabled and lora_int_id > 0
985
+ and lora_int_id not in curr_loras
986
+ and len(curr_loras) >= self.lora_config.max_loras):
987
+ # We don't have a space for another LoRA, so
988
+ # we ignore this request for now.
989
+ leftover_waiting_sequences.appendleft(seq_group)
990
+ waiting_queue.popleft()
991
+ continue
992
+
993
+ if (budget.num_batched_tokens
994
+ >= self.scheduler_config.max_num_batched_tokens):
995
+ # We've reached the budget limit - since there might be
996
+ # continuous prefills in the running queue, we should break
997
+ # to avoid scheduling any new prefills.
998
+ break
999
+
1000
+ num_new_seqs = seq_group.get_max_num_running_seqs()
1001
+ if num_new_tokens_uncached == 0 or not budget.can_schedule(
1002
+ num_new_tokens=num_new_tokens_uncached,
1003
+ num_new_seqs=num_new_seqs,
1004
+ ):
1005
+ break
1006
+
1007
+ # Can schedule this request.
1008
+ if curr_loras is not None and lora_int_id > 0:
1009
+ curr_loras.add(lora_int_id)
1010
+ waiting_queue.popleft()
1011
+ self._allocate_and_set_running(seq_group)
1012
+
1013
+ if enable_chunking and self.scheduler_config.is_multi_step:
1014
+ blocks_to_copy: List[Tuple[int, int]] = []
1015
+ # init_multi_step_from_lookahead_slots happens in append_slots
1016
+ self._append_slots(seq_group, blocks_to_copy, enable_chunking)
1017
+ # This assert will trip when a copy-on-write happens. This is
1018
+ # not a concern as the very first sequence-group block
1019
+ # allocation happens above. Still, we have the assert to
1020
+ # catch any edge-cases.
1021
+ assert not blocks_to_copy
1022
+ else:
1023
+ seq_group.init_multi_step_from_lookahead_slots(
1024
+ num_lookahead_slots,
1025
+ num_scheduler_steps=self.scheduler_config.
1026
+ num_scheduler_steps,
1027
+ is_multi_step=self.scheduler_config.is_multi_step,
1028
+ enable_chunking=enable_chunking)
1029
+
1030
+ seq_groups.append(
1031
+ ScheduledSequenceGroup(seq_group=seq_group,
1032
+ token_chunk_size=num_new_tokens))
1033
+ budget.add_num_batched_tokens(
1034
+ seq_group.request_id,
1035
+ num_batched_tokens=num_new_tokens_uncached,
1036
+ num_cached_tokens=num_new_tokens_cached,
1037
+ )
1038
+ budget.add_num_seqs(seq_group.request_id, num_new_seqs)
1039
+
1040
+ # Queue requests that couldn't be scheduled.
1041
+ waiting_queue.extendleft(leftover_waiting_sequences)
1042
+ if len(seq_groups) > 0:
1043
+ self.prev_prompt = True
1044
+
1045
+ return SchedulerPrefillOutputs(
1046
+ seq_groups=seq_groups,
1047
+ ignored_seq_groups=ignored_seq_groups,
1048
+ num_lookahead_slots=self._get_num_lookahead_slots(
1049
+ is_prefill=True, enable_chunking=enable_chunking))
1050
+
1051
+ def _schedule_default(self) -> SchedulerOutputs:
1052
+ """Schedule queued requests.
1053
+
1054
+ The current policy is designed to optimize the throughput. First,
1055
+ it batches as many prefill requests as possible. And it schedules
1056
+ decodes. If there's a pressure on GPU memory, decode requests can
1057
+ be swapped or preempted.
1058
+ """
1059
+ # Include running requests to the budget.
1060
+ budget = SchedulingBudget(
1061
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1062
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1063
+ )
1064
+ # Make sure we include num running seqs before scheduling prefill,
1065
+ # so that we don't schedule beyond max_num_seqs for prefill.
1066
+ for seq_group in self.running:
1067
+ budget.add_num_seqs(seq_group.request_id,
1068
+ seq_group.get_max_num_running_seqs())
1069
+ curr_loras = set(
1070
+ seq_group.lora_int_id for seq_group in self.running
1071
+ if seq_group.lora_int_id > 0) if self.lora_enabled else None
1072
+
1073
+ prefills = SchedulerPrefillOutputs.create_empty()
1074
+ running_scheduled = SchedulerRunningOutputs.create_empty()
1075
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1076
+
1077
+ # If any requests are swapped, prioritized swapped requests.
1078
+ if not self.swapped:
1079
+ prefills = self._schedule_prefills(budget,
1080
+ curr_loras,
1081
+ enable_chunking=False)
1082
+
1083
+ if len(prefills.seq_groups
1084
+ ) == 0 and self.scheduler_config.policy == "priority":
1085
+ self._schedule_priority_preemption(budget)
1086
+
1087
+ # Don't schedule decodes if prefills are scheduled.
1088
+ # NOTE: If `_schedule_prefills` doesn't enable chunking, self.running
1089
+ # only contains decode requests, not chunked prefills.
1090
+ if len(prefills.seq_groups) == 0:
1091
+ running_scheduled = self._schedule_running(budget,
1092
+ curr_loras,
1093
+ enable_chunking=False)
1094
+
1095
+ # If any sequence group is preempted, do not swap in any sequence
1096
+ # group. because it means there's no slot for new running requests.
1097
+ if len(running_scheduled.preempted) + len(
1098
+ running_scheduled.swapped_out) == 0:
1099
+ swapped_in = self._schedule_swapped(budget, curr_loras)
1100
+
1101
+ assert (budget.num_batched_tokens
1102
+ <= self.scheduler_config.max_num_batched_tokens)
1103
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1104
+
1105
+ # Update waiting requests.
1106
+ self.waiting.extendleft(running_scheduled.preempted)
1107
+ # Update new running requests.
1108
+ if len(prefills.seq_groups) > 0:
1109
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1110
+
1111
+ self.running.extend(running_scheduled.decode_seq_groups_list)
1112
+
1113
+ if len(swapped_in.decode_seq_groups) > 0:
1114
+ self.running.extend(
1115
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1116
+
1117
+ # Update swapped requests.
1118
+ self.swapped.extend(running_scheduled.swapped_out)
1119
+ preempted = (len(running_scheduled.preempted) +
1120
+ len(running_scheduled.swapped_out))
1121
+
1122
+ # There should be no prefill from running queue because this policy
1123
+ # doesn't allow chunked prefills.
1124
+ assert len(running_scheduled.prefill_seq_groups) == 0
1125
+ assert len(swapped_in.prefill_seq_groups) == 0
1126
+
1127
+ # Merge lists
1128
+ num_prefill_groups = len(prefills.seq_groups)
1129
+ if num_prefill_groups > 0:
1130
+ scheduled_seq_groups = prefills.seq_groups
1131
+ scheduled_seq_groups.extend(running_scheduled.decode_seq_groups)
1132
+ else:
1133
+ scheduled_seq_groups = running_scheduled.decode_seq_groups
1134
+ scheduled_seq_groups.extend(swapped_in.decode_seq_groups)
1135
+
1136
+ blocks_to_copy = running_scheduled.blocks_to_copy
1137
+ blocks_to_copy.extend(swapped_in.blocks_to_copy)
1138
+
1139
+ ignored_seq_groups = prefills.ignored_seq_groups
1140
+ ignored_seq_groups.extend(swapped_in.infeasible_seq_groups)
1141
+
1142
+ return SchedulerOutputs(
1143
+ scheduled_seq_groups=scheduled_seq_groups,
1144
+ num_prefill_groups=num_prefill_groups,
1145
+ num_batched_tokens=budget.num_batched_tokens +
1146
+ budget.num_cached_tokens,
1147
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1148
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1149
+ blocks_to_copy=blocks_to_copy,
1150
+ ignored_seq_groups=ignored_seq_groups,
1151
+ num_lookahead_slots=running_scheduled.num_lookahead_slots,
1152
+ running_queue_size=len(self.running),
1153
+ preempted=preempted,
1154
+ )
1155
+
1156
+ def _schedule_chunked_prefill(self) -> SchedulerOutputs:
1157
+ """Schedule queued requests.
1158
+
1159
+ Chunked prefill allows to chunk prefill requests, batch them together
1160
+ with decode requests. This policy 1. schedule as many decoding requests
1161
+ as possible. 2. schedule chunked prefill requests that are not
1162
+ finished. 3. schedule swapped request. 4. schedule new prefill
1163
+ requests.
1164
+
1165
+ The policy can sustain the high GPU utilization because it can put
1166
+ prefill and decodes requests to the same batch, while it improves
1167
+ inter token latency because decodes requests don't need to be blocked
1168
+ by prefill requests.
1169
+ """
1170
+ budget = SchedulingBudget(
1171
+ token_budget=self.scheduler_config.max_num_batched_tokens,
1172
+ max_num_seqs=self.scheduler_config.max_num_seqs,
1173
+ )
1174
+ curr_loras: Set[int] = set()
1175
+
1176
+ prefills = SchedulerPrefillOutputs.create_empty()
1177
+ swapped_in = SchedulerSwappedInOutputs.create_empty()
1178
+
1179
+ # Decoding should be always scheduled first by fcfs.
1180
+ running_scheduled = self._schedule_running(budget,
1181
+ curr_loras,
1182
+ enable_chunking=True)
1183
+
1184
+ # Schedule swapped out requests.
1185
+ # If preemption happens, it means we don't have space for swap-in.
1186
+ if len(running_scheduled.preempted) + len(
1187
+ running_scheduled.swapped_out) == 0:
1188
+ swapped_in = self._schedule_swapped(budget, curr_loras)
1189
+
1190
+ prefills = self._schedule_prefills(budget,
1191
+ curr_loras,
1192
+ enable_chunking=True)
1193
+
1194
+ assert (budget.num_batched_tokens
1195
+ <= self.scheduler_config.max_num_batched_tokens)
1196
+ assert budget.num_curr_seqs <= self.scheduler_config.max_num_seqs
1197
+
1198
+ # Update waiting requests.
1199
+ self.waiting.extendleft(running_scheduled.preempted)
1200
+
1201
+ # Update new running requests.
1202
+ # By default, vLLM scheduler prioritizes prefills.
1203
+ # Once chunked prefill is enabled,
1204
+ # the policy is changed to prioritize decode requests.
1205
+ self.running.extend(
1206
+ [s.seq_group for s in swapped_in.decode_seq_groups])
1207
+ self.running.extend(
1208
+ [s.seq_group for s in swapped_in.prefill_seq_groups])
1209
+ self.running.extend(
1210
+ [s.seq_group for s in running_scheduled.decode_seq_groups])
1211
+ self.running.extend(
1212
+ [s.seq_group for s in running_scheduled.prefill_seq_groups])
1213
+ self.running.extend([s.seq_group for s in prefills.seq_groups])
1214
+
1215
+ # Update swapped requests.
1216
+ self.swapped.extend(running_scheduled.swapped_out)
1217
+ # Put prefills first due to Attention backend ordering assumption.
1218
+ scheduled_seq_groups = (prefills.seq_groups +
1219
+ running_scheduled.prefill_seq_groups +
1220
+ swapped_in.prefill_seq_groups +
1221
+ running_scheduled.decode_seq_groups +
1222
+ swapped_in.decode_seq_groups)
1223
+ num_prefill_groups = (len(prefills.seq_groups) +
1224
+ len(swapped_in.prefill_seq_groups) +
1225
+ len(running_scheduled.prefill_seq_groups))
1226
+ # If all prompts, then we set num_lookahead_slots to 0
1227
+ # this allows us to go through the `no_spec` path in
1228
+ # `spec_decode_worker.py`
1229
+ all_prefills = (len(scheduled_seq_groups) == num_prefill_groups)
1230
+ num_lookahead_slots = (0 if
1231
+ (all_prefills
1232
+ and not self.scheduler_config.is_multi_step)
1233
+ else running_scheduled.num_lookahead_slots)
1234
+ return SchedulerOutputs(
1235
+ scheduled_seq_groups=scheduled_seq_groups,
1236
+ num_prefill_groups=num_prefill_groups,
1237
+ num_batched_tokens=budget.num_batched_tokens +
1238
+ budget.num_cached_tokens,
1239
+ blocks_to_swap_in=swapped_in.blocks_to_swap_in,
1240
+ blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
1241
+ blocks_to_copy=running_scheduled.blocks_to_copy +
1242
+ swapped_in.blocks_to_copy,
1243
+ ignored_seq_groups=prefills.ignored_seq_groups +
1244
+ swapped_in.infeasible_seq_groups,
1245
+ num_lookahead_slots=num_lookahead_slots,
1246
+ running_queue_size=len(self.running),
1247
+ preempted=(len(running_scheduled.preempted) +
1248
+ len(running_scheduled.swapped_out)),
1249
+ )
1250
+
1251
+ def _schedule(self) -> SchedulerOutputs:
1252
+ """Schedule queued requests."""
1253
+ if self.scheduler_config.chunked_prefill_enabled:
1254
+ return self._schedule_chunked_prefill()
1255
+ else:
1256
+ return self._schedule_default()
1257
+
1258
+ def _can_append_slots(self, seq_group: SequenceGroup,
1259
+ enable_chunking: bool) -> bool:
1260
+ """Determine whether or not we have enough space in the KV cache to
1261
+ continue generation of the sequence group.
1262
+ """
1263
+ # It is True only for testing case to trigger artificial preemption.
1264
+ if (self.enable_artificial_preemption
1265
+ and random.uniform(0, 1) < ARTIFICIAL_PREEMPTION_PROB
1266
+ and self.artificial_preempt_cnt > 0):
1267
+ self.artificial_preempt_cnt -= 1
1268
+ return False
1269
+
1270
+ is_prefill = seq_group.is_prefill()
1271
+ num_lookahead_slots = self._get_num_lookahead_slots(
1272
+ is_prefill, enable_chunking)
1273
+
1274
+ if is_prefill and num_lookahead_slots > 0:
1275
+ # Appending prefill slots only happens multi-step and
1276
+ # chunked-prefill are enabled together.
1277
+ assert self.scheduler_config.is_multi_step and enable_chunking
1278
+
1279
+ return self.block_manager.can_append_slots(
1280
+ seq_group=seq_group, num_lookahead_slots=num_lookahead_slots)
1281
+
1282
+ def _allow_async_output_proc(self, seq_group: SequenceGroup) -> bool:
1283
+ # async_output_proc is allowed only when we have a single sequence
1284
+ # in the sequence group
1285
+ no_single_seq = seq_group.sampling_params is None or (
1286
+ seq_group.sampling_params.n == 1)
1287
+ return no_single_seq
1288
+
1289
+ def schedule(
1290
+ self
1291
+ ) -> Tuple[List[SequenceGroupMetadata], SchedulerOutputs, bool]:
1292
+ # Schedule sequence groups.
1293
+ # This function call changes the internal states of the scheduler
1294
+ # such as self.running, self.swapped, and self.waiting.
1295
+ scheduler_start_time = time.perf_counter()
1296
+
1297
+ scheduler_outputs: SchedulerOutputs = self._schedule()
1298
+ now = time.time()
1299
+
1300
+ if not self.cache_config.enable_prefix_caching:
1301
+ common_computed_block_nums = []
1302
+
1303
+ allow_async_output_proc: bool = self.use_async_output_proc
1304
+
1305
+ # Create input data structures.
1306
+ seq_group_metadata_list: List[SequenceGroupMetadata] = []
1307
+ for i, scheduled_seq_group in enumerate(
1308
+ scheduler_outputs.scheduled_seq_groups):
1309
+ seq_group = scheduled_seq_group.seq_group
1310
+ token_chunk_size = scheduled_seq_group.token_chunk_size
1311
+ seq_group.maybe_set_first_scheduled_time(now)
1312
+
1313
+ seq_group_metadata = self._seq_group_metadata_cache[
1314
+ self.cache_id].get_object()
1315
+ seq_group_metadata.seq_data.clear()
1316
+ seq_group_metadata.block_tables.clear()
1317
+
1318
+ # seq_id -> SequenceData
1319
+ seq_data: Dict[int, SequenceData] = {}
1320
+ # seq_id -> physical block numbers
1321
+ block_tables: Dict[int, List[int]] = {}
1322
+
1323
+ if seq_group.is_encoder_decoder():
1324
+ # Encoder associated with SequenceGroup
1325
+ encoder_seq = seq_group.get_encoder_seq()
1326
+ assert encoder_seq is not None
1327
+ encoder_seq_data = encoder_seq.data
1328
+ # Block table for cross-attention
1329
+ # Also managed at SequenceGroup level
1330
+ cross_block_table = self.block_manager.get_cross_block_table(
1331
+ seq_group)
1332
+ else:
1333
+ encoder_seq_data = None
1334
+ cross_block_table = None
1335
+
1336
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1337
+ seq_id = seq.seq_id
1338
+ seq_data[seq_id] = seq.data
1339
+ block_tables[seq_id] = self.block_manager.get_block_table(seq)
1340
+ self.block_manager.access_all_blocks_in_seq(seq, now)
1341
+
1342
+ if self.cache_config.enable_prefix_caching:
1343
+ common_computed_block_nums = (
1344
+ self.block_manager.get_common_computed_block_ids(
1345
+ seq_group.get_seqs(status=SequenceStatus.RUNNING)))
1346
+
1347
+ do_sample = True
1348
+ is_prompt = seq_group.is_prefill()
1349
+ # We should send the metadata to workers when the first prefill
1350
+ # is sent. Subsequent requests could be chunked prefill or decode.
1351
+ is_first_prefill = False
1352
+ if is_prompt:
1353
+ seqs = seq_group.get_seqs()
1354
+ # Prefill has only 1 sequence.
1355
+ assert len(seqs) == 1
1356
+ num_computed_tokens = seqs[0].data.get_num_computed_tokens()
1357
+ is_first_prefill = num_computed_tokens == 0
1358
+ # In the next iteration, all prompt tokens are not computed.
1359
+ # It means the prefill is chunked, and we don't need sampling.
1360
+ # NOTE: We use get_len instead of get_prompt_len because when
1361
+ # a sequence is preempted, prefill includes previous generated
1362
+ # output tokens.
1363
+ if (token_chunk_size + num_computed_tokens
1364
+ < seqs[0].data.get_len()):
1365
+ do_sample = False
1366
+
1367
+ # It assumes the scheduled_seq_groups is ordered by
1368
+ # prefill < decoding.
1369
+ if is_first_prefill or not self.scheduler_config.send_delta_data:
1370
+ seq_group_metadata = SequenceGroupMetadata(
1371
+ request_id=seq_group.request_id,
1372
+ is_prompt=is_prompt,
1373
+ seq_data=seq_data,
1374
+ sampling_params=seq_group.sampling_params,
1375
+ block_tables=block_tables,
1376
+ do_sample=do_sample,
1377
+ pooling_params=seq_group.pooling_params,
1378
+ token_chunk_size=token_chunk_size,
1379
+ lora_request=seq_group.lora_request,
1380
+ computed_block_nums=common_computed_block_nums,
1381
+ encoder_seq_data=encoder_seq_data,
1382
+ cross_block_table=cross_block_table,
1383
+ state=seq_group.state,
1384
+ token_type_ids=seq_group.token_type_ids,
1385
+ # `multi_modal_data` will only be present for the 1st comm
1386
+ # between engine and worker.
1387
+ # the subsequent comms can still use delta, but
1388
+ # `multi_modal_data` will be None.
1389
+ multi_modal_data=seq_group.multi_modal_data
1390
+ if scheduler_outputs.num_prefill_groups > 0 else None,
1391
+ multi_modal_placeholders=seq_group.multi_modal_placeholders
1392
+ if scheduler_outputs.num_prefill_groups > 0 else None,
1393
+ mm_processor_kwargs=seq_group.mm_processor_kwargs,
1394
+ prompt_adapter_request=seq_group.prompt_adapter_request,
1395
+ )
1396
+ else:
1397
+ # When SPMD mode is enabled, we only send delta data except for
1398
+ # the first request to reduce serialization cost.
1399
+ seq_data_delta = {}
1400
+ for id, data in seq_data.items():
1401
+ seq_data_delta[id] = data.get_delta_and_reset()
1402
+ seq_group_metadata = SequenceGroupMetadataDelta(
1403
+ seq_data_delta,
1404
+ seq_group.request_id,
1405
+ block_tables,
1406
+ is_prompt,
1407
+ do_sample=do_sample,
1408
+ token_chunk_size=token_chunk_size,
1409
+ computed_block_nums=common_computed_block_nums,
1410
+ )
1411
+ seq_group_metadata_list.append(seq_group_metadata)
1412
+
1413
+ if allow_async_output_proc:
1414
+ allow_async_output_proc = self._allow_async_output_proc(
1415
+ seq_group)
1416
+
1417
+ # Now that the batch has been created, we can assume all blocks in the
1418
+ # batch will have been computed before the next scheduling invocation.
1419
+ # This is because the engine assumes that a failure in model execution
1420
+ # will crash the vLLM instance / will not retry.
1421
+ for scheduled_seq_group in scheduler_outputs.scheduled_seq_groups:
1422
+ self.block_manager.mark_blocks_as_computed(
1423
+ scheduled_seq_group.seq_group,
1424
+ scheduled_seq_group.token_chunk_size)
1425
+
1426
+ self._seq_group_metadata_cache[self.next_cache_id].reset()
1427
+
1428
+ scheduler_time = time.perf_counter() - scheduler_start_time
1429
+ # Add this to scheduler time to all the sequences that are currently
1430
+ # running. This will help estimate if the scheduler is a significant
1431
+ # component in the e2e latency.
1432
+ for seq_group in self.running:
1433
+ if seq_group is not None and seq_group.metrics is not None:
1434
+ if seq_group.metrics.scheduler_time is not None:
1435
+ seq_group.metrics.scheduler_time += scheduler_time
1436
+ else:
1437
+ seq_group.metrics.scheduler_time = scheduler_time
1438
+
1439
+ # Move to next cache (if exists)
1440
+ self.cache_id = self.next_cache_id
1441
+
1442
+ # Return results
1443
+ return (seq_group_metadata_list, scheduler_outputs,
1444
+ allow_async_output_proc)
1445
+
1446
+ def fork_seq(self, parent_seq: Sequence, child_seq: Sequence) -> None:
1447
+ self.block_manager.fork(parent_seq, child_seq)
1448
+
1449
+ def free_seq(self, seq: Sequence) -> None:
1450
+ """Free a sequence from a block table."""
1451
+ self.block_manager.free(seq)
1452
+
1453
+ def _free_finished_seqs(self, seq_group: SequenceGroup) -> None:
1454
+ """Free finished seqs in a sequence group."""
1455
+ for seq in seq_group.get_seqs():
1456
+ if seq.is_finished():
1457
+ self.free_seq(seq)
1458
+
1459
+ def _free_finished_seq_group(self, seq_group: SequenceGroup) -> None:
1460
+ if seq_group.is_finished():
1461
+ # Free cross-attention block table, if it exists
1462
+ self._free_seq_group_cross_attn_blocks(seq_group)
1463
+
1464
+ # Add the finished requests to the finished requests list.
1465
+ # This list will be used to update the Mamba cache in the
1466
+ # next step.
1467
+ self._finished_requests_ids.append(seq_group.request_id)
1468
+
1469
+ # Free finished seqs
1470
+ self._free_finished_seqs(seq_group)
1471
+
1472
+ def free_finished_seq_groups(self) -> None:
1473
+ remaining: Deque[SequenceGroup] = deque()
1474
+ for seq_group in self.running:
1475
+ self._free_finished_seq_group(seq_group)
1476
+ if not seq_group.is_finished():
1477
+ remaining.append(seq_group)
1478
+
1479
+ self.running = remaining
1480
+
1481
+ # Handle async stopped sequence groups
1482
+ # (ones that reached max model len)
1483
+ if self._async_stopped:
1484
+ for seq_group in self._async_stopped:
1485
+ self._free_seq_group_cross_attn_blocks(seq_group)
1486
+ self._finished_requests_ids.append(seq_group.request_id)
1487
+
1488
+ # Free finished seqs
1489
+ self._free_finished_seqs(seq_group)
1490
+
1491
+ self._async_stopped.clear()
1492
+
1493
+ def _allocate_and_set_running(self, seq_group: SequenceGroup) -> None:
1494
+ self.block_manager.allocate(seq_group)
1495
+ for seq in seq_group.get_seqs(status=SequenceStatus.WAITING):
1496
+ seq.status = SequenceStatus.RUNNING
1497
+
1498
+ def _append_slots(self,
1499
+ seq_group: SequenceGroup,
1500
+ blocks_to_copy: List[Tuple[int, int]],
1501
+ enable_chunking: bool = False) -> None:
1502
+ """Appends new slots to the sequences in the given sequence group.
1503
+
1504
+ Args:
1505
+ seq_group (SequenceGroup): The sequence group containing the
1506
+ sequences to append slots to.
1507
+ blocks_to_copy (List[Tuple[int, int]]): A list of tuple of two
1508
+ ints, the first int is the source block index, and the second
1509
+ int is the destination block index. This list is updated with
1510
+ the new source and destination block indices for the appended
1511
+ slots.
1512
+ enable_chunking (bool): True if chunked prefill is enabled.
1513
+ """
1514
+ is_prefill: bool = seq_group.is_prefill()
1515
+ num_lookahead_slots: int = self._get_num_lookahead_slots(
1516
+ is_prefill, enable_chunking)
1517
+
1518
+ seq_group.init_multi_step_from_lookahead_slots(
1519
+ num_lookahead_slots,
1520
+ num_scheduler_steps=self.scheduler_config.num_scheduler_steps,
1521
+ is_multi_step=self.scheduler_config.is_multi_step,
1522
+ enable_chunking=enable_chunking)
1523
+
1524
+ seq_status: Optional[SequenceStatus] = SequenceStatus.RUNNING
1525
+ if self.scheduler_config.is_multi_step and enable_chunking:
1526
+ # In multi-step chunked-prefill any sequence type can have
1527
+ # slots appended.
1528
+ seq_status = None
1529
+
1530
+ for seq in seq_group.get_seqs(status=seq_status):
1531
+ cows = self.block_manager.append_slots(seq, num_lookahead_slots)
1532
+ if len(cows) > 0:
1533
+ blocks_to_copy.extend(cows)
1534
+
1535
+ def _preempt(self, seq_group: SequenceGroup,
1536
+ blocks_to_swap_out: List[Tuple[int, int]]) -> PreemptionMode:
1537
+ # If preemption mode is not specified, we determine the mode as follows:
1538
+ # We use recomputation by default since it incurs lower overhead than
1539
+ # swapping. However, when the sequence group has multiple sequences
1540
+ # (e.g., beam search), recomputation is not currently supported. In
1541
+ # such a case, we use swapping instead.
1542
+ # FIXME(woosuk): This makes our scheduling policy a bit bizarre.
1543
+ # As swapped sequences are prioritized over waiting sequences,
1544
+ # sequence groups with multiple sequences are implicitly prioritized
1545
+ # over sequence groups with a single sequence.
1546
+ # TODO(woosuk): Support recomputation for sequence groups with multiple
1547
+ # sequences. This may require a more sophisticated CUDA kernel.
1548
+ if self.user_specified_preemption_mode is None:
1549
+ if seq_group.get_max_num_running_seqs() == 1:
1550
+ preemption_mode = PreemptionMode.RECOMPUTE
1551
+ else:
1552
+ preemption_mode = PreemptionMode.SWAP
1553
+
1554
+ elif self.user_specified_preemption_mode == "swap":
1555
+ preemption_mode = PreemptionMode.SWAP
1556
+ else:
1557
+ preemption_mode = PreemptionMode.RECOMPUTE
1558
+
1559
+ if self.num_cumulative_preemption % 50 == 0:
1560
+ logger.warning(
1561
+ "Sequence group %s is preempted by %s mode because there is "
1562
+ "not enough KV cache space. This can affect the end-to-end "
1563
+ "performance. Increase gpu_memory_utilization or "
1564
+ "tensor_parallel_size to provide more KV cache memory. "
1565
+ "total_num_cumulative_preemption=%d", seq_group.request_id,
1566
+ preemption_mode, self.num_cumulative_preemption + 1)
1567
+ self.num_cumulative_preemption += 1
1568
+
1569
+ if preemption_mode == PreemptionMode.RECOMPUTE:
1570
+ self._preempt_by_recompute(seq_group)
1571
+ elif preemption_mode == PreemptionMode.SWAP:
1572
+ self._preempt_by_swap(seq_group, blocks_to_swap_out)
1573
+ else:
1574
+ raise AssertionError("Invalid preemption mode.")
1575
+ return preemption_mode
1576
+
1577
+ def _preempt_by_recompute(
1578
+ self,
1579
+ seq_group: SequenceGroup,
1580
+ ) -> None:
1581
+ seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
1582
+ assert len(seqs) == 1
1583
+ for seq in seqs:
1584
+ seq.status = SequenceStatus.WAITING
1585
+ self.free_seq(seq)
1586
+ seq.reset_state_for_recompute()
1587
+ self._free_seq_group_cross_attn_blocks(seq_group)
1588
+
1589
+ def _preempt_by_swap(
1590
+ self,
1591
+ seq_group: SequenceGroup,
1592
+ blocks_to_swap_out: List[Tuple[int, int]],
1593
+ ) -> None:
1594
+ self._swap_out(seq_group, blocks_to_swap_out)
1595
+
1596
+ def _swap_in(
1597
+ self,
1598
+ seq_group: SequenceGroup,
1599
+ blocks_to_swap_in: List[Tuple[int, int]],
1600
+ ) -> None:
1601
+ mapping = self.block_manager.swap_in(seq_group)
1602
+ blocks_to_swap_in.extend(mapping)
1603
+ for seq in seq_group.get_seqs(status=SequenceStatus.SWAPPED):
1604
+ seq.status = SequenceStatus.RUNNING
1605
+
1606
+ def _swap_out(
1607
+ self,
1608
+ seq_group: SequenceGroup,
1609
+ blocks_to_swap_out: List[Tuple[int, int]],
1610
+ ) -> None:
1611
+ if not self.block_manager.can_swap_out(seq_group):
1612
+ # FIXME(woosuk): Abort the sequence group instead of aborting the
1613
+ # entire engine.
1614
+ raise RuntimeError(
1615
+ "Aborted due to the lack of CPU swap space. Please increase "
1616
+ "the swap space to avoid this error.")
1617
+ mapping = self.block_manager.swap_out(seq_group)
1618
+ blocks_to_swap_out.extend(mapping)
1619
+ for seq in seq_group.get_seqs(status=SequenceStatus.RUNNING):
1620
+ seq.status = SequenceStatus.SWAPPED
1621
+
1622
+ def _passed_delay(self, now: float) -> bool:
1623
+ if self.prev_prompt:
1624
+ self.last_prompt_latency = now - self.prev_time
1625
+ self.prev_time, self.prev_prompt = now, False
1626
+ # Delay scheduling prompts to let waiting queue fill up
1627
+ if self.scheduler_config.delay_factor > 0 and self.waiting:
1628
+ earliest_arrival_time = min(
1629
+ [e.metrics.arrival_time for e in self.waiting])
1630
+ passed_delay = ((now - earliest_arrival_time)
1631
+ > (self.scheduler_config.delay_factor *
1632
+ self.last_prompt_latency) or not self.running)
1633
+ else:
1634
+ passed_delay = True
1635
+ return passed_delay
1636
+
1637
+ def _get_num_lookahead_slots(self, is_prefill: bool,
1638
+ enable_chunking: bool) -> int:
1639
+ """The number of slots to allocate per sequence per step, beyond known
1640
+ token ids. Speculative decoding uses these slots to store KV activations
1641
+ of tokens which may or may not be accepted.
1642
+
1643
+ Speculative decoding does not yet support prefill, so we do not perform
1644
+ lookahead allocation for prefill.
1645
+
1646
+ When chunking is enabled with multi-step, we allocate lookahead slots
1647
+ for the prefills for when the prefills turn into decodes in the first
1648
+ step.
1649
+ """
1650
+ if is_prefill:
1651
+ if self.scheduler_config.is_multi_step and enable_chunking:
1652
+ # num_lookahead_slots was introduced in the context of decodes,
1653
+ # in Speculative Decoding.
1654
+ # When the num_scheduler_steps is 8, say, then the
1655
+ # num_lookahead_slots is 7. Meaning, we are doing a 1-step of
1656
+ # decode anyways and we wish to do 7 more.
1657
+ #
1658
+ # "lookaheads" for prefills, is introduced in support for
1659
+ # Chunked-Prefill in Multi-Step.
1660
+ return self.scheduler_config.num_lookahead_slots + 1
1661
+ else:
1662
+ return 0
1663
+
1664
+ return self.scheduler_config.num_lookahead_slots
1665
+
1666
+ def _get_num_new_uncached_and_cached_tokens(
1667
+ self,
1668
+ seq_group: SequenceGroup,
1669
+ status: SequenceStatus,
1670
+ enable_chunking: bool,
1671
+ budget: SchedulingBudget,
1672
+ ) -> Tuple[int, int]:
1673
+ """
1674
+ Returns the number of new uncached and cached tokens to schedule for a
1675
+ given sequence group that's in a given `status`.
1676
+
1677
+ The API could chunk the number of tokens to compute based on `budget`
1678
+ if `enable_chunking` is True. If a sequence group has multiple
1679
+ sequences (e.g., running beam search), it means it is in decoding
1680
+ phase, so chunking doesn't happen.
1681
+
1682
+ Returns (0, 0) if the new token cannot be computed due to token budget.
1683
+
1684
+ The cached tokens's blocks are already computed, and the attention
1685
+ backend will reuse the cached blocks rather than recomputing them. So
1686
+ the scheduler could schedule these cached tokens "for free".
1687
+
1688
+ Args:
1689
+ seq_group: The sequence group to get the number of new tokens to
1690
+ schedule.
1691
+ status: The status of the sequences to get the number of new tokens
1692
+ to schedule.
1693
+ enable_chunking: Whether to chunk the number of tokens to compute.
1694
+ budget: The budget to chunk the number of tokens to compute.
1695
+
1696
+
1697
+ Returns:
1698
+ A tuple of two ints. The first int is the number of new uncached
1699
+ tokens to schedule. The second int is the number of cached tokens.
1700
+ If no more new tokens can be scheduled, returns (0, 0).
1701
+ """
1702
+ num_cached_new_tokens = 0
1703
+ num_uncached_new_tokens = 0
1704
+
1705
+ seqs = seq_group.get_seqs(status=status)
1706
+ # Compute the number of new uncached and cached tokens for
1707
+ # each sequence.
1708
+ for seq in seqs:
1709
+ if not seq.is_prefill():
1710
+ # Decode sequences should always just have 1 uncached token
1711
+ # TODO(rickyx): Actually is this still correct for multi-step?
1712
+ num_uncached_new_tokens += 1
1713
+ continue
1714
+
1715
+ num_computed_tokens_seq = seq.get_num_computed_tokens()
1716
+ all_num_new_tokens_seq = seq.get_len() - num_computed_tokens_seq
1717
+ if not self.cache_config.enable_prefix_caching:
1718
+ # If prefix caching is not enabled, all new tokens are uncached.
1719
+ num_uncached_new_tokens += all_num_new_tokens_seq
1720
+ continue
1721
+
1722
+ # NOTE: the cache token might be currently in a block that's in an
1723
+ # evictor meaning that it's not yet allocated. However, we don't
1724
+ # exclude such tokens in the cache count because it will be
1725
+ # guaranteed to be allocated later if the sequence can be allocated.
1726
+ num_cached_tokens_seq = self.block_manager.get_num_cached_tokens(
1727
+ seq)
1728
+
1729
+ # Sanity check.
1730
+ if num_cached_tokens_seq < num_computed_tokens_seq:
1731
+ # This should only happen with chunked prefill, and
1732
+ # the seq is still in prefill. The `num_cached_tokens_seq`
1733
+ # is the value we calculated on scheduling the first prefill.
1734
+ # For subsequent continuous prefill steps, we cached the
1735
+ # number of cache tokens for the sequence so the cached token
1736
+ # count could be less than the number of computed tokens.
1737
+ # See comments on `ComputedBlocksTracker` for more details.
1738
+ assert (
1739
+ seq.is_prefill() and seq.status == SequenceStatus.RUNNING
1740
+ and self.scheduler_config.chunked_prefill_enabled
1741
+ ), ("Number of cached tokens should not be less than the "
1742
+ "number of computed tokens for a sequence that's still "
1743
+ f"in prefill. But there are {num_cached_tokens_seq} cached "
1744
+ f"tokens and {num_computed_tokens_seq} computed tokens "
1745
+ f"for sequence {seq.seq_id}.")
1746
+
1747
+ num_cached_new_tokens_seq = max(
1748
+ 0, num_cached_tokens_seq - num_computed_tokens_seq)
1749
+ num_uncached_new_tokens_seq = (all_num_new_tokens_seq -
1750
+ num_cached_new_tokens_seq)
1751
+
1752
+ num_uncached_new_tokens += num_uncached_new_tokens_seq
1753
+ num_cached_new_tokens += num_cached_new_tokens_seq
1754
+
1755
+ if num_uncached_new_tokens == 0 and num_cached_new_tokens > 0:
1756
+ # For a fully cached hit sequence, we actually need to recompute the
1757
+ # last token. So we need at least 1 uncached token to schedule.
1758
+ # See ModelRunner._compute_for_prefix_cache_hit for more details.
1759
+ num_uncached_new_tokens = 1
1760
+ num_cached_new_tokens -= 1
1761
+
1762
+ if enable_chunking and len(seqs) == 1:
1763
+ # Chunk if a running request cannot fit in the given budget.
1764
+ # If number of seq > 1, it means it is doing beam search
1765
+ # in a decode phase. Do not chunk.
1766
+ num_uncached_new_tokens = self._chunk_new_tokens_to_schedule(
1767
+ self.scheduler_config,
1768
+ self.cache_config,
1769
+ budget,
1770
+ self._get_prompt_limit(seq_group),
1771
+ num_uncached_new_tokens,
1772
+ )
1773
+
1774
+ return num_uncached_new_tokens, num_cached_new_tokens
1775
+
1776
+ @staticmethod
1777
+ def _chunk_new_tokens_to_schedule(
1778
+ scheduler_config: SchedulerConfig,
1779
+ cache_config: CacheConfig,
1780
+ budget: SchedulingBudget,
1781
+ prompt_limit: int,
1782
+ num_new_tokens: int,
1783
+ ) -> int:
1784
+ """
1785
+ Chunks the number of new tokens to schedule based on the budget when
1786
+ chunked prefill is enabled.
1787
+
1788
+ Args:
1789
+ scheduler_config: The scheduler config.
1790
+ cache_config: The cache config.
1791
+ budget: The budget to chunk the number of tokens to compute.
1792
+ prompt_limit: The maximum number of tokens allowed in a prompt.
1793
+ num_new_tokens: The number of new tokens to schedule.
1794
+
1795
+ Returns:
1796
+ The number of new tokens to schedule after chunking.
1797
+ """
1798
+ remaining_token_budget = budget.remaining_token_budget()
1799
+ if scheduler_config.is_multi_step:
1800
+ # The current multi-step + chunked prefill capability does
1801
+ # not actually support chunking prompts.
1802
+ #
1803
+ # Therefore, `num_new_tokens` is computed in the same fashion
1804
+ # for both multi-step+chunked-prefill &
1805
+ # multi-step+chunked-prefill+APC
1806
+ #
1807
+ # Prompts with more tokens than the current remaining budget
1808
+ # are postponed to future scheduler steps
1809
+ if num_new_tokens > prompt_limit:
1810
+ # If the seq_group is in prompt-stage, pass the
1811
+ # num_new_tokens as-is so the caller can ignore
1812
+ # the sequence.
1813
+ return num_new_tokens
1814
+
1815
+ return (0 if num_new_tokens > remaining_token_budget else
1816
+ num_new_tokens)
1817
+
1818
+ if cache_config.enable_prefix_caching:
1819
+ # Adjust the remaining token budget to be divisible by the block
1820
+ # size when prefix caching is enabled.
1821
+
1822
+ # When prefix caching is enabled, we always allocate
1823
+ # the number of new tokens that is dividable by the block
1824
+ # size to avoid partial block matching.
1825
+ block_size = cache_config.block_size
1826
+ remainder = budget.token_budget % block_size
1827
+ if remainder != 0:
1828
+ raise ValueError("When enabling chunked prefill and "
1829
+ "prefix caching, max_num_batched_tokens "
1830
+ "(chunk size) must be dividable by "
1831
+ "block size, but got chunk_size "
1832
+ f"({budget.token_budget}) % block_size "
1833
+ f"({block_size}) = {remainder}")
1834
+ # Round down to block size.
1835
+ remaining_token_budget = (remaining_token_budget // block_size *
1836
+ block_size)
1837
+
1838
+ num_new_tokens = min(num_new_tokens, remaining_token_budget)
1839
+
1840
+ return num_new_tokens
.venv/lib/python3.11/site-packages/vllm/device_allocator/__pycache__/cumem.cpython-311.pyc ADDED
Binary file (12 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (210 Bytes). View file
 
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/cuda_wrapper.cpython-311.pyc ADDED
Binary file (9.62 kB). View file
 
.venv/lib/python3.11/site-packages/vllm/distributed/device_communicators/__pycache__/custom_all_reduce_utils.cpython-311.pyc ADDED
Binary file (12.7 kB). View file