add prefill chunking

Browse files

Files changed (3) hide show

chunk_utils.py +137 -0
processing_qwen3_vl.py +24 -2
video_preprocessor_config.json +1 -0

chunk_utils.py ADDED Viewed

	@@ -0,0 +1,137 @@

+from collections import deque
+import torch
+import numpy as np
+def _visual_token_cums(
+    sequence_idx: int,
+    input_ids: torch.Tensor | np.ndarray,
+    image_token_id: int,
+    video_token_id: int,
+    merge_size: int,
+    focus_size: int,
+    image_grid_thw: torch.Tensor | np.ndarray | None,
+    video_grid_thw: torch.Tensor | np.ndarray | None,
+    **kwargs,
+) -> list[int]:
+    cums: deque[int] = deque()
+    video_idx = 0
+    frame_idx = 0
+    image_idx = 0
+    token_idx = 0
+    in_video = False
+    cum = 0
+    sequence = input_ids[sequence_idx].tolist()
+    while token_idx < len(sequence):
+        token = sequence[token_idx]
+        if token == image_token_id:
+            assert image_grid_thw is not None, "image_grid_thw must be provided when image_token_id is used"
+            _, h, w = image_grid_thw[image_idx].tolist()
+            num_tokens = h * w // (merge_size ** 2)
+            cums.append(num_tokens)
+            token_idx += num_tokens
+            image_idx += 1
+        elif token == video_token_id:
+            assert video_grid_thw is not None, "video_grid_thw must be provided when video_token_id is used"
+            t, h, w = video_grid_thw[video_idx].tolist()
+            assert t % focus_size == 0, f"Number of frames {t} must be divisible by focus_size {focus_size}"
+            num_tokens = h * w // (merge_size ** 2)
+            cum += num_tokens
+            if (frame_idx + 1) % focus_size == 0:
+                cums.append(cum)
+                cum = 0
+                in_video = False
+            else:
+                in_video = True
+            frame_idx += 1
+            if frame_idx == t:
+                video_idx += 1
+                frame_idx = 0
+            token_idx += num_tokens
+        else:
+            if not in_video:
+                cums.append(1)
+            else:
+                cum += 1
+            token_idx += 1
+    return list(cums)
+def visual_token_cums(
+    input_ids: torch.Tensor | np.ndarray,
+    image_token_id: int,
+    video_token_id: int,
+    merge_size: int,
+    focus_size: int,
+    image_grid_thw: torch.Tensor | np.ndarray | None,
+    video_grid_thw: torch.Tensor | np.ndarray | None,
+    **kwargs,
+) -> list[list[int]]:
+    return [
+        _visual_token_cums(
+            sequence_idx=i,
+            input_ids=input_ids,
+            image_token_id=image_token_id,
+            video_token_id=video_token_id,
+            merge_size=merge_size,
+            focus_size=focus_size,
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+        )
+        for i in range(input_ids.shape[0])
+    ]
+def chunk_tokens(
+    max_chunk_size: int,
+    input_ids: torch.Tensor | np.ndarray,
+    image_token_id: int,
+    video_token_id: int,
+    merge_size: int,
+    focus_size: int,
+    image_grid_thw: torch.Tensor | np.ndarray | None,
+    video_grid_thw: torch.Tensor | np.ndarray | None,
+    **kwargs,
+) -> list[list[tuple[int, int]]]:
+    cums = visual_token_cums(
+        input_ids=input_ids,
+        image_token_id=image_token_id,
+        video_token_id=video_token_id,
+        merge_size=merge_size,
+        focus_size=focus_size,
+        image_grid_thw=image_grid_thw,
+        video_grid_thw=video_grid_thw,
+        **kwargs,
+    )
+    chunked_cums: list[list[tuple[int, int]]] = []
+    for sequence_cums in cums:
+        chunks = []
+        current_chunk_start = 0
+        current_chunk_size = 0
+        for cum in sequence_cums:
+            if current_chunk_size + cum > max_chunk_size:
+                chunks.append((current_chunk_start, current_chunk_start + current_chunk_size))
+                current_chunk_start += current_chunk_size
+                current_chunk_size = 0
+            current_chunk_size += cum
+        if current_chunk_size > 0:
+            chunks.append((current_chunk_start, current_chunk_start + current_chunk_size))
+        chunked_cums.append(chunks)
+    num_chunks = max(len(chunks) for chunks in chunked_cums)
+    for chunks in chunked_cums:
+        while len(chunks) < num_chunks:
+            chunks.append((chunks[-1][1], chunks[-1][1]))
+    return chunked_cums

processing_qwen3_vl.py CHANGED Viewed

@@ -1,7 +1,6 @@
 from typing import Optional, Union
 import numpy as np
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
@@ -9,12 +8,15 @@ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 from transformers.utils import logging
 from transformers.video_utils import VideoInput
 logger = logging.get_logger(__name__)
 class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
     focus_size: Optional[int]
 class Qwen3VLImagesKwargs(ImagesKwargs):
@@ -225,7 +227,27 @@ class ZFQwen3VLProcessor(ProcessorMixin):
             mm_token_type_ids[array_ids == self.image_token_id] = 1
             text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
     def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
         """

 from typing import Optional, Union
 import numpy as np
 from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from transformers.utils import logging
 from transformers.video_utils import VideoInput
+from .chunk_utils import chunk_tokens
 logger = logging.get_logger(__name__)
 class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
     focus_size: Optional[int]
+    max_chunk_size: Optional[int]
 class Qwen3VLImagesKwargs(ImagesKwargs):
             mm_token_type_ids[array_ids == self.image_token_id] = 1
             text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        chunks = chunk_tokens(
+            max_chunk_size=self.video_processor.max_chunk_size, # type: ignore
+            input_ids=np.array(text_inputs["input_ids"]),
+            image_token_id=self.image_token_id,
+            video_token_id=self.video_token_id,
+            merge_size=self.image_processor.merge_size, # type: ignore
+            focus_size=self.video_processor.focus_size, # type: ignore
+            image_grid_thw=image_grid_thw,
+            video_grid_thw=video_grid_thw,
+        )
+        image_token_mask = (text_inputs["input_ids"] == self.image_token_id)
+        video_token_mask = (text_inputs["input_ids"] == self.video_token_id)
+        return BatchFeature(data={
+            **text_inputs,
+            **image_inputs,
+            **videos_inputs,
+            "token_chunks": chunks,
+            "image_token_mask": image_token_mask,
+            "video_token_mask": video_token_mask,
+        }, tensor_type=return_tensors)
     def _get_num_multimodal_tokens(self, image_sizes=None, video_sizes=None, **kwargs):
         """

video_preprocessor_config.json CHANGED Viewed

@@ -26,6 +26,7 @@
     0.5
   ],
   "input_data_format": null,
   "max_frames": 2048,
   "merge_size": 2,
   "min_frames": 4,

     0.5
   ],
   "input_data_format": null,
+  "max_chunk_size": 4096,
   "max_frames": 2048,
   "merge_size": 2,
   "min_frames": 4,