TYTTYTTYT
/

zf_qwen3_vl_processor

Transformers

Model card Files Files and versions

xet

Community

TYTTYTTYT commited on 3 days ago

Commit

0c9c5ce

verified ·

1 Parent(s): 496d1ea

add image and video chunk to each token chunk in the processor

Browse files

Files changed (2) hide show

chunk_utils.py +65 -14
processing_qwen3_vl.py +43 -5

chunk_utils.py CHANGED Viewed

@@ -1,8 +1,17 @@
 from collections import deque
 import torch
 import numpy as np
 def _visual_token_cums(
     sequence_idx: int,
     input_ids: torch.Tensor | np.ndarray,
@@ -13,8 +22,8 @@ def _visual_token_cums(
     image_grid_thw: torch.Tensor | np.ndarray | None,
     video_grid_thw: torch.Tensor | np.ndarray | None,
     **kwargs,
-) -> list[int]:
-    cums: deque[int] = deque()
     video_idx = 0
     frame_idx = 0
@@ -30,7 +39,12 @@ def _visual_token_cums(
             assert image_grid_thw is not None, "image_grid_thw must be provided when image_token_id is used"
             _, h, w = image_grid_thw[image_idx].tolist()
             num_tokens = h * w // (merge_size ** 2)
-            cums.append(num_tokens)
             token_idx += num_tokens
             image_idx += 1
         elif token == video_token_id:
@@ -41,7 +55,11 @@ def _visual_token_cums(
             cum += num_tokens
             if (frame_idx + 1) % focus_size == 0:
-                cums.append(cum)
                 cum = 0
                 in_video = False
             else:
@@ -56,13 +74,14 @@ def _visual_token_cums(
         else:
             if not in_video:
-                cums.append(1)
             else:
                 cum += 1
             token_idx += 1
     return list(cums)
 def visual_token_cums(
     input_ids: torch.Tensor | np.ndarray,
     image_token_id: int,
@@ -72,7 +91,7 @@ def visual_token_cums(
     image_grid_thw: torch.Tensor | np.ndarray | None,
     video_grid_thw: torch.Tensor | np.ndarray | None,
     **kwargs,
-) -> list[list[int]]:
     return [
         _visual_token_cums(
             sequence_idx=i,
@@ -87,6 +106,15 @@ def visual_token_cums(
         for i in range(input_ids.shape[0])
     ]
 def chunk_tokens(
     max_chunk_size: int,
     input_ids: torch.Tensor | np.ndarray,
@@ -97,7 +125,7 @@ def chunk_tokens(
     image_grid_thw: torch.Tensor | np.ndarray | None,
     video_grid_thw: torch.Tensor | np.ndarray | None,
     **kwargs,
-) -> list[list[tuple[int, int]]]:
     cums = visual_token_cums(
         input_ids=input_ids,
         image_token_id=image_token_id,
@@ -109,29 +137,52 @@ def chunk_tokens(
         **kwargs,
     )
-    chunked_cums: list[list[tuple[int, int]]] = []
     for sequence_cums in cums:
-        chunks = []
         current_chunk_start = 0
         current_chunk_size = 0
         for cum in sequence_cums:
-            if current_chunk_size + cum > max_chunk_size:
-                chunks.append((current_chunk_start, current_chunk_start + current_chunk_size))
                 current_chunk_start += current_chunk_size
                 current_chunk_size = 0
-            current_chunk_size += cum
         if current_chunk_size > 0:
-            chunks.append((current_chunk_start, current_chunk_start + current_chunk_size))
         chunked_cums.append(chunks)
     num_chunks = max(len(chunks) for chunks in chunked_cums)
     for chunks in chunked_cums:
         while len(chunks) < num_chunks:
-            chunks.append((chunks[-1][1], chunks[-1][1]))
     return chunked_cums

 from collections import deque
+from dataclasses import dataclass
 import torch
 import numpy as np
+@dataclass
+class ChunkCum:
+    cum: int
+    image_grid_thw: tuple[int, int, int] | None = None
+    video_grid_thw: tuple[int, int, int] | None = None
 def _visual_token_cums(
     sequence_idx: int,
     input_ids: torch.Tensor | np.ndarray,
     image_grid_thw: torch.Tensor | np.ndarray | None,
     video_grid_thw: torch.Tensor | np.ndarray | None,
     **kwargs,
+) -> list[ChunkCum]:
+    cums: deque[ChunkCum] = deque()
     video_idx = 0
     frame_idx = 0
             assert image_grid_thw is not None, "image_grid_thw must be provided when image_token_id is used"
             _, h, w = image_grid_thw[image_idx].tolist()
             num_tokens = h * w // (merge_size ** 2)
+            cums.append(ChunkCum(
+                cum=num_tokens,
+                image_grid_thw=(1, h, w),
+                video_grid_thw=None
+                )
+            )
             token_idx += num_tokens
             image_idx += 1
         elif token == video_token_id:
             cum += num_tokens
             if (frame_idx + 1) % focus_size == 0:
+                cums.append(ChunkCum(
+                    cum=cum,
+                    image_grid_thw=None,
+                    video_grid_thw=(focus_size, h, w),
+                ))
                 cum = 0
                 in_video = False
             else:
         else:
             if not in_video:
+                cums.append(ChunkCum(cum=cum, image_grid_thw=None, video_grid_thw=None))
             else:
                 cum += 1
             token_idx += 1
     return list(cums)
 def visual_token_cums(
     input_ids: torch.Tensor | np.ndarray,
     image_token_id: int,
     image_grid_thw: torch.Tensor | np.ndarray | None,
     video_grid_thw: torch.Tensor | np.ndarray | None,
     **kwargs,
+) -> list[list[ChunkCum]]:
     return [
         _visual_token_cums(
             sequence_idx=i,
         for i in range(input_ids.shape[0])
     ]
+@dataclass
+class Chunk:
+    start: int
+    end: int
+    image_grid_thws: list[tuple[int, int, int]]
+    video_grid_thws: list[tuple[int, int, int]]
 def chunk_tokens(
     max_chunk_size: int,
     input_ids: torch.Tensor | np.ndarray,
     image_grid_thw: torch.Tensor | np.ndarray | None,
     video_grid_thw: torch.Tensor | np.ndarray | None,
     **kwargs,
+) -> list[list[Chunk]]:
     cums = visual_token_cums(
         input_ids=input_ids,
         image_token_id=image_token_id,
         **kwargs,
     )
+    chunked_cums: list[list[Chunk]] = []
     for sequence_cums in cums:
+        chunks: list[Chunk] = []
         current_chunk_start = 0
         current_chunk_size = 0
+        current_image_grid_thws: list[tuple[int, int, int]] = []
+        current_video_grid_thws: list[tuple[int, int, int]] = []
         for cum in sequence_cums:
+            if cum.image_grid_thw is not None:
+                current_image_grid_thws.append(cum.image_grid_thw)
+            if cum.video_grid_thw is not None:
+                current_video_grid_thws.append(cum.video_grid_thw)
+            if current_chunk_size + cum.cum > max_chunk_size:
+                chunks.append(Chunk(
+                    start=current_chunk_start,
+                    end=current_chunk_start + current_chunk_size,
+                    image_grid_thws=current_image_grid_thws,
+                    video_grid_thws=current_video_grid_thws
+                ))
                 current_chunk_start += current_chunk_size
                 current_chunk_size = 0
+                current_image_grid_thws = []
+                current_video_grid_thws = []
+            current_chunk_size += cum.cum
         if current_chunk_size > 0:
+            chunks.append(Chunk(
+                start=current_chunk_start,
+                end=current_chunk_start + current_chunk_size,
+                image_grid_thws=current_image_grid_thws,
+                video_grid_thws=current_video_grid_thws,
+            ))
         chunked_cums.append(chunks)
     num_chunks = max(len(chunks) for chunks in chunked_cums)
     for chunks in chunked_cums:
         while len(chunks) < num_chunks:
+            chunks.append(Chunk(
+                start=chunks[-1].end,
+                end=chunks[-1].end,
+                image_grid_thws=[],
+                video_grid_thws=[],
+            ))
     return chunked_cums

processing_qwen3_vl.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from typing import Optional, Union
 import numpy as np
-from transformers.feature_extraction_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
@@ -14,6 +14,44 @@ from .chunk_utils import chunk_tokens
 logger = logging.get_logger(__name__)
 class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
     focus_size: Optional[int]
     max_chunk_size: Optional[int]
@@ -99,7 +137,7 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, # type: ignore
         videos: VideoInput = None, # type: ignore
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
-    ) -> BatchFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
@@ -125,7 +163,7 @@ class ZFQwen3VLProcessor(ProcessorMixin):
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
@@ -246,7 +284,7 @@ class ZFQwen3VLProcessor(ProcessorMixin):
             image_token_mask = image_token_mask * array_attention_mask
             video_token_mask = video_token_mask * array_attention_mask
-        return BatchFeature(data={
             **text_inputs,
             **image_inputs,
             **videos_inputs,

+from typing import Any, Optional, Union
+import torch
 import numpy as np
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ImagesKwargs, MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack, VideosKwargs
 from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
 logger = logging.get_logger(__name__)
+class MMFeature(dict):
+    def __init__(self, data, tensor_type: str | None = None):
+        super().__init__(data)
+        self.tensor_type = tensor_type
+        self.convert_to_tensor()
+    def convert_to_tensor(self) -> "MMFeature":
+        if self.tensor_type is None:
+            return self
+        match self.tensor_type:
+            case "pt":
+                for k, v in self.items():
+                    if not isinstance(v, torch.Tensor):
+                        try:
+                            self[k] = torch.tensor(v)
+                        except Exception:
+                            pass
+            case "np":
+                for k, v in self.items():
+                    if not isinstance(v, np.ndarray):
+                        try:
+                            self[k] = np.array(v)
+                        except Exception:
+                            pass
+            case _:
+                raise ValueError(f"Unsupported tensor type: {self.tensor_type}")
+        return self
+    def to(self, target: Any) -> "MMFeature":
+        for k, v in self.items():
+            if isinstance(v, torch.Tensor):
+                self[k] = v.to(target)
+        return self
 class Qwen3VLVideosProcessorKwargs(VideosKwargs, total=False):
     focus_size: Optional[int]
     max_chunk_size: Optional[int]
         text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None, # type: ignore
         videos: VideoInput = None, # type: ignore
         **kwargs: Unpack[Qwen3VLProcessorKwargs],
+    ) -> MMFeature:
         """
         Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
         and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
                 - `'jax'`: Return JAX `jnp.ndarray` objects.
         Returns:
+            [`MMFeature`]: A [`MMFeature`] with the following fields:
             - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
             - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
             image_token_mask = image_token_mask * array_attention_mask
             video_token_mask = video_token_mask * array_attention_mask
+        return MMFeature(data={
             **text_inputs,
             **image_inputs,
             **videos_inputs,