TYTTYTTYT
/

zf_qwen3_vl_processor

Transformers

Model card Files Files and versions

xet

Community

TYTTYTTYT commited on 7 days ago

Commit

496d1ea

verified ·

1 Parent(s): e7818b4

fix chunking error

Browse files

Files changed (1) hide show

processing_qwen3_vl.py +11 -4

processing_qwen3_vl.py CHANGED Viewed

@@ -220,16 +220,17 @@ class ZFQwen3VLProcessor(ProcessorMixin):
         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
         self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
         if return_mm_token_type_ids:
-            array_ids = np.array(text_inputs["input_ids"])
             mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
             mm_token_type_ids[array_ids == self.image_token_id] = 1
             text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
         chunks = chunk_tokens(
             max_chunk_size=self.video_processor.max_chunk_size, # type: ignore
-            input_ids=np.array(text_inputs["input_ids"]),
             image_token_id=self.image_token_id,
             video_token_id=self.video_token_id,
             merge_size=self.image_processor.merge_size, # type: ignore
@@ -237,14 +238,20 @@ class ZFQwen3VLProcessor(ProcessorMixin):
             image_grid_thw=image_grid_thw,
             video_grid_thw=video_grid_thw,
         )
-        image_token_mask = (text_inputs["input_ids"] == self.image_token_id)
-        video_token_mask = (text_inputs["input_ids"] == self.video_token_id)
         return BatchFeature(data={
             **text_inputs,
             **image_inputs,
             **videos_inputs,
             "token_chunks": chunks,
             "image_token_mask": image_token_mask,
             "video_token_mask": video_token_mask,
         }, tensor_type=return_tensors)

         return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
         text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
         self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
+        array_ids = np.array(text_inputs["input_ids"])
+        array_attention_mask = np.array(text_inputs["attention_mask"]) if "attention_mask" in text_inputs else None
         if return_mm_token_type_ids:
             mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
             mm_token_type_ids[array_ids == self.image_token_id] = 1
             text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
         chunks = chunk_tokens(
             max_chunk_size=self.video_processor.max_chunk_size, # type: ignore
+            input_ids=array_ids,
             image_token_id=self.image_token_id,
             video_token_id=self.video_token_id,
             merge_size=self.image_processor.merge_size, # type: ignore
             image_grid_thw=image_grid_thw,
             video_grid_thw=video_grid_thw,
         )
+        image_token_mask = (array_ids == self.image_token_id).astype(int)
+        video_token_mask = (array_ids == self.video_token_id).astype(int)
+        text_token_mask = np.ones_like(image_token_mask) - image_token_mask - video_token_mask
+        if array_attention_mask is not None:
+            text_token_mask = text_token_mask * array_attention_mask
+            image_token_mask = image_token_mask * array_attention_mask
+            video_token_mask = video_token_mask * array_attention_mask
         return BatchFeature(data={
             **text_inputs,
             **image_inputs,
             **videos_inputs,
             "token_chunks": chunks,
+            "text_token_mask": text_token_mask,
             "image_token_mask": image_token_mask,
             "video_token_mask": video_token_mask,
         }, tensor_type=return_tensors)