fix chunking error
Browse files- processing_qwen3_vl.py +11 -4
processing_qwen3_vl.py
CHANGED
|
@@ -220,16 +220,17 @@ class ZFQwen3VLProcessor(ProcessorMixin):
|
|
| 220 |
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
| 221 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
|
| 222 |
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
|
|
|
|
|
|
|
| 223 |
|
| 224 |
if return_mm_token_type_ids:
|
| 225 |
-
array_ids = np.array(text_inputs["input_ids"])
|
| 226 |
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
|
| 227 |
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
| 228 |
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
| 229 |
|
| 230 |
chunks = chunk_tokens(
|
| 231 |
max_chunk_size=self.video_processor.max_chunk_size, # type: ignore
|
| 232 |
-
input_ids=
|
| 233 |
image_token_id=self.image_token_id,
|
| 234 |
video_token_id=self.video_token_id,
|
| 235 |
merge_size=self.image_processor.merge_size, # type: ignore
|
|
@@ -237,14 +238,20 @@ class ZFQwen3VLProcessor(ProcessorMixin):
|
|
| 237 |
image_grid_thw=image_grid_thw,
|
| 238 |
video_grid_thw=video_grid_thw,
|
| 239 |
)
|
| 240 |
-
image_token_mask = (
|
| 241 |
-
video_token_mask = (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
return BatchFeature(data={
|
| 244 |
**text_inputs,
|
| 245 |
**image_inputs,
|
| 246 |
**videos_inputs,
|
| 247 |
"token_chunks": chunks,
|
|
|
|
| 248 |
"image_token_mask": image_token_mask,
|
| 249 |
"video_token_mask": video_token_mask,
|
| 250 |
}, tensor_type=return_tensors)
|
|
|
|
| 220 |
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
|
| 221 |
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
|
| 222 |
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
|
| 223 |
+
array_ids = np.array(text_inputs["input_ids"])
|
| 224 |
+
array_attention_mask = np.array(text_inputs["attention_mask"]) if "attention_mask" in text_inputs else None
|
| 225 |
|
| 226 |
if return_mm_token_type_ids:
|
|
|
|
| 227 |
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
|
| 228 |
mm_token_type_ids[array_ids == self.image_token_id] = 1
|
| 229 |
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
|
| 230 |
|
| 231 |
chunks = chunk_tokens(
|
| 232 |
max_chunk_size=self.video_processor.max_chunk_size, # type: ignore
|
| 233 |
+
input_ids=array_ids,
|
| 234 |
image_token_id=self.image_token_id,
|
| 235 |
video_token_id=self.video_token_id,
|
| 236 |
merge_size=self.image_processor.merge_size, # type: ignore
|
|
|
|
| 238 |
image_grid_thw=image_grid_thw,
|
| 239 |
video_grid_thw=video_grid_thw,
|
| 240 |
)
|
| 241 |
+
image_token_mask = (array_ids == self.image_token_id).astype(int)
|
| 242 |
+
video_token_mask = (array_ids == self.video_token_id).astype(int)
|
| 243 |
+
text_token_mask = np.ones_like(image_token_mask) - image_token_mask - video_token_mask
|
| 244 |
+
if array_attention_mask is not None:
|
| 245 |
+
text_token_mask = text_token_mask * array_attention_mask
|
| 246 |
+
image_token_mask = image_token_mask * array_attention_mask
|
| 247 |
+
video_token_mask = video_token_mask * array_attention_mask
|
| 248 |
|
| 249 |
return BatchFeature(data={
|
| 250 |
**text_inputs,
|
| 251 |
**image_inputs,
|
| 252 |
**videos_inputs,
|
| 253 |
"token_chunks": chunks,
|
| 254 |
+
"text_token_mask": text_token_mask,
|
| 255 |
"image_token_mask": image_token_mask,
|
| 256 |
"video_token_mask": video_token_mask,
|
| 257 |
}, tensor_type=return_tensors)
|