TYTTYTTYT commited on
Commit
496d1ea
·
verified ·
1 Parent(s): e7818b4

fix chunking error

Browse files
Files changed (1) hide show
  1. processing_qwen3_vl.py +11 -4
processing_qwen3_vl.py CHANGED
@@ -220,16 +220,17 @@ class ZFQwen3VLProcessor(ProcessorMixin):
220
  return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
221
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
222
  self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
 
 
223
 
224
  if return_mm_token_type_ids:
225
- array_ids = np.array(text_inputs["input_ids"])
226
  mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
227
  mm_token_type_ids[array_ids == self.image_token_id] = 1
228
  text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
229
 
230
  chunks = chunk_tokens(
231
  max_chunk_size=self.video_processor.max_chunk_size, # type: ignore
232
- input_ids=np.array(text_inputs["input_ids"]),
233
  image_token_id=self.image_token_id,
234
  video_token_id=self.video_token_id,
235
  merge_size=self.image_processor.merge_size, # type: ignore
@@ -237,14 +238,20 @@ class ZFQwen3VLProcessor(ProcessorMixin):
237
  image_grid_thw=image_grid_thw,
238
  video_grid_thw=video_grid_thw,
239
  )
240
- image_token_mask = (text_inputs["input_ids"] == self.image_token_id)
241
- video_token_mask = (text_inputs["input_ids"] == self.video_token_id)
 
 
 
 
 
242
 
243
  return BatchFeature(data={
244
  **text_inputs,
245
  **image_inputs,
246
  **videos_inputs,
247
  "token_chunks": chunks,
 
248
  "image_token_mask": image_token_mask,
249
  "video_token_mask": video_token_mask,
250
  }, tensor_type=return_tensors)
 
220
  return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
221
  text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) # type: ignore
222
  self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) # type: ignore
223
+ array_ids = np.array(text_inputs["input_ids"])
224
+ array_attention_mask = np.array(text_inputs["attention_mask"]) if "attention_mask" in text_inputs else None
225
 
226
  if return_mm_token_type_ids:
 
227
  mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
228
  mm_token_type_ids[array_ids == self.image_token_id] = 1
229
  text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
230
 
231
  chunks = chunk_tokens(
232
  max_chunk_size=self.video_processor.max_chunk_size, # type: ignore
233
+ input_ids=array_ids,
234
  image_token_id=self.image_token_id,
235
  video_token_id=self.video_token_id,
236
  merge_size=self.image_processor.merge_size, # type: ignore
 
238
  image_grid_thw=image_grid_thw,
239
  video_grid_thw=video_grid_thw,
240
  )
241
+ image_token_mask = (array_ids == self.image_token_id).astype(int)
242
+ video_token_mask = (array_ids == self.video_token_id).astype(int)
243
+ text_token_mask = np.ones_like(image_token_mask) - image_token_mask - video_token_mask
244
+ if array_attention_mask is not None:
245
+ text_token_mask = text_token_mask * array_attention_mask
246
+ image_token_mask = image_token_mask * array_attention_mask
247
+ video_token_mask = video_token_mask * array_attention_mask
248
 
249
  return BatchFeature(data={
250
  **text_inputs,
251
  **image_inputs,
252
  **videos_inputs,
253
  "token_chunks": chunks,
254
+ "text_token_mask": text_token_mask,
255
  "image_token_mask": image_token_mask,
256
  "video_token_mask": video_token_mask,
257
  }, tensor_type=return_tensors)