sentseven/embeddings / processing_llava_eurobert.py
sentseven's picture
download
raw
7.35 kB
"""Custom processor for jina-embeddings-v5-omni-nano.
Keeps Qwen2VL image/video preprocessing (pixel_values, pixel_values_videos,
image_grid_thw, video_grid_thw) and folds both media placeholders into nano's
single `<image>` tokenizer token in the final tokenized output.
Mixed image+video inputs use distinct intermediate markers per modality so
the image- and video-expansion passes don't collide on a shared `<image>`
token — which is the root cause of the upstream Qwen2VLProcessor crash that
walks `while self.image_token in text[i]` and IndexErrors into image_grid_thw
when video placeholders are still in the text.
Two prompt conventions are recognised and disambiguated before expansion:
1. Proper Qwen placeholders — `<|image_pad|>` / `<|video_pad|>` (optionally
wrapped in `<|vision_start|>`/`<|vision_end|>`). The pre-replace pass
maps each to its own modality marker.
2. Bare `<image>` literals (the legacy convention emitted by `custom_st.py`
when chat templates collapse `image_token` and `video_token` to the
same string). Remaining bare `<image>` literals after pass 1 are
assigned to modality markers in order: as many as are still required
by `images` first, then `videos`. Anything beyond the matched count
is left as a literal `<image>` token (preserving the old single-modality
fallback).
After per-modality expansion both markers collapse to the real `<image>`
token before the tokenizer runs, so input_ids carry exactly the right
number of `<image>` ids in the right positions for masked_scatter to fill
with concatenated image+video features.
"""
import numpy as np
from transformers.feature_extraction_utils import BatchFeature
from transformers.models.qwen2_vl.processing_qwen2_vl import (
Qwen2VLProcessor,
Qwen2VLProcessorKwargs,
)
class LlavaEuroBertProcessor(Qwen2VLProcessor):
_IMG_MARKER = "<__JINA_IMG_PAD__>"
_VID_MARKER = "<__JINA_VID_PAD__>"
def __init__(
self,
image_processor=None,
tokenizer=None,
video_processor=None,
chat_template=None,
**kwargs,
):
super().__init__(
image_processor=image_processor,
tokenizer=tokenizer,
video_processor=video_processor,
chat_template=chat_template,
**kwargs,
)
self.image_token = "<image>"
self.image_token_id = tokenizer.convert_tokens_to_ids(
self.image_token
)
self.video_token = "<image>"
self.video_token_id = self.image_token_id
def __call__(
self, images=None, text=None, videos=None, **kwargs
):
output_kwargs = self._merge_kwargs(
Qwen2VLProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
image_inputs: dict = {}
videos_inputs: dict = {}
image_grid_thw = None
video_grid_thw = None
if images is not None:
image_inputs = self.image_processor(
images=images, **output_kwargs["images_kwargs"]
)
image_grid_thw = image_inputs["image_grid_thw"]
if videos is not None:
videos_inputs = self.video_processor(
videos=videos, **output_kwargs["videos_kwargs"]
)
video_grid_thw = videos_inputs["video_grid_thw"]
if text is None:
return BatchFeature(
data={**image_inputs, **videos_inputs},
tensor_type=output_kwargs["text_kwargs"].get("return_tensors"),
)
if isinstance(text, str):
text = [text]
text = list(text)
for i in range(len(text)):
t = text[i]
t = t.replace(
"<|vision_start|><|image_pad|><|vision_end|>",
self._IMG_MARKER,
)
t = t.replace(
"<|vision_start|><|video_pad|><|vision_end|>",
self._VID_MARKER,
)
t = t.replace("<|image_pad|>", self._IMG_MARKER)
t = t.replace("<|video_pad|>", self._VID_MARKER)
t = t.replace("<|vision_start|>", "")
t = t.replace("<|vision_end|>", "")
text[i] = t
n_images = len(images) if images is not None else 0
n_videos = len(videos) if videos is not None else 0
img_markers_in_text = sum(t.count(self._IMG_MARKER) for t in text)
vid_markers_in_text = sum(t.count(self._VID_MARKER) for t in text)
images_to_match = max(0, n_images - img_markers_in_text)
videos_to_match = max(0, n_videos - vid_markers_in_text)
if images_to_match or videos_to_match:
for i in range(len(text)):
if self.image_token not in text[i]:
continue
parts = text[i].split(self.image_token)
rebuilt = [parts[0]]
for p in parts[1:]:
if images_to_match > 0:
rebuilt.append(self._IMG_MARKER)
images_to_match -= 1
elif videos_to_match > 0:
rebuilt.append(self._VID_MARKER)
videos_to_match -= 1
else:
rebuilt.append(self.image_token)
rebuilt.append(p)
text[i] = "".join(rebuilt)
if images is not None and image_grid_thw is not None:
merge_length = self.image_processor.merge_size ** 2
index = 0
for i in range(len(text)):
while self._IMG_MARKER in text[i]:
n = int(image_grid_thw[index].prod()) // merge_length
text[i] = text[i].replace(
self._IMG_MARKER, self.image_token * n, 1
)
index += 1
if videos is not None and video_grid_thw is not None:
merge_length = self.video_processor.merge_size ** 2
index = 0
for i in range(len(text)):
while self._VID_MARKER in text[i]:
n = int(video_grid_thw[index].prod()) // merge_length
text[i] = text[i].replace(
self._VID_MARKER, self.video_token * n, 1
)
index += 1
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
return_mm_token_type_ids = output_kwargs["text_kwargs"].pop(
"return_mm_token_type_ids", False
)
text_inputs = self.tokenizer(
text, **output_kwargs["text_kwargs"], return_tensors=None
)
self._check_special_mm_tokens(
text, text_inputs, modalities=["image", "video"]
)
if return_mm_token_type_ids:
array_ids = np.array(text_inputs["input_ids"])
mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
mm_token_type_ids[array_ids == self.image_token_id] = 1
mm_token_type_ids[array_ids == self.video_token_id] = 2
text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
return BatchFeature(
data={**text_inputs, **image_inputs, **videos_inputs},
tensor_type=return_tensors,
)

Xet Storage Details

Size:
7.35 kB
·
Xet hash:
8f620b02951438df158b47d4ae08a260a0040583827946f6ed46e6246432db22

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.