PanoVLM-500M / processing_panovlm.py
sd24's picture
Upload folder using huggingface_hub
a4f7250 verified
Raw
History Blame Contribute Delete
3.57 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
"""Standalone HF processor for PanoVLM: image processor + tokenizer.
Expands each ``<|image|>`` placeholder into ``image_seq_length`` repeated image
tokens wrapped by ``<|begin_of_image|>``/``<|end_of_image|>``, then tokenizes.
``image_seq_length`` is fixed for a given image resolution and is stored in the
processor config at upload time.
"""
from __future__ import annotations
from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessorMixin
class PanoVLMProcessor(ProcessorMixin):
attributes = ["image_processor", "tokenizer"]
image_processor_class = "AutoImageProcessor"
tokenizer_class = "AutoTokenizer"
# Hardcoded PanoVLM image placeholder tokens (not configurable / not serialized).
image_token = "<|image|>"
boi_token = "<|begin_of_image|>"
eoi_token = "<|end_of_image|>"
def __init__(
self,
image_processor=None,
tokenizer=None,
image_seq_length=None,
chat_template=None,
**kwargs,
):
# Set extra scalar attrs before super().__init__ (mirrors LlavaProcessor pattern).
# ProcessorMixin.__init__ in transformers 5.x only accepts modality args
# (image_processor, tokenizer) plus chat_template — extra scalars like
# image_seq_length must be stored before the super() call.
self.image_seq_length = image_seq_length
super().__init__(image_processor, tokenizer, chat_template=chat_template)
def _expand_image_tokens(self, text: str) -> str:
"""Replace each ``<|image|>`` with BOI + N image tokens + EOI."""
if self.image_seq_length is None:
raise ValueError(
"image_seq_length is not set on this processor. Pass "
"image_seq_length=... when constructing, or load a processor saved "
"with it set."
)
block = (
self.boi_token + (self.image_token * self.image_seq_length) + self.eoi_token
)
return text.replace(self.image_token, block)
def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
if text is None:
raise ValueError("PanoVLMProcessor requires `text`.")
if isinstance(text, str):
text = [text]
# rstrip before tokenizing: chat templates end with a trailing space (e.g.
# "ASSISTANT: ") that SentencePiece tokenizes as a standalone ▁ token. That
# token never appears at this position in training data (the space is absorbed
# into the first generated token there), so it breaks the first-token distribution.
# Mirrors the explicit .rstrip() in SFTMultiModalProcessor.prepare_for_generation.
text = [self._expand_image_tokens(t).rstrip() for t in text]
data = {}
if images is not None:
image_inputs = self.image_processor(images, return_tensors=return_tensors)
data.update(image_inputs)
text_inputs = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
data.update(text_inputs)
return BatchFeature(data=data, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
return self.tokenizer.batch_decode(*args, **kwargs)
def decode(self, *args, **kwargs):
return self.tokenizer.decode(*args, **kwargs)