# Copyright (c) Meta Platforms, Inc. and affiliates. # All rights reserved. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. """Standalone HF processor for PanoVLM: image processor + tokenizer. Expands each ``<|image|>`` placeholder into ``image_seq_length`` repeated image tokens wrapped by ``<|begin_of_image|>``/``<|end_of_image|>``, then tokenizes. ``image_seq_length`` is fixed for a given image resolution and is stored in the processor config at upload time. """ from __future__ import annotations from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin class PanoVLMProcessor(ProcessorMixin): attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" # Hardcoded PanoVLM image placeholder tokens (not configurable / not serialized). image_token = "<|image|>" boi_token = "<|begin_of_image|>" eoi_token = "<|end_of_image|>" def __init__( self, image_processor=None, tokenizer=None, image_seq_length=None, chat_template=None, **kwargs, ): # Set extra scalar attrs before super().__init__ (mirrors LlavaProcessor pattern). # ProcessorMixin.__init__ in transformers 5.x only accepts modality args # (image_processor, tokenizer) plus chat_template — extra scalars like # image_seq_length must be stored before the super() call. self.image_seq_length = image_seq_length super().__init__(image_processor, tokenizer, chat_template=chat_template) def _expand_image_tokens(self, text: str) -> str: """Replace each ``<|image|>`` with BOI + N image tokens + EOI.""" if self.image_seq_length is None: raise ValueError( "image_seq_length is not set on this processor. Pass " "image_seq_length=... when constructing, or load a processor saved " "with it set." ) block = ( self.boi_token + (self.image_token * self.image_seq_length) + self.eoi_token ) return text.replace(self.image_token, block) def __call__(self, text=None, images=None, return_tensors=None, **kwargs): if text is None: raise ValueError("PanoVLMProcessor requires `text`.") if isinstance(text, str): text = [text] # rstrip before tokenizing: chat templates end with a trailing space (e.g. # "ASSISTANT: ") that SentencePiece tokenizes as a standalone ▁ token. That # token never appears at this position in training data (the space is absorbed # into the first generated token there), so it breaks the first-token distribution. # Mirrors the explicit .rstrip() in SFTMultiModalProcessor.prepare_for_generation. text = [self._expand_image_tokens(t).rstrip() for t in text] data = {} if images is not None: image_inputs = self.image_processor(images, return_tensors=return_tensors) data.update(image_inputs) text_inputs = self.tokenizer(text, return_tensors=return_tensors, **kwargs) data.update(text_inputs) return BatchFeature(data=data, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): return self.tokenizer.decode(*args, **kwargs)