# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

"""Standalone HF processor for PanoVLM: image processor + tokenizer.

Expands each ``<|image|>`` placeholder into ``image_seq_length`` repeated image
tokens wrapped by ``<|begin_of_image|>``/``<|end_of_image|>``, then tokenizes.
``image_seq_length`` is fixed for a given image resolution and is stored in the
processor config at upload time.
"""

from __future__ import annotations

from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessorMixin


class PanoVLMProcessor(ProcessorMixin):
    attributes = ["image_processor", "tokenizer"]
    image_processor_class = "AutoImageProcessor"
    tokenizer_class = "AutoTokenizer"

    # Hardcoded PanoVLM image placeholder tokens (not configurable / not serialized).
    image_token = "<|image|>"
    boi_token = "<|begin_of_image|>"
    eoi_token = "<|end_of_image|>"

    def __init__(
        self,
        image_processor=None,
        tokenizer=None,
        image_seq_length=None,
        chat_template=None,
        **kwargs,
    ):
        # Set extra scalar attrs before super().__init__ (mirrors LlavaProcessor pattern).
        # ProcessorMixin.__init__ in transformers 5.x only accepts modality args
        # (image_processor, tokenizer) plus chat_template — extra scalars like
        # image_seq_length must be stored before the super() call.
        self.image_seq_length = image_seq_length
        super().__init__(image_processor, tokenizer, chat_template=chat_template)

    def _expand_image_tokens(self, text: str) -> str:
        """Replace each ``<|image|>`` with BOI + N image tokens + EOI."""
        if self.image_seq_length is None:
            raise ValueError(
                "image_seq_length is not set on this processor. Pass "
                "image_seq_length=... when constructing, or load a processor saved "
                "with it set."
            )
        block = (
            self.boi_token + (self.image_token * self.image_seq_length) + self.eoi_token
        )
        return text.replace(self.image_token, block)

    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
        if text is None:
            raise ValueError("PanoVLMProcessor requires `text`.")
        if isinstance(text, str):
            text = [text]
        # rstrip before tokenizing: chat templates end with a trailing space (e.g.
        # "ASSISTANT: ") that SentencePiece tokenizes as a standalone ▁ token.  That
        # token never appears at this position in training data (the space is absorbed
        # into the first generated token there), so it breaks the first-token distribution.
        # Mirrors the explicit .rstrip() in SFTMultiModalProcessor.prepare_for_generation.
        text = [self._expand_image_tokens(t).rstrip() for t in text]

        data = {}
        if images is not None:
            image_inputs = self.image_processor(images, return_tensors=return_tensors)
            data.update(image_inputs)
        text_inputs = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
        data.update(text_inputs)
        return BatchFeature(data=data, tensor_type=return_tensors)

    def batch_decode(self, *args, **kwargs):
        return self.tokenizer.batch_decode(*args, **kwargs)

    def decode(self, *args, **kwargs):
        return self.tokenizer.decode(*args, **kwargs)