|
|
""" |
|
|
EO1Vision processor for `eo_pi_internvl`. |
|
|
|
|
|
This is the InternVL-backbone EO1 processor with a Pi05-style action prompt: |
|
|
- We keep a *single* `<|action_pad|>` as a placeholder suffix token in text prompts. |
|
|
- The action expert consumes *continuous* action tokens (length=`action_chunk_size`) internally, so we do not need to |
|
|
repeat `<|action_pad|>` by chunk size in the text (this also keeps AR loss extensible). |
|
|
""" |
|
|
|
|
|
from __future__ import annotations |
|
|
|
|
|
from transformers.feature_extraction_utils import BatchFeature |
|
|
from transformers.image_utils import ImageInput |
|
|
from transformers.processing_utils import Unpack |
|
|
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput |
|
|
from transformers.video_utils import VideoInput |
|
|
|
|
|
from eo_internvl.model.processing_eo1_internvl import ( |
|
|
DEFAULT_ACTION_TOKEN, |
|
|
EO1VisionProcessor as _BaseEO1VisionProcessor, |
|
|
EO1VisionProcessorKwargs, |
|
|
RobotInput, |
|
|
) |
|
|
|
|
|
|
|
|
class EO1VisionProcessor(_BaseEO1VisionProcessor): |
|
|
def expand_action_prompt(self, chunk_size: int) -> str: |
|
|
|
|
|
return DEFAULT_ACTION_TOKEN |
|
|
|
|
|
def __call__( |
|
|
self, |
|
|
images: ImageInput = None, |
|
|
text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] = None, |
|
|
videos: VideoInput = None, |
|
|
states: RobotInput = None, |
|
|
actions: RobotInput = None, |
|
|
**kwargs: Unpack[EO1VisionProcessorKwargs], |
|
|
) -> BatchFeature: |
|
|
|
|
|
text_kwargs = kwargs.get("text_kwargs") or {} |
|
|
text_kwargs = dict(text_kwargs) |
|
|
text_kwargs["noise_token_num"] = 1 |
|
|
kwargs["text_kwargs"] = text_kwargs |
|
|
return super().__call__(images=images, text=text, videos=videos, states=states, actions=actions, **kwargs) |
|
|
|
|
|
|
|
|
EO1VisionProcessor.register_for_auto_class() |
|
|
|