0118 / checkpoint-250000 /processing_eo1_internvl.py

Upload folder using huggingface_hub

be1e979 verified 3 months ago

1.97 kB

	"""
	EO1Vision processor for `eo_pi_internvl`.

	This is the InternVL-backbone EO1 processor with a Pi05-style action prompt:
	- We keep a single `<\|action_pad\|>` as a placeholder suffix token in text prompts.
	- The action expert consumes continuous action tokens (length=`action_chunk_size`) internally, so we do not need to
	repeat `<\|action_pad\|>` by chunk size in the text (this also keeps AR loss extensible).
	"""

	from __future__ import annotations

	from transformers.feature_extraction_utils import BatchFeature
	from transformers.image_utils import ImageInput
	from transformers.processing_utils import Unpack
	from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
	from transformers.video_utils import VideoInput

	from eo_internvl.model.processing_eo1_internvl import (
	DEFAULT_ACTION_TOKEN,
	EO1VisionProcessor as _BaseEO1VisionProcessor,
	EO1VisionProcessorKwargs,
	RobotInput,
	)


	class EO1VisionProcessor(_BaseEO1VisionProcessor):
	def expand_action_prompt(self, chunk_size: int) -> str:
	# Pi05-style: keep a single placeholder token in text; the model builds the full continuous action block.
	return DEFAULT_ACTION_TOKEN

	def __call__(
	self,
	images: ImageInput = None,
	text: TextInput \| PreTokenizedInput \| list[TextInput] \| list[PreTokenizedInput] = None,
	videos: VideoInput = None,
	states: RobotInput = None,
	actions: RobotInput = None,
	**kwargs: Unpack[EO1VisionProcessorKwargs],
	) -> BatchFeature:
	# Force action-token expansion length to 1 (no-op), regardless of robot_config / caller.
	text_kwargs = kwargs.get("text_kwargs") or {}
	text_kwargs = dict(text_kwargs)
	text_kwargs["noise_token_num"] = 1
	kwargs["text_kwargs"] = text_kwargs
	return super().__call__(images=images, text=text, videos=videos, states=states, actions=actions, **kwargs)


	EO1VisionProcessor.register_for_auto_class()