0118 / checkpoint-200000 /configuration_eo1_internvl.py

Upload folder using huggingface_hub

b5b48a7 verified 4 months ago

3.34 kB

	# Copyright 2026 EO-Robotics Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	from __future__ import annotations

	from transformers.configuration_utils import PretrainedConfig


	class EO1InternVLPiFlowMatchingConfig(PretrainedConfig):
	"""
	EO1 Flow-Matching wrapper for InternVL backbone + Pi05-style action expert.

	Pi05 key properties (mirrors `openpi.models.pi0` with `pi05=True`):
	- Prefix uses standard causal LM forward (flash-attn friendly) to build a per-layer KV cache.
	- Action block is bidirectional within itself and can attend to the cached prefix KV.
	- Flow-matching timestep is injected via AdaRMSNorm in the action expert (not concatenated into embeddings).
	- Continuous state token in suffix is disabled (state should be encoded in text if needed).
	"""

	model_type = "eo1_internvl_pi"
	keys_to_ignore_at_inference = ["past_key_values"]

	def __init__(
	self,
	backbone_name_or_path: str \| None = None,
	# Flow matching
	action_chunk_size: int = 16,
	max_action_dim: int = 32,
	num_denoise_steps: int = 10,
	# Tokens
	action_token_id: int \| None = None,
	action_pass_id: int \| None = None,
	img_context_token_id: int \| None = None,
	ignore_index: int = -100,
	# Expert init
	expert_init_from_backbone: bool = False,
	# Expert architecture (Pi05-style: smaller action expert than VLM)
	expert_num_hidden_layers: int \| None = 18,
	expert_hidden_size: int \| None = 1024,
	expert_intermediate_size: int \| None = 3072,
	expert_num_attention_heads: int \| None = 16,
	expert_layer_mapping: str = "last",
	**kwargs,
	):
	self.backbone_name_or_path = backbone_name_or_path

	self.action_chunk_size = int(action_chunk_size)
	self.max_action_dim = int(max_action_dim)
	self.num_denoise_steps = int(num_denoise_steps)

	self.action_token_id = action_token_id
	self.action_pass_id = action_pass_id
	self.img_context_token_id = img_context_token_id
	self.ignore_index = int(ignore_index)

	self.expert_init_from_backbone = bool(expert_init_from_backbone)
	self.expert_num_hidden_layers = None if expert_num_hidden_layers is None else int(expert_num_hidden_layers)
	self.expert_hidden_size = None if expert_hidden_size is None else int(expert_hidden_size)
	self.expert_intermediate_size = None if expert_intermediate_size is None else int(expert_intermediate_size)
	self.expert_num_attention_heads = None if expert_num_attention_heads is None else int(expert_num_attention_heads)
	self.expert_layer_mapping = str(expert_layer_mapping)

	super().__init__(**kwargs)


	EO1InternVLPiFlowMatchingConfig.register_for_auto_class()