0118 / checkpoint-200000 /configuration_eo1_internvl.py
jasonzhango's picture
Upload folder using huggingface_hub
b5b48a7 verified
# Copyright 2026 EO-Robotics Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import annotations
from transformers.configuration_utils import PretrainedConfig
class EO1InternVLPiFlowMatchingConfig(PretrainedConfig):
"""
EO1 Flow-Matching wrapper for InternVL backbone + Pi05-style action expert.
Pi05 key properties (mirrors `openpi.models.pi0` with `pi05=True`):
- Prefix uses standard *causal* LM forward (flash-attn friendly) to build a per-layer KV cache.
- Action block is bidirectional within itself and can attend to the cached prefix KV.
- Flow-matching timestep is injected via AdaRMSNorm in the action expert (not concatenated into embeddings).
- Continuous state token in suffix is *disabled* (state should be encoded in text if needed).
"""
model_type = "eo1_internvl_pi"
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
backbone_name_or_path: str | None = None,
# Flow matching
action_chunk_size: int = 16,
max_action_dim: int = 32,
num_denoise_steps: int = 10,
# Tokens
action_token_id: int | None = None,
action_pass_id: int | None = None,
img_context_token_id: int | None = None,
ignore_index: int = -100,
# Expert init
expert_init_from_backbone: bool = False,
# Expert architecture (Pi05-style: smaller action expert than VLM)
expert_num_hidden_layers: int | None = 18,
expert_hidden_size: int | None = 1024,
expert_intermediate_size: int | None = 3072,
expert_num_attention_heads: int | None = 16,
expert_layer_mapping: str = "last",
**kwargs,
):
self.backbone_name_or_path = backbone_name_or_path
self.action_chunk_size = int(action_chunk_size)
self.max_action_dim = int(max_action_dim)
self.num_denoise_steps = int(num_denoise_steps)
self.action_token_id = action_token_id
self.action_pass_id = action_pass_id
self.img_context_token_id = img_context_token_id
self.ignore_index = int(ignore_index)
self.expert_init_from_backbone = bool(expert_init_from_backbone)
self.expert_num_hidden_layers = None if expert_num_hidden_layers is None else int(expert_num_hidden_layers)
self.expert_hidden_size = None if expert_hidden_size is None else int(expert_hidden_size)
self.expert_intermediate_size = None if expert_intermediate_size is None else int(expert_intermediate_size)
self.expert_num_attention_heads = None if expert_num_attention_heads is None else int(expert_num_attention_heads)
self.expert_layer_mapping = str(expert_layer_mapping)
super().__init__(**kwargs)
EO1InternVLPiFlowMatchingConfig.register_for_auto_class()