| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| from __future__ import annotations |
|
|
| from transformers.configuration_utils import PretrainedConfig |
|
|
|
|
| class EO1InternVLPiFlowMatchingConfig(PretrainedConfig): |
| """ |
| EO1 Flow-Matching wrapper for InternVL backbone + Pi05-style action expert. |
| |
| Pi05 key properties (mirrors `openpi.models.pi0` with `pi05=True`): |
| - Prefix uses standard *causal* LM forward (flash-attn friendly) to build a per-layer KV cache. |
| - Action block is bidirectional within itself and can attend to the cached prefix KV. |
| - Flow-matching timestep is injected via AdaRMSNorm in the action expert (not concatenated into embeddings). |
| - Continuous state token in suffix is *disabled* (state should be encoded in text if needed). |
| """ |
|
|
| model_type = "eo1_internvl_pi" |
| keys_to_ignore_at_inference = ["past_key_values"] |
|
|
| def __init__( |
| self, |
| backbone_name_or_path: str | None = None, |
| |
| action_chunk_size: int = 16, |
| max_action_dim: int = 32, |
| num_denoise_steps: int = 10, |
| |
| action_token_id: int | None = None, |
| action_pass_id: int | None = None, |
| img_context_token_id: int | None = None, |
| ignore_index: int = -100, |
| |
| expert_init_from_backbone: bool = False, |
| |
| expert_num_hidden_layers: int | None = 18, |
| expert_hidden_size: int | None = 1024, |
| expert_intermediate_size: int | None = 3072, |
| expert_num_attention_heads: int | None = 16, |
| expert_layer_mapping: str = "last", |
| **kwargs, |
| ): |
| self.backbone_name_or_path = backbone_name_or_path |
|
|
| self.action_chunk_size = int(action_chunk_size) |
| self.max_action_dim = int(max_action_dim) |
| self.num_denoise_steps = int(num_denoise_steps) |
|
|
| self.action_token_id = action_token_id |
| self.action_pass_id = action_pass_id |
| self.img_context_token_id = img_context_token_id |
| self.ignore_index = int(ignore_index) |
|
|
| self.expert_init_from_backbone = bool(expert_init_from_backbone) |
| self.expert_num_hidden_layers = None if expert_num_hidden_layers is None else int(expert_num_hidden_layers) |
| self.expert_hidden_size = None if expert_hidden_size is None else int(expert_hidden_size) |
| self.expert_intermediate_size = None if expert_intermediate_size is None else int(expert_intermediate_size) |
| self.expert_num_attention_heads = None if expert_num_attention_heads is None else int(expert_num_attention_heads) |
| self.expert_layer_mapping = str(expert_layer_mapping) |
|
|
| super().__init__(**kwargs) |
|
|
|
|
| EO1InternVLPiFlowMatchingConfig.register_for_auto_class() |
|
|