File size: 2,572 Bytes
b7b76ba |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 |
# Copyright 2025 EO-Robotics Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from transformers.configuration_utils import PretrainedConfig
from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import (
Qwen2_5_VLTextConfig,
Qwen2_5_VLVisionConfig,
)
class EO1VisionFlowMatchingConfig(PretrainedConfig):
model_type = "eo1"
sub_configs = {"vision_config": Qwen2_5_VLVisionConfig, "text_config": Qwen2_5_VLTextConfig}
keys_to_ignore_at_inference = ["past_key_values"]
def __init__(
self,
text_config=None,
vision_config=None,
image_token_id=151655,
video_token_id=151656,
action_chunk_size=50,
max_action_dim=32,
num_denoise_steps=10,
action_act="linear",
num_action_layers=2,
state_token_id=151669,
action_token_id=151666,
action_pass_id=151672,
**kwargs,
):
if isinstance(vision_config, dict):
self.vision_config = self.sub_configs["vision_config"](**vision_config)
elif vision_config is None:
self.vision_config = self.sub_configs["vision_config"](
hidden_size=1280,
out_hidden_size=2048,
tokens_per_second=2,
)
if isinstance(text_config, dict):
self.text_config = self.sub_configs["text_config"](**text_config)
elif text_config is None:
self.text_config = self.sub_configs["text_config"](**kwargs)
self.image_token_id = image_token_id
self.video_token_id = video_token_id
self.state_token_id = state_token_id
self.action_token_id = action_token_id
self.action_pass_id = action_pass_id
self.action_chunk_size = action_chunk_size
self.max_action_dim = max_action_dim
self.num_denoise_steps = num_denoise_steps
self.action_act = action_act
self.num_action_layers = num_action_layers
super().__init__(**kwargs)
EO1VisionFlowMatchingConfig.register_for_auto_class()
|