LEE181204
/

BadSpa_object_fir

+# coding=utf-8
+# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from transformers import CONFIG_MAPPING, AutoConfig
+logger = logging.get_logger(__name__)
+class SpatialVLAConfig(PretrainedConfig):
+    model_type = "spatialvla"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig, "vision_zoe_config": AutoConfig}
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=256000,
+        vocab_size=257152,
+        projection_dim=2048,
+        hidden_size=2048,
+        vision_zoe_config=None,
+        action_token_begin_idx=None,
+        spatial_token_num=259,
+        use_spatial_token=False,
+        ego3d_patch_reso=4,
+        n_freqs=8,
+        use_vision_zoe=True,
+        **kwargs,
+    ):
+        self._ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self._vocab_size = vocab_size
+        self.projection_dim = projection_dim
+        self.hidden_size = hidden_size
+        self.vision_config = vision_config
+        self.is_encoder_decoder = False
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "siglip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1152,
+                patch_size=14,
+                image_size=224,
+                num_hidden_layers=27,
+                num_attention_heads=16,
+                vocab_size=257152,
+                vision_use_head=False,
+            )
+        self.text_config = text_config
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "gemma2"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["gemma2"](
+                hidden_size=2048,
+                num_hidden_layers=18,
+                intermediate_size=16384,
+                num_attention_heads=8,
+                num_key_value_heads=1,
+                is_encoder_decoder=False,
+                vocab_size=vocab_size,
+            )
+        self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
+        self.vision_config.projection_dim = projection_dim
+        # vision zoe config
+        self.vision_zoe_config = vision_zoe_config
+        if isinstance(self.vision_zoe_config, dict):
+            vision_zoe_config["model_type"] = vision_zoe_config["model_type"] if "model_type" in vision_zoe_config else "zoedepth"
+            self.vision_zoe_config = CONFIG_MAPPING[vision_zoe_config["model_type"]](**vision_zoe_config)
+        else:
+            pass
+        # additional attributes
+        self.action_token_begin_idx = action_token_begin_idx
+        self.spatial_token_num = spatial_token_num
+        self.use_spatial_token = use_spatial_token
+        self.ego3d_patch_reso = ego3d_patch_reso
+        self.n_freqs = n_freqs
+        self.use_vision_zoe = use_vision_zoe
+        super().__init__(**kwargs)
+    @property
+    def ignore_index(self):
+        warnings.warn(
+            "The `ignore_index` attribute is deprecated and will be removed in v4.47.",
+            FutureWarning,
+        )
+        return self._ignore_index
+    @ignore_index.setter
+    def ignore_index(self, value):
+        self._ignore_index = value
+    def to_dict(self):
+        output = super().to_dict()
+        output.pop("_ignore_index", None)
+        return output