OpenMOSS-Team
/

MOSS-VL-Base-0408

@@ -69,6 +69,8 @@ class MossVLTextConfig(PretrainedConfig):
     model_type = "moss_vl_text"
     base_config_key = "text_config"
     def __init__(
         self,
@@ -86,9 +88,11 @@ class MossVLTextConfig(PretrainedConfig):
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=5000000.0,
         rope_scaling=None,
         attention_bias=False,
         attention_dropout=0.0,
         # Cross attention specific
         cross_attention_layers=None,  # List of layer indices to insert cross attention
         **kwargs,
@@ -112,11 +116,31 @@ class MossVLTextConfig(PretrainedConfig):
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
-        rope_config_validation(self, ignore_keys={"mrope_section", "mrope_interleaved"})
         self.cross_attention_layers = cross_attention_layers or [2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
@@ -161,4 +185,4 @@ class MossVLConfig(PretrainedConfig):
         super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
-__all__ = ["MossVLConfig", "MossVLTextConfig"]

     model_type = "moss_vl_text"
     base_config_key = "text_config"
+    default_theta = 5000000.0
+    ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"}
     def __init__(
         self,
         use_cache=True,
         tie_word_embeddings=False,
         rope_theta=5000000.0,
+        rope_parameters=None,
         rope_scaling=None,
         attention_bias=False,
         attention_dropout=0.0,
+        pad_token_id=None,
         # Cross attention specific
         cross_attention_layers=None,  # List of layer indices to insert cross attention
         **kwargs,
         self.rms_norm_eps = rms_norm_eps
         self.use_cache = use_cache
         self.rope_theta = rope_theta
+        if rope_parameters is None:
+            if rope_scaling is not None:
+                rope_parameters = dict(rope_scaling)
+            else:
+                rope_parameters = {"rope_type": "default"}
+        else:
+            rope_parameters = dict(rope_parameters)
+        if "type" in rope_parameters and "rope_type" not in rope_parameters:
+            rope_parameters["rope_type"] = rope_parameters.pop("type")
+        rope_parameters.setdefault("rope_type", "default")
+        rope_parameters.setdefault("rope_theta", rope_theta)
+        self.rope_parameters = rope_parameters
+        self.rope_scaling = rope_scaling if rope_scaling is not None else dict(rope_parameters)
         self.attention_bias = attention_bias
         self.attention_dropout = attention_dropout
+        self.pad_token_id = pad_token_id
+        if hasattr(self, "standardize_rope_params"):
+            self.standardize_rope_params()
+        if hasattr(self, "validate_rope"):
+            self.validate_rope()
+        else:
+            rope_config_validation(self, ignore_keys=self.ignore_keys_at_rope_validation)
         self.cross_attention_layers = cross_attention_layers or [2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
         super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
+__all__ = ["MossVLConfig", "MossVLTextConfig"]