Update code for transformers 5.5.4
#2
by sjzhou - opened
- configuration_moss_vl.py +28 -4
configuration_moss_vl.py
CHANGED
|
@@ -69,6 +69,8 @@ class MossVLTextConfig(PretrainedConfig):
|
|
| 69 |
|
| 70 |
model_type = "moss_vl_text"
|
| 71 |
base_config_key = "text_config"
|
|
|
|
|
|
|
| 72 |
|
| 73 |
def __init__(
|
| 74 |
self,
|
|
@@ -86,9 +88,11 @@ class MossVLTextConfig(PretrainedConfig):
|
|
| 86 |
use_cache=True,
|
| 87 |
tie_word_embeddings=False,
|
| 88 |
rope_theta=5000000.0,
|
|
|
|
| 89 |
rope_scaling=None,
|
| 90 |
attention_bias=False,
|
| 91 |
attention_dropout=0.0,
|
|
|
|
| 92 |
# Cross attention specific
|
| 93 |
cross_attention_layers=None, # List of layer indices to insert cross attention
|
| 94 |
**kwargs,
|
|
@@ -112,11 +116,31 @@ class MossVLTextConfig(PretrainedConfig):
|
|
| 112 |
self.rms_norm_eps = rms_norm_eps
|
| 113 |
self.use_cache = use_cache
|
| 114 |
self.rope_theta = rope_theta
|
| 115 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
self.attention_bias = attention_bias
|
| 117 |
self.attention_dropout = attention_dropout
|
| 118 |
-
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
self.cross_attention_layers = cross_attention_layers or [2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
|
| 121 |
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
| 122 |
|
|
@@ -161,4 +185,4 @@ class MossVLConfig(PretrainedConfig):
|
|
| 161 |
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
| 162 |
|
| 163 |
|
| 164 |
-
__all__ = ["MossVLConfig", "MossVLTextConfig"]
|
|
|
|
| 69 |
|
| 70 |
model_type = "moss_vl_text"
|
| 71 |
base_config_key = "text_config"
|
| 72 |
+
default_theta = 5000000.0
|
| 73 |
+
ignore_keys_at_rope_validation = {"mrope_section", "mrope_interleaved"}
|
| 74 |
|
| 75 |
def __init__(
|
| 76 |
self,
|
|
|
|
| 88 |
use_cache=True,
|
| 89 |
tie_word_embeddings=False,
|
| 90 |
rope_theta=5000000.0,
|
| 91 |
+
rope_parameters=None,
|
| 92 |
rope_scaling=None,
|
| 93 |
attention_bias=False,
|
| 94 |
attention_dropout=0.0,
|
| 95 |
+
pad_token_id=None,
|
| 96 |
# Cross attention specific
|
| 97 |
cross_attention_layers=None, # List of layer indices to insert cross attention
|
| 98 |
**kwargs,
|
|
|
|
| 116 |
self.rms_norm_eps = rms_norm_eps
|
| 117 |
self.use_cache = use_cache
|
| 118 |
self.rope_theta = rope_theta
|
| 119 |
+
if rope_parameters is None:
|
| 120 |
+
if rope_scaling is not None:
|
| 121 |
+
rope_parameters = dict(rope_scaling)
|
| 122 |
+
else:
|
| 123 |
+
rope_parameters = {"rope_type": "default"}
|
| 124 |
+
else:
|
| 125 |
+
rope_parameters = dict(rope_parameters)
|
| 126 |
+
|
| 127 |
+
if "type" in rope_parameters and "rope_type" not in rope_parameters:
|
| 128 |
+
rope_parameters["rope_type"] = rope_parameters.pop("type")
|
| 129 |
+
rope_parameters.setdefault("rope_type", "default")
|
| 130 |
+
rope_parameters.setdefault("rope_theta", rope_theta)
|
| 131 |
+
|
| 132 |
+
self.rope_parameters = rope_parameters
|
| 133 |
+
self.rope_scaling = rope_scaling if rope_scaling is not None else dict(rope_parameters)
|
| 134 |
self.attention_bias = attention_bias
|
| 135 |
self.attention_dropout = attention_dropout
|
| 136 |
+
self.pad_token_id = pad_token_id
|
| 137 |
+
|
| 138 |
+
if hasattr(self, "standardize_rope_params"):
|
| 139 |
+
self.standardize_rope_params()
|
| 140 |
+
if hasattr(self, "validate_rope"):
|
| 141 |
+
self.validate_rope()
|
| 142 |
+
else:
|
| 143 |
+
rope_config_validation(self, ignore_keys=self.ignore_keys_at_rope_validation)
|
| 144 |
self.cross_attention_layers = cross_attention_layers or [2, 6, 10, 14, 18, 22, 26, 30, 34, 38, 42, 46]
|
| 145 |
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
| 146 |
|
|
|
|
| 185 |
super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings)
|
| 186 |
|
| 187 |
|
| 188 |
+
__all__ = ["MossVLConfig", "MossVLTextConfig"]
|