MeowFET commited on
Commit ·
ac14fde
1
Parent(s): 3227ceb
feat: update model names, make vocab_size divisible to 128
Browse files- config.json +7 -7
- configuration_minicpm.py → configuration_cauchy.py +16 -16
- modeling_minicpm.py → modeling_cauchy.py +69 -69
- pytorch_model.bin +2 -2
config.json
CHANGED
|
@@ -3,16 +3,16 @@
|
|
| 3 |
"_ori_bos_token_id": 1,
|
| 4 |
"_ori_eos_token_id": 2,
|
| 5 |
"architectures": [
|
| 6 |
-
"
|
| 7 |
],
|
| 8 |
"attention_bias": false,
|
| 9 |
"attention_dropout": 0.0,
|
| 10 |
"auto_map": {
|
| 11 |
-
"AutoConfig": "
|
| 12 |
-
"AutoModel": "
|
| 13 |
-
"AutoModelForCausalLM": "
|
| 14 |
-
"AutoModelForSeq2SeqLM": "
|
| 15 |
-
"AutoModelForSequenceClassification": "
|
| 16 |
},
|
| 17 |
"bos_token_id": 151643,
|
| 18 |
"dim_model_base": 256,
|
|
@@ -37,5 +37,5 @@
|
|
| 37 |
"torch_dtype": "bfloat16",
|
| 38 |
"transformers_version": "4.43.3",
|
| 39 |
"use_cache": true,
|
| 40 |
-
"vocab_size":
|
| 41 |
}
|
|
|
|
| 3 |
"_ori_bos_token_id": 1,
|
| 4 |
"_ori_eos_token_id": 2,
|
| 5 |
"architectures": [
|
| 6 |
+
"CauchyForCausalLM"
|
| 7 |
],
|
| 8 |
"attention_bias": false,
|
| 9 |
"attention_dropout": 0.0,
|
| 10 |
"auto_map": {
|
| 11 |
+
"AutoConfig": "configuration_cauchy.CauchyConfig",
|
| 12 |
+
"AutoModel": "modeling_cauchy.CauchyModel",
|
| 13 |
+
"AutoModelForCausalLM": "modeling_cauchy.CauchyForCausalLM",
|
| 14 |
+
"AutoModelForSeq2SeqLM": "modeling_cauchy.CauchyForCausalLM",
|
| 15 |
+
"AutoModelForSequenceClassification": "modeling_cauchy.CauchyForSequenceClassification"
|
| 16 |
},
|
| 17 |
"bos_token_id": 151643,
|
| 18 |
"dim_model_base": 256,
|
|
|
|
| 37 |
"torch_dtype": "bfloat16",
|
| 38 |
"transformers_version": "4.43.3",
|
| 39 |
"use_cache": true,
|
| 40 |
+
"vocab_size": 151680
|
| 41 |
}
|
configuration_minicpm.py → configuration_cauchy.py
RENAMED
|
@@ -17,7 +17,7 @@
|
|
| 17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 18 |
# See the License for the specific language governing permissions and
|
| 19 |
# limitations under the License.
|
| 20 |
-
"""
|
| 21 |
|
| 22 |
from transformers.configuration_utils import PretrainedConfig
|
| 23 |
from transformers.utils import logging
|
|
@@ -25,14 +25,14 @@ from transformers.utils import logging
|
|
| 25 |
|
| 26 |
logger = logging.get_logger(__name__)
|
| 27 |
|
| 28 |
-
|
| 29 |
|
| 30 |
|
| 31 |
-
class
|
| 32 |
r"""
|
| 33 |
-
This is the configuration class to store the configuration of a [`
|
| 34 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
| 35 |
-
defaults will yield a similar configuration to that of the
|
| 36 |
|
| 37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 38 |
documentation from [`PretrainedConfig`] for more information.
|
|
@@ -40,8 +40,8 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 40 |
|
| 41 |
Args:
|
| 42 |
vocab_size (`int`, *optional*, defaults to 32000):
|
| 43 |
-
Vocabulary size of the
|
| 44 |
-
`inputs_ids` passed when calling [`
|
| 45 |
hidden_size (`int`, *optional*, defaults to 4096):
|
| 46 |
Dimension of the hidden representations.
|
| 47 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
|
@@ -61,8 +61,8 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 61 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 62 |
The non-linear activation function (function or string) in the decoder.
|
| 63 |
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
| 64 |
-
The maximum sequence length that this model might ever be used with.
|
| 65 |
-
|
| 66 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 67 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 68 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
@@ -91,7 +91,7 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 91 |
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
| 92 |
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
|
| 93 |
these scaling strategies behave:
|
| 94 |
-
https://www.reddit.com/r/
|
| 95 |
experimental feature, subject to breaking API changes in future versions.
|
| 96 |
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
| 97 |
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
@@ -99,19 +99,19 @@ class MiniCPMConfig(PretrainedConfig):
|
|
| 99 |
The dropout ratio for the attention probabilities.
|
| 100 |
|
| 101 |
```python
|
| 102 |
-
>>> from transformers import
|
| 103 |
|
| 104 |
-
>>> # Initializing a
|
| 105 |
-
>>> configuration =
|
| 106 |
|
| 107 |
-
>>> # Initializing a model from the
|
| 108 |
-
>>> model =
|
| 109 |
|
| 110 |
>>> # Accessing the model configuration
|
| 111 |
>>> configuration = model.config
|
| 112 |
```"""
|
| 113 |
|
| 114 |
-
model_type = "
|
| 115 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 116 |
|
| 117 |
def __init__(
|
|
|
|
| 17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 18 |
# See the License for the specific language governing permissions and
|
| 19 |
# limitations under the License.
|
| 20 |
+
""" Cauchy model configuration"""
|
| 21 |
|
| 22 |
from transformers.configuration_utils import PretrainedConfig
|
| 23 |
from transformers.utils import logging
|
|
|
|
| 25 |
|
| 26 |
logger = logging.get_logger(__name__)
|
| 27 |
|
| 28 |
+
CAUCHY_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
| 29 |
|
| 30 |
|
| 31 |
+
class CauchyConfig(PretrainedConfig):
|
| 32 |
r"""
|
| 33 |
+
This is the configuration class to store the configuration of a [`CauchyModel`]. It is used to instantiate an Cauchy
|
| 34 |
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
| 35 |
+
defaults will yield a similar configuration to that of the Cauchy-7B.
|
| 36 |
|
| 37 |
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
| 38 |
documentation from [`PretrainedConfig`] for more information.
|
|
|
|
| 40 |
|
| 41 |
Args:
|
| 42 |
vocab_size (`int`, *optional*, defaults to 32000):
|
| 43 |
+
Vocabulary size of the Cauchy model. Defines the number of different tokens that can be represented by the
|
| 44 |
+
`inputs_ids` passed when calling [`CauchyModel`]
|
| 45 |
hidden_size (`int`, *optional*, defaults to 4096):
|
| 46 |
Dimension of the hidden representations.
|
| 47 |
intermediate_size (`int`, *optional*, defaults to 11008):
|
|
|
|
| 61 |
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
| 62 |
The non-linear activation function (function or string) in the decoder.
|
| 63 |
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
| 64 |
+
The maximum sequence length that this model might ever be used with. Cauchy 1 supports up to 2048 tokens,
|
| 65 |
+
Cauchy 2 up to 4096, CodeCauchy up to 16384.
|
| 66 |
initializer_range (`float`, *optional*, defaults to 0.02):
|
| 67 |
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
| 68 |
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
|
|
|
| 91 |
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
| 92 |
`max_position_embeddings` to the expected new maximum. See the following thread for more information on how
|
| 93 |
these scaling strategies behave:
|
| 94 |
+
https://www.reddit.com/r/LocalCauchy/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
|
| 95 |
experimental feature, subject to breaking API changes in future versions.
|
| 96 |
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
| 97 |
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
|
|
|
| 99 |
The dropout ratio for the attention probabilities.
|
| 100 |
|
| 101 |
```python
|
| 102 |
+
>>> from transformers import CauchyModel, CauchyConfig
|
| 103 |
|
| 104 |
+
>>> # Initializing a Cauchy cauchy-7b style configuration
|
| 105 |
+
>>> configuration = CauchyConfig()
|
| 106 |
|
| 107 |
+
>>> # Initializing a model from the cauchy-7b style configuration
|
| 108 |
+
>>> model = CauchyModel(configuration)
|
| 109 |
|
| 110 |
>>> # Accessing the model configuration
|
| 111 |
>>> configuration = model.config
|
| 112 |
```"""
|
| 113 |
|
| 114 |
+
model_type = "cauchy"
|
| 115 |
keys_to_ignore_at_inference = ["past_key_values"]
|
| 116 |
|
| 117 |
def __init__(
|
modeling_minicpm.py → modeling_cauchy.py
RENAMED
|
@@ -17,7 +17,7 @@
|
|
| 17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 18 |
# See the License for the specific language governing permissions and
|
| 19 |
# limitations under the License.
|
| 20 |
-
""" PyTorch
|
| 21 |
import math
|
| 22 |
import warnings
|
| 23 |
from typing import List, Optional, Tuple, Union, Dict
|
|
@@ -48,7 +48,7 @@ from transformers.utils import (
|
|
| 48 |
replace_return_docstrings,
|
| 49 |
)
|
| 50 |
from transformers.utils.import_utils import is_torch_fx_available
|
| 51 |
-
from .
|
| 52 |
import re
|
| 53 |
|
| 54 |
try:
|
|
@@ -69,7 +69,7 @@ if is_torch_fx_available():
|
|
| 69 |
|
| 70 |
logger = logging.get_logger(__name__)
|
| 71 |
|
| 72 |
-
_CONFIG_FOR_DOC = "
|
| 73 |
|
| 74 |
|
| 75 |
def _get_unpad_data(attention_mask):
|
|
@@ -86,7 +86,7 @@ def _get_unpad_data(attention_mask):
|
|
| 86 |
|
| 87 |
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
| 88 |
warnings.warn(
|
| 89 |
-
"Calling `transformers.models.
|
| 90 |
)
|
| 91 |
return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
|
| 92 |
|
|
@@ -95,7 +95,7 @@ def _make_causal_mask(
|
|
| 95 |
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
| 96 |
):
|
| 97 |
warnings.warn(
|
| 98 |
-
"Calling `transformers.models.
|
| 99 |
)
|
| 100 |
return AttentionMaskConverter._make_causal_mask(
|
| 101 |
input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
|
|
@@ -110,10 +110,10 @@ def rms_layernorm(hidden: torch.Tensor, weight: torch.Tensor, eps: float):
|
|
| 110 |
return hidden * weight
|
| 111 |
|
| 112 |
|
| 113 |
-
class
|
| 114 |
def __init__(self, hidden_size, eps=1e-6):
|
| 115 |
"""
|
| 116 |
-
|
| 117 |
"""
|
| 118 |
super().__init__()
|
| 119 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
@@ -123,10 +123,10 @@ class MiniCPMRMSNorm(nn.Module):
|
|
| 123 |
return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
|
| 124 |
|
| 125 |
|
| 126 |
-
ALL_LAYERNORM_LAYERS.append(
|
| 127 |
|
| 128 |
|
| 129 |
-
class
|
| 130 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
| 131 |
super().__init__()
|
| 132 |
|
|
@@ -163,8 +163,8 @@ class MiniCPMRotaryEmbedding(nn.Module):
|
|
| 163 |
)
|
| 164 |
|
| 165 |
|
| 166 |
-
class
|
| 167 |
-
"""
|
| 168 |
|
| 169 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
| 170 |
self.scaling_factor = scaling_factor
|
|
@@ -182,8 +182,8 @@ class MiniCPMLinearScalingRotaryEmbedding(MiniCPMRotaryEmbedding):
|
|
| 182 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
| 183 |
|
| 184 |
|
| 185 |
-
class
|
| 186 |
-
"""
|
| 187 |
|
| 188 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
| 189 |
self.scaling_factor = scaling_factor
|
|
@@ -250,7 +250,7 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
|
|
| 250 |
k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
|
| 251 |
return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
|
| 252 |
|
| 253 |
-
class
|
| 254 |
def __init__(self, config):
|
| 255 |
super().__init__()
|
| 256 |
self.config = config
|
|
@@ -297,10 +297,10 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
| 297 |
|
| 298 |
|
| 299 |
|
| 300 |
-
class
|
| 301 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 302 |
|
| 303 |
-
def __init__(self, config:
|
| 304 |
super().__init__()
|
| 305 |
self.config = config
|
| 306 |
self.layer_idx = layer_idx
|
|
@@ -335,12 +335,12 @@ class MiniCPMAttention(nn.Module):
|
|
| 335 |
self._init_rope()
|
| 336 |
|
| 337 |
if self.qk_norm:
|
| 338 |
-
self.q_norm =
|
| 339 |
-
self.k_norm =
|
| 340 |
|
| 341 |
def _init_rope(self):
|
| 342 |
if self.config.rope_scaling is None:
|
| 343 |
-
self.rotary_emb =
|
| 344 |
self.head_dim,
|
| 345 |
max_position_embeddings=self.max_position_embeddings,
|
| 346 |
base=self.rope_theta,
|
|
@@ -349,14 +349,14 @@ class MiniCPMAttention(nn.Module):
|
|
| 349 |
scaling_type = self.config.rope_scaling["type"]
|
| 350 |
scaling_factor = self.config.rope_scaling["factor"]
|
| 351 |
if scaling_type == "linear":
|
| 352 |
-
self.rotary_emb =
|
| 353 |
self.head_dim,
|
| 354 |
max_position_embeddings=self.max_position_embeddings,
|
| 355 |
scaling_factor=scaling_factor,
|
| 356 |
base=self.rope_theta,
|
| 357 |
)
|
| 358 |
elif scaling_type == "dynamic":
|
| 359 |
-
self.rotary_emb =
|
| 360 |
self.head_dim,
|
| 361 |
max_position_embeddings=self.max_position_embeddings,
|
| 362 |
scaling_factor=scaling_factor,
|
|
@@ -477,9 +477,9 @@ class MiniCPMAttention(nn.Module):
|
|
| 477 |
return attn_output, attn_weights, past_key_value
|
| 478 |
|
| 479 |
|
| 480 |
-
class
|
| 481 |
"""
|
| 482 |
-
|
| 483 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
| 484 |
flash attention and deal with padding tokens in case the input contains any of them.
|
| 485 |
"""
|
|
@@ -502,7 +502,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
| 502 |
use_cache: bool = False,
|
| 503 |
**kwargs,
|
| 504 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 505 |
-
#
|
| 506 |
if "padding_mask" in kwargs:
|
| 507 |
warnings.warn(
|
| 508 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
|
@@ -552,7 +552,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
| 552 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
| 553 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
| 554 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
| 555 |
-
# in fp32. (
|
| 556 |
|
| 557 |
input_dtype = query_states.dtype
|
| 558 |
if input_dtype == torch.float32:
|
|
@@ -609,7 +609,7 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
| 609 |
if not self._flash_attn_uses_top_left_mask:
|
| 610 |
causal = self.is_causal
|
| 611 |
else:
|
| 612 |
-
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in
|
| 613 |
causal = self.is_causal and query_length != 1
|
| 614 |
# Contains at least one padding token in the sequence
|
| 615 |
if attention_mask is not None:
|
|
@@ -680,14 +680,14 @@ class MiniCPMFlashAttention2(MiniCPMAttention):
|
|
| 680 |
)
|
| 681 |
|
| 682 |
|
| 683 |
-
class
|
| 684 |
"""
|
| 685 |
-
|
| 686 |
-
`
|
| 687 |
SDPA API.
|
| 688 |
"""
|
| 689 |
|
| 690 |
-
# Adapted from
|
| 691 |
def forward(
|
| 692 |
self,
|
| 693 |
hidden_states: torch.Tensor,
|
|
@@ -700,7 +700,7 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
|
|
| 700 |
if output_attentions:
|
| 701 |
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
| 702 |
logger.warning_once(
|
| 703 |
-
"
|
| 704 |
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
| 705 |
)
|
| 706 |
return super().forward(
|
|
@@ -771,22 +771,22 @@ class MiniCPMSdpaAttention(MiniCPMAttention):
|
|
| 771 |
return attn_output, None, past_key_value
|
| 772 |
|
| 773 |
|
| 774 |
-
|
| 775 |
-
"eager":
|
| 776 |
-
"flash_attention_2":
|
| 777 |
-
"sdpa":
|
| 778 |
}
|
| 779 |
|
| 780 |
|
| 781 |
-
class
|
| 782 |
-
def __init__(self, config:
|
| 783 |
super().__init__()
|
| 784 |
self.hidden_size = config.hidden_size
|
| 785 |
-
self.self_attn =
|
| 786 |
|
| 787 |
-
self.mlp =
|
| 788 |
-
self.input_layernorm =
|
| 789 |
-
self.post_attention_layernorm =
|
| 790 |
|
| 791 |
self.scale_depth = config.scale_depth
|
| 792 |
self.num_hidden_layers = config.num_hidden_layers
|
|
@@ -853,7 +853,7 @@ class MiniCPMDecoderLayer(nn.Module):
|
|
| 853 |
return outputs
|
| 854 |
|
| 855 |
|
| 856 |
-
|
| 857 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 858 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 859 |
etc.)
|
|
@@ -863,7 +863,7 @@ MINICPM_START_DOCSTRING = r"""
|
|
| 863 |
and behavior.
|
| 864 |
|
| 865 |
Parameters:
|
| 866 |
-
config ([`
|
| 867 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
| 868 |
load the weights associated with the model, only the configuration. Check out the
|
| 869 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
@@ -871,14 +871,14 @@ MINICPM_START_DOCSTRING = r"""
|
|
| 871 |
|
| 872 |
|
| 873 |
@add_start_docstrings(
|
| 874 |
-
"The bare
|
| 875 |
-
|
| 876 |
)
|
| 877 |
-
class
|
| 878 |
-
config_class =
|
| 879 |
base_model_prefix = "model"
|
| 880 |
supports_gradient_checkpointing = True
|
| 881 |
-
_no_split_modules = ["
|
| 882 |
_skip_keys_device_placement = "past_key_values"
|
| 883 |
_supports_flash_attn_2 = True
|
| 884 |
_supports_sdpa = True
|
|
@@ -896,7 +896,7 @@ class MiniCPMPreTrainedModel(PreTrainedModel):
|
|
| 896 |
module.weight.data[module.padding_idx].zero_()
|
| 897 |
|
| 898 |
|
| 899 |
-
|
| 900 |
Args:
|
| 901 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 902 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
@@ -967,30 +967,30 @@ MINICPM_INPUTS_DOCSTRING = r"""
|
|
| 967 |
|
| 968 |
|
| 969 |
@add_start_docstrings(
|
| 970 |
-
"The bare
|
| 971 |
-
|
| 972 |
)
|
| 973 |
-
class
|
| 974 |
"""
|
| 975 |
-
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`
|
| 976 |
|
| 977 |
Args:
|
| 978 |
-
config:
|
| 979 |
"""
|
| 980 |
|
| 981 |
-
def __init__(self, config:
|
| 982 |
super().__init__(config)
|
| 983 |
self.padding_idx = config.pad_token_id
|
| 984 |
self.vocab_size = config.vocab_size
|
| 985 |
|
| 986 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
| 987 |
self.layers = nn.ModuleList(
|
| 988 |
-
[
|
| 989 |
)
|
| 990 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
| 991 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
| 992 |
|
| 993 |
-
self.norm =
|
| 994 |
|
| 995 |
self.gradient_checkpointing = False
|
| 996 |
# Initialize weights and apply final processing
|
|
@@ -1002,7 +1002,7 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
| 1002 |
def set_input_embeddings(self, value):
|
| 1003 |
self.embed_tokens = value
|
| 1004 |
|
| 1005 |
-
@add_start_docstrings_to_model_forward(
|
| 1006 |
def forward(
|
| 1007 |
self,
|
| 1008 |
input_ids: torch.LongTensor = None,
|
|
@@ -1135,12 +1135,12 @@ class MiniCPMModel(MiniCPMPreTrainedModel):
|
|
| 1135 |
)
|
| 1136 |
|
| 1137 |
|
| 1138 |
-
class
|
| 1139 |
_tied_weights_keys = ["lm_head.weight"]
|
| 1140 |
|
| 1141 |
def __init__(self, config):
|
| 1142 |
super().__init__(config)
|
| 1143 |
-
self.model =
|
| 1144 |
self.vocab_size = config.vocab_size
|
| 1145 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 1146 |
|
|
@@ -1165,7 +1165,7 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
| 1165 |
def get_decoder(self):
|
| 1166 |
return self.model
|
| 1167 |
|
| 1168 |
-
@add_start_docstrings_to_model_forward(
|
| 1169 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
| 1170 |
def forward(
|
| 1171 |
self,
|
|
@@ -1192,9 +1192,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
| 1192 |
Example:
|
| 1193 |
|
| 1194 |
```python
|
| 1195 |
-
>>> from transformers import AutoTokenizer,
|
| 1196 |
|
| 1197 |
-
>>> model =
|
| 1198 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
| 1199 |
|
| 1200 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
|
@@ -1354,9 +1354,9 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
| 1354 |
|
| 1355 |
@add_start_docstrings(
|
| 1356 |
"""
|
| 1357 |
-
The
|
| 1358 |
|
| 1359 |
-
[`
|
| 1360 |
(e.g. GPT-2) do.
|
| 1361 |
|
| 1362 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
@@ -1365,13 +1365,13 @@ class MiniCPMForCausalLM(MiniCPMPreTrainedModel):
|
|
| 1365 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
| 1366 |
each row of the batch).
|
| 1367 |
""",
|
| 1368 |
-
|
| 1369 |
)
|
| 1370 |
-
class
|
| 1371 |
def __init__(self, config):
|
| 1372 |
super().__init__(config)
|
| 1373 |
self.num_labels = config.num_labels
|
| 1374 |
-
self.model =
|
| 1375 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
| 1376 |
|
| 1377 |
# Initialize weights and apply final processing
|
|
@@ -1383,7 +1383,7 @@ class MiniCPMForSequenceClassification(MiniCPMPreTrainedModel):
|
|
| 1383 |
def set_input_embeddings(self, value):
|
| 1384 |
self.model.embed_tokens = value
|
| 1385 |
|
| 1386 |
-
@add_start_docstrings_to_model_forward(
|
| 1387 |
def forward(
|
| 1388 |
self,
|
| 1389 |
input_ids: torch.LongTensor = None,
|
|
|
|
| 17 |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 18 |
# See the License for the specific language governing permissions and
|
| 19 |
# limitations under the License.
|
| 20 |
+
""" PyTorch Cauchy model."""
|
| 21 |
import math
|
| 22 |
import warnings
|
| 23 |
from typing import List, Optional, Tuple, Union, Dict
|
|
|
|
| 48 |
replace_return_docstrings,
|
| 49 |
)
|
| 50 |
from transformers.utils.import_utils import is_torch_fx_available
|
| 51 |
+
from .configuration_cauchy import CauchyConfig
|
| 52 |
import re
|
| 53 |
|
| 54 |
try:
|
|
|
|
| 69 |
|
| 70 |
logger = logging.get_logger(__name__)
|
| 71 |
|
| 72 |
+
_CONFIG_FOR_DOC = "CauchyConfig"
|
| 73 |
|
| 74 |
|
| 75 |
def _get_unpad_data(attention_mask):
|
|
|
|
| 86 |
|
| 87 |
def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
|
| 88 |
warnings.warn(
|
| 89 |
+
"Calling `transformers.models.cauchy.modeling_cauchy._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
|
| 90 |
)
|
| 91 |
return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
|
| 92 |
|
|
|
|
| 95 |
input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0
|
| 96 |
):
|
| 97 |
warnings.warn(
|
| 98 |
+
"Calling `transformers.models.cauchy.modeling_cauchy._make_causal_mask` is deprecated and will be removed in v4.37. Use `transformers.models.cauchy.modeling_cauchy.AttentionMaskConverter._make_causal_mask"
|
| 99 |
)
|
| 100 |
return AttentionMaskConverter._make_causal_mask(
|
| 101 |
input_ids_shape=input_ids_shape, dtype=dtype, device=device, past_key_values_length=past_key_values_length
|
|
|
|
| 110 |
return hidden * weight
|
| 111 |
|
| 112 |
|
| 113 |
+
class CauchyRMSNorm(nn.Module):
|
| 114 |
def __init__(self, hidden_size, eps=1e-6):
|
| 115 |
"""
|
| 116 |
+
CauchyRMSNorm is equivalent to T5LayerNorm
|
| 117 |
"""
|
| 118 |
super().__init__()
|
| 119 |
self.weight = nn.Parameter(torch.ones(hidden_size))
|
|
|
|
| 123 |
return rms_layernorm(hidden_states, self.weight, self.variance_epsilon)
|
| 124 |
|
| 125 |
|
| 126 |
+
ALL_LAYERNORM_LAYERS.append(CauchyRMSNorm)
|
| 127 |
|
| 128 |
|
| 129 |
+
class CauchyRotaryEmbedding(nn.Module):
|
| 130 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
|
| 131 |
super().__init__()
|
| 132 |
|
|
|
|
| 163 |
)
|
| 164 |
|
| 165 |
|
| 166 |
+
class CauchyLinearScalingRotaryEmbedding(CauchyRotaryEmbedding):
|
| 167 |
+
"""CauchyRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
|
| 168 |
|
| 169 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
| 170 |
self.scaling_factor = scaling_factor
|
|
|
|
| 182 |
self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
|
| 183 |
|
| 184 |
|
| 185 |
+
class CauchyDynamicNTKScalingRotaryEmbedding(CauchyRotaryEmbedding):
|
| 186 |
+
"""CauchyRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
|
| 187 |
|
| 188 |
def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
|
| 189 |
self.scaling_factor = scaling_factor
|
|
|
|
| 250 |
k_embed = (k_fp32 * cos) + (rotate_half(k_fp32) * sin)
|
| 251 |
return q_embed.to(dtype=orig_dtype), k_embed.to(dtype=orig_dtype)
|
| 252 |
|
| 253 |
+
class CauchyMLP(nn.Module):
|
| 254 |
def __init__(self, config):
|
| 255 |
super().__init__()
|
| 256 |
self.config = config
|
|
|
|
| 297 |
|
| 298 |
|
| 299 |
|
| 300 |
+
class CauchyAttention(nn.Module):
|
| 301 |
"""Multi-headed attention from 'Attention Is All You Need' paper"""
|
| 302 |
|
| 303 |
+
def __init__(self, config: CauchyConfig, layer_idx: Optional[int] = None):
|
| 304 |
super().__init__()
|
| 305 |
self.config = config
|
| 306 |
self.layer_idx = layer_idx
|
|
|
|
| 335 |
self._init_rope()
|
| 336 |
|
| 337 |
if self.qk_norm:
|
| 338 |
+
self.q_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
| 339 |
+
self.k_norm = CauchyRMSNorm(self.head_dim, eps=config.rms_norm_eps)
|
| 340 |
|
| 341 |
def _init_rope(self):
|
| 342 |
if self.config.rope_scaling is None:
|
| 343 |
+
self.rotary_emb = CauchyRotaryEmbedding(
|
| 344 |
self.head_dim,
|
| 345 |
max_position_embeddings=self.max_position_embeddings,
|
| 346 |
base=self.rope_theta,
|
|
|
|
| 349 |
scaling_type = self.config.rope_scaling["type"]
|
| 350 |
scaling_factor = self.config.rope_scaling["factor"]
|
| 351 |
if scaling_type == "linear":
|
| 352 |
+
self.rotary_emb = CauchyLinearScalingRotaryEmbedding(
|
| 353 |
self.head_dim,
|
| 354 |
max_position_embeddings=self.max_position_embeddings,
|
| 355 |
scaling_factor=scaling_factor,
|
| 356 |
base=self.rope_theta,
|
| 357 |
)
|
| 358 |
elif scaling_type == "dynamic":
|
| 359 |
+
self.rotary_emb = CauchyDynamicNTKScalingRotaryEmbedding(
|
| 360 |
self.head_dim,
|
| 361 |
max_position_embeddings=self.max_position_embeddings,
|
| 362 |
scaling_factor=scaling_factor,
|
|
|
|
| 477 |
return attn_output, attn_weights, past_key_value
|
| 478 |
|
| 479 |
|
| 480 |
+
class CauchyFlashAttention2(CauchyAttention):
|
| 481 |
"""
|
| 482 |
+
Cauchy flash attention module. This module inherits from `CauchyAttention` as the weights of the module stays
|
| 483 |
untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
|
| 484 |
flash attention and deal with padding tokens in case the input contains any of them.
|
| 485 |
"""
|
|
|
|
| 502 |
use_cache: bool = False,
|
| 503 |
**kwargs,
|
| 504 |
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
| 505 |
+
# CauchyFlashAttention2 attention does not support output_attentions
|
| 506 |
if "padding_mask" in kwargs:
|
| 507 |
warnings.warn(
|
| 508 |
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
|
|
|
| 552 |
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
| 553 |
# cast them back in the correct dtype just to be sure everything works as expected.
|
| 554 |
# This might slowdown training & inference so it is recommended to not cast the LayerNorms
|
| 555 |
+
# in fp32. (CauchyRMSNorm handles it correctly)
|
| 556 |
|
| 557 |
input_dtype = query_states.dtype
|
| 558 |
if input_dtype == torch.float32:
|
|
|
|
| 609 |
if not self._flash_attn_uses_top_left_mask:
|
| 610 |
causal = self.is_causal
|
| 611 |
else:
|
| 612 |
+
# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in CauchyFlashAttention2 __init__.
|
| 613 |
causal = self.is_causal and query_length != 1
|
| 614 |
# Contains at least one padding token in the sequence
|
| 615 |
if attention_mask is not None:
|
|
|
|
| 680 |
)
|
| 681 |
|
| 682 |
|
| 683 |
+
class CauchySdpaAttention(CauchyAttention):
|
| 684 |
"""
|
| 685 |
+
Cauchy attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
|
| 686 |
+
`CauchyAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
|
| 687 |
SDPA API.
|
| 688 |
"""
|
| 689 |
|
| 690 |
+
# Adapted from CauchyAttention.forward
|
| 691 |
def forward(
|
| 692 |
self,
|
| 693 |
hidden_states: torch.Tensor,
|
|
|
|
| 700 |
if output_attentions:
|
| 701 |
# TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
|
| 702 |
logger.warning_once(
|
| 703 |
+
"CauchyModel is using CauchySdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
|
| 704 |
'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
|
| 705 |
)
|
| 706 |
return super().forward(
|
|
|
|
| 771 |
return attn_output, None, past_key_value
|
| 772 |
|
| 773 |
|
| 774 |
+
CAUCHY_ATTENTION_CLASSES = {
|
| 775 |
+
"eager": CauchyAttention,
|
| 776 |
+
"flash_attention_2": CauchyFlashAttention2,
|
| 777 |
+
"sdpa": CauchySdpaAttention,
|
| 778 |
}
|
| 779 |
|
| 780 |
|
| 781 |
+
class CauchyDecoderLayer(nn.Module):
|
| 782 |
+
def __init__(self, config: CauchyConfig, layer_idx: int):
|
| 783 |
super().__init__()
|
| 784 |
self.hidden_size = config.hidden_size
|
| 785 |
+
self.self_attn = CAUCHY_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
|
| 786 |
|
| 787 |
+
self.mlp = CauchyMLP(config)
|
| 788 |
+
self.input_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
| 789 |
+
self.post_attention_layernorm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
| 790 |
|
| 791 |
self.scale_depth = config.scale_depth
|
| 792 |
self.num_hidden_layers = config.num_hidden_layers
|
|
|
|
| 853 |
return outputs
|
| 854 |
|
| 855 |
|
| 856 |
+
CAUCHY_START_DOCSTRING = r"""
|
| 857 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
| 858 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
| 859 |
etc.)
|
|
|
|
| 863 |
and behavior.
|
| 864 |
|
| 865 |
Parameters:
|
| 866 |
+
config ([`CauchyConfig`]):
|
| 867 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
| 868 |
load the weights associated with the model, only the configuration. Check out the
|
| 869 |
[`~PreTrainedModel.from_pretrained`] method to load the model weights.
|
|
|
|
| 871 |
|
| 872 |
|
| 873 |
@add_start_docstrings(
|
| 874 |
+
"The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
|
| 875 |
+
CAUCHY_START_DOCSTRING,
|
| 876 |
)
|
| 877 |
+
class CauchyPreTrainedModel(PreTrainedModel):
|
| 878 |
+
config_class = CauchyConfig
|
| 879 |
base_model_prefix = "model"
|
| 880 |
supports_gradient_checkpointing = True
|
| 881 |
+
_no_split_modules = ["CauchyDecoderLayer"]
|
| 882 |
_skip_keys_device_placement = "past_key_values"
|
| 883 |
_supports_flash_attn_2 = True
|
| 884 |
_supports_sdpa = True
|
|
|
|
| 896 |
module.weight.data[module.padding_idx].zero_()
|
| 897 |
|
| 898 |
|
| 899 |
+
CAUCHY_INPUTS_DOCSTRING = r"""
|
| 900 |
Args:
|
| 901 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
| 902 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
|
|
|
| 967 |
|
| 968 |
|
| 969 |
@add_start_docstrings(
|
| 970 |
+
"The bare Cauchy Model outputting raw hidden-states without any specific head on top.",
|
| 971 |
+
CAUCHY_START_DOCSTRING,
|
| 972 |
)
|
| 973 |
+
class CauchyModel(CauchyPreTrainedModel):
|
| 974 |
"""
|
| 975 |
+
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`CauchyDecoderLayer`]
|
| 976 |
|
| 977 |
Args:
|
| 978 |
+
config: CauchyConfig
|
| 979 |
"""
|
| 980 |
|
| 981 |
+
def __init__(self, config: CauchyConfig):
|
| 982 |
super().__init__(config)
|
| 983 |
self.padding_idx = config.pad_token_id
|
| 984 |
self.vocab_size = config.vocab_size
|
| 985 |
|
| 986 |
self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
|
| 987 |
self.layers = nn.ModuleList(
|
| 988 |
+
[CauchyDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
|
| 989 |
)
|
| 990 |
self._use_sdpa = config._attn_implementation == "sdpa"
|
| 991 |
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
| 992 |
|
| 993 |
+
self.norm = CauchyRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
|
| 994 |
|
| 995 |
self.gradient_checkpointing = False
|
| 996 |
# Initialize weights and apply final processing
|
|
|
|
| 1002 |
def set_input_embeddings(self, value):
|
| 1003 |
self.embed_tokens = value
|
| 1004 |
|
| 1005 |
+
@add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
|
| 1006 |
def forward(
|
| 1007 |
self,
|
| 1008 |
input_ids: torch.LongTensor = None,
|
|
|
|
| 1135 |
)
|
| 1136 |
|
| 1137 |
|
| 1138 |
+
class CauchyForCausalLM(CauchyPreTrainedModel):
|
| 1139 |
_tied_weights_keys = ["lm_head.weight"]
|
| 1140 |
|
| 1141 |
def __init__(self, config):
|
| 1142 |
super().__init__(config)
|
| 1143 |
+
self.model = CauchyModel(config)
|
| 1144 |
self.vocab_size = config.vocab_size
|
| 1145 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
| 1146 |
|
|
|
|
| 1165 |
def get_decoder(self):
|
| 1166 |
return self.model
|
| 1167 |
|
| 1168 |
+
@add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
|
| 1169 |
@replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
|
| 1170 |
def forward(
|
| 1171 |
self,
|
|
|
|
| 1192 |
Example:
|
| 1193 |
|
| 1194 |
```python
|
| 1195 |
+
>>> from transformers import AutoTokenizer, CauchyForCausalLM
|
| 1196 |
|
| 1197 |
+
>>> model = CauchyForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
|
| 1198 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
|
| 1199 |
|
| 1200 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
|
|
|
| 1354 |
|
| 1355 |
@add_start_docstrings(
|
| 1356 |
"""
|
| 1357 |
+
The Cauchy Model transformer with a sequence classification head on top (linear layer).
|
| 1358 |
|
| 1359 |
+
[`CauchyForSequenceClassification`] uses the last token in order to do the classification, as other causal models
|
| 1360 |
(e.g. GPT-2) do.
|
| 1361 |
|
| 1362 |
Since it does classification on the last token, it requires to know the position of the last token. If a
|
|
|
|
| 1365 |
padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
|
| 1366 |
each row of the batch).
|
| 1367 |
""",
|
| 1368 |
+
CAUCHY_START_DOCSTRING,
|
| 1369 |
)
|
| 1370 |
+
class CauchyForSequenceClassification(CauchyPreTrainedModel):
|
| 1371 |
def __init__(self, config):
|
| 1372 |
super().__init__(config)
|
| 1373 |
self.num_labels = config.num_labels
|
| 1374 |
+
self.model = CauchyModel(config)
|
| 1375 |
self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
|
| 1376 |
|
| 1377 |
# Initialize weights and apply final processing
|
|
|
|
| 1383 |
def set_input_embeddings(self, value):
|
| 1384 |
self.model.embed_tokens = value
|
| 1385 |
|
| 1386 |
+
@add_start_docstrings_to_model_forward(CAUCHY_INPUTS_DOCSTRING)
|
| 1387 |
def forward(
|
| 1388 |
self,
|
| 1389 |
input_ids: torch.LongTensor = None,
|
pytorch_model.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c23ecc5e0665c45154097ff165e98e769e9be180e7bec074871838ebe2a415e0
|
| 3 |
+
size 6220791502
|