Sławomir Dadas commited on
Commit
4f562fd
·
1 Parent(s): 9bc6d2e

Transformers v5 compatibility fixes

Browse files
config.json CHANGED
@@ -4,6 +4,7 @@
4
  ],
5
  "attention_dropout": 0.0,
6
  "auto_map": {
 
7
  "AutoModel": "modeling_qwen.Qwen2Model",
8
  "AutoModelForCausalLM": "modeling_qwen.Qwen2ForCausalLM",
9
  "AutoModelForSequenceClassification": "modeling_qwen.Qwen2ForSequenceClassification"
 
4
  ],
5
  "attention_dropout": 0.0,
6
  "auto_map": {
7
+ "AutoConfig": "configuration_qwen.Qwen2Config",
8
  "AutoModel": "modeling_qwen.Qwen2Model",
9
  "AutoModelForCausalLM": "modeling_qwen.Qwen2ForCausalLM",
10
  "AutoModelForSequenceClassification": "modeling_qwen.Qwen2ForSequenceClassification"
configuration_qwen.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Qwen2 model configuration"""
16
+
17
+ from transformers import PretrainedConfig
18
+
19
+
20
+ class Qwen2Config(PretrainedConfig):
21
+ model_type = "qwen2"
22
+ keys_to_ignore_at_inference = ["past_key_values"]
23
+
24
+ # Default tensor parallel plan for base model `Qwen2`
25
+ base_model_tp_plan = {
26
+ "layers.*.self_attn.q_proj": "colwise",
27
+ "layers.*.self_attn.k_proj": "colwise",
28
+ "layers.*.self_attn.v_proj": "colwise",
29
+ "layers.*.self_attn.o_proj": "rowwise",
30
+ "layers.*.mlp.gate_proj": "colwise",
31
+ "layers.*.mlp.up_proj": "colwise",
32
+ "layers.*.mlp.down_proj": "rowwise",
33
+ }
34
+ base_model_pp_plan = {
35
+ "embed_tokens": (["input_ids"], ["inputs_embeds"]),
36
+ "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
37
+ "norm": (["hidden_states"], ["hidden_states"]),
38
+ }
39
+
40
+ def __init__(
41
+ self,
42
+ vocab_size=151936,
43
+ hidden_size=4096,
44
+ intermediate_size=22016,
45
+ num_hidden_layers=32,
46
+ num_attention_heads=32,
47
+ num_key_value_heads=32,
48
+ hidden_act="silu",
49
+ max_position_embeddings=32768,
50
+ initializer_range=0.02,
51
+ rms_norm_eps=1e-6,
52
+ use_cache=True,
53
+ tie_word_embeddings=False,
54
+ rope_theta=10000.0,
55
+ rope_scaling=None,
56
+ use_sliding_window=False,
57
+ sliding_window=4096,
58
+ max_window_layers=28,
59
+ layer_types=None,
60
+ attention_dropout=0.0,
61
+ pad_token_id: int | None = None,
62
+ bos_token_id: int | None = None,
63
+ eos_token_id: int | None = None,
64
+ **kwargs,
65
+ ):
66
+ self.vocab_size = vocab_size
67
+ self.max_position_embeddings = max_position_embeddings
68
+ self.hidden_size = hidden_size
69
+ self.intermediate_size = intermediate_size
70
+ self.num_hidden_layers = num_hidden_layers
71
+ self.num_attention_heads = num_attention_heads
72
+ self.use_sliding_window = use_sliding_window
73
+ self.sliding_window = sliding_window if self.use_sliding_window else None
74
+ self.max_window_layers = max_window_layers
75
+
76
+ # for backward compatibility
77
+ if num_key_value_heads is None:
78
+ num_key_value_heads = num_attention_heads
79
+
80
+ self.num_key_value_heads = num_key_value_heads
81
+ self.hidden_act = hidden_act
82
+ self.initializer_range = initializer_range
83
+ self.rms_norm_eps = rms_norm_eps
84
+ self.use_cache = use_cache
85
+ self.rope_theta = rope_theta
86
+ self.rope_scaling = rope_scaling
87
+ self.attention_dropout = attention_dropout
88
+ # Validate the correctness of rotary position embeddings parameters
89
+ # BC: if there is a 'type' field, move it to 'rope_type'.
90
+ if self.rope_scaling is not None and "type" in self.rope_scaling:
91
+ self.rope_scaling["rope_type"] = self.rope_scaling["type"]
92
+
93
+ self.layer_types = layer_types
94
+ if self.layer_types is None:
95
+ self.layer_types = [
96
+ "sliding_attention"
97
+ if self.sliding_window is not None and i >= self.max_window_layers
98
+ else "full_attention"
99
+ for i in range(self.num_hidden_layers)
100
+ ]
101
+
102
+ self.pad_token_id = pad_token_id
103
+ self.bos_token_id = bos_token_id
104
+ self.eos_token_id = eos_token_id
105
+ self.tie_word_embeddings = tie_word_embeddings
106
+ super().__init__(**kwargs)
107
+
108
+
109
+ __all__ = ["Qwen2Config"]
modeling_qwen.py CHANGED
@@ -18,12 +18,12 @@
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
  """ PyTorch Qwen2 model."""
21
- from transformers import Qwen2Config
 
22
  import inspect
23
  import math
24
- import os
25
  import warnings
26
- from typing import List, Optional, Tuple, Union
27
 
28
  import torch
29
  import torch.nn.functional as F
@@ -44,8 +44,6 @@ from transformers.utils import (
44
  logging,
45
  replace_return_docstrings,
46
  )
47
-
48
-
49
  if is_flash_attn_2_available():
50
  from flash_attn import flash_attn_func, flash_attn_varlen_func
51
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
@@ -65,6 +63,19 @@ QWEN2_PRETRAINED_MODEL_ARCHIVE_LIST = [
65
  ]
66
 
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
69
  def _get_unpad_data(attention_mask):
70
  seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -96,41 +107,68 @@ class Qwen2RMSNorm(nn.Module):
96
  return self.weight * hidden_states.to(input_dtype)
97
 
98
 
99
- # Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Qwen2
100
  class Qwen2RotaryEmbedding(nn.Module):
101
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
 
 
102
  super().__init__()
 
 
103
 
104
- self.dim = dim
105
- self.max_position_embeddings = max_position_embeddings
106
- self.base = base
107
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
108
- self.register_buffer("inv_freq", inv_freq, persistent=False)
109
 
110
- # Build here to make `torch.jit.trace` work.
111
- self._set_cos_sin_cache(
112
- seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
113
- )
 
114
 
115
- def _set_cos_sin_cache(self, seq_len, device, dtype):
116
- self.max_seq_len_cached = seq_len
117
- t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
118
 
119
- freqs = torch.outer(t, self.inv_freq)
120
- # Different from paper, but it uses a different permutation in order to obtain the same calculation
121
- emb = torch.cat((freqs, freqs), dim=-1)
122
- self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
123
- self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
 
125
- def forward(self, x, seq_len=None):
126
- # x: [bs, num_attention_heads, seq_len, head_size]
127
- if seq_len > self.max_seq_len_cached:
128
- self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
129
 
130
- return (
131
- self.cos_cached[:seq_len].to(dtype=x.dtype),
132
- self.sin_cached[:seq_len].to(dtype=x.dtype),
133
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
 
136
  # Copied from transformers.models.llama.modeling_llama.rotate_half
@@ -163,8 +201,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
163
  Returns:
164
  `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
165
  """
166
- cos = cos[position_ids].unsqueeze(unsqueeze_dim)
167
- sin = sin[position_ids].unsqueeze(unsqueeze_dim)
168
  q_embed = (q * cos) + (rotate_half(q) * sin)
169
  k_embed = (k * cos) + (rotate_half(k) * sin)
170
  return q_embed, k_embed
@@ -235,12 +273,7 @@ class Qwen2Attention(nn.Module):
235
  self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
236
  self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
237
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
238
-
239
- self.rotary_emb = Qwen2RotaryEmbedding(
240
- self.head_dim,
241
- max_position_embeddings=self.max_position_embeddings,
242
- base=self.rope_theta,
243
- )
244
 
245
  def forward(
246
  self,
@@ -277,7 +310,7 @@ class Qwen2Attention(nn.Module):
277
  # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
278
  past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
279
  kv_seq_len += past_len
280
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
281
  query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
282
 
283
  if past_key_value is not None:
@@ -385,8 +418,7 @@ class Qwen2FlashAttention2(Qwen2Attention):
385
  kv_seq_len += past_len
386
 
387
  # Because the input can be padded, the absolute sequence length depends on the max position id.
388
- rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
389
- cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
390
 
391
  query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
392
 
@@ -683,7 +715,7 @@ class Qwen2SdpaAttention(Qwen2Attention):
683
  # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
684
  past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
685
  kv_seq_len += past_len
686
- cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
687
 
688
  query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
689
 
@@ -842,17 +874,6 @@ class Qwen2PreTrainedModel(PreTrainedModel):
842
  _supports_sdpa = True
843
  _supports_cache_class = True
844
 
845
- def _init_weights(self, module):
846
- std = self.config.initializer_range
847
- if isinstance(module, nn.Linear):
848
- module.weight.data.normal_(mean=0.0, std=std)
849
- if module.bias is not None:
850
- module.bias.data.zero_()
851
- elif isinstance(module, nn.Embedding):
852
- module.weight.data.normal_(mean=0.0, std=std)
853
- if module.padding_idx is not None:
854
- module.weight.data[module.padding_idx].zero_()
855
-
856
 
857
  QWEN2_INPUTS_DOCSTRING = r"""
858
  Args:
 
18
  # See the License for the specific language governing permissions and
19
  # limitations under the License.
20
  """ PyTorch Qwen2 model."""
21
+ from contextlib import nullcontext
22
+ from transformers import Qwen2Config, ROPE_INIT_FUNCTIONS
23
  import inspect
24
  import math
 
25
  import warnings
26
+ from typing import List, Optional, Tuple, Union, Callable
27
 
28
  import torch
29
  import torch.nn.functional as F
 
44
  logging,
45
  replace_return_docstrings,
46
  )
 
 
47
  if is_flash_attn_2_available():
48
  from flash_attn import flash_attn_func, flash_attn_varlen_func
49
  from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa
 
63
  ]
64
 
65
 
66
+ def maybe_autocast(
67
+ device_type: str,
68
+ dtype: Optional["_dtype"] = None,
69
+ enabled: bool = True,
70
+ cache_enabled: bool | None = None,
71
+ ):
72
+ if torch.is_autocast_enabled(device_type) or enabled:
73
+ return torch.autocast(device_type, dtype=dtype, enabled=enabled, cache_enabled=cache_enabled)
74
+ else:
75
+ return nullcontext()
76
+
77
+
78
+
79
  # Copied from transformers.models.llama.modeling_llama._get_unpad_data
80
  def _get_unpad_data(attention_mask):
81
  seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
 
107
  return self.weight * hidden_states.to(input_dtype)
108
 
109
 
 
110
  class Qwen2RotaryEmbedding(nn.Module):
111
+ inv_freq: torch.Tensor # fix linting for `register_buffer`
112
+
113
+ def __init__(self, config: Qwen2Config, device=None):
114
  super().__init__()
115
+ self.max_seq_len_cached = config.max_position_embeddings
116
+ self.original_max_seq_len = config.max_position_embeddings
117
 
118
+ self.config = config
 
 
 
 
119
 
120
+ self.rope_type = "default"
121
+ rope_init_fn: Callable = self.compute_default_rope_parameters
122
+ if self.rope_type != "default":
123
+ rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
124
+ inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
125
 
126
+ self.register_buffer("inv_freq", inv_freq, persistent=False)
127
+ self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
 
128
 
129
+ @staticmethod
130
+ def compute_default_rope_parameters(
131
+ config: Qwen2Config | None = None,
132
+ device: Optional["torch.device"] = None,
133
+ seq_len: int | None = None,
134
+ ) -> tuple["torch.Tensor", float]:
135
+ """
136
+ Computes the inverse frequencies according to the original RoPE implementation
137
+ Args:
138
+ config ([`~transformers.PreTrainedConfig`]):
139
+ The model configuration.
140
+ device (`torch.device`):
141
+ The device to use for initialization of the inverse frequencies.
142
+ seq_len (`int`, *optional*):
143
+ The current sequence length. Unused for this type of RoPE.
144
+ Returns:
145
+ Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
146
+ post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
147
+ """
148
+ base = config.rope_theta
149
+ dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
150
 
151
+ attention_factor = 1.0 # Unused in this type of RoPE
 
 
 
152
 
153
+ # Compute the inverse frequencies
154
+ inv_freq = 1.0 / (
155
+ base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)
156
  )
157
+ return inv_freq, attention_factor
158
+
159
+ @torch.no_grad()
160
+ def forward(self, x, position_ids):
161
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
162
+ position_ids_expanded = position_ids[:, None, :].float()
163
+
164
+ device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
165
+ with maybe_autocast(device_type=device_type, enabled=False): # Force float32
166
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
167
+ emb = torch.cat((freqs, freqs), dim=-1)
168
+ cos = emb.cos() * self.attention_scaling
169
+ sin = emb.sin() * self.attention_scaling
170
+
171
+ return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
172
 
173
 
174
  # Copied from transformers.models.llama.modeling_llama.rotate_half
 
201
  Returns:
202
  `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
203
  """
204
+ cos = cos.unsqueeze(unsqueeze_dim)
205
+ sin = sin.unsqueeze(unsqueeze_dim)
206
  q_embed = (q * cos) + (rotate_half(q) * sin)
207
  k_embed = (k * cos) + (rotate_half(k) * sin)
208
  return q_embed, k_embed
 
273
  self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
274
  self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=True)
275
  self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
276
+ self.rotary_emb = Qwen2RotaryEmbedding(self.config)
 
 
 
 
 
277
 
278
  def forward(
279
  self,
 
310
  # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
311
  past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
312
  kv_seq_len += past_len
313
+ cos, sin = self.rotary_emb(value_states, position_ids)
314
  query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
315
 
316
  if past_key_value is not None:
 
418
  kv_seq_len += past_len
419
 
420
  # Because the input can be padded, the absolute sequence length depends on the max position id.
421
+ cos, sin = self.rotary_emb(value_states, position_ids)
 
422
 
423
  query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
424
 
 
715
  # kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
716
  past_len = past_key_value.get_seq_length(self.layer_idx) if past_key_value is not None else 0
717
  kv_seq_len += past_len
718
+ cos, sin = self.rotary_emb(value_states, position_ids)
719
 
720
  query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
721
 
 
874
  _supports_sdpa = True
875
  _supports_cache_class = True
876
 
 
 
 
 
 
 
 
 
 
 
 
877
 
878
  QWEN2_INPUTS_DOCSTRING = r"""
879
  Args:
tokenization_qwen.py CHANGED
@@ -1,8 +1,23 @@
1
-
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from typing import List, Optional
3
- from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as OriginalQwen2Tokenizer
4
- from transformers.models.qwen2.tokenization_qwen2_fast import Qwen2TokenizerFast as OriginalQwen2TokenizerFast
5
- from tokenizers import processors
 
 
6
 
7
  VOCAB_FILES_NAMES = {
8
  "vocab_file": "vocab.json",
@@ -10,258 +25,265 @@ VOCAB_FILES_NAMES = {
10
  "tokenizer_file": "tokenizer.json",
11
  }
12
 
13
- class Qwen2Tokenizer(OriginalQwen2Tokenizer):
14
- """
15
- Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
16
-
17
- Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
18
- be encoded differently whether it is at the beginning of the sentence (without space) or not:
19
-
20
- ```python
21
- >>> from transformers import Qwen2Tokenizer
22
-
23
- >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
24
- >>> tokenizer("Hello world")["input_ids"]
25
- [9707, 1879]
26
-
27
- >>> tokenizer(" Hello world")["input_ids"]
28
- [21927, 1879]
29
- ```
30
- This is expected.
31
-
32
- You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
33
-
34
- This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
35
- this superclass for more information regarding those methods.
36
-
37
- Args:
38
- vocab_file (`str`):
39
- Path to the vocabulary file.
40
- merges_file (`str`):
41
- Path to the merges file.
42
- errors (`str`, *optional*, defaults to `"replace"`):
43
- Paradigm to follow when decoding bytes to UTF-8. See
44
- [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
45
- unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
46
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
47
- token instead.
48
- bos_token (`str`, *optional*):
49
- The beginning of sequence token. Not applicable for this tokenizer.
50
- eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
51
- The end of sequence token.
52
- pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
53
- The token used for padding, for example when batching sequences of different lengths.
54
- clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
55
- Whether or not the model should cleanup the spaces that were added when splitting the input text during the
56
- tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
57
- split_special_tokens (`bool`, *optional*, defaults to `False`):
58
- Whether or not the special tokens should be split during the tokenization process. The default behavior is
59
- to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
60
- ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
61
- '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
62
- add_eos_token (`bool`, *optional*, defaults to `False`):
63
- Whether or not to add an `eos_token` at the end of sequences.
64
- """
65
-
66
- def __init__(
67
- self,
68
- vocab_file,
69
- merges_file,
70
- errors="replace",
71
- unk_token="<|endoftext|>",
72
- bos_token=None,
73
- eos_token="<|endoftext|>",
74
- pad_token="<|endoftext|>",
75
- clean_up_tokenization_spaces=False,
76
- split_special_tokens=False,
77
- add_eos_token=False,
78
- **kwargs,
79
- ):
80
- # The add_eos_token code was inspired by the LlamaTokenizer
81
- self.add_eos_token = add_eos_token
82
-
83
- super().__init__(
84
- vocab_file=vocab_file,
85
- merges_file=merges_file,
86
- errors=errors,
87
- unk_token=unk_token,
88
- bos_token=bos_token,
89
- eos_token=eos_token,
90
- pad_token=pad_token,
91
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
92
- split_special_tokens=split_special_tokens,
93
- add_eos_token=add_eos_token,
94
  **kwargs,
95
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
98
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
 
99
 
100
- output = token_ids_0 + eos_token_id
 
 
101
 
102
- if token_ids_1 is not None:
103
- output = output + token_ids_1 + eos_token_id
 
 
 
104
 
105
- return output
 
106
 
107
- def get_special_tokens_mask(
108
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
109
- ) -> List[int]:
 
 
 
 
 
 
 
 
110
  """
111
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
112
- special tokens using the tokenizer `prepare_for_model` method.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
  Args:
115
- token_ids_0 (`List[int]`):
116
- List of IDs.
117
- token_ids_1 (`List[int]`, *optional*):
118
- Optional second list of IDs for sequence pairs.
119
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
120
- Whether or not the token list is already formatted with special tokens for the model.
121
-
122
- Returns:
123
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
124
  """
125
- if already_has_special_tokens:
126
- return super().get_special_tokens_mask(
127
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  )
129
 
130
- eos_token_id = [1] if self.add_eos_token else []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
- if token_ids_1 is None:
133
- return ([0] * len(token_ids_0)) + eos_token_id
134
- return (
135
- ([0] * len(token_ids_0))
136
- + eos_token_id
137
- + ([0] * len(token_ids_1))
138
- + eos_token_id
139
- )
140
 
141
- def create_token_type_ids_from_sequences(
142
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
143
- ) -> List[int]:
144
- """
145
- Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
146
- sequence pair mask has the following format:
147
 
148
- ```
149
- 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
150
- | first sequence | second sequence |
151
- ```
152
 
153
- if token_ids_1 is None, only returns the first portion of the mask (0s).
 
 
 
 
154
 
155
- Args:
156
- token_ids_0 (`List[int]`):
157
- List of ids.
158
- token_ids_1 (`List[int]`, *optional*):
159
- Optional second list of IDs for sequence pairs.
160
 
161
- Returns:
162
- `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
163
- """
164
- eos_token_id = [self.eos_token_id] if self.add_eos_token else []
165
-
166
- output = [0] * len(token_ids_0 + eos_token_id)
167
-
168
- if token_ids_1 is not None:
169
- output += [1] * len(token_ids_1 + eos_token_id)
170
-
171
- return output
172
-
173
- class Qwen2TokenizerFast(OriginalQwen2TokenizerFast):
174
- """
175
- Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
176
- Byte-Pair-Encoding.
177
-
178
- Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
179
- be encoded differently whether it is at the beginning of the sentence (without space) or not:
180
-
181
- ```python
182
- >>> from transformers import Qwen2TokenizerFast
183
-
184
- >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
185
- >>> tokenizer("Hello world")["input_ids"]
186
- [9707, 1879]
187
-
188
- >>> tokenizer(" Hello world")["input_ids"]
189
- [21927, 1879]
190
- ```
191
- This is expected.
192
-
193
- This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
194
- refer to this superclass for more information regarding those methods.
195
-
196
- Args:
197
- vocab_file (`str`, *optional*):
198
- Path to the vocabulary file.
199
- merges_file (`str`, *optional*):
200
- Path to the merges file.
201
- tokenizer_file (`str`, *optional*):
202
- Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
203
- contains everything needed to load the tokenizer.
204
- unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
205
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
206
- token instead. Not applicable to this tokenizer.
207
- bos_token (`str`, *optional*):
208
- The beginning of sequence token. Not applicable for this tokenizer.
209
- eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
210
- The end of sequence token.
211
- pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
212
- The token used for padding, for example when batching sequences of different lengths.
213
- add_eos_token (`bool`, *optional*, defaults to `False`):
214
- Whether or not to add an `eos_token` at the end of sequences.
215
- """
216
-
217
- slow_tokenizer_class = Qwen2Tokenizer
218
- padding_side = "left"
219
-
220
- def __init__(
221
- self,
222
- vocab_file=None,
223
- merges_file=None,
224
- tokenizer_file=None,
225
- unk_token="<|endoftext|>",
226
- bos_token=None,
227
- eos_token="<|endoftext|>",
228
- pad_token="<|endoftext|>",
229
- add_eos_token=False,
230
- **kwargs,
231
- ):
232
- super().__init__(
233
- vocab_file=vocab_file,
234
- merges_file=merges_file,
235
- tokenizer_file=tokenizer_file,
236
- unk_token=unk_token,
237
- bos_token=bos_token,
238
- eos_token=eos_token,
239
- pad_token=pad_token,
240
- **kwargs,
241
- )
242
 
243
- self._add_eos_token = add_eos_token
244
- self.update_post_processor()
245
 
246
- def update_post_processor(self):
247
- """
248
- Updates the underlying post processor with the current `eos_token`.
249
- """
250
- eos = self.eos_token
251
- eos_token_id = self.eos_token_id
252
- if eos is None and self.add_eos_token:
253
- raise ValueError("add_eos_token = True but eos_token = None")
254
-
255
- single = f"$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
256
- pair = f"{single} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
257
-
258
- special_tokens = []
259
- if self.add_eos_token:
260
- special_tokens.append((eos, eos_token_id))
261
- self._tokenizer.post_processor = processors.TemplateProcessing(
262
- single=single, pair=pair, special_tokens=special_tokens
263
- )
264
-
265
- @property
266
- def add_eos_token(self):
267
- return self._add_eos_token
 
1
+ # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ """Tokenization classes for Qwen2."""
15
  from typing import List, Optional
16
+
17
+ from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers
18
+ from tokenizers.models import BPE
19
+ from tokenizers.processors import TemplateProcessing
20
+
21
 
22
  VOCAB_FILES_NAMES = {
23
  "vocab_file": "vocab.json",
 
25
  "tokenizer_file": "tokenizer.json",
26
  }
27
 
28
+ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
29
+
30
+ PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"""
31
+
32
+ from packaging.version import Version
33
+ import transformers
34
+
35
+ if Version(transformers.__version__) >= Version("5.0.0"):
36
+ from transformers import TokenizersBackend
37
+
38
+ class Qwen2Tokenizer(TokenizersBackend):
39
+ vocab_files_names = VOCAB_FILES_NAMES
40
+ model_input_names = ["input_ids", "attention_mask"]
41
+ model = BPE
42
+
43
+ def __init__(
44
+ self,
45
+ vocab: str | dict[str, int] | None = None,
46
+ merges: str | list[str] | None = None,
47
+ unk_token: str = "<|endoftext|>",
48
+ bos_token=None,
49
+ eos_token: str = "<|endoftext|>",
50
+ pad_token: str = "<|endoftext|>",
51
+ add_prefix_space=None,
52
+ add_eos_token=True,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  **kwargs,
54
+ ):
55
+ self.add_prefix_space = add_prefix_space if add_prefix_space is not None else False
56
+ self._vocab = (
57
+ vocab
58
+ if vocab is not None
59
+ else {
60
+ "<|endoftext|>": 0,
61
+ }
62
+ )
63
+ self._merges = merges or []
64
+ self._tokenizer = Tokenizer(
65
+ BPE(
66
+ vocab=self._vocab,
67
+ merges=self._merges,
68
+ dropout=None,
69
+ unk_token=None,
70
+ continuing_subword_prefix="",
71
+ end_of_word_suffix="",
72
+ fuse_unk=False,
73
+ byte_fallback=False,
74
+ )
75
+ )
76
+ self._tokenizer.decoder = decoders.ByteLevel()
77
+ self._tokenizer.normalizer = normalizers.NFC()
78
+ self._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
79
+ [
80
+ pre_tokenizers.Split(
81
+ Regex(PRETOKENIZE_REGEX),
82
+ behavior="isolated",
83
+ invert=False,
84
+ ),
85
+ pre_tokenizers.ByteLevel(
86
+ add_prefix_space=self.add_prefix_space,
87
+ use_regex=False,
88
+ ),
89
+ ]
90
+ )
91
+
92
+ super().__init__(
93
+ unk_token=unk_token,
94
+ bos_token=bos_token,
95
+ eos_token=eos_token,
96
+ pad_token=pad_token,
97
+ add_prefix_space=add_prefix_space,
98
+ **kwargs,
99
+ )
100
 
101
+ self.add_tokens([AddedToken(token, special=True) for token in self.all_special_tokens])
102
+ self._add_eos_token = add_eos_token
103
+ self.update_post_processor()
104
 
105
+ @property
106
+ def add_eos_token(self):
107
+ return self._add_eos_token
108
 
109
+ def update_post_processor(self):
110
+ eos = self.eos_token
111
+ eos_token_id = self.eos_token_id
112
+ if eos is None and self.add_eos_token:
113
+ raise ValueError("add_eos_token = True but eos_token = None")
114
 
115
+ single = f"$A:0{(' ' + eos + ':0') if self.add_eos_token else ''}"
116
+ pair = f"{single} $B:1{(' ' + eos + ':1') if self.add_eos_token else ''}"
117
 
118
+ special_tokens = []
119
+ if self.add_eos_token:
120
+ special_tokens.append((eos, eos_token_id))
121
+ self._tokenizer.post_processor = TemplateProcessing(
122
+ single=single, pair=pair, special_tokens=special_tokens
123
+ )
124
+
125
+ else:
126
+ from transformers.models.qwen2.tokenization_qwen2 import Qwen2Tokenizer as OriginalQwen2Tokenizer
127
+
128
+ class Qwen2Tokenizer(OriginalQwen2Tokenizer):
129
  """
130
+ Construct a Qwen2 tokenizer. Based on byte-level Byte-Pair-Encoding.
131
+
132
+ Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
133
+ be encoded differently whether it is at the beginning of the sentence (without space) or not:
134
+
135
+ ```python
136
+ >>> from transformers import Qwen2Tokenizer
137
+
138
+ >>> tokenizer = Qwen2Tokenizer.from_pretrained("Qwen/Qwen-tokenizer")
139
+ >>> tokenizer("Hello world")["input_ids"]
140
+ [9707, 1879]
141
+
142
+ >>> tokenizer(" Hello world")["input_ids"]
143
+ [21927, 1879]
144
+ ```
145
+ This is expected.
146
+
147
+ You should not use GPT2Tokenizer instead, because of the different pretokenization rules.
148
+
149
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
150
+ this superclass for more information regarding those methods.
151
 
152
  Args:
153
+ vocab_file (`str`):
154
+ Path to the vocabulary file.
155
+ merges_file (`str`):
156
+ Path to the merges file.
157
+ errors (`str`, *optional*, defaults to `"replace"`):
158
+ Paradigm to follow when decoding bytes to UTF-8. See
159
+ [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
160
+ unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
161
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
162
+ token instead.
163
+ bos_token (`str`, *optional*):
164
+ The beginning of sequence token. Not applicable for this tokenizer.
165
+ eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
166
+ The end of sequence token.
167
+ pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
168
+ The token used for padding, for example when batching sequences of different lengths.
169
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
170
+ Whether or not the model should cleanup the spaces that were added when splitting the input text during the
171
+ tokenization process. Not applicable to this tokenizer, since tokenization does not add spaces.
172
+ split_special_tokens (`bool`, *optional*, defaults to `False`):
173
+ Whether or not the special tokens should be split during the tokenization process. The default behavior is
174
+ to not split special tokens. This means that if `<|endoftext|>` is the `eos_token`, then `tokenizer.tokenize("<|endoftext|>") =
175
+ ['<|endoftext|>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<|endoftext|>")` will be give `['<',
176
+ '|', 'endo', 'ft', 'ext', '|', '>']`. This argument is only supported for `slow` tokenizers for the moment.
177
+ add_eos_token (`bool`, *optional*, defaults to `False`):
178
+ Whether or not to add an `eos_token` at the end of sequences.
179
  """
180
+
181
+ def __init__(
182
+ self,
183
+ vocab_file,
184
+ merges_file,
185
+ errors="replace",
186
+ unk_token="<|endoftext|>",
187
+ bos_token=None,
188
+ eos_token="<|endoftext|>",
189
+ pad_token="<|endoftext|>",
190
+ clean_up_tokenization_spaces=False,
191
+ split_special_tokens=False,
192
+ add_eos_token=False,
193
+ **kwargs,
194
+ ):
195
+ # The add_eos_token code was inspired by the LlamaTokenizer
196
+ self.add_eos_token = add_eos_token
197
+
198
+ super().__init__(
199
+ vocab_file=vocab_file,
200
+ merges_file=merges_file,
201
+ errors=errors,
202
+ unk_token=unk_token,
203
+ bos_token=bos_token,
204
+ eos_token=eos_token,
205
+ pad_token=pad_token,
206
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
207
+ split_special_tokens=split_special_tokens,
208
+ add_eos_token=add_eos_token,
209
+ **kwargs,
210
  )
211
 
212
+ def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
213
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
214
+
215
+ output = token_ids_0 + eos_token_id
216
+
217
+ if token_ids_1 is not None:
218
+ output = output + token_ids_1 + eos_token_id
219
+
220
+ return output
221
+
222
+ def get_special_tokens_mask(
223
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None,
224
+ already_has_special_tokens: bool = False
225
+ ) -> List[int]:
226
+ """
227
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
228
+ special tokens using the tokenizer `prepare_for_model` method.
229
+
230
+ Args:
231
+ token_ids_0 (`List[int]`):
232
+ List of IDs.
233
+ token_ids_1 (`List[int]`, *optional*):
234
+ Optional second list of IDs for sequence pairs.
235
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
236
+ Whether or not the token list is already formatted with special tokens for the model.
237
+
238
+ Returns:
239
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
240
+ """
241
+ if already_has_special_tokens:
242
+ return super().get_special_tokens_mask(
243
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
244
+ )
245
+
246
+ eos_token_id = [1] if self.add_eos_token else []
247
+
248
+ if token_ids_1 is None:
249
+ return ([0] * len(token_ids_0)) + eos_token_id
250
+ return (
251
+ ([0] * len(token_ids_0))
252
+ + eos_token_id
253
+ + ([0] * len(token_ids_1))
254
+ + eos_token_id
255
+ )
256
 
257
+ def create_token_type_ids_from_sequences(
258
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
259
+ ) -> List[int]:
260
+ """
261
+ Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
262
+ sequence pair mask has the following format:
 
 
263
 
264
+ ```
265
+ 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
266
+ | first sequence | second sequence |
267
+ ```
 
 
268
 
269
+ if token_ids_1 is None, only returns the first portion of the mask (0s).
 
 
 
270
 
271
+ Args:
272
+ token_ids_0 (`List[int]`):
273
+ List of ids.
274
+ token_ids_1 (`List[int]`, *optional*):
275
+ Optional second list of IDs for sequence pairs.
276
 
277
+ Returns:
278
+ `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
279
+ """
280
+ eos_token_id = [self.eos_token_id] if self.add_eos_token else []
 
281
 
282
+ output = [0] * len(token_ids_0 + eos_token_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
 
284
+ if token_ids_1 is not None:
285
+ output += [1] * len(token_ids_1 + eos_token_id)
286
 
287
+ return output
288
+
289
+ __all__ = ["Qwen2Tokenizer"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tokenizer_config.json CHANGED
@@ -32,7 +32,7 @@
32
  "<|im_end|>"
33
  ],
34
  "auto_map": {
35
- "AutoTokenizer": ["tokenization_qwen.Qwen2Tokenizer", "tokenization_qwen.Qwen2TokenizerFast"]
36
  },
37
  "bos_token": null,
38
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
@@ -43,5 +43,6 @@
43
  "pad_token": "<|endoftext|>",
44
  "split_special_tokens": false,
45
  "tokenizer_class": "Qwen2Tokenizer",
46
- "unk_token": null
 
47
  }
 
32
  "<|im_end|>"
33
  ],
34
  "auto_map": {
35
+ "AutoTokenizer": ["tokenization_qwen.Qwen2Tokenizer", "tokenization_qwen.Qwen2Tokenizer"]
36
  },
37
  "bos_token": null,
38
  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
 
43
  "pad_token": "<|endoftext|>",
44
  "split_special_tokens": false,
45
  "tokenizer_class": "Qwen2Tokenizer",
46
+ "unk_token": null,
47
+ "padding_side": "left"
48
  }