exdysa commited on
Commit
4f35d33
·
verified ·
1 Parent(s): fe1b0c6

Delete tokenization_qwen2_fast.py

Browse files
Files changed (1) hide show
  1. tokenization_qwen2_fast.py +0 -131
tokenization_qwen2_fast.py DELETED
@@ -1,131 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Tokenization classes for Qwen2."""
16
-
17
- from typing import Optional, Tuple
18
-
19
- from transformers.tokenization_utils import AddedToken
20
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
21
- from transformers.utils import logging
22
- from .tokenization_qwen2 import Qwen2Tokenizer
23
-
24
-
25
- logger = logging.get_logger(__name__)
26
-
27
- VOCAB_FILES_NAMES = {
28
- "vocab_file": "vocab.json",
29
- "merges_file": "merges.txt",
30
- "tokenizer_file": "tokenizer.json",
31
- }
32
-
33
-
34
- MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
35
-
36
-
37
- class Qwen2TokenizerFast(PreTrainedTokenizerFast):
38
- """
39
- Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
40
- Byte-Pair-Encoding.
41
- Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
42
- be encoded differently whether it is at the beginning of the sentence (without space) or not:
43
- ```python
44
- >>> from transformers import Qwen2TokenizerFast
45
- >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
46
- >>> tokenizer("Hello world")["input_ids"]
47
- [9707, 1879]
48
- >>> tokenizer(" Hello world")["input_ids"]
49
- [21927, 1879]
50
- ```
51
- This is expected.
52
- This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
53
- refer to this superclass for more information regarding those methods.
54
- Args:
55
- vocab_file (`str`, *optional*):
56
- Path to the vocabulary file.
57
- merges_file (`str`, *optional*):
58
- Path to the merges file.
59
- tokenizer_file (`str`, *optional*):
60
- Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
61
- contains everything needed to load the tokenizer.
62
- unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
63
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
64
- token instead. Not applicable to this tokenizer.
65
- bos_token (`str`, *optional*):
66
- The beginning of sequence token. Not applicable for this tokenizer.
67
- eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
68
- The end of sequence token.
69
- pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
70
- The token used for padding, for example when batching sequences of different lengths.
71
- """
72
-
73
- vocab_files_names = VOCAB_FILES_NAMES
74
- model_input_names = ["input_ids", "attention_mask"]
75
- slow_tokenizer_class = Qwen2Tokenizer
76
-
77
- def __init__(
78
- self,
79
- vocab_file=None,
80
- merges_file=None,
81
- tokenizer_file=None,
82
- unk_token="<|endoftext|>",
83
- bos_token=None,
84
- eos_token="<|endoftext|>",
85
- pad_token="<|endoftext|>",
86
- **kwargs,
87
- ):
88
- # We need to at least pass vocab_file and merges_file to base class
89
- # in case a slow tokenizer needs to be initialized; other can be
90
- # configured through files.
91
- # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
92
-
93
- bos_token = (
94
- AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
95
- if isinstance(bos_token, str)
96
- else bos_token
97
- )
98
- eos_token = (
99
- AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
100
- if isinstance(eos_token, str)
101
- else eos_token
102
- )
103
- unk_token = (
104
- AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
105
- if isinstance(unk_token, str)
106
- else unk_token
107
- )
108
- pad_token = (
109
- AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
110
- if isinstance(pad_token, str)
111
- else pad_token
112
- )
113
-
114
- super().__init__(
115
- vocab_file=vocab_file,
116
- merges_file=merges_file,
117
- tokenizer_file=tokenizer_file,
118
- unk_token=unk_token,
119
- bos_token=bos_token,
120
- eos_token=eos_token,
121
- pad_token=pad_token,
122
- **kwargs,
123
- )
124
-
125
- # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
126
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
127
- files = self._tokenizer.model.save(save_directory, name=filename_prefix)
128
- return tuple(files)
129
-
130
-
131
- __all__ = ["Qwen2TokenizerFast"]