exdysa commited on
Commit
111b8ca
·
verified ·
1 Parent(s): 812e485

Delete tokenization_qwen2_fast.py

Browse files
Files changed (1) hide show
  1. tokenization_qwen2_fast.py +0 -137
tokenization_qwen2_fast.py DELETED
@@ -1,137 +0,0 @@
1
- # coding=utf-8
2
- # Copyright 2024 The Qwen team, Alibaba Group and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # Licensed under the Apache License, Version 2.0 (the "License");
5
- # you may not use this file except in compliance with the License.
6
- # You may obtain a copy of the License at
7
- #
8
- # http://www.apache.org/licenses/LICENSE-2.0
9
- #
10
- # Unless required by applicable law or agreed to in writing, software
11
- # distributed under the License is distributed on an "AS IS" BASIS,
12
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
- # See the License for the specific language governing permissions and
14
- # limitations under the License.
15
- """Tokenization classes for Qwen2."""
16
-
17
- from typing import Optional, Tuple
18
-
19
- from transformers.tokenization_utils import AddedToken
20
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
21
- from transformers.utils import logging
22
- from .tokenization_qwen2 import Qwen2Tokenizer
23
-
24
-
25
- logger = logging.get_logger(__name__)
26
-
27
- VOCAB_FILES_NAMES = {
28
- "vocab_file": "vocab.json",
29
- "merges_file": "merges.txt",
30
- "tokenizer_file": "tokenizer.json",
31
- }
32
-
33
-
34
- MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768}
35
-
36
-
37
- class Qwen2TokenizerFast(PreTrainedTokenizerFast):
38
- """
39
- Construct a "fast" Qwen2 tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
40
- Byte-Pair-Encoding.
41
-
42
- Same with GPT2Tokenizer, this tokenizer has been trained to treat spaces like parts of the tokens so a word will
43
- be encoded differently whether it is at the beginning of the sentence (without space) or not:
44
-
45
- ```python
46
- >>> from transformers import Qwen2TokenizerFast
47
-
48
- >>> tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen-tokenizer")
49
- >>> tokenizer("Hello world")["input_ids"]
50
- [9707, 1879]
51
-
52
- >>> tokenizer(" Hello world")["input_ids"]
53
- [21927, 1879]
54
- ```
55
- This is expected.
56
-
57
- This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
58
- refer to this superclass for more information regarding those methods.
59
-
60
- Args:
61
- vocab_file (`str`, *optional*):
62
- Path to the vocabulary file.
63
- merges_file (`str`, *optional*):
64
- Path to the merges file.
65
- tokenizer_file (`str`, *optional*):
66
- Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
67
- contains everything needed to load the tokenizer.
68
- unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
69
- The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
70
- token instead. Not applicable to this tokenizer.
71
- bos_token (`str`, *optional*):
72
- The beginning of sequence token. Not applicable for this tokenizer.
73
- eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
74
- The end of sequence token.
75
- pad_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
76
- The token used for padding, for example when batching sequences of different lengths.
77
- """
78
-
79
- vocab_files_names = VOCAB_FILES_NAMES
80
- model_input_names = ["input_ids", "attention_mask"]
81
- slow_tokenizer_class = Qwen2Tokenizer
82
-
83
- def __init__(
84
- self,
85
- vocab_file=None,
86
- merges_file=None,
87
- tokenizer_file=None,
88
- unk_token="<|endoftext|>",
89
- bos_token=None,
90
- eos_token="<|endoftext|>",
91
- pad_token="<|endoftext|>",
92
- **kwargs,
93
- ):
94
- # We need to at least pass vocab_file and merges_file to base class
95
- # in case a slow tokenizer needs to be initialized; other can be
96
- # configured through files.
97
- # following GPT2TokenizerFast, also adding unk_token, bos_token, and eos_token
98
-
99
- bos_token = (
100
- AddedToken(bos_token, lstrip=False, rstrip=False, special=True, normalized=False)
101
- if isinstance(bos_token, str)
102
- else bos_token
103
- )
104
- eos_token = (
105
- AddedToken(eos_token, lstrip=False, rstrip=False, special=True, normalized=False)
106
- if isinstance(eos_token, str)
107
- else eos_token
108
- )
109
- unk_token = (
110
- AddedToken(unk_token, lstrip=False, rstrip=False, special=True, normalized=False)
111
- if isinstance(unk_token, str)
112
- else unk_token
113
- )
114
- pad_token = (
115
- AddedToken(pad_token, lstrip=False, rstrip=False, special=True, normalized=False)
116
- if isinstance(pad_token, str)
117
- else pad_token
118
- )
119
-
120
- super().__init__(
121
- vocab_file=vocab_file,
122
- merges_file=merges_file,
123
- tokenizer_file=tokenizer_file,
124
- unk_token=unk_token,
125
- bos_token=bos_token,
126
- eos_token=eos_token,
127
- pad_token=pad_token,
128
- **kwargs,
129
- )
130
-
131
- # Copied from transformers.models.gpt2.tokenization_gpt2_fast.GPT2TokenizerFast.save_vocabulary
132
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
133
- files = self._tokenizer.model.save(save_directory, name=filename_prefix)
134
- return tuple(files)
135
-
136
-
137
- __all__ = ["Qwen2TokenizerFast"]