tranviethuy01 commited on
Commit
fe32f46
·
verified ·
1 Parent(s): 0e78128

Clone model from vinai/bartpho-syllable

Browse files
.gitattributes CHANGED
@@ -1,35 +1,29 @@
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
 
4
  *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
  *.model filter=lfs diff=lfs merge=lfs -text
13
  *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
  *.onnx filter=lfs diff=lfs merge=lfs -text
17
  *.ot filter=lfs diff=lfs merge=lfs -text
18
  *.parquet filter=lfs diff=lfs merge=lfs -text
19
  *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
  *.pt filter=lfs diff=lfs merge=lfs -text
23
  *.pth filter=lfs diff=lfs merge=lfs -text
24
  *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
  *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
  *.tflite filter=lfs diff=lfs merge=lfs -text
30
  *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
  *.xz filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
1
  *.7z filter=lfs diff=lfs merge=lfs -text
2
  *.arrow filter=lfs diff=lfs merge=lfs -text
3
  *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bin.* filter=lfs diff=lfs merge=lfs -text
5
  *.bz2 filter=lfs diff=lfs merge=lfs -text
 
6
  *.ftz filter=lfs diff=lfs merge=lfs -text
7
  *.gz filter=lfs diff=lfs merge=lfs -text
8
  *.h5 filter=lfs diff=lfs merge=lfs -text
9
  *.joblib filter=lfs diff=lfs merge=lfs -text
10
  *.lfs.* filter=lfs diff=lfs merge=lfs -text
 
11
  *.model filter=lfs diff=lfs merge=lfs -text
12
  *.msgpack filter=lfs diff=lfs merge=lfs -text
 
 
13
  *.onnx filter=lfs diff=lfs merge=lfs -text
14
  *.ot filter=lfs diff=lfs merge=lfs -text
15
  *.parquet filter=lfs diff=lfs merge=lfs -text
16
  *.pb filter=lfs diff=lfs merge=lfs -text
 
 
17
  *.pt filter=lfs diff=lfs merge=lfs -text
18
  *.pth filter=lfs diff=lfs merge=lfs -text
19
  *.rar filter=lfs diff=lfs merge=lfs -text
20
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 
21
  *.tar.* filter=lfs diff=lfs merge=lfs -text
 
22
  *.tflite filter=lfs diff=lfs merge=lfs -text
23
  *.tgz filter=lfs diff=lfs merge=lfs -text
 
24
  *.xz filter=lfs diff=lfs merge=lfs -text
25
  *.zip filter=lfs diff=lfs merge=lfs -text
26
+ *.zstandard filter=lfs diff=lfs merge=lfs -text
27
  *tfevents* filter=lfs diff=lfs merge=lfs -text
28
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
29
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,3 +1,21 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
4
+ # <a name="introduction"></a> BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese
5
+
6
+
7
+ Two BARTpho versions `BARTpho-syllable` and `BARTpho-word` are the first public large-scale monolingual sequence-to-sequence models pre-trained for Vietnamese. BARTpho uses the "large" architecture and pre-training scheme of the sequence-to-sequence denoising model [BART](https://github.com/pytorch/fairseq/tree/main/examples/bart), thus especially suitable for generative NLP tasks. Experiments on a downstream task of Vietnamese text summarization show that in both automatic and human evaluations, BARTpho outperforms the strong baseline [mBART](https://github.com/pytorch/fairseq/tree/main/examples/mbart) and improves the state-of-the-art.
8
+
9
+ The general architecture and experimental results of BARTpho can be found in our [paper](https://arxiv.org/abs/2109.09701):
10
+
11
+ @article{bartpho,
12
+ title = {{BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese}},
13
+ author = {Nguyen Luong Tran and Duong Minh Le and Dat Quoc Nguyen},
14
+ journal = {arXiv preprint},
15
+ volume = {arXiv:2109.09701},
16
+ year = {2021}
17
+ }
18
+
19
+ **Please CITE** our paper when BARTpho is used to help produce published results or incorporated into other software.
20
+
21
+ For further information or requests, please go to [BARTpho's homepage](https://github.com/VinAIResearch/BARTpho)!
__init__.py ADDED
File without changes
config.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.0,
3
+ "activation_function": "gelu",
4
+ "architectures": [
5
+ "MBartModel"
6
+ ],
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 0,
9
+ "classifier_dropout": 0.0,
10
+ "d_model": 1024,
11
+ "decoder_attention_heads": 16,
12
+ "decoder_ffn_dim": 4096,
13
+ "decoder_layerdrop": 0.0,
14
+ "decoder_layers": 12,
15
+ "decoder_start_token_id": 2,
16
+ "dropout": 0.1,
17
+ "encoder_attention_heads": 16,
18
+ "encoder_ffn_dim": 4096,
19
+ "encoder_layerdrop": 0.0,
20
+ "encoder_layers": 12,
21
+ "eos_token_id": 2,
22
+ "forced_eos_token_id": 2,
23
+ "gradient_checkpointing": false,
24
+ "init_std": 0.02,
25
+ "is_encoder_decoder": true,
26
+ "max_position_embeddings": 1024,
27
+ "model_type": "mbart",
28
+ "num_hidden_layers": 12,
29
+ "pad_token_id": 1,
30
+ "scale_embedding": false,
31
+ "torch_dtype": "float32",
32
+ "transformers_version": "4.10.2",
33
+ "use_cache": true,
34
+ "vocab_size": 40030,
35
+ "tokenizer_class": "BartphoTokenizer"
36
+ }
dict.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c3ca2ef8f5b24e0f3c71a8736cf207ac9bec18eeae5df39bac9c339721f96f4
3
+ size 1583480280
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8eb59e162b7c19001935efa03e0685ebb53b40c8d951286cff76138a87b6ac5e
3
+ size 1583451917
runs/Apr16_16-20-05_gb2t-WS-C621E-SAGE-Series/events.out.tfevents.1713259206.gb2t-WS-C621E-SAGE-Series.60008.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:429e3a33a41c913b63052cf10c3f6fec444f0f47c2d7e42274f41728740cd43a
3
+ size 5075
runs/Apr16_16-23-23_gb2t-WS-C621E-SAGE-Series/events.out.tfevents.1713259405.gb2t-WS-C621E-SAGE-Series.60748.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:447354c2aded4f2703d77adb4dfe5dc8dc27b9808f094b1c83fcfaf14ec50e6f
3
+ size 10518
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": true,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cc15aeacb446d0f6f02c2da354552a4bde7eec757c61d503e0754fa482209bc
3
+ size 1583820760
tokenization_bartpho.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 VinAI Research and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License
15
+ """ Tokenization classes for BARTpho-syllable model."""
16
+
17
+
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, List, Optional, Tuple
21
+
22
+ import sentencepiece as spm
23
+
24
+ from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
25
+ from transformers.utils import logging
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+ SPIECE_UNDERLINE = "▁"
31
+
32
+ VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"}
33
+
34
+ PRETRAINED_VOCAB_FILES_MAP = {
35
+ "vocab_file": {
36
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/sentencepiece.bpe.model",
37
+ },
38
+ "monolingual_vocab_file": {
39
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/dict.txt",
40
+ },
41
+ }
42
+
43
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
44
+
45
+
46
+ class BartphoTokenizer(PreTrainedTokenizer):
47
+ """
48
+ Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
49
+
50
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
51
+ this superclass for more information regarding those methods.
52
+
53
+ Args:
54
+ vocab_file (`str`):
55
+ Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the
56
+ multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types.
57
+ monolingual_vocab_file (`str`):
58
+ Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized
59
+ types extracted from the multilingual vocabulary vocab_file of 250K types.
60
+ bos_token (`str`, *optional*, defaults to `"<s>"`):
61
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
62
+
63
+ <Tip>
64
+
65
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
66
+ sequence. The token used is the `cls_token`.
67
+
68
+ </Tip>
69
+
70
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
71
+ The end of sequence token.
72
+
73
+ <Tip>
74
+
75
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
76
+ The token used is the `sep_token`.
77
+
78
+ </Tip>
79
+
80
+ sep_token (`str`, *optional*, defaults to `"</s>"`):
81
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
82
+ sequence classification or for a text and a question for question answering. It is also used as the last
83
+ token of a sequence built with special tokens.
84
+ cls_token (`str`, *optional*, defaults to `"<s>"`):
85
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
86
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
87
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
88
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
89
+ token instead.
90
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
91
+ The token used for padding, for example when batching sequences of different lengths.
92
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
93
+ The token used for masking values. This is the token used when training this model with masked language
94
+ modeling. This is the token which the model will try to predict.
95
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
96
+ Additional special tokens used by the tokenizer.
97
+ sp_model_kwargs (`dict`, *optional*):
98
+ Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
99
+ SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
100
+ to set:
101
+
102
+ - `enable_sampling`: Enable subword regularization.
103
+ - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
104
+
105
+ - `nbest_size = {0,1}`: No sampling is performed.
106
+ - `nbest_size > 1`: samples from the nbest_size results.
107
+ - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
108
+ using forward-filtering-and-backward-sampling algorithm.
109
+
110
+ - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
111
+ BPE-dropout.
112
+
113
+ Attributes:
114
+ sp_model (`SentencePieceProcessor`):
115
+ The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
116
+ """
117
+
118
+ vocab_files_names = VOCAB_FILES_NAMES
119
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
120
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
121
+ model_input_names = ["input_ids", "attention_mask"]
122
+
123
+ def __init__(
124
+ self,
125
+ vocab_file,
126
+ monolingual_vocab_file,
127
+ bos_token="<s>",
128
+ eos_token="</s>",
129
+ sep_token="</s>",
130
+ cls_token="<s>",
131
+ unk_token="<unk>",
132
+ pad_token="<pad>",
133
+ mask_token="<mask>",
134
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
135
+ **kwargs
136
+ ) -> None:
137
+ # Mask token behave like a normal word, i.e. include the space before it
138
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
139
+
140
+ self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
141
+
142
+ super().__init__(
143
+ bos_token=bos_token,
144
+ eos_token=eos_token,
145
+ unk_token=unk_token,
146
+ sep_token=sep_token,
147
+ cls_token=cls_token,
148
+ pad_token=pad_token,
149
+ mask_token=mask_token,
150
+ sp_model_kwargs=self.sp_model_kwargs,
151
+ **kwargs,
152
+ )
153
+
154
+ self.vocab_file = vocab_file
155
+ self.monolingual_vocab_file = monolingual_vocab_file
156
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
157
+ self.sp_model.Load(str(vocab_file))
158
+
159
+ # Load the reduced vocab
160
+
161
+ # Keep order of special tokens for backward compatibility
162
+ self.fairseq_tokens_to_ids = {}
163
+ cnt = 0
164
+ for token in [bos_token, pad_token, eos_token, unk_token, sep_token, cls_token]:
165
+ if str(token) not in self.fairseq_tokens_to_ids:
166
+ self.fairseq_tokens_to_ids[str(token)] = cnt
167
+ cnt += 1
168
+ with open(monolingual_vocab_file, "r", encoding="utf-8") as f:
169
+ for line in f.readlines():
170
+ token = line.strip().split()[0]
171
+ self.fairseq_tokens_to_ids[token] = len(self.fairseq_tokens_to_ids)
172
+ if str(mask_token) not in self.fairseq_tokens_to_ids:
173
+ self.fairseq_tokens_to_ids[str(mask_token)] = len(self.fairseq_tokens_to_ids)
174
+
175
+ self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
176
+
177
+ def __getstate__(self):
178
+ state = self.__dict__.copy()
179
+ state["sp_model"] = None
180
+ state["sp_model_proto"] = self.sp_model.serialized_model_proto()
181
+ return state
182
+
183
+ def __setstate__(self, d):
184
+ self.__dict__ = d
185
+
186
+ # for backward compatibility
187
+ if not hasattr(self, "sp_model_kwargs"):
188
+ self.sp_model_kwargs = {}
189
+
190
+ self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
191
+ self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
192
+
193
+ def build_inputs_with_special_tokens(
194
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
195
+ ) -> List[int]:
196
+ """
197
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
198
+ adding special tokens. An BARTPho sequence has the following format:
199
+
200
+ - single sequence: `<s> X </s>`
201
+ - pair of sequences: `<s> A </s></s> B </s>`
202
+
203
+ Args:
204
+ token_ids_0 (`List[int]`):
205
+ List of IDs to which the special tokens will be added.
206
+ token_ids_1 (`List[int]`, *optional*):
207
+ Optional second list of IDs for sequence pairs.
208
+
209
+ Returns:
210
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
211
+ """
212
+
213
+ if token_ids_1 is None:
214
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
215
+ cls = [self.cls_token_id]
216
+ sep = [self.sep_token_id]
217
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
218
+
219
+ def get_special_tokens_mask(
220
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
221
+ ) -> List[int]:
222
+ """
223
+ Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
224
+ special tokens using the tokenizer `prepare_for_model` method.
225
+
226
+ Args:
227
+ token_ids_0 (`List[int]`):
228
+ List of IDs.
229
+ token_ids_1 (`List[int]`, *optional*):
230
+ Optional second list of IDs for sequence pairs.
231
+ already_has_special_tokens (`bool`, *optional*, defaults to `False`):
232
+ Whether or not the token list is already formatted with special tokens for the model.
233
+
234
+ Returns:
235
+ `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
236
+ """
237
+
238
+ if already_has_special_tokens:
239
+ return super().get_special_tokens_mask(
240
+ token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
241
+ )
242
+
243
+ if token_ids_1 is None:
244
+ return [1] + ([0] * len(token_ids_0)) + [1]
245
+ return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
246
+
247
+ def create_token_type_ids_from_sequences(
248
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
249
+ ) -> List[int]:
250
+ """
251
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTPho does not
252
+ make use of token type ids, therefore a list of zeros is returned.
253
+
254
+ Args:
255
+ token_ids_0 (`List[int]`):
256
+ List of IDs.
257
+ token_ids_1 (`List[int]`, *optional*):
258
+ Optional second list of IDs for sequence pairs.
259
+
260
+ Returns:
261
+ `List[int]`: List of zeros.
262
+
263
+ """
264
+
265
+ sep = [self.sep_token_id]
266
+ cls = [self.cls_token_id]
267
+
268
+ if token_ids_1 is None:
269
+ return len(cls + token_ids_0 + sep) * [0]
270
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
271
+
272
+ @property
273
+ def vocab_size(self):
274
+ return len(self.fairseq_ids_to_tokens)
275
+
276
+ def get_vocab(self):
277
+ vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
278
+ vocab.update(self.added_tokens_encoder)
279
+ return vocab
280
+
281
+ def _tokenize(self, text: str) -> List[str]:
282
+ return self.sp_model.encode(text, out_type=str)
283
+
284
+ def _convert_token_to_id(self, token):
285
+ """Converts a token (str) in an id using the vocab."""
286
+ if token in self.fairseq_tokens_to_ids:
287
+ return self.fairseq_tokens_to_ids[token]
288
+ else:
289
+ return self.unk_token_id
290
+
291
+ def _convert_id_to_token(self, index):
292
+ """Converts an index (integer) in a token (str) using the vocab."""
293
+ return self.fairseq_ids_to_tokens[index]
294
+
295
+ def convert_tokens_to_string(self, tokens):
296
+ """Converts a sequence of tokens (strings for sub-words) in a single string."""
297
+ out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
298
+ return out_string
299
+
300
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
301
+ if not os.path.isdir(save_directory):
302
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory")
303
+ return
304
+ out_vocab_file = os.path.join(
305
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
306
+ )
307
+ out_monolingual_vocab_file = os.path.join(
308
+ save_directory,
309
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"],
310
+ )
311
+
312
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
313
+ copyfile(self.vocab_file, out_vocab_file)
314
+ elif not os.path.isfile(self.vocab_file):
315
+ with open(out_vocab_file, "wb") as fi:
316
+ content_spiece_model = self.sp_model.serialized_model_proto()
317
+ fi.write(content_spiece_model)
318
+
319
+ if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(
320
+ out_monolingual_vocab_file
321
+ ) and os.path.isfile(self.monolingual_vocab_file):
322
+ copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file)
323
+ elif not os.path.isfile(self.monolingual_vocab_file):
324
+ with open(out_monolingual_vocab_file, "w", encoding="utf-8") as fp:
325
+ for token in self.fairseq_tokens_to_ids:
326
+ if token not in self.all_special_tokens:
327
+ fp.write(f"{str(token)} \n")
328
+
329
+ return out_vocab_file, out_monolingual_vocab_file
tokenization_bartpho_fast.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2021 VinAI Research and the HuggingFace Inc. team.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License
15
+ """ Tokenization classes for BARTpho-syllable model."""
16
+
17
+ import os
18
+ from collections import defaultdict
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, List, Optional, Tuple, Union
21
+
22
+ from transformers.tokenization_utils import AddedToken
23
+ from transformers.tokenization_utils_base import EncodingFast
24
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
25
+ from transformers.utils import is_sentencepiece_available, logging
26
+
27
+
28
+ if is_sentencepiece_available():
29
+ from .tokenization_bartpho import BartphoTokenizer
30
+ else:
31
+ BartphoTokenizer = None
32
+
33
+
34
+ logger = logging.get_logger(__name__)
35
+
36
+ VOCAB_FILES_NAMES = {
37
+ "vocab_file": "sentencepiece.bpe.model",
38
+ "monolingual_vocab_file": "dict.txt",
39
+ "tokenizer_file": "tokenizer.json",
40
+ }
41
+
42
+ PRETRAINED_VOCAB_FILES_MAP = {
43
+ "vocab_file": {
44
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/sentencepiece.bpe.model",
45
+ },
46
+ "monolingual_vocab_file": {
47
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/dict.txt",
48
+ },
49
+ "tokenizer_file": {
50
+ "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/tokenizer.json",
51
+ },
52
+ }
53
+
54
+ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
55
+
56
+
57
+ class BartphoTokenizerFast(PreTrainedTokenizerFast):
58
+ """
59
+ Construct a "fast" BARTpho tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
60
+ [`XLMRobertaTokenizerFast`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
61
+
62
+ This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
63
+ refer to this superclass for more information regarding those methods.
64
+
65
+ Args:
66
+ vocab_file (`str`):
67
+ Path to the vocabulary file.
68
+ bos_token (`str`, *optional*, defaults to `"<s>"`):
69
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
70
+
71
+ <Tip>
72
+
73
+ When building a sequence using special tokens, this is not the token that is used for the beginning of
74
+ sequence. The token used is the `cls_token`.
75
+
76
+ </Tip>
77
+
78
+ eos_token (`str`, *optional*, defaults to `"</s>"`):
79
+ The end of sequence token.
80
+
81
+ <Tip>
82
+
83
+ When building a sequence using special tokens, this is not the token that is used for the end of sequence.
84
+ The token used is the `sep_token`.
85
+
86
+ </Tip>
87
+
88
+ sep_token (`str`, *optional*, defaults to `"</s>"`):
89
+ The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
90
+ sequence classification or for a text and a question for question answering. It is also used as the last
91
+ token of a sequence built with special tokens.
92
+ cls_token (`str`, *optional*, defaults to `"<s>"`):
93
+ The classifier token which is used when doing sequence classification (classification of the whole sequence
94
+ instead of per-token classification). It is the first token of the sequence when built with special tokens.
95
+ unk_token (`str`, *optional*, defaults to `"<unk>"`):
96
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
97
+ token instead.
98
+ pad_token (`str`, *optional*, defaults to `"<pad>"`):
99
+ The token used for padding, for example when batching sequences of different lengths.
100
+ mask_token (`str`, *optional*, defaults to `"<mask>"`):
101
+ The token used for masking values. This is the token used when training this model with masked language
102
+ modeling. This is the token which the model will try to predict.
103
+ additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
104
+ Additional special tokens used by the tokenizer.
105
+ """
106
+
107
+ vocab_files_names = VOCAB_FILES_NAMES
108
+ pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
109
+ max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
110
+ model_input_names = ["input_ids", "attention_mask"]
111
+ slow_tokenizer_class = BartphoTokenizer
112
+
113
+ def __init__(
114
+ self,
115
+ vocab_file=None,
116
+ monolingual_vocab_file=None,
117
+ tokenizer_file=None,
118
+ bos_token="<s>",
119
+ eos_token="</s>",
120
+ sep_token="</s>",
121
+ cls_token="<s>",
122
+ unk_token="<unk>",
123
+ pad_token="<pad>",
124
+ mask_token="<mask>",
125
+ **kwargs
126
+ ):
127
+ # Mask token behave like a normal word, i.e. include the space before it
128
+ mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
129
+
130
+ super().__init__(
131
+ vocab_file,
132
+ monolingual_vocab_file,
133
+ tokenizer_file=tokenizer_file,
134
+ bos_token=bos_token,
135
+ eos_token=eos_token,
136
+ sep_token=sep_token,
137
+ cls_token=cls_token,
138
+ unk_token=unk_token,
139
+ pad_token=pad_token,
140
+ mask_token=mask_token,
141
+ **kwargs,
142
+ )
143
+
144
+ self.vocab_file = vocab_file
145
+ self.monolingual_vocab_file = monolingual_vocab_file
146
+ self.can_save_slow_tokenizer = False if not self.vocab_file else True
147
+
148
+ def get_added_vocab_hacking(self):
149
+ """
150
+ Returns the added tokens in the vocabulary as a dictionary of token to index.
151
+
152
+ Returns:
153
+ `Dict[str, int], Dict[int, int]`: The added tokens, and their original and new ids
154
+ """
155
+ base_vocab_size = self._tokenizer.get_vocab_size(with_added_tokens=False)
156
+ full_vocab_size = self._tokenizer.get_vocab_size(with_added_tokens=True)
157
+ if full_vocab_size == base_vocab_size:
158
+ return {}, {}
159
+
160
+ # Tokens in added_vocab should have ids that are equal to or larger than the size of base_vocab
161
+ added_vocab = dict(
162
+ (self._tokenizer.id_to_token(index), index + 1 - base_vocab_size + self.mask_token_id)
163
+ for index in range(base_vocab_size, full_vocab_size)
164
+ )
165
+
166
+ id_mapping = dict((index, self._tokenizer.token_to_id(tok)) for tok, index in added_vocab.items())
167
+
168
+ return added_vocab, id_mapping
169
+
170
+ def _decode(
171
+ self,
172
+ token_ids: Union[int, List[int]],
173
+ skip_special_tokens: bool = False,
174
+ clean_up_tokenization_spaces: bool = True,
175
+ **kwargs
176
+ ) -> str:
177
+ self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
178
+
179
+ if isinstance(token_ids, int):
180
+ token_ids = [token_ids]
181
+
182
+ # Mapping ids into their original values
183
+ _, id_mapping = self.get_added_vocab_hacking()
184
+ if len(id_mapping) > 0:
185
+ token_ids = [id_mapping[id] if id in id_mapping else id for id in token_ids]
186
+
187
+ text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
188
+
189
+ if clean_up_tokenization_spaces:
190
+ clean_text = self.clean_up_tokenization(text)
191
+ return clean_text
192
+ else:
193
+ return text
194
+
195
+ def _convert_encoding(
196
+ self,
197
+ encoding: EncodingFast,
198
+ return_token_type_ids: Optional[bool] = None,
199
+ return_attention_mask: Optional[bool] = None,
200
+ return_overflowing_tokens: bool = False,
201
+ return_special_tokens_mask: bool = False,
202
+ return_offsets_mapping: bool = False,
203
+ return_length: bool = False,
204
+ verbose: bool = True,
205
+ ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
206
+ """
207
+ Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
208
+ of encodings, take care of building a batch from overflowing tokens.
209
+
210
+ Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
211
+ lists (overflows) of lists (tokens).
212
+
213
+ Output shape: (overflows, sequence length)
214
+ """
215
+ if return_token_type_ids is None:
216
+ return_token_type_ids = "token_type_ids" in self.model_input_names
217
+ if return_attention_mask is None:
218
+ return_attention_mask = "attention_mask" in self.model_input_names
219
+
220
+ if return_overflowing_tokens and encoding.overflowing is not None:
221
+ encodings = [encoding] + encoding.overflowing
222
+ else:
223
+ encodings = [encoding]
224
+
225
+ encoding_dict = defaultdict(list)
226
+ added_vocab, _ = self.get_added_vocab_hacking()
227
+ for e in encodings:
228
+ # encoding_dict["input_ids"].append(e.ids)
229
+ # Reassign ids of tokens due to the hacking strategy
230
+ ids = []
231
+ for id, token in zip(e.ids, e.tokens):
232
+ if id <= self.mask_token_id:
233
+ ids.append(id)
234
+ else:
235
+ if token.strip() in added_vocab:
236
+ ids.append(added_vocab[token.strip()])
237
+ else:
238
+ ids.append(self.unk_token_id)
239
+
240
+ encoding_dict["input_ids"].append(ids)
241
+
242
+ if return_token_type_ids:
243
+ encoding_dict["token_type_ids"].append(e.type_ids)
244
+ if return_attention_mask:
245
+ encoding_dict["attention_mask"].append(e.attention_mask)
246
+ if return_special_tokens_mask:
247
+ encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
248
+ if return_offsets_mapping:
249
+ encoding_dict["offset_mapping"].append(e.offsets)
250
+ if return_length:
251
+ # encoding_dict["length"].append(len(e.ids))
252
+ encoding_dict["length"].append(len(ids))
253
+
254
+ return encoding_dict, encodings
255
+
256
+ def build_inputs_with_special_tokens(
257
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
258
+ ) -> List[int]:
259
+ """
260
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
261
+ adding special tokens. A BARTpho sequence has the following format:
262
+
263
+ - single sequence: `<s> X </s>`
264
+ - pair of sequences: `<s> A </s></s> B </s>`
265
+
266
+ Args:
267
+ token_ids_0 (`List[int]`):
268
+ List of IDs to which the special tokens will be added.
269
+ token_ids_1 (`List[int]`, *optional*):
270
+ Optional second list of IDs for sequence pairs.
271
+
272
+ Returns:
273
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
274
+ """
275
+
276
+ if token_ids_1 is None:
277
+ return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
278
+ cls = [self.cls_token_id]
279
+ sep = [self.sep_token_id]
280
+ return cls + token_ids_0 + sep + sep + token_ids_1 + sep
281
+
282
+ def create_token_type_ids_from_sequences(
283
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
284
+ ) -> List[int]:
285
+ """
286
+ Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTpho does not
287
+ make use of token type ids, therefore a list of zeros is returned.
288
+
289
+ Args:
290
+ token_ids_0 (`List[int]`):
291
+ List of IDs.
292
+ token_ids_1 (`List[int]`, *optional*):
293
+ Optional second list of IDs for sequence pairs.
294
+
295
+ Returns:
296
+ `List[int]`: List of zeros.
297
+
298
+ """
299
+
300
+ sep = [self.sep_token_id]
301
+ cls = [self.cls_token_id]
302
+
303
+ if token_ids_1 is None:
304
+ return len(cls + token_ids_0 + sep) * [0]
305
+ return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
306
+
307
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
308
+ if not self.can_save_slow_tokenizer:
309
+ raise ValueError(
310
+ "Your fast tokenizer does not have the necessary information to save the vocabulary for a "
311
+ "slow tokenizer."
312
+ )
313
+
314
+ if not os.path.isdir(save_directory):
315
+ logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
316
+ return
317
+
318
+ out_vocab_file = os.path.join(
319
+ save_directory,
320
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
321
+ )
322
+
323
+ out_monolingual_vocab_file = os.path.join(
324
+ save_directory,
325
+ (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"],
326
+ )
327
+
328
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
329
+ copyfile(self.vocab_file, out_vocab_file)
330
+
331
+ if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(out_monolingual_vocab_file):
332
+ copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file)
333
+
334
+ return (out_vocab_file, out_monolingual_vocab_file)
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "40029": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": true,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 1024,
50
+ "pad_token": "<pad>",
51
+ "sep_token": "</s>",
52
+ "sp_model_kwargs": {},
53
+ "tokenizer_class": "BartphoTokenizer",
54
+ "unk_token": "<unk>"
55
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d185703bf08499ca9c2f9c3612d9f7ec11b7666a7cdb13773aac6c1b1e3d400
3
+ size 5048