dhigurashi commited on
Commit ·
b0f6037
1
Parent(s): 14a911c
support transformers==4.34.0
Browse files- tokenization_plamo.py +6 -14
tokenization_plamo.py
CHANGED
|
@@ -5,7 +5,6 @@ from shutil import copyfile
|
|
| 5 |
from typing import Any, Dict, List, Optional, Tuple
|
| 6 |
|
| 7 |
import sentencepiece as spm
|
| 8 |
-
import transformers
|
| 9 |
from transformers.tokenization_utils import PreTrainedTokenizer
|
| 10 |
from transformers.utils import logging
|
| 11 |
|
|
@@ -35,6 +34,12 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
|
|
| 35 |
kwargs["add_bos_token"] = False
|
| 36 |
if "add_eos_token" not in kwargs:
|
| 37 |
kwargs["add_eos_token"] = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
super().__init__(
|
| 40 |
vocab_file=vocab_file,
|
|
@@ -50,15 +55,6 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
|
|
| 50 |
**kwargs,
|
| 51 |
)
|
| 52 |
|
| 53 |
-
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
| 54 |
-
self.vocab_file = vocab_file
|
| 55 |
-
self.add_bos_token = kwargs["add_bos_token"]
|
| 56 |
-
self.add_eos_token = kwargs["add_eos_token"]
|
| 57 |
-
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
| 58 |
-
self.sp_model.Load(vocab_file)
|
| 59 |
-
|
| 60 |
-
self.add_tokens(self.all_special_tokens_extended, special_tokens=True)
|
| 61 |
-
|
| 62 |
# the functions below are copied from hf transformers LlamaTokenizer's implementation to fix the behaviour of the tokenizer
|
| 63 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/llama/tokenization_llama.py
|
| 64 |
|
|
@@ -155,7 +151,3 @@ class PlamoTokenizer(PreTrainedTokenizer): # type: ignore
|
|
| 155 |
fi.write(content_spiece_model)
|
| 156 |
|
| 157 |
return (out_vocab_file,)
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
class PlamoConfig(transformers.LlamaConfig): # type: ignore
|
| 161 |
-
model_type = "plamo"
|
|
|
|
| 5 |
from typing import Any, Dict, List, Optional, Tuple
|
| 6 |
|
| 7 |
import sentencepiece as spm
|
|
|
|
| 8 |
from transformers.tokenization_utils import PreTrainedTokenizer
|
| 9 |
from transformers.utils import logging
|
| 10 |
|
|
|
|
| 34 |
kwargs["add_bos_token"] = False
|
| 35 |
if "add_eos_token" not in kwargs:
|
| 36 |
kwargs["add_eos_token"] = False
|
| 37 |
+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
|
| 38 |
+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
|
| 39 |
+
self.sp_model.Load(vocab_file)
|
| 40 |
+
self.vocab_file = vocab_file
|
| 41 |
+
self.add_bos_token = kwargs["add_bos_token"]
|
| 42 |
+
self.add_eos_token = kwargs["add_eos_token"]
|
| 43 |
|
| 44 |
super().__init__(
|
| 45 |
vocab_file=vocab_file,
|
|
|
|
| 55 |
**kwargs,
|
| 56 |
)
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# the functions below are copied from hf transformers LlamaTokenizer's implementation to fix the behaviour of the tokenizer
|
| 59 |
# https://github.com/huggingface/transformers/blob/v4.30.2/src/transformers/models/llama/tokenization_llama.py
|
| 60 |
|
|
|
|
| 151 |
fi.write(content_spiece_model)
|
| 152 |
|
| 153 |
return (out_vocab_file,)
|
|
|
|
|
|
|
|
|
|
|
|