Update README.md
Browse files
README.md
CHANGED
|
@@ -35,7 +35,24 @@ import logging
|
|
| 35 |
from typing import List, Tuple
|
| 36 |
from transformers import AutoConfig
|
| 37 |
from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
def load_model(model_path):
|
| 41 |
config = AutoConfig.from_pretrained(model_path)
|
|
|
|
| 35 |
from typing import List, Tuple
|
| 36 |
from transformers import AutoConfig
|
| 37 |
from transformers.models.mt5.modeling_mt5 import MT5ForConditionalGeneration
|
| 38 |
+
|
| 39 |
+
import jieba
|
| 40 |
+
from functools import partial
|
| 41 |
+
from transformers import BertTokenizer
|
| 42 |
+
|
| 43 |
+
class T5PegasusTokenizer(BertTokenizer):
|
| 44 |
+
def __init__(self, *args, **kwargs):
|
| 45 |
+
super().__init__(*args, **kwargs)
|
| 46 |
+
self.pre_tokenizer = partial(jieba.cut, HMM=False)
|
| 47 |
+
|
| 48 |
+
def _tokenize(self, text, *arg, **kwargs):
|
| 49 |
+
split_tokens = []
|
| 50 |
+
for text in self.pre_tokenizer(text):
|
| 51 |
+
if text in self.vocab:
|
| 52 |
+
split_tokens.append(text)
|
| 53 |
+
else:
|
| 54 |
+
split_tokens.extend(super()._tokenize(text))
|
| 55 |
+
return split_tokens
|
| 56 |
|
| 57 |
def load_model(model_path):
|
| 58 |
config = AutoConfig.from_pretrained(model_path)
|