Update README.md, tokenization_deberta_v2_jumanpp.py, tokenization_deberta_v2_jumanpp_fast.py
Browse files
README.md
CHANGED
|
@@ -29,8 +29,8 @@ You can use this model for masked language modeling as follows:
|
|
| 29 |
|
| 30 |
```python
|
| 31 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 32 |
-
tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese', trust_remote_code=True)
|
| 33 |
-
model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese')
|
| 34 |
|
| 35 |
sentence = '京都大学で自然言語処理を[MASK]する。'
|
| 36 |
encoding = tokenizer(sentence, return_tensors='pt')
|
|
@@ -41,9 +41,8 @@ You can also fine-tune this model on downstream tasks.
|
|
| 41 |
|
| 42 |
## Tokenization
|
| 43 |
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
UPDATE: The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer(Fast)`, so there's no need to segment it in advance. To use `DebertaV2JumanppTokenizer(Fast)`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
|
| 47 |
|
| 48 |
## Training data
|
| 49 |
|
|
|
|
| 29 |
|
| 30 |
```python
|
| 31 |
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 32 |
+
tokenizer = AutoTokenizer.from_pretrained('ku-nlp/deberta-v2-base-japanese-with-auto-jumanpp', trust_remote_code=True)
|
| 33 |
+
model = AutoModelForMaskedLM.from_pretrained('ku-nlp/deberta-v2-base-japanese-with-auto-jumanpp')
|
| 34 |
|
| 35 |
sentence = '京都大学で自然言語処理を[MASK]する。'
|
| 36 |
encoding = tokenizer(sentence, return_tensors='pt')
|
|
|
|
| 41 |
|
| 42 |
## Tokenization
|
| 43 |
|
| 44 |
+
The input text is internally segmented by [Juman++](https://github.com/ku-nlp/jumanpp) within `DebertaV2JumanppTokenizer` or `DebertaV2JumanppTokenizerFast`, so there's no need to segment it in advance.
|
| 45 |
+
To use `DebertaV2JumanppTokenizer` or `DebertaV2JumanppTokenizerFast`, you need to install [Juman++ 2.0.0-rc3](https://github.com/ku-nlp/jumanpp/releases/tag/v2.0.0-rc3) and [rhoknp](https://github.com/ku-nlp/rhoknp).
|
|
|
|
| 46 |
|
| 47 |
## Training data
|
| 48 |
|
tokenization_deberta_v2_jumanpp.py
CHANGED
|
@@ -24,7 +24,7 @@ class JumanppTokenizer:
|
|
| 24 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
| 25 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
| 26 |
)
|
| 27 |
-
self.
|
| 28 |
|
| 29 |
def tokenize(self, text: str) -> str:
|
| 30 |
-
return " ".join([morpheme.surf for morpheme in self.
|
|
|
|
| 24 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
| 25 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
| 26 |
)
|
| 27 |
+
self.jumanpp = rhoknp.Jumanpp()
|
| 28 |
|
| 29 |
def tokenize(self, text: str) -> str:
|
| 30 |
+
return " ".join([morpheme.surf for morpheme in self.jumanpp.apply_to_sentence(text).morphemes])
|
tokenization_deberta_v2_jumanpp_fast.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import copy
|
|
|
|
| 2 |
|
| 3 |
from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
|
| 4 |
from transformers import DebertaV2TokenizerFast
|
|
@@ -54,11 +55,11 @@ class JumanppPreTokenizer:
|
|
| 54 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
| 55 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
| 56 |
)
|
| 57 |
-
self.
|
| 58 |
|
| 59 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
| 60 |
pretok.split(self.jumanpp_split)
|
| 61 |
|
| 62 |
-
def jumanpp_split(self, i: int, normalized_string: NormalizedString) ->
|
| 63 |
-
offsets = [morpheme.span for morpheme in self.
|
| 64 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|
|
|
|
| 1 |
import copy
|
| 2 |
+
from typing import List
|
| 3 |
|
| 4 |
from tokenizers import NormalizedString, PreTokenizedString, normalizers, pre_tokenizers
|
| 5 |
from transformers import DebertaV2TokenizerFast
|
|
|
|
| 55 |
"You need to install rhoknp to use JumanppPreTokenizer. "
|
| 56 |
"See https://github.com/ku-nlp/rhoknp for installation."
|
| 57 |
)
|
| 58 |
+
self.jumanpp = rhoknp.Jumanpp()
|
| 59 |
|
| 60 |
def pre_tokenize(self, pretok: PreTokenizedString):
|
| 61 |
pretok.split(self.jumanpp_split)
|
| 62 |
|
| 63 |
+
def jumanpp_split(self, i: int, normalized_string: NormalizedString) -> List[NormalizedString]:
|
| 64 |
+
offsets = [morpheme.span for morpheme in self.jumanpp.apply_to_sentence(str(normalized_string)).morphemes]
|
| 65 |
return [normalized_string[offset[0]:offset[1]] for offset in offsets]
|