pasha commited on
Commit ·
c7a5318
1
Parent(s): d8fe7bb
Diguts wrapper added
Browse files- tokenizer.json +0 -0
- tokenizer.py +5 -3
tokenizer.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tokenizer.py
CHANGED
|
@@ -47,11 +47,11 @@ class RuMorphemePreTokenizer:
|
|
| 47 |
word = str(normalized_string)
|
| 48 |
|
| 49 |
# If word is just spaces, return as is
|
| 50 |
-
if word.isspace():
|
| 51 |
return [normalized_string]
|
| 52 |
|
| 53 |
-
# Ignore special characters (non-alphabetical
|
| 54 |
-
if not any(c.isalpha()
|
| 55 |
return [normalized_string]
|
| 56 |
|
| 57 |
# Make predictions and return morphemes
|
|
@@ -100,6 +100,7 @@ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
|
|
| 100 |
# Custom pre-tokenizer
|
| 101 |
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
| 102 |
pre_tokenizers.Punctuation(),
|
|
|
|
| 103 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(self.model_name))
|
| 104 |
])
|
| 105 |
# Custom decoder
|
|
@@ -150,6 +151,7 @@ class RuMorphemeTokenizerFast(PreTrainedTokenizerFast):
|
|
| 150 |
# Custom pre-tokenizer
|
| 151 |
tokenizer.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
| 152 |
pre_tokenizers.Punctuation(),
|
|
|
|
| 153 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(model_name))
|
| 154 |
])
|
| 155 |
|
|
|
|
| 47 |
word = str(normalized_string)
|
| 48 |
|
| 49 |
# If word is just spaces, return as is
|
| 50 |
+
if word.isspace() or word.isdigit():
|
| 51 |
return [normalized_string]
|
| 52 |
|
| 53 |
+
# Ignore special characters (non-alphabetical)
|
| 54 |
+
if not any(c.isalpha() for c in word):
|
| 55 |
return [normalized_string]
|
| 56 |
|
| 57 |
# Make predictions and return morphemes
|
|
|
|
| 100 |
# Custom pre-tokenizer
|
| 101 |
self.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
| 102 |
pre_tokenizers.Punctuation(),
|
| 103 |
+
pre_tokenizers.Digits(individual_digits=True),
|
| 104 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(self.model_name))
|
| 105 |
])
|
| 106 |
# Custom decoder
|
|
|
|
| 151 |
# Custom pre-tokenizer
|
| 152 |
tokenizer.backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
| 153 |
pre_tokenizers.Punctuation(),
|
| 154 |
+
pre_tokenizers.Digits(individual_digits=True),
|
| 155 |
pre_tokenizers.PreTokenizer.custom(RuMorphemePreTokenizer(model_name))
|
| 156 |
])
|
| 157 |
|