File size: 6,074 Bytes
77f3f47 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 | from typing import List, Tuple
from transformers import PreTrainedTokenizerFast
import re
import fast_disambig
_TATWEEL_RE = re.compile(r"\u0640")
_ALIF_RE = re.compile(r"[آأإٱ]")
_ALIF_MAK_RE = re.compile(r"ى")
_TEH_MARB_RE = re.compile(r"ة")
_ZERO_WIDTH_RE = re.compile(r"[\u200B-\u200D\u200E\u200F\uFEFF]")
ARABIC_DIACRITICS = {
"ً", "ٌ", "ٍ",
"َ", "ُ", "ِ",
"ّ", "ْ",
"ٗ", "٘", "ٙ", "ٚ", "ٛ", "ٜ", "ٝ", "ٞ", "ٟ",
"ؐ", "ؑ", "ؒ", "ؓ", "ؔ", "ؕ", "ؖ", "ؗ", "ؘ", "ؙ", "ؚ",
"ۖ", "ۗ", "ۘ", "ۙ", "ۚ", "ۛ", "ۜ", "۟", "۠", "ۡ", "ۢ", "ۣ", "ۤ", "ۧ", "ۨ",
"۪", "۫", "۬", "ۭ",
}
def separate_diacritics(text):
tokens = re.split(r'(\s+|\[\+\])', text)
processed_tokens = []
for token in tokens:
if not token:
continue
if token.isspace() or token == '[+]':
processed_tokens.append(token)
continue
if not any(c in ARABIC_DIACRITICS for c in token):
processed_tokens.append(token)
continue
base_chars = []
diac_groups = []
for char in token:
if char in ARABIC_DIACRITICS:
if not diac_groups:
base_chars.append(" ")
diac_groups.append([])
diac_groups[-1].append(char)
else:
base_chars.append(char)
diac_groups.append([])
base_word = "".join(base_chars)
diac_string = []
for group in diac_groups:
if group:
diac_string.append("".join(group))
else:
diac_string.append("◌")
processed_tokens.append(base_word + " " + "".join(diac_string))
return "".join(processed_tokens)
def normalize_arabic(text):
text = _TATWEEL_RE.sub("", text)
text = _ZERO_WIDTH_RE.sub("", text)
text = _ALIF_RE.sub("ا", text)
text = _ALIF_MAK_RE.sub("ي", text)
text = _TEH_MARB_RE.sub("ه", text)
return text
class ArabicMorphTokenizer(PreTrainedTokenizerFast):
slow_tokenizer_class = None
def __init__(self, tokenizer_file=None, apply_stemming=True, **kwargs):
super().__init__(tokenizer_file=tokenizer_file, **kwargs)
self.apply_stemming = apply_stemming
if self.apply_stemming:
self.stemmer = fast_disambig.camel.Stemmer()
def _preprocess_one(self, s, do_stem):
if isinstance(s, (list, tuple)):
return [self._preprocess_one(x, do_stem) for x in s]
if do_stem:
s = self.stemmer.stem(s, preserve_diacritics=True)
s = normalize_arabic(s)
s = separate_diacritics(s)
return s
def _preprocess_pair(self, text, text_pair, do_stem):
def maybe(s):
return self._preprocess_one(s, do_stem) if isinstance(s, str) else s
if isinstance(text, (list, tuple)):
text = [maybe(x) for x in text]
else:
text = maybe(text)
if isinstance(text_pair, (list, tuple)):
text_pair = [maybe(x) for x in text_pair]
else:
text_pair = maybe(text_pair)
return text, text_pair
def _pop_flag(self, kwargs):
v = kwargs.pop("apply_stemming", None)
return self.apply_stemming if v is None else bool(v)
def __call__(self, text=None, text_pair=None, *args, **kwargs):
flag = self._pop_flag(kwargs)
if not getattr(self, "_processing", False):
self._processing = True
try:
text, text_pair = self._preprocess_pair(text, text_pair, flag)
return super().__call__(text=text, text_pair=text_pair, *args, **kwargs)
finally:
self._processing = False
return super().__call__(text=text, text_pair=text_pair, *args, **kwargs)
def encode(self, text, text_pair=None, *args, **kwargs):
flag = self._pop_flag(kwargs)
if not getattr(self, "_processing", False):
self._processing = True
try:
text, text_pair = self._preprocess_pair(text, text_pair, flag)
return super().encode(text, text_pair, *args, **kwargs)
finally:
self._processing = False
return super().encode(text, text_pair, *args, **kwargs)
def encode_plus(self, text=None, text_pair=None, *args, **kwargs):
flag = self._pop_flag(kwargs)
if not getattr(self, "_processing", False):
self._processing = True
try:
text, text_pair = self._preprocess_pair(text, text_pair, flag)
return super().encode_plus(text=text, text_pair=text_pair, *args, **kwargs)
finally:
self._processing = False
return super().encode_plus(text=text, text_pair=text_pair, *args, **kwargs)
def batch_encode_plus(self, batch_text_or_text_pairs=None, *args, **kwargs):
flag = self._pop_flag(kwargs)
if not getattr(self, "_processing", False):
self._processing = True
try:
data = batch_text_or_text_pairs
if isinstance(data, (list, tuple)):
new_data = []
for item in data:
if isinstance(item, (list, tuple)) and len(item) == 2:
new_data.append(self._preprocess_pair(item[0], item[1], flag))
else:
new_data.append(self._preprocess_one(item, flag))
batch_text_or_text_pairs = new_data
return super().batch_encode_plus(batch_text_or_text_pairs=batch_text_or_text_pairs, *args, **kwargs)
finally:
self._processing = False
return super().batch_encode_plus(batch_text_or_text_pairs=batch_text_or_text_pairs, *args, **kwargs)
def preprocess(self, text, apply_stemming=True):
flag = self.apply_stemming if apply_stemming is None else bool(apply_stemming)
return self._preprocess_one(text, flag)
|