Release v1.1: PMI Phrase Merging & Smart Morphology
Browse files- exceptions.txt +71 -0
- tokenization_df_arc.py +16 -23
- tokenizer_config.json +6 -7
exceptions.txt
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
الله
|
| 2 |
+
محمد
|
| 3 |
+
عبدالله
|
| 4 |
+
عبدالرحمن
|
| 5 |
+
مكة
|
| 6 |
+
بغداد
|
| 7 |
+
دمشق
|
| 8 |
+
القاهرة
|
| 9 |
+
بيروت
|
| 10 |
+
عمان
|
| 11 |
+
الرياض
|
| 12 |
+
جدة
|
| 13 |
+
الكويت
|
| 14 |
+
دبي
|
| 15 |
+
أبوظبي
|
| 16 |
+
المنامة
|
| 17 |
+
الدوحة
|
| 18 |
+
مسقط
|
| 19 |
+
ليبيا
|
| 20 |
+
تونس
|
| 21 |
+
الجزائر
|
| 22 |
+
المغرب
|
| 23 |
+
فلسطين
|
| 24 |
+
الأردن
|
| 25 |
+
لبنان
|
| 26 |
+
سوريا
|
| 27 |
+
العراق
|
| 28 |
+
مصر
|
| 29 |
+
السودان
|
| 30 |
+
اليمن
|
| 31 |
+
أمريكا
|
| 32 |
+
أوروبا
|
| 33 |
+
آسيا
|
| 34 |
+
أفريقيا
|
| 35 |
+
ترامب
|
| 36 |
+
بايدن
|
| 37 |
+
جوجل
|
| 38 |
+
فيسبوك
|
| 39 |
+
أمازون
|
| 40 |
+
مايكروسوفت
|
| 41 |
+
أبل
|
| 42 |
+
سامسونج
|
| 43 |
+
سوني
|
| 44 |
+
هواوي
|
| 45 |
+
مرسيدس
|
| 46 |
+
بي إم دبليو
|
| 47 |
+
تويوتا
|
| 48 |
+
هوندا
|
| 49 |
+
فورد
|
| 50 |
+
شيفروليه
|
| 51 |
+
تسلا
|
| 52 |
+
ناسا
|
| 53 |
+
إيلون ماسك
|
| 54 |
+
مارك زوكربيرج
|
| 55 |
+
بيل جيتس
|
| 56 |
+
ستيف جوبز
|
| 57 |
+
ألبرت أينشتاين
|
| 58 |
+
إسحاق نيوتن
|
| 59 |
+
داروين
|
| 60 |
+
بيتهوفن
|
| 61 |
+
موتزارت
|
| 62 |
+
شكسبير
|
| 63 |
+
دوستويفسكي
|
| 64 |
+
تولستوي
|
| 65 |
+
نجيب محفوظ
|
| 66 |
+
طه حسين
|
| 67 |
+
العقاد
|
| 68 |
+
المنفلوطي
|
| 69 |
+
جبران خليل جبران
|
| 70 |
+
محمود درويش
|
| 71 |
+
نزار قباني
|
tokenization_df_arc.py
CHANGED
|
@@ -163,14 +163,13 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 163 |
|
| 164 |
def __init__(
|
| 165 |
self,
|
| 166 |
-
vocab_file=None,
|
| 167 |
-
tokenizer_file=None,
|
| 168 |
-
phrases_file=None,
|
| 169 |
-
normalization_config=None,
|
| 170 |
-
min_stem_length=2,
|
| 171 |
**kwargs
|
| 172 |
):
|
| 173 |
-
# Initialize helpers
|
| 174 |
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 175 |
self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
|
| 176 |
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
|
@@ -181,10 +180,10 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 181 |
**kwargs
|
| 182 |
)
|
| 183 |
|
| 184 |
-
def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
t = self.normalizer_helper.normalize(text)
|
| 189 |
t = self.morph_helper.segment_text(t)
|
| 190 |
t = self.phrase_helper.merge_phrases(t)
|
|
@@ -193,12 +192,11 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 193 |
if isinstance(batch_text_or_text_pairs, str):
|
| 194 |
batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
|
| 195 |
elif isinstance(batch_text_or_text_pairs, (list, tuple)):
|
| 196 |
-
# Handle text pairs? For now assume list of strings
|
| 197 |
processed = []
|
| 198 |
for item in batch_text_or_text_pairs:
|
| 199 |
if isinstance(item, str):
|
| 200 |
processed.append(preprocess(item))
|
| 201 |
-
elif isinstance(item, (list, tuple)):
|
| 202 |
processed.append((preprocess(item[0]), preprocess(item[1])))
|
| 203 |
else:
|
| 204 |
processed.append(item)
|
|
@@ -228,19 +226,14 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 228 |
return self.convert_tokens_to_string(tokens)
|
| 229 |
|
| 230 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 231 |
-
|
| 232 |
-
"""
|
| 233 |
-
Converts a sequence of tokens (string) in a single string.
|
| 234 |
-
"""
|
| 235 |
-
# Join with simple space (since we used Whitespace pre-tokenizer)
|
| 236 |
text = " ".join(tokens)
|
| 237 |
|
| 238 |
-
# Remove morphological
|
| 239 |
-
# We only remove
|
| 240 |
-
#
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
return text
|
| 244 |
|
| 245 |
|
| 246 |
|
|
|
|
| 163 |
|
| 164 |
def __init__(
|
| 165 |
self,
|
| 166 |
+
vocab_file: Optional[str] = None,
|
| 167 |
+
tokenizer_file: Optional[str] = None,
|
| 168 |
+
phrases_file: Optional[str] = None,
|
| 169 |
+
normalization_config: Optional[Dict[str, bool]] = None,
|
| 170 |
+
min_stem_length: int = 2,
|
| 171 |
**kwargs
|
| 172 |
):
|
|
|
|
| 173 |
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 174 |
self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
|
| 175 |
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
|
|
|
| 180 |
**kwargs
|
| 181 |
)
|
| 182 |
|
| 183 |
+
def _batch_encode_plus(self, batch_text_or_text_pairs: Union[str, List[str], List[Tuple[str, str]]], *args, **kwargs):
|
| 184 |
+
def preprocess(text: str) -> str:
|
| 185 |
+
if not text:
|
| 186 |
+
return ""
|
| 187 |
t = self.normalizer_helper.normalize(text)
|
| 188 |
t = self.morph_helper.segment_text(t)
|
| 189 |
t = self.phrase_helper.merge_phrases(t)
|
|
|
|
| 192 |
if isinstance(batch_text_or_text_pairs, str):
|
| 193 |
batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
|
| 194 |
elif isinstance(batch_text_or_text_pairs, (list, tuple)):
|
|
|
|
| 195 |
processed = []
|
| 196 |
for item in batch_text_or_text_pairs:
|
| 197 |
if isinstance(item, str):
|
| 198 |
processed.append(preprocess(item))
|
| 199 |
+
elif isinstance(item, (list, tuple)):
|
| 200 |
processed.append((preprocess(item[0]), preprocess(item[1])))
|
| 201 |
else:
|
| 202 |
processed.append(item)
|
|
|
|
| 226 |
return self.convert_tokens_to_string(tokens)
|
| 227 |
|
| 228 |
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
| 229 |
+
"""Converts a sequence of tokens into a single string."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
text = " ".join(tokens)
|
| 231 |
|
| 232 |
+
# Remove internal morphological underscores (e.g., 'w_s_y' -> 'wsy')
|
| 233 |
+
# We use a regex to ensure we only remove underscores that are
|
| 234 |
+
# acting as connectors between Arabic segments, preserving snake_case.
|
| 235 |
+
arabic_range = r'[\u0600-\u06FF]'
|
| 236 |
+
return re.sub(rf'(?<={arabic_range})_|_(?={arabic_range})', '', text)
|
|
|
|
| 237 |
|
| 238 |
|
| 239 |
|
tokenizer_config.json
CHANGED
|
@@ -1,21 +1,20 @@
|
|
| 1 |
{
|
|
|
|
| 2 |
"auto_map": {
|
| 3 |
"AutoTokenizer": [
|
| 4 |
"tokenization_df_arc.DFArcTokenizer",
|
| 5 |
null
|
| 6 |
]
|
| 7 |
},
|
| 8 |
-
"
|
| 9 |
-
"
|
| 10 |
-
"
|
|
|
|
| 11 |
"unify_alef": true,
|
| 12 |
"unify_yeh": true,
|
| 13 |
"unify_teh_marbuta": true,
|
| 14 |
"remove_diacritics": true,
|
| 15 |
"remove_tatweel": true,
|
| 16 |
"remove_repeats": true
|
| 17 |
-
}
|
| 18 |
-
"min_stem_length": 2,
|
| 19 |
-
"vocab_size": 256000,
|
| 20 |
-
"model_max_length": 4096
|
| 21 |
}
|
|
|
|
| 1 |
{
|
| 2 |
+
"tokenizer_class": "DFArcTokenizer",
|
| 3 |
"auto_map": {
|
| 4 |
"AutoTokenizer": [
|
| 5 |
"tokenization_df_arc.DFArcTokenizer",
|
| 6 |
null
|
| 7 |
]
|
| 8 |
},
|
| 9 |
+
"phrases_file": "phrase_vocab.json",
|
| 10 |
+
"vocab_file": "tokenizer.json",
|
| 11 |
+
"min_stem_length": 2,
|
| 12 |
+
"normalization_config": {
|
| 13 |
"unify_alef": true,
|
| 14 |
"unify_yeh": true,
|
| 15 |
"unify_teh_marbuta": true,
|
| 16 |
"remove_diacritics": true,
|
| 17 |
"remove_tatweel": true,
|
| 18 |
"remove_repeats": true
|
| 19 |
+
}
|
|
|
|
|
|
|
|
|
|
| 20 |
}
|