Release v1.1: PMI Phrase Merging & Smart Morphology
Browse files- tokenization_df_arc.py +28 -12
- tokenizer_config.json +0 -1
tokenization_df_arc.py
CHANGED
|
@@ -68,9 +68,24 @@ class MorphologicalPreTokenizer:
|
|
| 68 |
PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
|
| 69 |
SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
self.min_stem_length = min_stem_length
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
| 74 |
self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
|
| 75 |
self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
|
| 76 |
self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
|
|
@@ -108,10 +123,9 @@ class MorphologicalPreTokenizer:
|
|
| 108 |
|
| 109 |
def segment_text(self, text: str) -> str:
|
| 110 |
words = text.split()
|
| 111 |
-
segmented_words = [
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
segmented_words.append('_'.join(segments))
|
| 115 |
return ' '.join(segmented_words)
|
| 116 |
|
| 117 |
class PhraseMerger:
|
|
@@ -177,18 +191,20 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 177 |
):
|
| 178 |
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 179 |
|
| 180 |
-
# Load exceptions if
|
| 181 |
-
|
| 182 |
-
if exceptions_file:
|
| 183 |
try:
|
| 184 |
with open(exceptions_file, 'r', encoding='utf-8') as f:
|
| 185 |
-
|
| 186 |
-
except
|
|
|
|
|
|
|
| 187 |
pass
|
| 188 |
|
| 189 |
self.morph_helper = MorphologicalPreTokenizer(
|
| 190 |
min_stem_length=min_stem_length,
|
| 191 |
-
exceptions=
|
| 192 |
)
|
| 193 |
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
| 194 |
|
|
|
|
| 68 |
PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
|
| 69 |
SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
|
| 70 |
|
| 71 |
+
# Common entities/words to protect from segmentation (embedded fallback)
|
| 72 |
+
DEFAULT_EXCEPTIONS = {
|
| 73 |
+
"الله", "محمد", "عبدالله", "عبدالرحمن", "مكة", "بغداد", "دمشق", "القاهرة", "بيروت", "عمان",
|
| 74 |
+
"الرياض", "جدة", "الكويت", "دبي", "أبوظبي", "المنامة", "الدوحة", "مسقط", "ليبيا", "تونس",
|
| 75 |
+
"الجزائر", "المغرب", "فلسطين", "الأردن", "لبنان", "سوريا", "العراق", "مصر", "السودان", "اليمن",
|
| 76 |
+
"أمريكا", "أوروبا", "آسيا", "أفريقيا", "ترامب", "بايدن", "جوجل", "فيسبوك", "أمازون", "مايكروسوفت",
|
| 77 |
+
"أبل", "سامسونج", "سوني", "هواوي", "مرسيدس", "بي إم دبليو", "تويوتا", "هوندا", "فورد", "شيفروليه",
|
| 78 |
+
"تسلا", "ناسا", "إيلون ماسك", "مارك زوكربيرج", "بيل جيتس", "ستيف جوبز", "ألبرت أينشتاين",
|
| 79 |
+
"إسحاق نيوتن", "داروين", "بيتهوفن", "موتزارت", "شكسبير", "دوستويفسكي", "تولستوي", "نجيب محفوظ",
|
| 80 |
+
"طه حسين", "العقاد", "المنفلوطي", "جبران خليل جبران", "محمود درويش", "نزار قباني"
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
def __init__(self, min_stem_length: int = 2, exceptions: Optional[List[str]] = None):
|
| 84 |
self.min_stem_length = min_stem_length
|
| 85 |
+
# Merge user exceptions with defaults using frozenset for immutability and O(1) lookups
|
| 86 |
+
user_exceptions = set(exceptions) if exceptions else set()
|
| 87 |
+
self.exceptions = frozenset(self.DEFAULT_EXCEPTIONS.union(user_exceptions))
|
| 88 |
+
|
| 89 |
self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
|
| 90 |
self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
|
| 91 |
self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
|
|
|
|
| 123 |
|
| 124 |
def segment_text(self, text: str) -> str:
|
| 125 |
words = text.split()
|
| 126 |
+
segmented_words = [
|
| 127 |
+
'_'.join(self.segment_word(word)) for word in words
|
| 128 |
+
]
|
|
|
|
| 129 |
return ' '.join(segmented_words)
|
| 130 |
|
| 131 |
class PhraseMerger:
|
|
|
|
| 191 |
):
|
| 192 |
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 193 |
|
| 194 |
+
# Load user-provided exceptions if file exists
|
| 195 |
+
user_exceptions = []
|
| 196 |
+
if exceptions_file and os.path.exists(exceptions_file):
|
| 197 |
try:
|
| 198 |
with open(exceptions_file, 'r', encoding='utf-8') as f:
|
| 199 |
+
user_exceptions = [line.strip() for line in f if line.strip()]
|
| 200 |
+
except OSError:
|
| 201 |
+
# If file read fails, we just won't have custom exceptions
|
| 202 |
+
# The MorphologicalPreTokenizer has embedded defaults now.
|
| 203 |
pass
|
| 204 |
|
| 205 |
self.morph_helper = MorphologicalPreTokenizer(
|
| 206 |
min_stem_length=min_stem_length,
|
| 207 |
+
exceptions=user_exceptions
|
| 208 |
)
|
| 209 |
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
| 210 |
|
tokenizer_config.json
CHANGED
|
@@ -7,7 +7,6 @@
|
|
| 7 |
]
|
| 8 |
},
|
| 9 |
"phrases_file": "phrase_vocab.json",
|
| 10 |
-
"exceptions_file": "exceptions.txt",
|
| 11 |
"vocab_file": "tokenizer.json",
|
| 12 |
"min_stem_length": 2,
|
| 13 |
"normalization_config": {
|
|
|
|
| 7 |
]
|
| 8 |
},
|
| 9 |
"phrases_file": "phrase_vocab.json",
|
|
|
|
| 10 |
"vocab_file": "tokenizer.json",
|
| 11 |
"min_stem_length": 2,
|
| 12 |
"normalization_config": {
|