Release v1.1: PMI Phrase Merging & Smart Morphology

Browse files

Files changed (3) hide show

exceptions.txt +71 -0
tokenization_df_arc.py +16 -23
tokenizer_config.json +6 -7

exceptions.txt ADDED Viewed

	@@ -0,0 +1,71 @@

+الله
+محمد
+عبدالله
+عبدالرحمن
+مكة
+بغداد
+دمشق
+القاهرة
+بيروت
+عمان
+الرياض
+جدة
+الكويت
+دبي
+أبوظبي
+المنامة
+الدوحة
+مسقط
+ليبيا
+تونس
+الجزائر
+المغرب
+فلسطين
+الأردن
+لبنان
+سوريا
+العراق
+مصر
+السودان
+اليمن
+أمريكا
+أوروبا
+آسيا
+أفريقيا
+ترامب
+بايدن
+جوجل
+فيسبوك
+أمازون
+مايكروسوفت
+أبل
+سامسونج
+سوني
+هواوي
+مرسيدس
+بي إم دبليو
+تويوتا
+هوندا
+فورد
+شيفروليه
+تسلا
+ناسا
+إيلون ماسك
+مارك زوكربيرج
+بيل جيتس
+ستيف جوبز
+ألبرت أينشتاين
+إسحاق نيوتن
+داروين
+بيتهوفن
+موتزارت
+شكسبير
+دوستويفسكي
+تولستوي
+نجيب محفوظ
+طه حسين
+العقاد
+المنفلوطي
+جبران خليل جبران
+محمود درويش
+نزار قباني

tokenization_df_arc.py CHANGED Viewed

@@ -163,14 +163,13 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
     def __init__(
         self,
-        vocab_file=None,
-        tokenizer_file=None,
-        phrases_file=None,
-        normalization_config=None,
-        min_stem_length=2,
         **kwargs
     ):
-        # Initialize helpers
         self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
         self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
         self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
@@ -181,10 +180,10 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
             **kwargs
         )
-    def _batch_encode_plus(self, batch_text_or_text_pairs, *args, **kwargs):
-        # Pre-process batch
-        def preprocess(text):
-            if not text: return ""
             t = self.normalizer_helper.normalize(text)
             t = self.morph_helper.segment_text(t)
             t = self.phrase_helper.merge_phrases(t)
@@ -193,12 +192,11 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
         if isinstance(batch_text_or_text_pairs, str):
             batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
         elif isinstance(batch_text_or_text_pairs, (list, tuple)):
-            # Handle text pairs? For now assume list of strings
             processed = []
             for item in batch_text_or_text_pairs:
                 if isinstance(item, str):
                     processed.append(preprocess(item))
-                elif isinstance(item, (list, tuple)): # Pairs
                     processed.append((preprocess(item[0]), preprocess(item[1])))
                 else:
                     processed.append(item)
@@ -228,19 +226,14 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
         return self.convert_tokens_to_string(tokens)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        """
-        Converts a sequence of tokens (string) in a single string.
-        """
-        # Join with simple space (since we used Whitespace pre-tokenizer)
         text = " ".join(tokens)
-        # Remove morphological markers (underscores)
-        # We only remove internal underscores.
-        # Note: This is an approximation.
-        text = text.replace("_", "")
-        return text

     def __init__(
         self,
+        vocab_file: Optional[str] = None,
+        tokenizer_file: Optional[str] = None,
+        phrases_file: Optional[str] = None,
+        normalization_config: Optional[Dict[str, bool]] = None,
+        min_stem_length: int = 2,
         **kwargs
     ):
         self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
         self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
         self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
             **kwargs
         )
+    def _batch_encode_plus(self, batch_text_or_text_pairs: Union[str, List[str], List[Tuple[str, str]]], *args, **kwargs):
+        def preprocess(text: str) -> str:
+            if not text:
+                return ""
             t = self.normalizer_helper.normalize(text)
             t = self.morph_helper.segment_text(t)
             t = self.phrase_helper.merge_phrases(t)
         if isinstance(batch_text_or_text_pairs, str):
             batch_text_or_text_pairs = preprocess(batch_text_or_text_pairs)
         elif isinstance(batch_text_or_text_pairs, (list, tuple)):
             processed = []
             for item in batch_text_or_text_pairs:
                 if isinstance(item, str):
                     processed.append(preprocess(item))
+                elif isinstance(item, (list, tuple)):
                     processed.append((preprocess(item[0]), preprocess(item[1])))
                 else:
                     processed.append(item)
         return self.convert_tokens_to_string(tokens)
     def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Converts a sequence of tokens into a single string."""
         text = " ".join(tokens)
+        # Remove internal morphological underscores (e.g., 'w_s_y' -> 'wsy')
+        # We use a regex to ensure we only remove underscores that are
+        # acting as connectors between Arabic segments, preserving snake_case.
+        arabic_range = r'[\u0600-\u06FF]'
+        return re.sub(rf'(?<={arabic_range})_|_(?={arabic_range})', '', text)

tokenizer_config.json CHANGED Viewed

@@ -1,21 +1,20 @@
 {
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_df_arc.DFArcTokenizer",
       null
     ]
   },
-  "tokenizer_class": "DFArcTokenizer",
-  "phrases_file": "phrases.json",
-  "normalization": {
     "unify_alef": true,
     "unify_yeh": true,
     "unify_teh_marbuta": true,
     "remove_diacritics": true,
     "remove_tatweel": true,
     "remove_repeats": true
-  },
-  "min_stem_length": 2,
-  "vocab_size": 256000,
-  "model_max_length": 4096
 }

 {
+  "tokenizer_class": "DFArcTokenizer",
   "auto_map": {
     "AutoTokenizer": [
       "tokenization_df_arc.DFArcTokenizer",
       null
     ]
   },
+  "phrases_file": "phrase_vocab.json",
+  "vocab_file": "tokenizer.json",
+  "min_stem_length": 2,
+  "normalization_config": {
     "unify_alef": true,
     "unify_yeh": true,
     "unify_teh_marbuta": true,
     "remove_diacritics": true,
     "remove_tatweel": true,
     "remove_repeats": true
+  }
 }