dataflare
/

df-arc

@@ -68,8 +68,9 @@ class MorphologicalPreTokenizer:
     PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
     SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
-    def __init__(self, min_stem_length: int = 2):
         self.min_stem_length = min_stem_length
         self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
         self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
         self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
@@ -77,6 +78,9 @@ class MorphologicalPreTokenizer:
     def segment_word(self, word: str) -> List[str]:
         if not word or not self.arabic_pattern.fullmatch(word):
             return [word]
         original = word
         segments = []
@@ -168,10 +172,24 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
         phrases_file: Optional[str] = None,
         normalization_config: Optional[Dict[str, bool]] = None,
         min_stem_length: int = 2,
         **kwargs
     ):
         self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
-        self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
         self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
         super().__init__(

     PREFIXES = ['و', 'ف', 'ب', 'ك', 'ل', 'ال', 'س', 'وال', 'بال', 'كال', 'لل', 'فال']
     SUFFIXES = ['ني', 'نا', 'ك', 'كم', 'ه', 'ها', 'هم', 'هن', 'ي', 'ون', 'ين', 'ان', 'ت', 'وا', 'ة']
+    def __init__(self, min_stem_length: int = 2, exceptions: List[str] = None):
         self.min_stem_length = min_stem_length
+        self.exceptions = set(exceptions) if exceptions else set()
         self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
         self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
         self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
     def segment_word(self, word: str) -> List[str]:
         if not word or not self.arabic_pattern.fullmatch(word):
             return [word]
+        if word in self.exceptions:
+            return [word]
         original = word
         segments = []
         phrases_file: Optional[str] = None,
         normalization_config: Optional[Dict[str, bool]] = None,
         min_stem_length: int = 2,
+        exceptions_file: Optional[str] = None,
         **kwargs
     ):
         self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
+        # Load exceptions if provided
+        exceptions = []
+        if exceptions_file:
+            try:
+                with open(exceptions_file, 'r', encoding='utf-8') as f:
+                    exceptions = [line.strip() for line in f if line.strip()]
+            except FileNotFoundError:
+                pass
+        self.morph_helper = MorphologicalPreTokenizer(
+            min_stem_length=min_stem_length,
+            exceptions=exceptions
+        )
         self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
         super().__init__(

tokenizer_config.json CHANGED Viewed

@@ -7,6 +7,7 @@
     ]
   },
   "phrases_file": "phrase_vocab.json",
   "vocab_file": "tokenizer.json",
   "min_stem_length": 2,
   "normalization_config": {

     ]
   },
   "phrases_file": "phrase_vocab.json",
+  "exceptions_file": "exceptions.txt",
   "vocab_file": "tokenizer.json",
   "min_stem_length": 2,
   "normalization_config": {