Arabic
arabic
tokenizer
morphology
nlp
dialect
fr3on commited on
Commit
f1a695e
ยท
verified ยท
1 Parent(s): dfdf13b

Release v1.1: PMI Phrase Merging & Smart Morphology

Browse files
Files changed (2) hide show
  1. tokenization_df_arc.py +20 -2
  2. tokenizer_config.json +1 -0
tokenization_df_arc.py CHANGED
@@ -68,8 +68,9 @@ class MorphologicalPreTokenizer:
68
  PREFIXES = ['ูˆ', 'ู', 'ุจ', 'ูƒ', 'ู„', 'ุงู„', 'ุณ', 'ูˆุงู„', 'ุจุงู„', 'ูƒุงู„', 'ู„ู„', 'ูุงู„']
69
  SUFFIXES = ['ู†ูŠ', 'ู†ุง', 'ูƒ', 'ูƒู…', 'ู‡', 'ู‡ุง', 'ู‡ู…', 'ู‡ู†', 'ูŠ', 'ูˆู†', 'ูŠู†', 'ุงู†', 'ุช', 'ูˆุง', 'ุฉ']
70
 
71
- def __init__(self, min_stem_length: int = 2):
72
  self.min_stem_length = min_stem_length
 
73
  self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
74
  self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
75
  self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
@@ -77,6 +78,9 @@ class MorphologicalPreTokenizer:
77
  def segment_word(self, word: str) -> List[str]:
78
  if not word or not self.arabic_pattern.fullmatch(word):
79
  return [word]
 
 
 
80
 
81
  original = word
82
  segments = []
@@ -168,10 +172,24 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
168
  phrases_file: Optional[str] = None,
169
  normalization_config: Optional[Dict[str, bool]] = None,
170
  min_stem_length: int = 2,
 
171
  **kwargs
172
  ):
173
  self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
174
- self.morph_helper = MorphologicalPreTokenizer(min_stem_length=min_stem_length)
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
176
 
177
  super().__init__(
 
68
  PREFIXES = ['ูˆ', 'ู', 'ุจ', 'ูƒ', 'ู„', 'ุงู„', 'ุณ', 'ูˆุงู„', 'ุจุงู„', 'ูƒุงู„', 'ู„ู„', 'ูุงู„']
69
  SUFFIXES = ['ู†ูŠ', 'ู†ุง', 'ูƒ', 'ูƒู…', 'ู‡', 'ู‡ุง', 'ู‡ู…', 'ู‡ู†', 'ูŠ', 'ูˆู†', 'ูŠู†', 'ุงู†', 'ุช', 'ูˆุง', 'ุฉ']
70
 
71
+ def __init__(self, min_stem_length: int = 2, exceptions: List[str] = None):
72
  self.min_stem_length = min_stem_length
73
+ self.exceptions = set(exceptions) if exceptions else set()
74
  self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
75
  self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
76
  self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
 
78
  def segment_word(self, word: str) -> List[str]:
79
  if not word or not self.arabic_pattern.fullmatch(word):
80
  return [word]
81
+
82
+ if word in self.exceptions:
83
+ return [word]
84
 
85
  original = word
86
  segments = []
 
172
  phrases_file: Optional[str] = None,
173
  normalization_config: Optional[Dict[str, bool]] = None,
174
  min_stem_length: int = 2,
175
+ exceptions_file: Optional[str] = None,
176
  **kwargs
177
  ):
178
  self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
179
+
180
+ # Load exceptions if provided
181
+ exceptions = []
182
+ if exceptions_file:
183
+ try:
184
+ with open(exceptions_file, 'r', encoding='utf-8') as f:
185
+ exceptions = [line.strip() for line in f if line.strip()]
186
+ except FileNotFoundError:
187
+ pass
188
+
189
+ self.morph_helper = MorphologicalPreTokenizer(
190
+ min_stem_length=min_stem_length,
191
+ exceptions=exceptions
192
+ )
193
  self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
194
 
195
  super().__init__(
tokenizer_config.json CHANGED
@@ -7,6 +7,7 @@
7
  ]
8
  },
9
  "phrases_file": "phrase_vocab.json",
 
10
  "vocab_file": "tokenizer.json",
11
  "min_stem_length": 2,
12
  "normalization_config": {
 
7
  ]
8
  },
9
  "phrases_file": "phrase_vocab.json",
10
+ "exceptions_file": "exceptions.txt",
11
  "vocab_file": "tokenizer.json",
12
  "min_stem_length": 2,
13
  "normalization_config": {