Release v1.1: PMI Phrase Merging & Smart Morphology
Browse files- tokenization_df_arc.py +20 -2
- tokenizer_config.json +1 -0
tokenization_df_arc.py
CHANGED
|
@@ -68,8 +68,9 @@ class MorphologicalPreTokenizer:
|
|
| 68 |
PREFIXES = ['ู', 'ู', 'ุจ', 'ู', 'ู', 'ุงู', 'ุณ', 'ูุงู', 'ุจุงู', 'ูุงู', 'ูู', 'ูุงู']
|
| 69 |
SUFFIXES = ['ูู', 'ูุง', 'ู', 'ูู
', 'ู', 'ูุง', 'ูู
', 'ูู', 'ู', 'ูู', 'ูู', 'ุงู', 'ุช', 'ูุง', 'ุฉ']
|
| 70 |
|
| 71 |
-
def __init__(self, min_stem_length: int = 2):
|
| 72 |
self.min_stem_length = min_stem_length
|
|
|
|
| 73 |
self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
|
| 74 |
self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
|
| 75 |
self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
|
|
@@ -77,6 +78,9 @@ class MorphologicalPreTokenizer:
|
|
| 77 |
def segment_word(self, word: str) -> List[str]:
|
| 78 |
if not word or not self.arabic_pattern.fullmatch(word):
|
| 79 |
return [word]
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
original = word
|
| 82 |
segments = []
|
|
@@ -168,10 +172,24 @@ class DFArcTokenizer(PreTrainedTokenizerFast):
|
|
| 168 |
phrases_file: Optional[str] = None,
|
| 169 |
normalization_config: Optional[Dict[str, bool]] = None,
|
| 170 |
min_stem_length: int = 2,
|
|
|
|
| 171 |
**kwargs
|
| 172 |
):
|
| 173 |
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 174 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
| 176 |
|
| 177 |
super().__init__(
|
|
|
|
| 68 |
PREFIXES = ['ู', 'ู', 'ุจ', 'ู', 'ู', 'ุงู', 'ุณ', 'ูุงู', 'ุจุงู', 'ูุงู', 'ูู', 'ูุงู']
|
| 69 |
SUFFIXES = ['ูู', 'ูุง', 'ู', 'ูู
', 'ู', 'ูุง', 'ูู
', 'ูู', 'ู', 'ูู', 'ูู', 'ุงู', 'ุช', 'ูุง', 'ุฉ']
|
| 70 |
|
| 71 |
+
def __init__(self, min_stem_length: int = 2, exceptions: List[str] = None):
|
| 72 |
self.min_stem_length = min_stem_length
|
| 73 |
+
self.exceptions = set(exceptions) if exceptions else set()
|
| 74 |
self.prefixes = sorted(self.PREFIXES, key=len, reverse=True)
|
| 75 |
self.suffixes = sorted(self.SUFFIXES, key=len, reverse=True)
|
| 76 |
self.arabic_pattern = re.compile(r'[\u0600-\u06FF]+')
|
|
|
|
| 78 |
def segment_word(self, word: str) -> List[str]:
|
| 79 |
if not word or not self.arabic_pattern.fullmatch(word):
|
| 80 |
return [word]
|
| 81 |
+
|
| 82 |
+
if word in self.exceptions:
|
| 83 |
+
return [word]
|
| 84 |
|
| 85 |
original = word
|
| 86 |
segments = []
|
|
|
|
| 172 |
phrases_file: Optional[str] = None,
|
| 173 |
normalization_config: Optional[Dict[str, bool]] = None,
|
| 174 |
min_stem_length: int = 2,
|
| 175 |
+
exceptions_file: Optional[str] = None,
|
| 176 |
**kwargs
|
| 177 |
):
|
| 178 |
self.normalizer_helper = ArabicNormalizer(**(normalization_config or {}))
|
| 179 |
+
|
| 180 |
+
# Load exceptions if provided
|
| 181 |
+
exceptions = []
|
| 182 |
+
if exceptions_file:
|
| 183 |
+
try:
|
| 184 |
+
with open(exceptions_file, 'r', encoding='utf-8') as f:
|
| 185 |
+
exceptions = [line.strip() for line in f if line.strip()]
|
| 186 |
+
except FileNotFoundError:
|
| 187 |
+
pass
|
| 188 |
+
|
| 189 |
+
self.morph_helper = MorphologicalPreTokenizer(
|
| 190 |
+
min_stem_length=min_stem_length,
|
| 191 |
+
exceptions=exceptions
|
| 192 |
+
)
|
| 193 |
self.phrase_helper = PhraseMerger(phrases_file=phrases_file)
|
| 194 |
|
| 195 |
super().__init__(
|
tokenizer_config.json
CHANGED
|
@@ -7,6 +7,7 @@
|
|
| 7 |
]
|
| 8 |
},
|
| 9 |
"phrases_file": "phrase_vocab.json",
|
|
|
|
| 10 |
"vocab_file": "tokenizer.json",
|
| 11 |
"min_stem_length": 2,
|
| 12 |
"normalization_config": {
|
|
|
|
| 7 |
]
|
| 8 |
},
|
| 9 |
"phrases_file": "phrase_vocab.json",
|
| 10 |
+
"exceptions_file": "exceptions.txt",
|
| 11 |
"vocab_file": "tokenizer.json",
|
| 12 |
"min_stem_length": 2,
|
| 13 |
"normalization_config": {
|