fix: all model bugs — S1 S2 S3 G1 P1 (6 fixes across 4 files)
Browse filesS1 (P0): WordAligner now prefers ة over ه at word end when both IV
- araspell_rules.py _select_best_word: ه→ة preference for feminine nouns
S2 (P0): Gender preservation — reject corrections that drop feminine marker
- app.py _is_small_spelling_change: block بارده→بارد, منخفظه→منخفض
S3 (P1): Hamza whitelist — 50+ common Arabic hamza corrections
- araspell_rules.py HAMZA_WHITELIST + fix_common_hamza()
- Fixes: الي→إلى, انت→أنت, لان→لأن, امس→أمس, الايام→الأيام, etc.
G1 (P1): Verb-subject agreement for SVO word order
- grammar_rules.py fix_subject_verb_agreement()
- Handles: الطلاب ذهب→ذهبوا, الطالبات ذهب→ذهبن
P1 (P2): Punctuation model now only adds marks, no spelling/grammar changes
- punctuation_service.py _strip_non_punctuation_changes()
- Reverts PuncAra's baked-in spelling/grammar corrections, keeps only marks
S4 (P2): Mitigated by S1+S2+S3 — spelling now makes better corrections,
fewer bad locks blocking grammar
31/31 tests passing
- src/app.py +8 -0
- src/nlp/grammar/grammar_rules.py +81 -0
- src/nlp/punctuation/punctuation_service.py +97 -1
- src/nlp/spelling/araspell_rules.py +87 -5
|
@@ -751,6 +751,14 @@ def _is_small_spelling_change(orig_word, corr_word):
|
|
| 751 |
if re.search(r'[^ء-يآأإىa-zA-Z]', corr_word):
|
| 752 |
return False
|
| 753 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 754 |
dist = _levenshtein(orig_word, corr_word)
|
| 755 |
max_len = max(len(orig_word), len(corr_word))
|
| 756 |
|
|
|
|
| 751 |
if re.search(r'[^ء-يآأإىa-zA-Z]', corr_word):
|
| 752 |
return False
|
| 753 |
|
| 754 |
+
# Fix S2: Reject corrections that drop feminine marker (ه/ة)
|
| 755 |
+
# e.g. بارده→بارد, منخفظه→منخفض — these are WORSE than no correction
|
| 756 |
+
feminine_endings = ('ه', 'ة')
|
| 757 |
+
if orig_word.endswith(feminine_endings) and not corr_word.endswith(feminine_endings):
|
| 758 |
+
# Only reject if the correction is just the word minus the ending
|
| 759 |
+
if corr_word == orig_word[:-1] or len(corr_word) < len(orig_word):
|
| 760 |
+
return False
|
| 761 |
+
|
| 762 |
dist = _levenshtein(orig_word, corr_word)
|
| 763 |
max_len = max(len(orig_word), len(corr_word))
|
| 764 |
|
|
@@ -161,6 +161,85 @@ class ArabicGrammarGuard:
|
|
| 161 |
text = re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', r'\1\2ين', text)
|
| 162 |
return text
|
| 163 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 164 |
def regex_rules_fallback(self, text):
|
| 165 |
# إن وأخواتها
|
| 166 |
text = re.sub(r'\b(إن|أن|كأن|لكن|لعل|ليت)\s+(أبوك|أخوك|ذو|فوك)\b',
|
|
@@ -183,6 +262,8 @@ class ArabicGrammarGuard:
|
|
| 183 |
text = self.fix_verbs_nasb_and_jazm(text)
|
| 184 |
text = self.fix_gender_agreement(text)
|
| 185 |
text = self.fix_prepositions_advanced(text)
|
|
|
|
| 186 |
text = self.regex_rules_fallback(text)
|
| 187 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 188 |
return text
|
|
|
|
|
|
| 161 |
text = re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', r'\1\2ين', text)
|
| 162 |
return text
|
| 163 |
|
| 164 |
+
def fix_subject_verb_agreement(self, text):
|
| 165 |
+
"""
|
| 166 |
+
Fix G1: When a plural/dual noun PRECEDES a singular verb (SVO order),
|
| 167 |
+
the verb must agree in number and gender.
|
| 168 |
+
|
| 169 |
+
Arabic rule: In VSO order, verb can be singular even with plural subject.
|
| 170 |
+
But in SVO order, subject-verb agreement is required.
|
| 171 |
+
"""
|
| 172 |
+
tokens = simple_word_tokenize(text)
|
| 173 |
+
if len(tokens) < 2:
|
| 174 |
+
return text
|
| 175 |
+
disambig_tokens = self.mle.disambiguate(tokens)
|
| 176 |
+
corrected_tokens = list(tokens)
|
| 177 |
+
|
| 178 |
+
# Common plural nouns (masculine sound plural) ending in ون/ين/ات
|
| 179 |
+
# and their expected verb conjugation patterns
|
| 180 |
+
for i in range(len(disambig_tokens) - 1):
|
| 181 |
+
noun_info = disambig_tokens[i].analyses[0] if disambig_tokens[i].analyses else None
|
| 182 |
+
verb_info = disambig_tokens[i+1].analyses[0] if disambig_tokens[i+1].analyses else None
|
| 183 |
+
if not noun_info or not verb_info:
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
noun_pos = noun_info.analysis.get('pos', 'unknown')
|
| 187 |
+
verb_pos = verb_info.analysis.get('pos', 'unknown')
|
| 188 |
+
noun_word = corrected_tokens[i]
|
| 189 |
+
verb_word = corrected_tokens[i+1]
|
| 190 |
+
|
| 191 |
+
# Only process noun → verb patterns (SVO order)
|
| 192 |
+
if noun_pos != 'noun' or verb_pos != 'verb':
|
| 193 |
+
continue
|
| 194 |
+
|
| 195 |
+
noun_num = noun_info.analysis.get('num', 's')
|
| 196 |
+
noun_gen = noun_info.analysis.get('gen', 'm')
|
| 197 |
+
verb_num = verb_info.analysis.get('num', 's')
|
| 198 |
+
|
| 199 |
+
# Skip if verb is already plural
|
| 200 |
+
if verb_num != 's':
|
| 201 |
+
continue
|
| 202 |
+
|
| 203 |
+
# Detect plural nouns
|
| 204 |
+
is_plural_masc = (noun_word.endswith('ون') or noun_word.endswith('ين')
|
| 205 |
+
or noun_num == 'p')
|
| 206 |
+
is_plural_fem = (noun_word.endswith('ات') or
|
| 207 |
+
(noun_gen == 'f' and noun_num == 'p'))
|
| 208 |
+
# Common broken plurals and collective nouns
|
| 209 |
+
KNOWN_PLURALS_MASC = {
|
| 210 |
+
'الطلاب', 'طلاب', 'الرجال', 'رجال', 'الأولاد', 'أولاد',
|
| 211 |
+
'الأطباء', 'أطباء', 'الاطباء', 'اطباء',
|
| 212 |
+
'العمال', 'عمال', 'الناس', 'الشباب', 'الأبناء',
|
| 213 |
+
}
|
| 214 |
+
KNOWN_PLURALS_FEM = {
|
| 215 |
+
'الطالبات', 'طالبات', 'النساء', 'نساء', 'البنات', 'بنات',
|
| 216 |
+
'المعلمات', 'معلمات', 'الأمهات', 'أمهات',
|
| 217 |
+
}
|
| 218 |
+
if noun_word in KNOWN_PLURALS_MASC:
|
| 219 |
+
is_plural_masc = True
|
| 220 |
+
if noun_word in KNOWN_PLURALS_FEM:
|
| 221 |
+
is_plural_fem = True
|
| 222 |
+
|
| 223 |
+
if not is_plural_masc and not is_plural_fem:
|
| 224 |
+
continue
|
| 225 |
+
|
| 226 |
+
# Fix the verb to agree with the plural subject
|
| 227 |
+
# Past tense singular → plural
|
| 228 |
+
if is_plural_fem:
|
| 229 |
+
# Feminine plural: ذهب → ذهبن
|
| 230 |
+
if not verb_word.endswith('ن') and not verb_word.endswith('نَ'):
|
| 231 |
+
# Check if it's a past tense verb (typically 3-5 chars, no prefix)
|
| 232 |
+
if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
|
| 233 |
+
corrected_tokens[i+1] = verb_word + 'ن'
|
| 234 |
+
elif is_plural_masc:
|
| 235 |
+
# Masculine plural: ذهب → ذهبوا
|
| 236 |
+
if (not verb_word.endswith('وا') and not verb_word.endswith('ون')
|
| 237 |
+
and not verb_word.endswith('ين')):
|
| 238 |
+
if len(verb_word) >= 3 and not verb_word.startswith('ي') and not verb_word.startswith('ت'):
|
| 239 |
+
corrected_tokens[i+1] = verb_word + 'وا'
|
| 240 |
+
|
| 241 |
+
return " ".join(corrected_tokens)
|
| 242 |
+
|
| 243 |
def regex_rules_fallback(self, text):
|
| 244 |
# إن وأخواتها
|
| 245 |
text = re.sub(r'\b(إن|أن|كأن|لكن|لعل|ليت)\s+(أبوك|أخوك|ذو|فوك)\b',
|
|
|
|
| 262 |
text = self.fix_verbs_nasb_and_jazm(text)
|
| 263 |
text = self.fix_gender_agreement(text)
|
| 264 |
text = self.fix_prepositions_advanced(text)
|
| 265 |
+
text = self.fix_subject_verb_agreement(text) # Fix G1
|
| 266 |
text = self.regex_rules_fallback(text)
|
| 267 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 268 |
return text
|
| 269 |
+
|
|
@@ -27,14 +27,108 @@ class PunctuationChecker:
|
|
| 27 |
Arabic punctuation restoration pipeline:
|
| 28 |
1. Preprocessing (remove diacritics)
|
| 29 |
2. Model inference (chunked, windowed — 50 words/chunk)
|
| 30 |
-
3. Postprocessing (
|
|
|
|
| 31 |
"""
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
def __init__(self, model, tokenizer, device):
|
| 34 |
self.model = model
|
| 35 |
self.tokenizer = tokenizer
|
| 36 |
self.device = device
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
def _predict_chunk(self, text_chunk: str) -> str:
|
| 39 |
"""Run model inference on a single chunk (max 128 tokens)."""
|
| 40 |
from nlp.punctuation.punctuation_rules import arabic_preprocessing
|
|
@@ -114,6 +208,8 @@ class PunctuationChecker:
|
|
| 114 |
|
| 115 |
for paragraph in paragraphs:
|
| 116 |
punctuated = self._fix_punctuation(paragraph)
|
|
|
|
|
|
|
| 117 |
cleaned = arabic_postprocessing(punctuated)
|
| 118 |
processed_paragraphs.append(cleaned)
|
| 119 |
|
|
|
|
| 27 |
Arabic punctuation restoration pipeline:
|
| 28 |
1. Preprocessing (remove diacritics)
|
| 29 |
2. Model inference (chunked, windowed — 50 words/chunk)
|
| 30 |
+
3. Postprocessing: strip non-punctuation changes (Fix P1)
|
| 31 |
+
4. Typographic cleanup
|
| 32 |
"""
|
| 33 |
|
| 34 |
+
# Arabic and common punctuation marks
|
| 35 |
+
PUNCTUATION_CHARS = set('.,;:!?،؛؟!.:«»"\'()-–—…')
|
| 36 |
+
|
| 37 |
def __init__(self, model, tokenizer, device):
|
| 38 |
self.model = model
|
| 39 |
self.tokenizer = tokenizer
|
| 40 |
self.device = device
|
| 41 |
|
| 42 |
+
@staticmethod
|
| 43 |
+
def _strip_punct(word: str) -> str:
|
| 44 |
+
"""Remove leading/trailing punctuation from a word."""
|
| 45 |
+
return word.strip('.,;:!?،؛؟!.:«»"\'()-–—…')
|
| 46 |
+
|
| 47 |
+
def _strip_non_punctuation_changes(self, original: str, punctuated: str) -> str:
|
| 48 |
+
"""
|
| 49 |
+
Fix P1: The PuncAra model was fine-tuned on data with spelling/grammar
|
| 50 |
+
corrections. We only want punctuation marks from this stage.
|
| 51 |
+
|
| 52 |
+
Strategy: Align original and punctuated word-by-word. For each word,
|
| 53 |
+
if the model changed the BASE text (not just added/moved punctuation),
|
| 54 |
+
revert to the original word but keep any punctuation the model added.
|
| 55 |
+
"""
|
| 56 |
+
orig_words = original.split()
|
| 57 |
+
punc_words = punctuated.split()
|
| 58 |
+
|
| 59 |
+
if not orig_words or not punc_words:
|
| 60 |
+
return punctuated
|
| 61 |
+
|
| 62 |
+
# Build result by aligning words
|
| 63 |
+
result = []
|
| 64 |
+
oi = 0 # index into orig_words
|
| 65 |
+
pi = 0 # index into punc_words
|
| 66 |
+
|
| 67 |
+
while oi < len(orig_words) and pi < len(punc_words):
|
| 68 |
+
o_word = orig_words[oi]
|
| 69 |
+
p_word = punc_words[pi]
|
| 70 |
+
|
| 71 |
+
o_base = self._strip_punct(o_word)
|
| 72 |
+
p_base = self._strip_punct(p_word)
|
| 73 |
+
|
| 74 |
+
if o_base == p_base:
|
| 75 |
+
# Same base word — keep punctuation changes from model
|
| 76 |
+
result.append(p_word)
|
| 77 |
+
oi += 1
|
| 78 |
+
pi += 1
|
| 79 |
+
elif self._is_only_punct_difference(o_word, p_word):
|
| 80 |
+
# Words differ only by punctuation — keep model's punctuation
|
| 81 |
+
result.append(p_word)
|
| 82 |
+
oi += 1
|
| 83 |
+
pi += 1
|
| 84 |
+
else:
|
| 85 |
+
# Model changed the actual word content (spelling/grammar/hamza)
|
| 86 |
+
# Revert to original word but transfer any NEW punctuation
|
| 87 |
+
punct_suffix = ''
|
| 88 |
+
punct_prefix = ''
|
| 89 |
+
for ch in reversed(p_word):
|
| 90 |
+
if ch in self.PUNCTUATION_CHARS:
|
| 91 |
+
punct_suffix = ch + punct_suffix
|
| 92 |
+
else:
|
| 93 |
+
break
|
| 94 |
+
for ch in p_word:
|
| 95 |
+
if ch in self.PUNCTUATION_CHARS:
|
| 96 |
+
punct_prefix += ch
|
| 97 |
+
else:
|
| 98 |
+
break
|
| 99 |
+
|
| 100 |
+
# Only add punctuation that wasn't already there
|
| 101 |
+
if not o_word.endswith(punct_suffix) and punct_suffix:
|
| 102 |
+
result.append(o_word + punct_suffix)
|
| 103 |
+
elif punct_prefix and not o_word.startswith(punct_prefix):
|
| 104 |
+
result.append(punct_prefix + o_word)
|
| 105 |
+
else:
|
| 106 |
+
result.append(o_word)
|
| 107 |
+
oi += 1
|
| 108 |
+
pi += 1
|
| 109 |
+
|
| 110 |
+
# Append remaining original words
|
| 111 |
+
while oi < len(orig_words):
|
| 112 |
+
result.append(orig_words[oi])
|
| 113 |
+
oi += 1
|
| 114 |
+
|
| 115 |
+
# Append remaining punctuation-only words from model
|
| 116 |
+
while pi < len(punc_words):
|
| 117 |
+
p_word = punc_words[pi]
|
| 118 |
+
if all(ch in self.PUNCTUATION_CHARS or ch.isspace() for ch in p_word):
|
| 119 |
+
result.append(p_word)
|
| 120 |
+
pi += 1
|
| 121 |
+
|
| 122 |
+
return ' '.join(result)
|
| 123 |
+
|
| 124 |
+
@staticmethod
|
| 125 |
+
def _is_only_punct_difference(word1: str, word2: str) -> bool:
|
| 126 |
+
"""Check if two words differ only by punctuation characters."""
|
| 127 |
+
PUNCT = set('.,;:!?،؛؟!.:«»"\'()-–—…')
|
| 128 |
+
base1 = ''.join(c for c in word1 if c not in PUNCT)
|
| 129 |
+
base2 = ''.join(c for c in word2 if c not in PUNCT)
|
| 130 |
+
return base1 == base2
|
| 131 |
+
|
| 132 |
def _predict_chunk(self, text_chunk: str) -> str:
|
| 133 |
"""Run model inference on a single chunk (max 128 tokens)."""
|
| 134 |
from nlp.punctuation.punctuation_rules import arabic_preprocessing
|
|
|
|
| 208 |
|
| 209 |
for paragraph in paragraphs:
|
| 210 |
punctuated = self._fix_punctuation(paragraph)
|
| 211 |
+
# Fix P1: Strip spelling/grammar changes, keep only punctuation
|
| 212 |
+
punctuated = self._strip_non_punctuation_changes(paragraph, punctuated)
|
| 213 |
cleaned = arabic_postprocessing(punctuated)
|
| 214 |
processed_paragraphs.append(cleaned)
|
| 215 |
|
|
@@ -114,6 +114,50 @@ class AraSpellPostProcessor:
|
|
| 114 |
|
| 115 |
# --- Hamza & Ta Marbuta Handling ---
|
| 116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
@staticmethod
|
| 118 |
def fix_hamza_conservative(text: str) -> str:
|
| 119 |
"""Conservative Hamza normalization — only at word END, not middle."""
|
|
@@ -128,34 +172,62 @@ class AraSpellPostProcessor:
|
|
| 128 |
result.append(word)
|
| 129 |
return ' '.join(result)
|
| 130 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
@staticmethod
|
| 132 |
def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
|
| 133 |
"""
|
| 134 |
Smart ه → ة fix at end of words.
|
| 135 |
-
Strategy:
|
|
|
|
| 136 |
"""
|
| 137 |
PROTECTED_ENDINGS = ['لله']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
words = text.split()
|
| 139 |
result = []
|
| 140 |
for word in words:
|
| 141 |
if any(word.endswith(e) for e in PROTECTED_ENDINGS):
|
| 142 |
result.append(word)
|
| 143 |
continue
|
| 144 |
-
if
|
|
|
|
|
|
|
|
|
|
| 145 |
if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
|
| 146 |
candidate_with_ta = word[:-1] + 'ة'
|
|
|
|
| 147 |
if vocab_manager:
|
| 148 |
ta_iv = vocab_manager.is_iv(candidate_with_ta)
|
| 149 |
ha_iv = vocab_manager.is_iv(word)
|
| 150 |
if ta_iv:
|
|
|
|
| 151 |
result.append(candidate_with_ta)
|
| 152 |
continue
|
| 153 |
elif ha_iv:
|
| 154 |
result.append(word)
|
| 155 |
continue
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
result.append(word)
|
| 160 |
return ' '.join(result)
|
| 161 |
|
|
@@ -263,6 +335,7 @@ class AraSpellPostProcessor:
|
|
| 263 |
text = AraSpellPostProcessor.remove_hallucinations(text)
|
| 264 |
text = AraSpellPostProcessor.unified_collapse_repeated(text)
|
| 265 |
text = AraSpellPostProcessor.fix_hamza_conservative(text)
|
|
|
|
| 266 |
text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
|
| 267 |
text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
|
| 268 |
text = AraSpellPostProcessor.remove_duplicate_words(text)
|
|
@@ -588,6 +661,15 @@ class WordAligner:
|
|
| 588 |
if in_iv and not out_iv:
|
| 589 |
return input_word
|
| 590 |
if in_iv and out_iv:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 591 |
return input_word
|
| 592 |
if len(input_word) == len(output_word) and len(input_word) >= 3:
|
| 593 |
for i in range(len(input_word)):
|
|
|
|
| 114 |
|
| 115 |
# --- Hamza & Ta Marbuta Handling ---
|
| 116 |
|
| 117 |
+
# Common Arabic words with hamza errors — covers the most frequent
|
| 118 |
+
# spelling mistakes in informal Arabic writing
|
| 119 |
+
HAMZA_WHITELIST = {
|
| 120 |
+
'الي': 'إلى', 'الى': 'إلى',
|
| 121 |
+
'انت': 'أنت', 'انتم': 'أنتم', 'انتي': 'أنتِ',
|
| 122 |
+
'انتو': 'أنتم', 'انتن': 'أنتن',
|
| 123 |
+
'انا': 'أنا',
|
| 124 |
+
'امس': 'أمس',
|
| 125 |
+
'لان': 'لأن', 'لانه': 'لأنه', 'لانها': 'لأنها',
|
| 126 |
+
'لانهم': 'لأنهم', 'لانك': 'لأنك',
|
| 127 |
+
'اذا': 'إذا', 'اذ': 'إذ',
|
| 128 |
+
'اي': 'أي', 'اين': 'أين',
|
| 129 |
+
'او': 'أو',
|
| 130 |
+
'اما': 'أما',
|
| 131 |
+
'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم',
|
| 132 |
+
'اخر': 'آخر', 'اخرى': 'أخرى',
|
| 133 |
+
'الان': 'الآن',
|
| 134 |
+
'اول': 'أول', 'اولى': 'أولى',
|
| 135 |
+
'اصبح': 'أصبح', 'اصبحت': 'أصبحت',
|
| 136 |
+
'اكثر': 'أكثر', 'اقل': 'أقل',
|
| 137 |
+
'اعلى': 'أعلى', 'ادنى': 'أدنى',
|
| 138 |
+
'اسرع': 'أسرع', 'ابطا': 'أبطأ',
|
| 139 |
+
'اكبر': 'أكبر', 'اصغر': 'أصغر',
|
| 140 |
+
'احسن': 'أحسن', 'اسوا': 'أسوأ',
|
| 141 |
+
'امام': 'أمام',
|
| 142 |
+
'اثناء': 'أثناء',
|
| 143 |
+
'ايضا': 'أيضاً', 'ايض': 'أيضاً',
|
| 144 |
+
'اساسي': 'أساسي', 'اساسية': 'أساسية',
|
| 145 |
+
'اخي': 'أخي', 'اخت': 'أخت', 'اخو': 'أخو',
|
| 146 |
+
'ابي': 'أبي', 'اب': 'أب', 'ابو': 'أبو',
|
| 147 |
+
'اهل': 'أهل',
|
| 148 |
+
'اطفال': 'أطفال',
|
| 149 |
+
'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي',
|
| 150 |
+
'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب',
|
| 151 |
+
'اعرف': 'أعرف', 'اعلم': 'أعلم',
|
| 152 |
+
'اخذ': 'أخذ', 'اكل': 'أكل',
|
| 153 |
+
'الايام': 'الأيام',
|
| 154 |
+
'الاطفال': 'الأطفال',
|
| 155 |
+
'الاسعار': 'الأسعار',
|
| 156 |
+
'الاولى': 'الأولى',
|
| 157 |
+
'الاخير': 'الأخير', 'الاخيرة': 'الأخيرة',
|
| 158 |
+
'واصدقائي': 'وأصدقائي',
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
@staticmethod
|
| 162 |
def fix_hamza_conservative(text: str) -> str:
|
| 163 |
"""Conservative Hamza normalization — only at word END, not middle."""
|
|
|
|
| 172 |
result.append(word)
|
| 173 |
return ' '.join(result)
|
| 174 |
|
| 175 |
+
@staticmethod
|
| 176 |
+
def fix_common_hamza(text: str) -> str:
|
| 177 |
+
"""
|
| 178 |
+
Fix common hamza placement errors using a whitelist.
|
| 179 |
+
These are the most frequent informal Arabic spelling mistakes.
|
| 180 |
+
"""
|
| 181 |
+
words = text.split()
|
| 182 |
+
result = []
|
| 183 |
+
for word in words:
|
| 184 |
+
# Check exact match first
|
| 185 |
+
if word in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 186 |
+
result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
|
| 187 |
+
else:
|
| 188 |
+
result.append(word)
|
| 189 |
+
return ' '.join(result)
|
| 190 |
+
|
| 191 |
@staticmethod
|
| 192 |
def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
|
| 193 |
"""
|
| 194 |
Smart ه → ة fix at end of words.
|
| 195 |
+
Strategy: Always prefer ة when the previous char is a consonant,
|
| 196 |
+
UNLESS the ه form is specifically a known word and the ة form is NOT.
|
| 197 |
"""
|
| 198 |
PROTECTED_ENDINGS = ['لله']
|
| 199 |
+
# Words that genuinely end in ه (not ة)
|
| 200 |
+
PROTECTED_HA_WORDS = {
|
| 201 |
+
'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
|
| 202 |
+
'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
|
| 203 |
+
'اتجه', 'توجه', 'تشابه',
|
| 204 |
+
}
|
| 205 |
words = text.split()
|
| 206 |
result = []
|
| 207 |
for word in words:
|
| 208 |
if any(word.endswith(e) for e in PROTECTED_ENDINGS):
|
| 209 |
result.append(word)
|
| 210 |
continue
|
| 211 |
+
if word in PROTECTED_HA_WORDS:
|
| 212 |
+
result.append(word)
|
| 213 |
+
continue
|
| 214 |
+
if len(word) >= 3 and word.endswith('ه'):
|
| 215 |
if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
|
| 216 |
candidate_with_ta = word[:-1] + 'ة'
|
| 217 |
+
# Default: prefer ة (correct Arabic orthography for feminine nouns)
|
| 218 |
if vocab_manager:
|
| 219 |
ta_iv = vocab_manager.is_iv(candidate_with_ta)
|
| 220 |
ha_iv = vocab_manager.is_iv(word)
|
| 221 |
if ta_iv:
|
| 222 |
+
# Always prefer ة when it's a valid word
|
| 223 |
result.append(candidate_with_ta)
|
| 224 |
continue
|
| 225 |
elif ha_iv:
|
| 226 |
result.append(word)
|
| 227 |
continue
|
| 228 |
+
# No vocab manager — default to ة
|
| 229 |
+
result.append(candidate_with_ta)
|
| 230 |
+
continue
|
| 231 |
result.append(word)
|
| 232 |
return ' '.join(result)
|
| 233 |
|
|
|
|
| 335 |
text = AraSpellPostProcessor.remove_hallucinations(text)
|
| 336 |
text = AraSpellPostProcessor.unified_collapse_repeated(text)
|
| 337 |
text = AraSpellPostProcessor.fix_hamza_conservative(text)
|
| 338 |
+
text = AraSpellPostProcessor.fix_common_hamza(text) # Fix S3: hamza whitelist
|
| 339 |
text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
|
| 340 |
text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
|
| 341 |
text = AraSpellPostProcessor.remove_duplicate_words(text)
|
|
|
|
| 661 |
if in_iv and not out_iv:
|
| 662 |
return input_word
|
| 663 |
if in_iv and out_iv:
|
| 664 |
+
# Fix S1: When only difference is ه→ة at word end, prefer ة
|
| 665 |
+
# (correct Arabic orthography — ة is the standard feminine ending)
|
| 666 |
+
if (input_word.endswith('ه') and output_word.endswith('ة')
|
| 667 |
+
and input_word[:-1] == output_word[:-1]):
|
| 668 |
+
return output_word
|
| 669 |
+
# Fix S1: Also handle ة→ه (don't regress a correct ة to ه)
|
| 670 |
+
if (input_word.endswith('ة') and output_word.endswith('ه')
|
| 671 |
+
and input_word[:-1] == output_word[:-1]):
|
| 672 |
+
return input_word
|
| 673 |
return input_word
|
| 674 |
if len(input_word) == len(output_word) and len(input_word) >= 3:
|
| 675 |
for i in range(len(input_word)):
|