Commit ·
17a3ac2
1
Parent(s): 27198e8
FIX-33b: Protect second+third preposition regexes with blocklist
Browse filesThe بال...ون/ان and ل...ون/ان patterns had no blocklist protection.
بالامتحان would become بالامتحين, للإنسان would become للإنسين.
Now all three preposition regexes use the same callback with:
- _PREP_BLOCKLIST check
- ان suffix → skip (almost always root noun form)
Tests: 39 passing (10 new for FIX-33b).
- src/nlp/grammar/grammar_rules.py +27 -2
- tests/test_recent_fixes.py +66 -0
src/nlp/grammar/grammar_rules.py
CHANGED
|
@@ -238,10 +238,35 @@ class ArabicGrammarGuard:
|
|
| 238 |
text = re.sub(r'\b([وف]?(?:في|من|إلى|على|عن|حتى))\s+([أ-ي]{4,})(ون|ان)\b', _prep_replace, text)
|
| 239 |
|
| 240 |
# (وبالمبرمجون) -> (وبالمبرمجين)
|
| 241 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 242 |
|
| 243 |
# (ولمهندسون) -> (ولمهندسين)
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
return text
|
| 246 |
|
| 247 |
def fix_subject_verb_agreement(self, text):
|
|
|
|
| 238 |
text = re.sub(r'\b([وف]?(?:في|من|إلى|على|عن|حتى))\s+([أ-ي]{4,})(ون|ان)\b', _prep_replace, text)
|
| 239 |
|
| 240 |
# (وبالمبرمجون) -> (وبالمبرمجين)
|
| 241 |
+
# FIX-33b: Same blocklist protection as first regex
|
| 242 |
+
def _attached_prep_replace(m):
|
| 243 |
+
prefix = m.group(1) # وب، ب، فب، ول، ل، etc.
|
| 244 |
+
stem = m.group(2)
|
| 245 |
+
suffix = m.group(3)
|
| 246 |
+
full_word = 'ال' + stem + suffix # reconstruct with ال for blocklist check
|
| 247 |
+
if full_word in self._PREP_BLOCKLIST:
|
| 248 |
+
return m.group(0)
|
| 249 |
+
# Words ending in ان with 4+ char stems are almost always root nouns
|
| 250 |
+
if suffix == 'ان':
|
| 251 |
+
return m.group(0)
|
| 252 |
+
return f'{prefix}ال{stem}ين'
|
| 253 |
+
|
| 254 |
+
text = re.sub(r'\b([وف]?[بلكف])ال([أ-ي]{4,})(ون|ان)\b', _attached_prep_replace, text)
|
| 255 |
|
| 256 |
# (ولمهندسون) -> (ولمهندسين)
|
| 257 |
+
# FIX-33b: Same protection — reconstruct full word for blocklist
|
| 258 |
+
def _lam_prep_replace(m):
|
| 259 |
+
prefix = m.group(1) # ول، ل، فل
|
| 260 |
+
stem = m.group(2)
|
| 261 |
+
suffix = m.group(3)
|
| 262 |
+
# Check blocklist with common prefixed forms
|
| 263 |
+
if (stem + suffix) in self._PREP_BLOCKLIST:
|
| 264 |
+
return m.group(0)
|
| 265 |
+
if suffix == 'ان':
|
| 266 |
+
return m.group(0)
|
| 267 |
+
return f'{prefix}{stem}ين'
|
| 268 |
+
|
| 269 |
+
text = re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', _lam_prep_replace, text)
|
| 270 |
return text
|
| 271 |
|
| 272 |
def fix_subject_verb_agreement(self, text):
|
tests/test_recent_fixes.py
CHANGED
|
@@ -80,6 +80,72 @@ for input_text, expected in plurals:
|
|
| 80 |
result = fix_prepositions(input_text)
|
| 81 |
test(f"'{input_text}' → '{expected}'", result == expected, f"got '{result}'")
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
# ══════════════════════════════════════════════════════════════
|
| 85 |
# TEST 2: FIX-35 — Spelling doesn't strip conjugation suffixes
|
|
|
|
| 80 |
result = fix_prepositions(input_text)
|
| 81 |
test(f"'{input_text}' → '{expected}'", result == expected, f"got '{result}'")
|
| 82 |
|
| 83 |
+
# ── FIX-33b: Second regex (بال...ون/ان) ──
|
| 84 |
+
print("\n═══ FIX-33b: Attached preposition regex protection ═══")
|
| 85 |
+
|
| 86 |
+
def _attached_prep_replace(m):
|
| 87 |
+
prefix = m.group(1)
|
| 88 |
+
stem = m.group(2)
|
| 89 |
+
suffix = m.group(3)
|
| 90 |
+
full_word = 'ال' + stem + suffix
|
| 91 |
+
if full_word in _PREP_BLOCKLIST:
|
| 92 |
+
return m.group(0)
|
| 93 |
+
if suffix == 'ان':
|
| 94 |
+
return m.group(0)
|
| 95 |
+
return f'{prefix}ال{stem}ين'
|
| 96 |
+
|
| 97 |
+
def fix_attached_prep(text):
|
| 98 |
+
return re.sub(r'\b([وف]?[بلكف])ال([أ-ي]{4,})(ون|ان)\b', _attached_prep_replace, text)
|
| 99 |
+
|
| 100 |
+
# Root nouns with بال — should NOT be corrupted
|
| 101 |
+
attached_root = [
|
| 102 |
+
("بالامتحان", "بالامتحان"), # NOT بالامتحين
|
| 103 |
+
("بالإنسان", "بالإنسان"), # NOT بالإنسين
|
| 104 |
+
("بالميدان", "بالميدان"), # NOT بالميدين
|
| 105 |
+
("كالسلطان", "كالسلطان"), # NOT كالسلطين
|
| 106 |
+
("فبالبرلمان", "فبالبرلمان"), # NOT فبالبرلمين
|
| 107 |
+
]
|
| 108 |
+
for input_text, expected in attached_root:
|
| 109 |
+
result = fix_attached_prep(input_text)
|
| 110 |
+
test(f"'{input_text}' → unchanged", result == expected, f"got '{result}'")
|
| 111 |
+
|
| 112 |
+
# Actual plurals with بال — SHOULD be corrected
|
| 113 |
+
attached_plurals = [
|
| 114 |
+
("بالمهندسون", "بالمهندسين"),
|
| 115 |
+
("كالمعلمون", "كالمعلمين"),
|
| 116 |
+
]
|
| 117 |
+
for input_text, expected in attached_plurals:
|
| 118 |
+
result = fix_attached_prep(input_text)
|
| 119 |
+
test(f"'{input_text}' → '{expected}'", result == expected, f"got '{result}'")
|
| 120 |
+
|
| 121 |
+
# ── FIX-33b: Third regex (ل...ون/ان) ──
|
| 122 |
+
def _lam_prep_replace(m):
|
| 123 |
+
prefix = m.group(1)
|
| 124 |
+
stem = m.group(2)
|
| 125 |
+
suffix = m.group(3)
|
| 126 |
+
if (stem + suffix) in _PREP_BLOCKLIST:
|
| 127 |
+
return m.group(0)
|
| 128 |
+
if suffix == 'ان':
|
| 129 |
+
return m.group(0)
|
| 130 |
+
return f'{prefix}{stem}ين'
|
| 131 |
+
|
| 132 |
+
def fix_lam_prep(text):
|
| 133 |
+
return re.sub(r'\b([وف]?ل)([أ-ي]{4,})(ون|ان)\b', _lam_prep_replace, text)
|
| 134 |
+
|
| 135 |
+
# ل-prefixed root nouns
|
| 136 |
+
lam_root = [
|
| 137 |
+
("لامتحان", "لامتحان"),
|
| 138 |
+
("لإنسان", "لإنسان"),
|
| 139 |
+
]
|
| 140 |
+
for input_text, expected in lam_root:
|
| 141 |
+
result = fix_lam_prep(input_text)
|
| 142 |
+
test(f"'{input_text}' → unchanged", result == expected, f"got '{result}'")
|
| 143 |
+
|
| 144 |
+
# ل-prefixed plurals — SHOULD be corrected
|
| 145 |
+
test("'لمهندسون'→'لمهندسين'",
|
| 146 |
+
fix_lam_prep("لمهندسون") == "لمهندسين",
|
| 147 |
+
f"got '{fix_lam_prep('لمهندسون')}'")
|
| 148 |
+
|
| 149 |
|
| 150 |
# ══════════════════════════════════════════════════════════════
|
| 151 |
# TEST 2: FIX-35 — Spelling doesn't strip conjugation suffixes
|