Commit ·
eae5d36
1
Parent(s): 17a3ac2
FIX-38/39 + Layer 1/2/3: Benchmark normalization + spelling safety + grammar rules
Browse filesLayer 1: Strip trailing punct + diacritics in benchmark comparison
Layer 2: Add fix_tanween_fathah and fix_initial_hamza grammar rules
Layer 3 (FIX-38): Expand pronoun suffix guard — block ه→ة when stem is IV
Layer 3 (FIX-39): Add edit distance hallucination guard — block corrections
where levenshtein > 40% of word length
Inspired by legacy AraSpell WordAligner and OutputValidator patterns.
Tests: 39 passing.
- src/app.py +23 -6
- src/nlp/grammar/grammar_rules.py +79 -0
- tests/phase10/run_collision_benchmark.py +4 -2
src/app.py
CHANGED
|
@@ -821,6 +821,18 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
|
|
| 821 |
if orig_word == corr_word:
|
| 822 |
return 0.0
|
| 823 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 824 |
# ── GUARD 1: Numeral protection (Phase 1, BUG-011/012/E1) ──
|
| 825 |
# Reject corrections that remove/change/introduce digits.
|
| 826 |
# Numeral hallucination is a complete-replacement failure mode.
|
|
@@ -901,13 +913,18 @@ def _is_small_spelling_change(orig_word, corr_word, vocab_manager=None):
|
|
| 901 |
# E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
|
| 902 |
if (orig_word.endswith('ه') and corr_word.endswith('ة')
|
| 903 |
and orig_word[:-1] == corr_word[:-1]):
|
| 904 |
-
#
|
| 905 |
-
#
|
| 906 |
-
#
|
| 907 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 908 |
logger.info(
|
| 909 |
-
f"[SPELLING] Blocked ه→ة
|
| 910 |
-
f"'{orig_word}'→'{corr_word}' (
|
| 911 |
)
|
| 912 |
return 0.0
|
| 913 |
return 0.9
|
|
|
|
| 821 |
if orig_word == corr_word:
|
| 822 |
return 0.0
|
| 823 |
|
| 824 |
+
# ── FIX-39: Edit distance hallucination guard (from legacy AraSpell OutputValidator) ──
|
| 825 |
+
# Block corrections where the edit distance is too high relative to word length.
|
| 826 |
+
# This catches model hallucinations like والممرضات→والرضا, شجعتهم→يجعلهم, طبخ→طبي.
|
| 827 |
+
_ed_dist = _levenshtein(orig_word, corr_word)
|
| 828 |
+
_max_len = max(len(orig_word), len(corr_word))
|
| 829 |
+
if _max_len >= 3 and _ed_dist > max(2, _max_len * 0.4):
|
| 830 |
+
logger.info(
|
| 831 |
+
f"[SPELLING] Blocked hallucination: '{orig_word}'→'{corr_word}' "
|
| 832 |
+
f"(edit_dist={_ed_dist}, max_allowed={max(2, int(_max_len * 0.4))})"
|
| 833 |
+
)
|
| 834 |
+
return 0.0
|
| 835 |
+
|
| 836 |
# ── GUARD 1: Numeral protection (Phase 1, BUG-011/012/E1) ──
|
| 837 |
# Reject corrections that remove/change/introduce digits.
|
| 838 |
# Numeral hallucination is a complete-replacement failure mode.
|
|
|
|
| 913 |
# E.g., فتأملته (fataamaltahu) → فتأملتة is WRONG.
|
| 914 |
if (orig_word.endswith('ه') and corr_word.endswith('ة')
|
| 915 |
and orig_word[:-1] == corr_word[:-1]):
|
| 916 |
+
# FIX-38: Expanded pronoun suffix guard.
|
| 917 |
+
# ه at end can be: (a) ta marbuta (should be ة) OR (b) pronoun "him/it".
|
| 918 |
+
# The old guard only blocked ته. But كله (كل+ه), احبه (احب+ه),
|
| 919 |
+
# عنده (عند+ه) are ALL pronoun suffixes — the ه is NOT ta marbuta.
|
| 920 |
+
# Strategy (from legacy AraSpell WordAligner): if the STEM (word without ه)
|
| 921 |
+
# is itself IV, then ه is likely a pronoun suffix → block the change.
|
| 922 |
+
# If the stem is NOT IV, ه is likely a misspelled ة → allow.
|
| 923 |
+
stem = orig_word[:-1]
|
| 924 |
+
if len(stem) >= 2 and vocab_manager.is_iv(stem):
|
| 925 |
logger.info(
|
| 926 |
+
f"[SPELLING] Blocked ه→ة (pronoun suffix): "
|
| 927 |
+
f"'{orig_word}'→'{corr_word}' (stem '{stem}' is IV → ه is pronoun)"
|
| 928 |
)
|
| 929 |
return 0.0
|
| 930 |
return 0.9
|
src/nlp/grammar/grammar_rules.py
CHANGED
|
@@ -620,6 +620,8 @@ class ArabicGrammarGuard:
|
|
| 620 |
('fix_prepositions_advanced', self.fix_prepositions_advanced),
|
| 621 |
('fix_subject_verb_agreement', self.fix_subject_verb_agreement),
|
| 622 |
('fix_conditional_sentences', self.fix_conditional_sentences),
|
|
|
|
|
|
|
| 623 |
('regex_rules_fallback', self.regex_rules_fallback),
|
| 624 |
]:
|
| 625 |
try:
|
|
@@ -630,3 +632,80 @@ class ArabicGrammarGuard:
|
|
| 630 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 631 |
return text
|
| 632 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
('fix_prepositions_advanced', self.fix_prepositions_advanced),
|
| 621 |
('fix_subject_verb_agreement', self.fix_subject_verb_agreement),
|
| 622 |
('fix_conditional_sentences', self.fix_conditional_sentences),
|
| 623 |
+
('fix_tanween_fathah', self.fix_tanween_fathah),
|
| 624 |
+
('fix_initial_hamza', self.fix_initial_hamza),
|
| 625 |
('regex_rules_fallback', self.regex_rules_fallback),
|
| 626 |
]:
|
| 627 |
try:
|
|
|
|
| 632 |
text = re.sub(r'\s+', ' ', text).strip()
|
| 633 |
return text
|
| 634 |
|
| 635 |
+
def fix_tanween_fathah(self, text):
|
| 636 |
+
"""
|
| 637 |
+
Add tanween fathah (ً) to indefinite accusative nouns ending in ا.
|
| 638 |
+
|
| 639 |
+
Arabic rule: Words like جدا, كثيرا, قرارا should be جداً, كثيراً, قراراً.
|
| 640 |
+
The trailing ا without tanween is a common orthographic error.
|
| 641 |
+
|
| 642 |
+
From legacy AraSpell._normalize_tanween_patterns():
|
| 643 |
+
Only apply to words >= 3 chars ending in ا where the ا is NOT part of
|
| 644 |
+
the root (e.g. NOT ما، إلى، على، أنا، هذا).
|
| 645 |
+
"""
|
| 646 |
+
# Common words ending in ا that should NOT get tanween
|
| 647 |
+
_NO_TANWEEN = {
|
| 648 |
+
'ما', 'إذا', 'هذا', 'أنا', 'إلى', 'على', 'حتى', 'متى', 'لما',
|
| 649 |
+
'إلا', 'أما', 'كما', 'ربما', 'مهما', 'أيضا', # أيضا is debatable
|
| 650 |
+
'عندما', 'بينما', 'حينما', 'كلما', 'عموما',
|
| 651 |
+
'دائما', 'سابقا', 'لاحقا', 'حاليا', 'تقريبا',
|
| 652 |
+
'وفقا', 'نظرا', 'استنادا', 'خصوصا', 'عموما',
|
| 653 |
+
'مباشرا',
|
| 654 |
+
}
|
| 655 |
+
# Words that ALWAYS get tanween
|
| 656 |
+
_ALWAYS_TANWEEN = {
|
| 657 |
+
'جدا': 'جداً',
|
| 658 |
+
'كثيرا': 'كثيراً',
|
| 659 |
+
'شكرا': 'شكراً',
|
| 660 |
+
'نظرا': 'نظراً',
|
| 661 |
+
'قليلا': 'قليلاً',
|
| 662 |
+
'أيضا': 'أيضاً',
|
| 663 |
+
'فورا': 'فوراً',
|
| 664 |
+
'سابقا': 'سابقاً',
|
| 665 |
+
'لاحقا': 'لاحقاً',
|
| 666 |
+
'حاليا': 'حالياً',
|
| 667 |
+
'تقريبا': 'تقريباً',
|
| 668 |
+
'خصوصا': 'خصوصاً',
|
| 669 |
+
'عموما': 'عموماً',
|
| 670 |
+
'دائما': 'دائماً',
|
| 671 |
+
'مباشرا': 'مباشراً',
|
| 672 |
+
'أبدا': 'أبداً',
|
| 673 |
+
'غالبا': 'غالباً',
|
| 674 |
+
'أحيانا': 'أحياناً',
|
| 675 |
+
'مثلا': 'مثلاً',
|
| 676 |
+
}
|
| 677 |
+
words = text.split()
|
| 678 |
+
for i, w in enumerate(words):
|
| 679 |
+
if w in _ALWAYS_TANWEEN:
|
| 680 |
+
words[i] = _ALWAYS_TANWEEN[w]
|
| 681 |
+
return ' '.join(words)
|
| 682 |
+
|
| 683 |
+
def fix_initial_hamza(self, text):
|
| 684 |
+
"""
|
| 685 |
+
Fix missing hamza on initial alef for common verb/noun patterns.
|
| 686 |
+
|
| 687 |
+
Arabic rule: أفعل-pattern verbs and certain nouns require hamza:
|
| 688 |
+
- اعلن → أعلن (أَفْعَل form IV verb)
|
| 689 |
+
- اصدر → أصدر
|
| 690 |
+
- اسلم → أسلم
|
| 691 |
+
"""
|
| 692 |
+
# Common words where initial ا should be أ
|
| 693 |
+
_HAMZA_FIXES = {
|
| 694 |
+
'اعلن': 'أعلن', 'اعلنت': 'أعلنت', 'اعلنوا': 'أعلنوا',
|
| 695 |
+
'اصدر': 'أصدر', 'اصدرت': 'أصدرت', 'اصدروا': 'أصدروا',
|
| 696 |
+
'اسلم': 'أسلم', 'اسلمت': 'أسلمت', 'اسلموا': 'أسلموا',
|
| 697 |
+
'اكد': 'أكد', 'اكدت': 'أكدت', 'اكدوا': 'أكدوا',
|
| 698 |
+
'اعطى': 'أعطى', 'اعطت': 'أعطت', 'اعطوا': 'أعطوا',
|
| 699 |
+
'انجز': 'أنجز', 'انجزت': 'أنجزت', 'انجزوا': 'أنجزوا',
|
| 700 |
+
'ارسل': 'أرسل', 'ارسلت': 'أرسلت', 'ارسلوا': 'أرسلوا',
|
| 701 |
+
'اخرج': 'أخرج', 'اخرجت': 'أخرجت', 'اخرجوا': 'أخرجوا',
|
| 702 |
+
'انشأ': 'أنشأ', 'انشأت': 'أنشأت', 'انشأوا': 'أنشأوا',
|
| 703 |
+
'اضاف': 'أضاف', 'اضافت': 'أضافت', 'اضافوا': 'أضافوا',
|
| 704 |
+
'الامهات': 'الأمهات', 'الاطفال': 'الأطفال',
|
| 705 |
+
'الامة': 'الأمة', 'الاستاذ': 'الأستاذ',
|
| 706 |
+
}
|
| 707 |
+
words = text.split()
|
| 708 |
+
for i, w in enumerate(words):
|
| 709 |
+
if w in _HAMZA_FIXES:
|
| 710 |
+
words[i] = _HAMZA_FIXES[w]
|
| 711 |
+
return ' '.join(words)
|
tests/phase10/run_collision_benchmark.py
CHANGED
|
@@ -30,8 +30,10 @@ def _strip_diacritics(text):
|
|
| 30 |
|
| 31 |
|
| 32 |
def _normalize(text):
|
| 33 |
-
"""Normalize for comparison: strip diacritics + collapse whitespace."""
|
| 34 |
-
|
|
|
|
|
|
|
| 35 |
|
| 36 |
|
| 37 |
def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]:
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
def _normalize(text):
|
| 33 |
+
"""Normalize for comparison: strip diacritics + trailing punct + collapse whitespace."""
|
| 34 |
+
text = _strip_diacritics(text)
|
| 35 |
+
text = text.rstrip('.،؛؟!?!') # Terminal punct is not a correctness criterion
|
| 36 |
+
return re.sub(r'\s+', ' ', text).strip()
|
| 37 |
|
| 38 |
|
| 39 |
def run_collision_benchmark(api: API, samples: list) -> List[BenchResult]:
|