youssefreda9 commited on
Commit
71021ea
·
1 Parent(s): beaf00c

REVERT FIX-48: ه→ة pass caused 12 regressions (30% → revert to 54%)

Browse files

The ه→ة pass converted الصغيره→الصغيرة BEFORE the grammar model
could decide if gender should change (الصغيره→الصغير for masculine).
The grammar model needs to see the original ه form to make correct
gender agreement decisions.

FIX-49 (trailing و in IV→IV guard) is preserved — only FIX-48 reverted.
Tests: 39 passing.

Files changed (1) hide show
  1. src/app.py +5 -42
src/app.py CHANGED
@@ -2109,48 +2109,11 @@ def analyze_text():
2109
  # ── FIX-07: Religious text already detected above (before spelling) ──
2110
  # _is_religious_text was set earlier to skip ALL stages for sacred text
2111
 
2112
- # ── FIX-48: Dedicated ه→ة pass (runs on ALL words, not just OOV) ──
2113
- # Words like الحكومه and الشركه are IV in BERT vocab, so OOV cleanup
2114
- # skips them. This pass converts ه→ة when the ة form is also IV,
2115
- # preferring standard orthography.
2116
- if not _is_religious_text:
2117
- try:
2118
- from nlp.spelling.araspell_service import get_spelling_model
2119
- _hata_checker = get_spelling_model()
2120
- _hata_text = ctx.current_text
2121
- _hata_words = _hata_text.split()
2122
- _hata_changed = False
2123
- _hata_result = []
2124
- # Words that genuinely end in ه (not ة)
2125
- _PROTECTED_HA = {
2126
- 'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
2127
- 'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
2128
- 'اتجه', 'توجه', 'تشابه', 'وفيه', 'وعليه', 'ومنه', 'وله',
2129
- 'دراسته', 'دراستها', 'حياته', 'حياتها',
2130
- }
2131
- _CONSONANTS = set('بتثجحخدذرزسشصضطظعغفقكلمنهوي')
2132
- for _hw_idx, _hw in enumerate(_hata_words):
2133
- _hw_clean = _hw.rstrip('.،؛؟!?!')
2134
- if (len(_hw_clean) >= 4 and _hw_clean.endswith('ه')
2135
- and _hw_clean not in _PROTECTED_HA
2136
- and _hw_clean[-2] in _CONSONANTS):
2137
- _ta_cand = _hw_clean[:-1] + 'ة'
2138
- if _hata_checker.vocab_manager.is_iv(_ta_cand):
2139
- _punct_suffix = _hw[len(_hw_clean):]
2140
- logger.info(
2141
- f"[HA-TA] ه→ة fix: '{_hw}'→'{_ta_cand}{_punct_suffix}'"
2142
- )
2143
- _hata_result.append(_ta_cand + _punct_suffix)
2144
- _hata_changed = True
2145
- continue
2146
- _hata_result.append(_hw)
2147
- if _hata_changed:
2148
- _hata_new = ' '.join(_hata_result)
2149
- logger.info(f"[HA-TA] Applied: '{_hata_text[:60]}' → '{_hata_new[:60]}'")
2150
- ctx.mutate_text(_hata_new, OffsetMapper)
2151
- current_text = ctx.current_text
2152
- except Exception as e:
2153
- logger.warning(f"[HA-TA] Failed: {type(e).__name__}: {e}")
2154
 
2155
  # ── FIX-03: Structured content protection ──
2156
  # Protect URLs, emails, dates, code etc. from grammar model destruction
 
2109
  # ── FIX-07: Religious text already detected above (before spelling) ──
2110
  # _is_religious_text was set earlier to skip ALL stages for sacred text
2111
 
2112
+
2113
+ # ── FIX-48: DISABLED Caused 12 regressions ──
2114
+ # The ه→ة pass converted الصغيره→الصغيرة BEFORE the grammar model
2115
+ # could decide if gender should change (الصغيره→الصغير for masculine).
2116
+ # The grammar model needs to see the original ه form.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2117
 
2118
  # ── FIX-03: Structured content protection ──
2119
  # Protect URLs, emails, dates, code etc. from grammar model destruction