Commit ·
71021ea
1
Parent(s): beaf00c
REVERT FIX-48: ه→ة pass caused 12 regressions (30% → revert to 54%)
Browse filesThe ه→ة pass converted الصغيره→الصغيرة BEFORE the grammar model
could decide if gender should change (الصغيره→الصغير for masculine).
The grammar model needs to see the original ه form to make correct
gender agreement decisions.
FIX-49 (trailing و in IV→IV guard) is preserved — only FIX-48 reverted.
Tests: 39 passing.
- src/app.py +5 -42
src/app.py
CHANGED
|
@@ -2109,48 +2109,11 @@ def analyze_text():
|
|
| 2109 |
# ── FIX-07: Religious text already detected above (before spelling) ──
|
| 2110 |
# _is_religious_text was set earlier to skip ALL stages for sacred text
|
| 2111 |
|
| 2112 |
-
|
| 2113 |
-
#
|
| 2114 |
-
#
|
| 2115 |
-
#
|
| 2116 |
-
|
| 2117 |
-
try:
|
| 2118 |
-
from nlp.spelling.araspell_service import get_spelling_model
|
| 2119 |
-
_hata_checker = get_spelling_model()
|
| 2120 |
-
_hata_text = ctx.current_text
|
| 2121 |
-
_hata_words = _hata_text.split()
|
| 2122 |
-
_hata_changed = False
|
| 2123 |
-
_hata_result = []
|
| 2124 |
-
# Words that genuinely end in ه (not ة)
|
| 2125 |
-
_PROTECTED_HA = {
|
| 2126 |
-
'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
|
| 2127 |
-
'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
|
| 2128 |
-
'اتجه', 'توجه', 'تشابه', 'وفيه', 'وعليه', 'ومنه', 'وله',
|
| 2129 |
-
'دراسته', 'دراستها', 'حياته', 'حياتها',
|
| 2130 |
-
}
|
| 2131 |
-
_CONSONANTS = set('بتثجحخدذرزسشصضطظعغفقكلمنهوي')
|
| 2132 |
-
for _hw_idx, _hw in enumerate(_hata_words):
|
| 2133 |
-
_hw_clean = _hw.rstrip('.،؛؟!?!')
|
| 2134 |
-
if (len(_hw_clean) >= 4 and _hw_clean.endswith('ه')
|
| 2135 |
-
and _hw_clean not in _PROTECTED_HA
|
| 2136 |
-
and _hw_clean[-2] in _CONSONANTS):
|
| 2137 |
-
_ta_cand = _hw_clean[:-1] + 'ة'
|
| 2138 |
-
if _hata_checker.vocab_manager.is_iv(_ta_cand):
|
| 2139 |
-
_punct_suffix = _hw[len(_hw_clean):]
|
| 2140 |
-
logger.info(
|
| 2141 |
-
f"[HA-TA] ه→ة fix: '{_hw}'→'{_ta_cand}{_punct_suffix}'"
|
| 2142 |
-
)
|
| 2143 |
-
_hata_result.append(_ta_cand + _punct_suffix)
|
| 2144 |
-
_hata_changed = True
|
| 2145 |
-
continue
|
| 2146 |
-
_hata_result.append(_hw)
|
| 2147 |
-
if _hata_changed:
|
| 2148 |
-
_hata_new = ' '.join(_hata_result)
|
| 2149 |
-
logger.info(f"[HA-TA] Applied: '{_hata_text[:60]}' → '{_hata_new[:60]}'")
|
| 2150 |
-
ctx.mutate_text(_hata_new, OffsetMapper)
|
| 2151 |
-
current_text = ctx.current_text
|
| 2152 |
-
except Exception as e:
|
| 2153 |
-
logger.warning(f"[HA-TA] Failed: {type(e).__name__}: {e}")
|
| 2154 |
|
| 2155 |
# ── FIX-03: Structured content protection ──
|
| 2156 |
# Protect URLs, emails, dates, code etc. from grammar model destruction
|
|
|
|
| 2109 |
# ── FIX-07: Religious text already detected above (before spelling) ──
|
| 2110 |
# _is_religious_text was set earlier to skip ALL stages for sacred text
|
| 2111 |
|
| 2112 |
+
|
| 2113 |
+
# ── FIX-48: DISABLED — Caused 12 regressions ──
|
| 2114 |
+
# The ه→ة pass converted الصغيره→الصغيرة BEFORE the grammar model
|
| 2115 |
+
# could decide if gender should change (الصغيره→الصغير for masculine).
|
| 2116 |
+
# The grammar model needs to see the original ه form.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2117 |
|
| 2118 |
# ── FIX-03: Structured content protection ──
|
| 2119 |
# Protect URLs, emails, dates, code etc. from grammar model destruction
|