Commit ·
2ff1e5e
1
Parent(s): 71021ea
FIX-48v2: ه→ة pass moved AFTER grammar model
Browse filesKey insight: grammar model needs to see ه form to make gender decisions.
- Old (FIX-48): ه→ة before grammar → 12 regressions
- New (FIX-48v2): ه→ة after grammar → grammar first decides gender,
then we fix any remaining ه→ة (الحكومه→الحكومة, etc.)
Protected words: فيه, عليه, له, دراسته, etc.
Tests: 39 passing.
- src/app.py +38 -0
src/app.py
CHANGED
|
@@ -2490,6 +2490,44 @@ def analyze_text():
|
|
| 2490 |
logger.error(traceback.format_exc())
|
| 2491 |
timing_ms['grammar_error'] = f"{type(e).__name__}: {str(e)[:200]}"
|
| 2492 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2493 |
# 3. Punctuation (runs on grammar-corrected text — PuncAra-v1 local model)
|
| 2494 |
# FIX-07: Skip punctuation for religious text
|
| 2495 |
if not _is_religious_text:
|
|
|
|
| 2490 |
logger.error(traceback.format_exc())
|
| 2491 |
timing_ms['grammar_error'] = f"{type(e).__name__}: {str(e)[:200]}"
|
| 2492 |
|
| 2493 |
+
# ── FIX-48v2: ه→ة pass AFTER grammar (not before!) ──
|
| 2494 |
+
# Must run AFTER grammar so grammar model can use ه for gender decisions.
|
| 2495 |
+
# Only fixes remaining ه words that grammar didn't change.
|
| 2496 |
+
if not _is_religious_text:
|
| 2497 |
+
try:
|
| 2498 |
+
from nlp.spelling.araspell_service import get_spelling_model
|
| 2499 |
+
_hata_checker = get_spelling_model()
|
| 2500 |
+
_hata_text = ctx.current_text
|
| 2501 |
+
_hata_words = _hata_text.split()
|
| 2502 |
+
_hata_changed = False
|
| 2503 |
+
_hata_result = []
|
| 2504 |
+
_PROTECTED_HA = {
|
| 2505 |
+
'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
|
| 2506 |
+
'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
|
| 2507 |
+
'اتجه', 'توجه', 'تشابه', 'وفيه', 'وعليه', 'ومنه', 'وله',
|
| 2508 |
+
'دراسته', 'دراستها', 'حياته', 'حياتها',
|
| 2509 |
+
}
|
| 2510 |
+
_CONSONANTS = set('بتثجحخدذرزسشصضطظعغفقكلمنهوي')
|
| 2511 |
+
for _hw in _hata_words:
|
| 2512 |
+
_hw_clean = _hw.rstrip('.،؛؟!?!')
|
| 2513 |
+
if (len(_hw_clean) >= 4 and _hw_clean.endswith('ه')
|
| 2514 |
+
and _hw_clean not in _PROTECTED_HA
|
| 2515 |
+
and _hw_clean[-2] in _CONSONANTS):
|
| 2516 |
+
_ta_cand = _hw_clean[:-1] + 'ة'
|
| 2517 |
+
if _hata_checker.vocab_manager.is_iv(_ta_cand):
|
| 2518 |
+
_punct_suffix = _hw[len(_hw_clean):]
|
| 2519 |
+
logger.info(f"[HA-TA] Post-grammar ه→ة: '{_hw}'→'{_ta_cand}{_punct_suffix}'")
|
| 2520 |
+
_hata_result.append(_ta_cand + _punct_suffix)
|
| 2521 |
+
_hata_changed = True
|
| 2522 |
+
continue
|
| 2523 |
+
_hata_result.append(_hw)
|
| 2524 |
+
if _hata_changed:
|
| 2525 |
+
_hata_new = ' '.join(_hata_result)
|
| 2526 |
+
ctx.mutate_text(_hata_new, OffsetMapper)
|
| 2527 |
+
current_text = ctx.current_text
|
| 2528 |
+
except Exception as e:
|
| 2529 |
+
logger.warning(f"[HA-TA] Failed: {type(e).__name__}: {e}")
|
| 2530 |
+
|
| 2531 |
# 3. Punctuation (runs on grammar-corrected text — PuncAra-v1 local model)
|
| 2532 |
# FIX-07: Skip punctuation for religious text
|
| 2533 |
if not _is_religious_text:
|