youssefreda9 commited on
Commit
2ff1e5e
·
1 Parent(s): 71021ea

FIX-48v2: ه→ة pass moved AFTER grammar model

Browse files

Key insight: grammar model needs to see ه form to make gender decisions.
- Old (FIX-48): ه→ة before grammar → 12 regressions
- New (FIX-48v2): ه→ة after grammar → grammar first decides gender,
then we fix any remaining ه→ة (الحكومه→الحكومة, etc.)

Protected words: فيه, عليه, له, دراسته, etc.
Tests: 39 passing.

Files changed (1) hide show
  1. src/app.py +38 -0
src/app.py CHANGED
@@ -2490,6 +2490,44 @@ def analyze_text():
2490
  logger.error(traceback.format_exc())
2491
  timing_ms['grammar_error'] = f"{type(e).__name__}: {str(e)[:200]}"
2492
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2493
  # 3. Punctuation (runs on grammar-corrected text — PuncAra-v1 local model)
2494
  # FIX-07: Skip punctuation for religious text
2495
  if not _is_religious_text:
 
2490
  logger.error(traceback.format_exc())
2491
  timing_ms['grammar_error'] = f"{type(e).__name__}: {str(e)[:200]}"
2492
 
2493
+ # ── FIX-48v2: ه→ة pass AFTER grammar (not before!) ──
2494
+ # Must run AFTER grammar so grammar model can use ه for gender decisions.
2495
+ # Only fixes remaining ه words that grammar didn't change.
2496
+ if not _is_religious_text:
2497
+ try:
2498
+ from nlp.spelling.araspell_service import get_spelling_model
2499
+ _hata_checker = get_spelling_model()
2500
+ _hata_text = ctx.current_text
2501
+ _hata_words = _hata_text.split()
2502
+ _hata_changed = False
2503
+ _hata_result = []
2504
+ _PROTECTED_HA = {
2505
+ 'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
2506
+ 'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
2507
+ 'اتجه', 'توجه', 'تشابه', 'وفيه', 'وعليه', 'ومنه', 'وله',
2508
+ 'دراسته', 'دراستها', 'حياته', 'حياتها',
2509
+ }
2510
+ _CONSONANTS = set('بتثجحخدذرزسشصضطظعغفقكلمنهوي')
2511
+ for _hw in _hata_words:
2512
+ _hw_clean = _hw.rstrip('.،؛؟!?!')
2513
+ if (len(_hw_clean) >= 4 and _hw_clean.endswith('ه')
2514
+ and _hw_clean not in _PROTECTED_HA
2515
+ and _hw_clean[-2] in _CONSONANTS):
2516
+ _ta_cand = _hw_clean[:-1] + 'ة'
2517
+ if _hata_checker.vocab_manager.is_iv(_ta_cand):
2518
+ _punct_suffix = _hw[len(_hw_clean):]
2519
+ logger.info(f"[HA-TA] Post-grammar ه→ة: '{_hw}'→'{_ta_cand}{_punct_suffix}'")
2520
+ _hata_result.append(_ta_cand + _punct_suffix)
2521
+ _hata_changed = True
2522
+ continue
2523
+ _hata_result.append(_hw)
2524
+ if _hata_changed:
2525
+ _hata_new = ' '.join(_hata_result)
2526
+ ctx.mutate_text(_hata_new, OffsetMapper)
2527
+ current_text = ctx.current_text
2528
+ except Exception as e:
2529
+ logger.warning(f"[HA-TA] Failed: {type(e).__name__}: {e}")
2530
+
2531
  # 3. Punctuation (runs on grammar-corrected text — PuncAra-v1 local model)
2532
  # FIX-07: Skip punctuation for religious text
2533
  if not _is_religious_text: