youssefreda9 commited on
Commit
70184c4
·
1 Parent(s): 16ae935

UI/UX: Add Quran standalone tab, summarize textarea, floating selection bar, fix clear editor, remove duplicate button

Browse files
analyze_failures.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Analyze remaining 24 failures after Layer 1/2/3 fixes."""
2
+ import json, re
3
+
4
+ with open('tests/phase10/reports/collision_benchmark_results.json', 'r', encoding='utf-8') as f:
5
+ data = json.load(f)
6
+
7
+ def norm(t):
8
+ t = re.sub(r'[\u064B-\u065F\u0670]', '', t)
9
+ t = t.rstrip('.،؛؟!?!')
10
+ return re.sub(r'\s+', ' ', t).strip()
11
+
12
+ categories = {}
13
+ for r in data['results']:
14
+ if r['pipeline_verdict'] != 'FN':
15
+ continue
16
+ rid = r['id']
17
+ exp = r['expected'].strip()
18
+ act = r['pipeline_output'].strip()
19
+ inp = r['input'].strip()
20
+
21
+ inp_w = inp.split()
22
+ exp_w = exp.split()
23
+ act_w = act.split()
24
+
25
+ issues = []
26
+ for i in range(min(len(exp_w), len(act_w))):
27
+ aw = act_w[i].rstrip('.،؛؟!?!')
28
+ ew = exp_w[i].rstrip('.،؛؟!?!')
29
+ iw = inp_w[i] if i < len(inp_w) else '—'
30
+ aw_n = re.sub(r'[\u064B-\u065F]', '', aw)
31
+ ew_n = re.sub(r'[\u064B-\u065F]', '', ew)
32
+
33
+ if aw_n == ew_n:
34
+ continue # tanween/diacritic only diff
35
+ if aw != ew:
36
+ if iw == aw:
37
+ cause = "MODEL_MISS"
38
+ elif iw == ew:
39
+ cause = "CORRUPTED"
40
+ else:
41
+ cause = "WRONG_FIX"
42
+ issues.append(f" [{i}] '{iw}'→'{aw}' (exp:'{ew}') {cause}")
43
+
44
+ if len(exp_w) != len(act_w):
45
+ issues.append(f" word count: {len(act_w)} vs {len(exp_w)}")
46
+
47
+ # Classify
48
+ has_junk = any('وومن' in a or '.و' in a or 'ةل' in a for a in act_w)
49
+ has_trailing_و = any(a.endswith('و') and not e.endswith('و') and not e.endswith('وا')
50
+ for a, e in zip(act_w, exp_w) if a != e)
51
+
52
+ cat = r['category']
53
+ print(f"\n{rid} [{cat}]")
54
+ print(f" IN: {inp[:60]}")
55
+ print(f" EXP: {exp[:60]}")
56
+ print(f" ACT: {act[:60]}")
57
+ for iss in issues:
58
+ print(iss)
59
+ if has_junk:
60
+ print(" >>> TRAILING JUNK")
61
+
62
+ # Summary of what each failure needs
63
+ print("\n" + "="*60)
64
+ print("FIXABILITY ANALYSIS")
65
+ print("="*60)
66
+ print(f"\nTotal failures: 24")
67
+ print(f"Need: 17 more passes to reach 85% (43/50)")
src/css/components.css CHANGED
@@ -3433,3 +3433,94 @@ select:focus-visible,
3433
  outline-offset: 2px;
3434
  }
3435
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3433
  outline-offset: 2px;
3434
  }
3435
 
3436
+ /* ── Floating Selection Toolbar ── */
3437
+ .selection-toolbar {
3438
+ position: absolute;
3439
+ z-index: 1100;
3440
+ display: flex;
3441
+ align-items: center;
3442
+ gap: 2px;
3443
+ padding: 4px 6px;
3444
+ border-radius: 12px;
3445
+ background: var(--color-surface-elevated);
3446
+ border: 1px solid var(--color-border-strong);
3447
+ box-shadow: 0 8px 32px rgba(0,0,0,0.18), 0 0 0 1px rgba(255,255,255,0.05);
3448
+ backdrop-filter: blur(16px);
3449
+ transform: translateX(-50%);
3450
+ animation: selbar-in 0.2s ease;
3451
+ pointer-events: auto;
3452
+ }
3453
+ .selection-toolbar.is-hidden {
3454
+ display: none;
3455
+ }
3456
+ @keyframes selbar-in {
3457
+ from { opacity: 0; transform: translateX(-50%) translateY(6px); }
3458
+ to { opacity: 1; transform: translateX(-50%) translateY(0); }
3459
+ }
3460
+ .sel-tool-btn {
3461
+ display: inline-flex;
3462
+ align-items: center;
3463
+ gap: 5px;
3464
+ padding: 6px 12px;
3465
+ border: none;
3466
+ border-radius: 8px;
3467
+ background: transparent;
3468
+ color: var(--color-text-secondary);
3469
+ font-family: inherit;
3470
+ font-size: 12px;
3471
+ font-weight: 600;
3472
+ cursor: pointer;
3473
+ transition: all 0.15s ease;
3474
+ white-space: nowrap;
3475
+ }
3476
+ .sel-tool-btn:hover {
3477
+ background: var(--color-surface);
3478
+ color: var(--color-text-primary);
3479
+ }
3480
+ .sel-tool-sep {
3481
+ width: 1px;
3482
+ height: 20px;
3483
+ background: var(--color-border);
3484
+ flex-shrink: 0;
3485
+ }
3486
+
3487
+ /* ── Summary Source Toggle ── */
3488
+ .summary-source-toggle {
3489
+ display: flex;
3490
+ gap: 0;
3491
+ border-radius: 10px;
3492
+ background: var(--color-surface-elevated);
3493
+ border: 1px solid var(--color-border);
3494
+ padding: 3px;
3495
+ overflow: hidden;
3496
+ }
3497
+ .summary-source-btn {
3498
+ flex: 1;
3499
+ padding: 8px 16px;
3500
+ border: none;
3501
+ border-radius: 8px;
3502
+ background: transparent;
3503
+ color: var(--color-text-secondary);
3504
+ font-family: inherit;
3505
+ font-size: 13px;
3506
+ font-weight: 600;
3507
+ cursor: pointer;
3508
+ transition: all 0.2s ease;
3509
+ }
3510
+ .summary-source-btn:hover {
3511
+ color: var(--color-text-primary);
3512
+ }
3513
+ .summary-source-btn.active {
3514
+ background: linear-gradient(135deg, var(--color-primary), var(--color-secondary));
3515
+ color: var(--color-text-inverse);
3516
+ box-shadow: 0 2px 8px rgba(107, 163, 224, 0.25);
3517
+ }
3518
+
3519
+ /* ── Editor Tab Icons ── */
3520
+ .editor-tab svg {
3521
+ margin-left: 4px;
3522
+ opacity: 0.7;
3523
+ }
3524
+ .editor-tab.active svg {
3525
+ opacity: 1;
3526
+ }
src/index.html CHANGED
@@ -668,6 +668,7 @@
668
  <button id="write-tab" onclick="switchTab('write')" class="editor-tab active" type="button">كتابة</button>
669
  <button id="summarize-tab" onclick="switchTab('summarize')" class="editor-tab" type="button">تلخيص</button>
670
  <button id="dialect-tab" onclick="switchTab('dialect')" class="editor-tab" type="button">تحويل للفصحى</button>
 
671
  <button id="docs-sidebar-toggle" class="docs-sidebar-toggle-mobile btn-ghost lg:hidden" type="button" aria-label="مستنداتي" aria-expanded="false" aria-controls="docs-sidebar">
672
  <svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 7v10a2 2 0 002 2h14a2 2 0 002-2V9a2 2 0 00-2-2h-6l-2-2H5a2 2 0 00-2 2z"/></svg>
673
  </button>
@@ -679,7 +680,7 @@
679
  </div>
680
  <span id="auto-save-status" class="text-xs text-secondary" style="opacity:0;transition:opacity 0.3s;"></span>
681
  <div class="window-dots" aria-hidden="true">
682
- <span class="dot dot--red" title="مسح المحرر" onclick="showConfirmDialog('مسح المحرر','هل تريد مسح جميع محتويات المحرر؟ لا يمكن التراجع عن هذا.',function(){document.getElementById('editor-surface').innerHTML='';if(typeof updatePlaceholder==='function')updatePlaceholder();if(typeof updateEditorStats==='function')updateEditorStats()})" style="cursor:pointer;"></span>
683
  <span class="dot dot--yellow" title="طي لوحة الاقتراحات" onclick="document.querySelector('.sidebar-desktop')?.classList.toggle('collapsed')" style="cursor:pointer;"></span>
684
  <span class="dot dot--green" title="توسيع المحرر للعرض الكامل" onclick="document.querySelector('.editor-layout')?.classList.toggle('editor-fullscreen')" style="cursor:pointer;"></span>
685
  </div>
@@ -804,8 +805,25 @@
804
  <p class="popover-hint">اختر التصحيح المناسب · Escape للإغلاق</p>
805
  </div>
806
  </div>
807
- <div id="summarize-area" class="summarize-panel is-hidden">
808
- <!-- Item 11: Mode Toggle -->
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
809
  <div class="summary-mode-toggle">
810
  <button type="button" class="summary-mode-btn active" id="summary-mode-paragraph" onclick="setSummaryMode('paragraph')">
811
  <svg width="14" height="14" fill="currentColor" viewBox="0 0 24 24"><path d="M3 5h18v2H3V5zm0 8h18v2H3v-2zm0 4h12v2H3v-2z"/></svg>
@@ -875,7 +893,55 @@
875
  </div>
876
  <div id="dialect-result" class="text-right text-lg editor-content" dir="rtl" style="line-height: 2;"></div>
877
  </div>
878
- </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
879
  <div class="editor-footer">
880
  <div class="editor-stats" role="status" aria-label="إحصائيات">
881
  <div class="flex items-center gap-2"><span class="stat-dot stat-dot--spelling" aria-hidden="true"></span><span class="text-sm text-secondary"><span id="spelling-count">٠</span> إملائي</span></div>
@@ -1213,18 +1279,201 @@
1213
 
1214
 
1215
  function switchTab(tab) {
1216
- const writeTab = document.getElementById('write-tab');
1217
- const summarizeTab = document.getElementById('summarize-tab');
1218
- const dialectTab = document.getElementById('dialect-tab');
1219
- const writeArea = document.getElementById('write-area');
1220
- const summarizeArea = document.getElementById('summarize-area');
1221
- const dialectArea = document.getElementById('dialect-area');
1222
  const formatToolbar = document.getElementById('format-toolbar');
1223
- [writeTab, summarizeTab, dialectTab].forEach(function(t){if(t)t.classList.remove('active');});
1224
- [writeArea, summarizeArea, dialectArea].forEach(function(a){if(a)a.classList.add('is-hidden');});
1225
- if (tab === 'write') { writeTab.classList.add('active'); writeArea.classList.remove('is-hidden'); if(formatToolbar)formatToolbar.style.display=''; }
1226
- else if (tab === 'summarize') { summarizeTab.classList.add('active'); summarizeArea.classList.remove('is-hidden'); if(formatToolbar)formatToolbar.style.display='none'; }
1227
- else if (tab === 'dialect') { dialectTab.classList.add('active'); dialectArea.classList.remove('is-hidden'); if(formatToolbar)formatToolbar.style.display='none'; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1228
  }
1229
  let _dialectResult = '';
1230
  async function convertDialect() {
@@ -1533,11 +1782,17 @@
1533
  }
1534
 
1535
  async function generateSummary(event) {
1536
- const text = (typeof getEditorText === 'function' ? getEditorText() : '').trim();
 
 
 
 
 
 
1537
 
1538
  if (!text) {
1539
  const summaryText = document.getElementById('summary-text');
1540
- summaryText.innerHTML = '<p class="text-secondary text-center">الرجاء كتابة نص في المحرر أولاً</p>';
1541
  document.getElementById('summary-preview').classList.add('show');
1542
  return;
1543
  }
@@ -1946,5 +2201,22 @@
1946
  </div>
1947
  </div>
1948
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1949
  </body>
1950
  </html>
 
668
  <button id="write-tab" onclick="switchTab('write')" class="editor-tab active" type="button">كتابة</button>
669
  <button id="summarize-tab" onclick="switchTab('summarize')" class="editor-tab" type="button">تلخيص</button>
670
  <button id="dialect-tab" onclick="switchTab('dialect')" class="editor-tab" type="button">تحويل للفصحى</button>
671
+ <button id="quran-tab" onclick="switchTab('quran')" class="editor-tab" type="button">القرآن</button>
672
  <button id="docs-sidebar-toggle" class="docs-sidebar-toggle-mobile btn-ghost lg:hidden" type="button" aria-label="مستنداتي" aria-expanded="false" aria-controls="docs-sidebar">
673
  <svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 7v10a2 2 0 002 2h14a2 2 0 002-2V9a2 2 0 00-2-2h-6l-2-2H5a2 2 0 00-2 2z"/></svg>
674
  </button>
 
680
  </div>
681
  <span id="auto-save-status" class="text-xs text-secondary" style="opacity:0;transition:opacity 0.3s;"></span>
682
  <div class="window-dots" aria-hidden="true">
683
+ <span class="dot dot--red" title="مسح المحرر" onclick="if(typeof clearEditor==='function'){clearEditor();}" style="cursor:pointer;"></span>
684
  <span class="dot dot--yellow" title="طي لوحة الاقتراحات" onclick="document.querySelector('.sidebar-desktop')?.classList.toggle('collapsed')" style="cursor:pointer;"></span>
685
  <span class="dot dot--green" title="توسيع المحرر للعرض الكامل" onclick="document.querySelector('.editor-layout')?.classList.toggle('editor-fullscreen')" style="cursor:pointer;"></span>
686
  </div>
 
805
  <p class="popover-hint">اختر التصحيح المناسب · Escape للإغلاق</p>
806
  </div>
807
  </div>
808
+ <div id="summarize-area" class="summarize-panel is-hidden">
809
+ <!-- Source Toggle: Editor text vs Custom input -->
810
+ <div class="mb-4">
811
+ <div class="flex items-center gap-2 mb-3">
812
+ <svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24" style="color: var(--color-primary);"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/></svg>
813
+ <span class="text-base font-bold">تلخيص النصوص</span>
814
+ </div>
815
+ <div class="summary-source-toggle mb-3">
816
+ <button type="button" class="summary-source-btn active" id="summary-src-editor" onclick="setSummarySource('editor')">نص المحرر</button>
817
+ <button type="button" class="summary-source-btn" id="summary-src-custom" onclick="setSummarySource('custom')">نص مخصص</button>
818
+ </div>
819
+ <div id="summary-custom-input-wrap" class="is-hidden">
820
+ <textarea id="summary-custom-input" class="w-full p-4 rounded-xl text-right text-lg editor-content" dir="rtl" rows="6" placeholder="الصق أو اكتب النص الذي تريد تلخيصه هنا..." style="background: var(--color-surface); border: 1px solid var(--color-border); color: var(--color-text-primary); resize: vertical; font-family: inherit;"></textarea>
821
+ <div class="flex items-center justify-between mt-1 mb-2" dir="rtl">
822
+ <span id="summary-char-count" class="text-xs" style="color: var(--text-secondary);">٠ حرف</span>
823
+ </div>
824
+ </div>
825
+ </div>
826
+ <!-- Item 11: Mode Toggle -->
827
  <div class="summary-mode-toggle">
828
  <button type="button" class="summary-mode-btn active" id="summary-mode-paragraph" onclick="setSummaryMode('paragraph')">
829
  <svg width="14" height="14" fill="currentColor" viewBox="0 0 24 24"><path d="M3 5h18v2H3V5zm0 8h18v2H3v-2zm0 4h12v2H3v-2z"/></svg>
 
893
  </div>
894
  <div id="dialect-result" class="text-right text-lg editor-content" dir="rtl" style="line-height: 2;"></div>
895
  </div>
896
+ </div>
897
+ <!-- Quran Standalone Panel -->
898
+ <div id="quran-area" class="summarize-panel is-hidden">
899
+ <div class="mb-4">
900
+ <div class="flex items-center gap-2 mb-3">
901
+ <svg width="18" height="18" fill="none" stroke="#06b6d4" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"/></svg>
902
+ <span class="text-base font-bold">تدقيق النص القرآني</span>
903
+ </div>
904
+ <p class="text-sm mb-4" style="color: var(--text-secondary);">اكتب أو الصق نصًا قرآنيًا وسنعرض لك النص الصحيح بالتشكيل مع اسم السورة ورقم الآية، مع إمكانية ترجمته إلى ١٤ لغة.</p>
905
+ </div>
906
+ <textarea id="quran-input" class="w-full p-4 rounded-xl text-right text-lg" dir="rtl" rows="4" placeholder="اكتب الآية أو جزءًا منها هنا..." style="background: var(--color-surface); border: 1px solid var(--color-border); color: var(--color-text-primary); resize: vertical; font-family: 'Amiri Quran', 'Cairo', serif; font-size: 20px; line-height: 2;"></textarea>
907
+ <button id="quran-search-btn" onclick="searchQuranStandalone()" class="btn-primary w-full py-4 text-lg mt-4 mb-4" type="button">بحث وتدقيق</button>
908
+ <div id="quran-inline-result" class="is-hidden" style="background: var(--color-surface); border: 1px solid rgba(6,182,212,0.2); border-radius: 1rem; padding: 1.5rem;">
909
+ <div class="flex items-center justify-between mb-3">
910
+ <div class="text-sm font-bold" style="color:#06b6d4;">✓ النص القرآني المدقق</div>
911
+ <div class="flex items-center gap-2">
912
+ <button onclick="copyQuranInlineResult()" class="quran-copy-btn" type="button" title="نسخ">📋</button>
913
+ <button id="quran-inline-apply-btn" onclick="applyQuranInlineResult()" class="quran-apply-btn" type="button">تطبيق في المحرر ✓</button>
914
+ </div>
915
+ </div>
916
+ <p id="quran-inline-uthmani" class="quran-uthmani" style="font-size: 24px; line-height: 2.2; text-align: center;"></p>
917
+ <p id="quran-inline-reference" class="quran-reference text-center mt-2"></p>
918
+ <div class="mt-4 pt-4" style="border-top:1px solid var(--color-border);">
919
+ <div class="flex items-center gap-3 mb-3 flex-wrap">
920
+ <svg width="16" height="16" fill="none" stroke="#06b6d4" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129"/></svg>
921
+ <span class="text-sm font-bold">ترجمة الآية</span>
922
+ <select id="quran-inline-lang" onchange="translateQuranInline()" class="quran-lang-select">
923
+ <option value="">— اختر لغة —</option>
924
+ <option value="english">🇬🇧 English</option>
925
+ <option value="french">🇫🇷 Français</option>
926
+ <option value="turkish">🇹🇷 Türkçe</option>
927
+ <option value="persian">🇮🇷 فارسی</option>
928
+ <option value="russian">🇷🇺 Русский</option>
929
+ <option value="spanish">🇪🇸 Español</option>
930
+ <option value="german">🇩🇪 Deutsch</option>
931
+ <option value="indonesian">🇮🇩 Indonesia</option>
932
+ <option value="malay">🇲🇾 Melayu</option>
933
+ <option value="bengali">🇧🇩 বাংলা</option>
934
+ <option value="bosnian">🇧🇦 Bosanski</option>
935
+ <option value="portuguese">🇵🇹 Português</option>
936
+ <option value="uzbek">🇺🇿 O'zbek</option>
937
+ </select>
938
+ </div>
939
+ <div id="quran-inline-translation" class="is-hidden p-4 rounded-xl" style="background:rgba(6,182,212,0.06); border:1px solid rgba(6,182,212,0.15);">
940
+ <p id="quran-inline-trans-text" style="font-size:18px; line-height:2; color:var(--color-text-primary); text-align:center;"></p>
941
+ </div>
942
+ </div>
943
+ </div>
944
+ </div>
945
  <div class="editor-footer">
946
  <div class="editor-stats" role="status" aria-label="إحصائيات">
947
  <div class="flex items-center gap-2"><span class="stat-dot stat-dot--spelling" aria-hidden="true"></span><span class="text-sm text-secondary"><span id="spelling-count">٠</span> إملائي</span></div>
 
1279
 
1280
 
1281
  function switchTab(tab) {
1282
+ const tabs = ['write', 'summarize', 'dialect', 'quran'];
 
 
 
 
 
1283
  const formatToolbar = document.getElementById('format-toolbar');
1284
+ tabs.forEach(function(t) {
1285
+ var tabEl = document.getElementById(t + '-tab');
1286
+ var areaEl = document.getElementById(t + '-area') || document.getElementById(t === 'write' ? 'write-area' : t + '-area');
1287
+ if (tabEl) tabEl.classList.remove('active');
1288
+ if (areaEl) areaEl.classList.add('is-hidden');
1289
+ });
1290
+ var activeTab = document.getElementById(tab + '-tab');
1291
+ var activeArea = document.getElementById(tab === 'write' ? 'write-area' : tab + '-area');
1292
+ if (activeTab) activeTab.classList.add('active');
1293
+ if (activeArea) activeArea.classList.remove('is-hidden');
1294
+ if (formatToolbar) formatToolbar.style.display = (tab === 'write') ? '' : 'none';
1295
+ }
1296
+
1297
+ /* ═══════════════════════════════════════════
1298
+ Summarize — Source Toggle (editor vs custom)
1299
+ ═══════════════════════════════════════════ */
1300
+ window._summarySource = 'editor';
1301
+ function setSummarySource(src) {
1302
+ window._summarySource = src;
1303
+ var editorBtn = document.getElementById('summary-src-editor');
1304
+ var customBtn = document.getElementById('summary-src-custom');
1305
+ var customWrap = document.getElementById('summary-custom-input-wrap');
1306
+ if (editorBtn) editorBtn.classList.toggle('active', src === 'editor');
1307
+ if (customBtn) customBtn.classList.toggle('active', src === 'custom');
1308
+ if (customWrap) customWrap.classList.toggle('is-hidden', src !== 'custom');
1309
+ }
1310
+
1311
+ /* ═══════════════════════════════════════════
1312
+ Floating Selection Toolbar
1313
+ ═══════════════════════════════════════════ */
1314
+ (function() {
1315
+ var selBar = null;
1316
+ var hideTimer = null;
1317
+ function showSelectionBar() {
1318
+ var sel = window.getSelection();
1319
+ if (!sel || sel.isCollapsed || !sel.toString().trim()) { hideSelectionBar(); return; }
1320
+ var editor = document.getElementById('editor-container');
1321
+ if (!editor || !editor.contains(sel.anchorNode)) { hideSelectionBar(); return; }
1322
+ if (!selBar) selBar = document.getElementById('selection-toolbar');
1323
+ if (!selBar) return;
1324
+ var range = sel.getRangeAt(0);
1325
+ var rect = range.getBoundingClientRect();
1326
+ selBar.style.top = (rect.top + window.scrollY - 48) + 'px';
1327
+ selBar.style.left = (rect.left + rect.width / 2) + 'px';
1328
+ selBar.classList.remove('is-hidden');
1329
+ }
1330
+ function hideSelectionBar() {
1331
+ if (!selBar) selBar = document.getElementById('selection-toolbar');
1332
+ if (selBar) selBar.classList.add('is-hidden');
1333
+ }
1334
+ document.addEventListener('selectionchange', function() {
1335
+ clearTimeout(hideTimer);
1336
+ hideTimer = setTimeout(function() {
1337
+ var sel = window.getSelection();
1338
+ if (sel && !sel.isCollapsed && sel.toString().trim().length > 2) {
1339
+ var editor = document.getElementById('editor-container');
1340
+ if (editor && editor.contains(sel.anchorNode)) { showSelectionBar(); return; }
1341
+ }
1342
+ hideSelectionBar();
1343
+ }, 300);
1344
+ });
1345
+ document.addEventListener('mousedown', function(e) {
1346
+ if (!selBar) selBar = document.getElementById('selection-toolbar');
1347
+ if (selBar && !selBar.contains(e.target)) hideSelectionBar();
1348
+ });
1349
+ })();
1350
+
1351
+ function selectionToolAction(tool) {
1352
+ var sel = window.getSelection();
1353
+ var text = sel ? sel.toString().trim() : '';
1354
+ if (!text) { if (typeof showToast === 'function') showToast('حدد نصًا أولاً', 'warning'); return; }
1355
+ var selBar = document.getElementById('selection-toolbar');
1356
+ if (selBar) selBar.classList.add('is-hidden');
1357
+ if (tool === 'summarize') {
1358
+ switchTab('summarize');
1359
+ setSummarySource('custom');
1360
+ var ta = document.getElementById('summary-custom-input');
1361
+ if (ta) { ta.value = text; }
1362
+ } else if (tool === 'dialect') {
1363
+ switchTab('dialect');
1364
+ var ta = document.getElementById('dialect-input');
1365
+ if (ta) { ta.value = text; if (typeof updateDialectCharCount === 'function') updateDialectCharCount(); }
1366
+ } else if (tool === 'quran') {
1367
+ switchTab('quran');
1368
+ var ta = document.getElementById('quran-input');
1369
+ if (ta) ta.value = text;
1370
+ }
1371
+ }
1372
+
1373
+ /* ═══════════════════════════════════════════
1374
+ Quran Standalone Panel Functions
1375
+ ═══════════════════════════════════════════ */
1376
+ let _quranInlineVerse = '';
1377
+ let _quranInlineRef = '';
1378
+ let _quranInlineQuery = '';
1379
+
1380
+ async function searchQuranStandalone() {
1381
+ var input = document.getElementById('quran-input').value.trim();
1382
+ if (!input) { if (typeof showToast === 'function') showToast('الرجاء كتابة نص قرآني أولاً', 'warning'); return; }
1383
+ _quranInlineQuery = input;
1384
+ var resultDiv = document.getElementById('quran-inline-result');
1385
+ var uthmaniEl = document.getElementById('quran-inline-uthmani');
1386
+ var refEl = document.getElementById('quran-inline-reference');
1387
+ var searchBtn = document.getElementById('quran-search-btn');
1388
+ uthmaniEl.innerHTML = '<span class="text-secondary">⏳ جاري البحث...</span>';
1389
+ refEl.textContent = '';
1390
+ resultDiv.classList.remove('is-hidden');
1391
+ document.getElementById('quran-inline-translation').classList.add('is-hidden');
1392
+ document.getElementById('quran-inline-lang').value = '';
1393
+ if (searchBtn) { searchBtn.disabled = true; searchBtn.textContent = '⏳ جاري البحث...'; }
1394
+ var _abortCtrl = new AbortController();
1395
+ var _timeout = setTimeout(function(){ _abortCtrl.abort(); }, 30000);
1396
+ try {
1397
+ var res = await fetch('/api/quran', {
1398
+ method: 'POST',
1399
+ headers: { 'Content-Type': 'application/json' },
1400
+ body: JSON.stringify({ text: input, language: 'تدقيق الايات' }),
1401
+ signal: _abortCtrl.signal
1402
+ });
1403
+ var data = await res.json();
1404
+ if (data.error) {
1405
+ uthmaniEl.innerHTML = '<span class="text-secondary">' + data.error.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>';
1406
+ return;
1407
+ }
1408
+ var seg = data.matched_segment || '';
1409
+ var refMatch = seg.match(/【([^】]+)】/);
1410
+ var verseText = seg.replace(/\s*【[^】]+】\s*$/, '').replace(/^\(/, '').replace(/\)$/, '');
1411
+ var reference = refMatch ? refMatch[1] : '';
1412
+ _quranInlineVerse = verseText;
1413
+ _quranInlineRef = reference;
1414
+ uthmaniEl.textContent = verseText;
1415
+ refEl.textContent = reference ? '[' + reference + ']' : '';
1416
+ } catch (err) {
1417
+ var msg = err.name === 'AbortError' ? 'انتهى وقت الانتظار — حاول مرة أخرى' : 'حدث خطأ أثناء البحث — تأكد من الاتصال';
1418
+ uthmaniEl.innerHTML = '<span class="text-secondary">' + msg + '</span>';
1419
+ } finally {
1420
+ clearTimeout(_timeout);
1421
+ if (searchBtn) { searchBtn.disabled = false; searchBtn.textContent = 'بحث وتدقيق'; }
1422
+ }
1423
+ }
1424
+
1425
+ async function translateQuranInline() {
1426
+ var lang = document.getElementById('quran-inline-lang').value;
1427
+ if (!lang || !_quranInlineQuery) return;
1428
+ var resultDiv = document.getElementById('quran-inline-translation');
1429
+ var textEl = document.getElementById('quran-inline-trans-text');
1430
+ textEl.innerHTML = '<span class="text-secondary">⏳ جاري الترجمة...</span>';
1431
+ resultDiv.classList.remove('is-hidden');
1432
+ var _abortCtrl = new AbortController();
1433
+ var _timeout = setTimeout(function(){ _abortCtrl.abort(); }, 30000);
1434
+ try {
1435
+ var res = await fetch('/api/quran', {
1436
+ method: 'POST',
1437
+ headers: { 'Content-Type': 'application/json' },
1438
+ body: JSON.stringify({ text: _quranInlineQuery, language: lang }),
1439
+ signal: _abortCtrl.signal
1440
+ });
1441
+ var data = await res.json();
1442
+ if (data.error) {
1443
+ textEl.innerHTML = '<span class="text-secondary">' + data.error.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>';
1444
+ return;
1445
+ }
1446
+ var seg = data.matched_segment || '';
1447
+ var transText = seg.replace(/\s*【[^】]+】\s*$/, '').replace(/^\(/, '').replace(/\)$/, '');
1448
+ textEl.textContent = transText;
1449
+ } catch (err) {
1450
+ var msg = err.name === 'AbortError' ? 'انتهى وقت الانتظار' : 'حدث خطأ في الترجمة';
1451
+ textEl.innerHTML = '<span class="text-secondary">' + msg + '</span>';
1452
+ } finally { clearTimeout(_timeout); }
1453
+ }
1454
+
1455
+ function copyQuranInlineResult() {
1456
+ var text = (_quranInlineVerse || '') + (_quranInlineRef ? ' [' + _quranInlineRef + ']' : '');
1457
+ if (!text.trim()) return;
1458
+ navigator.clipboard.writeText(text).then(function() {
1459
+ if (typeof showToast === 'function') showToast('✓ تم نسخ النص المدقق');
1460
+ });
1461
+ }
1462
+
1463
+ function applyQuranInlineResult() {
1464
+ if (!_quranInlineVerse) return;
1465
+ var editor = document.getElementById('editor-container');
1466
+ if (!editor) return;
1467
+ if (typeof pushUndoState === 'function') pushUndoState();
1468
+ var esc = function(t) { return t.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;'); };
1469
+ var refHTML = _quranInlineRef ? ' <span class="quran-ref-inline">[' + esc(_quranInlineRef) + ']</span>' : '';
1470
+ var existing = editor.innerHTML;
1471
+ editor.innerHTML = existing + (existing ? '<br>' : '') +
1472
+ '<span class="quran-applied" contenteditable="false" data-quran="true">' +
1473
+ esc(_quranInlineVerse) + refHTML + '</span>';
1474
+ editor.dispatchEvent(new Event('input', { bubbles: true }));
1475
+ switchTab('write');
1476
+ if (typeof showToast === 'function') showToast('✓ تم إضافة النص القرآني في المحرر');
1477
  }
1478
  let _dialectResult = '';
1479
  async function convertDialect() {
 
1782
  }
1783
 
1784
  async function generateSummary(event) {
1785
+ let text = '';
1786
+ if (window._summarySource === 'custom') {
1787
+ var customInput = document.getElementById('summary-custom-input');
1788
+ text = customInput ? customInput.value.trim() : '';
1789
+ } else {
1790
+ text = (typeof getEditorText === 'function' ? getEditorText() : '').trim();
1791
+ }
1792
 
1793
  if (!text) {
1794
  const summaryText = document.getElementById('summary-text');
1795
+ summaryText.innerHTML = '<p class="text-secondary text-center">' + (window._summarySource === 'custom' ? 'الرجاء كتابة نص في مربع الإدخال أولاً' : 'الرجاء كتابة نص في المحرر أولاً') + '</p>';
1796
  document.getElementById('summary-preview').classList.add('show');
1797
  return;
1798
  }
 
2201
  </div>
2202
  </div>
2203
  </div>
2204
+ <!-- Floating Selection Toolbar -->
2205
+ <div id="selection-toolbar" class="selection-toolbar is-hidden" role="toolbar" aria-label="أدوات النص المحدد">
2206
+ <button type="button" class="sel-tool-btn" onclick="selectionToolAction('summarize')" title="تلخيص النص المحدد">
2207
+ <svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/></svg>
2208
+ تلخيص
2209
+ </button>
2210
+ <span class="sel-tool-sep"></span>
2211
+ <button type="button" class="sel-tool-btn" onclick="selectionToolAction('dialect')" title="تحويل النص المحدد للفصحى">
2212
+ <svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129"/></svg>
2213
+ فصحى
2214
+ </button>
2215
+ <span class="sel-tool-sep"></span>
2216
+ <button type="button" class="sel-tool-btn" onclick="selectionToolAction('quran')" title="تدقيق النص القرآني المحدد">
2217
+ <svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"/></svg>
2218
+ قرآن
2219
+ </button>
2220
+ </div>
2221
  </body>
2222
  </html>
src/js/editor.js CHANGED
@@ -409,8 +409,7 @@ function showTooltip(element) {
409
  const btnClass = isMain ? 'popover-alt-btn popover-alt-main' : 'popover-alt-btn';
410
  html += `<button class="${btnClass}" data-alt-correction="${escapeHtml(alt)}" type="button">${isMain ? '✓ ' : ''}${escapeHtml(alt)}</button>`;
411
  });
412
- // Render keep button at end
413
- html += `<button class="popover-alt-btn popover-alt-keep" data-alt-correction="${escapeHtml(suggestion.original)}" type="button">إبقاء كما هي</button>`;
414
  alternativesEl.innerHTML = html;
415
 
416
  // Bind click events for alternatives
 
409
  const btnClass = isMain ? 'popover-alt-btn popover-alt-main' : 'popover-alt-btn';
410
  html += `<button class="${btnClass}" data-alt-correction="${escapeHtml(alt)}" type="button">${isMain ? '✓ ' : ''}${escapeHtml(alt)}</button>`;
411
  });
412
+ // No separate "keep" button the "تجاهل" popover button handles dismissal
 
413
  alternativesEl.innerHTML = html;
414
 
415
  // Bind click events for alternatives
src/nlp/punctuation/spelling/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # AraSpell — Arabic Spelling Correction
src/nlp/punctuation/spelling/araspell_rules.py ADDED
@@ -0,0 +1,1615 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AraSpell — Arabic Spell Checker Pipeline (Rules & Classes)
2
+ # Extracted from AraSpell.py — NO global model loading, NO Gradio dependencies.
3
+ # All classes are imported by araspell_service.py.
4
+
5
+ import re
6
+ import math
7
+ import logging
8
+ import torch
9
+ from collections import Counter
10
+ from enum import Enum
11
+ from typing import List, Tuple, Optional
12
+
13
+ import Levenshtein
14
+ import jellyfish
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # ─────────────────────────────────────────────────────────────────────────────
19
+ # ERROR TYPE ENUM
20
+ # ─────────────────────────────────────────────────────────────────────────────
21
+
22
+ class ErrorType(Enum):
23
+ """Types of spelling errors"""
24
+ CHAR_REPETITION = "char_repetition"
25
+ WORD_MERGE = "word_merge"
26
+ CHAR_SUBSTITUTION = "char_substitution"
27
+ MIXED = "mixed"
28
+ CLEAN = "clean"
29
+
30
+ # ═══════════════════════════════════════════════════════════════════════════════
31
+ # KEYBOARD PROXIMITY (Phase 12 — from original AraSpell.py L475-520)
32
+ # ═══════════════════════════════════════════════════════════════════════════════
33
+
34
+ class RulesBasedCorrector:
35
+ """Arabic keyboard-proximity and character substitution rules."""
36
+
37
+ # Arabic keyboard layout adjacency mapping
38
+ KEYBOARD_NEIGHBORS = {
39
+ 'ض': ['ص', 'ق'],
40
+ 'ص': ['ض', 'ث', 'ق'],
41
+ 'ث': ['ص', 'ق'],
42
+ 'ق': ['ض', 'ص', 'ث', 'ف', 'غ'],
43
+ 'ف': ['ق', 'غ', 'ع', 'ب'],
44
+ 'غ': ['ق', 'ف', 'ع', 'ه'],
45
+ 'ع': ['ف', 'غ', 'ه', 'خ'],
46
+ 'ه': ['غ', 'ع', 'خ', 'ح'],
47
+ 'خ': ['ع', 'ه', 'ح', 'ج'],
48
+ 'ح': ['ه', 'خ', 'ج'],
49
+ 'ج': ['خ', 'ح', 'د'],
50
+ 'د': ['ج', 'ذ'],
51
+ 'ذ': ['د'],
52
+ 'ش': ['س', 'ي', 'ئ'],
53
+ 'س': ['ش', 'ي', 'ب'],
54
+ 'ي': ['ش', 'س', 'ب', 'ت'],
55
+ 'ب': ['ي', 'س', 'ف', 'ل', 'ن'],
56
+ 'ل': ['ب', 'ا', 'ن', 'م'],
57
+ 'ا': ['ل', 'ت', 'م'],
58
+ 'ت': ['ي', 'ا', 'ن'],
59
+ 'ن': ['ب', 'ل', 'ت', 'م', 'ك'],
60
+ 'م': ['ل', 'ا', 'ن', 'ك'],
61
+ 'ك': ['ن', 'م', 'ط'],
62
+ 'ط': ['ك', 'ظ'],
63
+ 'ظ': ['ط'],
64
+ 'ئ': ['ش', 'ء', 'ر'],
65
+ 'ء': ['ئ', 'ؤ'],
66
+ 'ؤ': ['ء', 'ر'],
67
+ 'ر': ['ئ', 'ؤ', 'لا', 'ى', 'ز'],
68
+ 'لا': ['ر', 'ى'],
69
+ 'ى': ['ر', 'لا', 'ة', 'ز'],
70
+ 'ة': ['ى', 'و', 'ز'],
71
+ 'و': ['ة', 'ز'],
72
+ 'ز': ['ر', 'ى', 'ة', 'و'],
73
+ 'أ': ['ا', 'إ', 'آ'],
74
+ 'إ': ['ا', 'أ'],
75
+ 'آ': ['ا', 'أ'],
76
+ }
77
+
78
+ @staticmethod
79
+ def is_keyboard_neighbor(char1: str, char2: str) -> bool:
80
+ """Check if two Arabic chars are adjacent on the keyboard."""
81
+ neighbors = RulesBasedCorrector.KEYBOARD_NEIGHBORS.get(char1, [])
82
+ return char2 in neighbors
83
+
84
+ # ═══════════════════════════════════════════════════════════════════════════════
85
+ # POST PROCESSOR
86
+ # ═══════════════════════════════════════════════════════════════════════════════
87
+
88
+ class AraSpellPostProcessor:
89
+ """Arabic text post-processing techniques."""
90
+
91
+ ARABIC_HARAKAT = 'ًٌٍَُِّْ'
92
+ TATWEEL = 'ـ'
93
+ NORMALIZER_MAP = {
94
+ 'ﻹ': 'لإ', 'ﻷ': 'لأ', 'ﻵ': 'لآ', 'ﻻ': 'لا', 'ﷲ': 'الله'
95
+ }
96
+ ARABIC_CONSONANTS = set('بتثجحخدذرزسشصضطظعغفقكلمن')
97
+
98
+ # --- Basic Normalization ---
99
+
100
+ @staticmethod
101
+ def remove_harakat(text: str) -> str:
102
+ """Remove Arabic diacritics"""
103
+ return re.sub(r'[ً-ْ]', '', text)
104
+
105
+ @staticmethod
106
+ def remove_tatweel(text: str) -> str:
107
+ """Remove Arabic kashida/tatweel"""
108
+ return text.replace(AraSpellPostProcessor.TATWEEL, '')
109
+
110
+ @staticmethod
111
+ def normalize_special_chars(text: str) -> str:
112
+ """Normalize special Arabic ligatures"""
113
+ for old, new in AraSpellPostProcessor.NORMALIZER_MAP.items():
114
+ text = text.replace(old, new)
115
+ return text
116
+
117
+ # --- Core Functions ---
118
+
119
+ @staticmethod
120
+ def unified_collapse_repeated(text: str) -> str:
121
+ """
122
+ Collapse repeated characters.
123
+ Arabic: 3+ consecutive → 1 | Latin: 2+ consecutive → 1
124
+ """
125
+ text = re.sub(r"([\u0600-\u06FF])\1{2,}", r"\1", text)
126
+ text = re.sub(r"([a-zA-Z])\1+", r"\1", text)
127
+ return text
128
+
129
+ @staticmethod
130
+ def remove_duplicate_words(text: str) -> str:
131
+ """Remove consecutive duplicate words. e.g. كتاب كتاب → كتاب"""
132
+ words = text.split()
133
+ if len(words) < 2:
134
+ return text
135
+ result = [words[0]]
136
+ for i in range(1, len(words)):
137
+ if words[i] != words[i-1]:
138
+ result.append(words[i])
139
+ return ' '.join(result)
140
+
141
+ @staticmethod
142
+ def normalize_spaces(text: str) -> str:
143
+ """Normalize whitespace: multiple spaces, unicode spaces, punctuation spacing."""
144
+ text = re.sub(r' +', ' ', text)
145
+ text = text.replace('\u00A0', ' ')
146
+ text = text.replace('\u200B', '')
147
+ text = text.replace('\u200C', '')
148
+ text = text.replace('\u200D', '')
149
+ text = text.strip()
150
+ text = re.sub(r'\s*([،؛؟!.])\s*', r'\1 ', text)
151
+ text = text.strip()
152
+ return text
153
+
154
+ @staticmethod
155
+ def remove_word_repetition_with_wa(text: str) -> str:
156
+ """Remove word و word → word"""
157
+ words = text.split()
158
+ result = []
159
+ i = 0
160
+ while i < len(words):
161
+ if i + 2 < len(words) and words[i] == words[i+2] and words[i+1] == 'و':
162
+ result.append(words[i])
163
+ i += 3
164
+ else:
165
+ result.append(words[i])
166
+ i += 1
167
+ return ' '.join(result)
168
+
169
+ # --- Hamza & Ta Marbuta Handling ---
170
+
171
+ # Common Arabic words with hamza errors — covers the most frequent
172
+ # spelling mistakes in informal Arabic writing
173
+ HAMZA_WHITELIST = {
174
+ 'الي': 'إلى', 'الى': 'إلى',
175
+ 'انت': 'أنت', 'انتم': 'أنتم', 'انتي': 'أنتِ',
176
+ 'انتو': 'أنتم', 'انتن': 'أنتن',
177
+ 'انا': 'أنا',
178
+ 'امس': 'أمس',
179
+ 'لان': 'لأن', 'لانه': 'لأنه', 'لانها': 'لأنها',
180
+ 'لانهم': 'لأنهم', 'لانك': 'لأنك',
181
+ 'اذا': 'إذا', 'اذ': 'إذ',
182
+ 'اي': 'أي', 'اين': 'أين',
183
+ 'او': 'أو',
184
+ 'اما': 'أما',
185
+ 'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم',
186
+ 'اخر': 'آخر', 'اخرى': 'أخرى',
187
+ 'الان': 'الآن',
188
+ 'اول': 'أول', 'اولى': 'أولى',
189
+ 'اصبح': 'أصبح', 'اصبحت': 'أصبحت',
190
+ 'اكثر': 'أكثر', 'اقل': 'أقل',
191
+ 'اعلى': 'أعلى', 'ادنى': 'أدنى',
192
+ 'اسرع': 'أسرع', 'ابطا': 'أبطأ',
193
+ 'اكبر': 'أكبر', 'اصغر': 'أصغر',
194
+ 'احسن': 'أحسن', 'اسوا': 'أسوأ',
195
+ 'امام': 'أمام',
196
+ 'اثناء': 'أثناء',
197
+ 'ايضا': 'أيضاً', 'ايض': 'أيضاً',
198
+ 'اساسي': 'أساسي', 'اساسية': 'أساسية',
199
+ 'اخي': 'أخي', 'اخت': 'أخت', 'اخو': 'أخو',
200
+ 'ابي': 'أبي', 'اب': 'أب', 'ابو': 'أبو',
201
+ 'اهل': 'أهل',
202
+ 'اطفال': 'أطفال',
203
+ 'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي',
204
+ 'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب',
205
+ 'اعرف': 'أعرف', 'اعلم': 'أعلم',
206
+ 'اخذ': 'أخذ', 'اكل': 'أكل',
207
+ 'الايام': 'الأيام',
208
+ 'الاطفال': 'الأطفال',
209
+ 'الاسعار': 'الأسعار',
210
+ 'الاولى': 'الأولى',
211
+ 'الاخير': 'الأخير', 'الاخيرة': 'الأخيرة',
212
+ 'واصدقائي': 'وأصدقائي',
213
+ # FIX-14: Additional hamza entries
214
+ 'ابناء': 'أبناء',
215
+ 'اجمل': 'أجمل', 'اجمع': 'أجمع',
216
+ 'اعلن': 'أعلن', 'اعلنت': 'أعلنت',
217
+ 'اكد': 'أكد', 'اكدت': 'أكدت',
218
+ 'اشار': 'أشار', 'اشارت': 'أشارت',
219
+ 'ارسل': 'أرسل', 'ارسلت': 'أرسلت',
220
+ 'اضاف': 'أضاف', 'اضافت': 'أضافت',
221
+ 'اخيرا': 'أخيراً', 'اخيراً': 'أخيراً',
222
+ 'اساسا': 'أساساً', 'اساساً': 'أساساً',
223
+ 'احيانا': 'أحياناً', 'احياناً': 'أحياناً',
224
+ 'ابدا': 'أبداً', 'ابداً': 'أبداً',
225
+ 'اصلا': 'أصلاً', 'اصلاً': 'أصلاً',
226
+ 'اخبار': 'أخبار', 'اخبر': 'أخبر',
227
+ 'امر': 'أمر', 'امور': 'أمور',
228
+ 'اهم': 'أهم', 'اهمية': 'أهمية',
229
+ 'اصبح': 'أصبح', 'اصل': 'أصل',
230
+ 'اثر': 'أثر', 'اثار': 'آثار',
231
+ 'اساء': 'أساء', 'اساس': 'أساس',
232
+ 'استاذ': 'أستاذ', 'اسلام': 'إسلام',
233
+ # Batch 3: More hamza entries for remaining FN cases
234
+ 'اسرة': 'أسرة', 'اسر': 'أسر',
235
+ 'اعضاء': 'أعضاء', 'اعداد': 'أعداد',
236
+ 'اعمال': 'أعمال', 'اعمار': 'أعمار',
237
+ 'انجاز': 'إنجاز', 'انجازات': 'إنجازات',
238
+ 'انشاء': 'إنشاء', 'انتاج': 'إنتاج',
239
+ 'انتخابات': 'انتخابات', 'انتظار': 'انتظار',
240
+ 'اسلامي': 'إسلامي', 'اسلامية': 'إسلامية',
241
+ 'امكانية': 'إمكانية', 'امكان': 'إمكان',
242
+ 'اشكالية': 'إشكالية',
243
+ 'ادارة': 'إدارة', 'ادارية': 'إدارية',
244
+ 'اعلام': 'إعلام', 'اعلامي': 'إعلامي',
245
+ 'احتمال': 'احتمال', 'احتفال': 'احتفال',
246
+ 'ازور': 'أزور', 'اذهب': 'أذهب', 'اكتب': 'أكتب',
247
+ 'اقرا': 'أقرأ', 'اقرأ': 'أقرأ',
248
+ 'اعمل': 'أعمل', 'ادرس': 'أدرس',
249
+ 'اشتري': 'أشتري', 'اسافر': 'أسافر',
250
+ 'مسؤول': 'مسؤول', 'مسؤولية': 'مسؤولية',
251
+ 'رؤية': 'رؤية', 'رؤيا': 'رؤيا',
252
+ 'مؤسسة': 'مؤسسة', 'مؤتمر': 'مؤتمر',
253
+ 'تأثير': 'تأثير', 'تأكيد': 'تأكيد',
254
+ # FIX-14: Alif maqsura common errors
255
+ 'المستشفي': 'المستشفى',
256
+ 'مصطفي': 'مصطفى', 'موسي': 'موسى', 'عيسي': 'عيسى',
257
+ 'هدي': 'هدى', 'بني': 'بنى',
258
+ 'معني': 'معنى', 'مبني': 'مبنى',
259
+ 'علي': 'على', # Common alif maqsura confusion
260
+ 'الي': 'إلى',
261
+ }
262
+
263
+ @staticmethod
264
+ def fix_hamza_conservative(text: str) -> str:
265
+ """Conservative Hamza normalization — only at word END, not middle."""
266
+ words = text.split()
267
+ result = []
268
+ for word in words:
269
+ if len(word) >= 3:
270
+ if word.endswith('أ'):
271
+ word = word[:-1] + 'ا'
272
+ if word.endswith('إ'):
273
+ word = word[:-1] + 'ا'
274
+ result.append(word)
275
+ return ' '.join(result)
276
+
277
+ # Attached prefixes that can precede hamza-whitelist words
278
+ # Ordered longest-first so وال is tried before و
279
+ HAMZA_PREFIXES = ['وبال', 'فبال', 'وال', 'بال', 'فال', 'كال', 'ول', 'فل',
280
+ 'وب', 'فب', 'وك', 'فك', 'و', 'ف', 'ب', 'ك', 'ل']
281
+
282
+ @staticmethod
283
+ def fix_common_hamza(text: str) -> str:
284
+ """
285
+ Fix common hamza placement errors using a whitelist.
286
+ Also handles prefixed words: و/ف/ب/ك/ل + whitelist word.
287
+ e.g. واصدقائي → وأصدقائي, بالاسعار → بالأسعار
288
+ """
289
+ words = text.split()
290
+ result = []
291
+ for word in words:
292
+ # Check exact match first
293
+ if word in AraSpellPostProcessor.HAMZA_WHITELIST:
294
+ result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
295
+ continue
296
+
297
+ # Try stripping common prefixes and looking up the remainder
298
+ fixed = False
299
+ for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
300
+ if word.startswith(prefix) and len(word) > len(prefix) + 1:
301
+ remainder = word[len(prefix):]
302
+ if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
303
+ result.append(prefix + AraSpellPostProcessor.HAMZA_WHITELIST[remainder])
304
+ fixed = True
305
+ break
306
+ if not fixed:
307
+ result.append(word)
308
+ return ' '.join(result)
309
+
310
+ @staticmethod
311
+ def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
312
+ """
313
+ Smart ه → ة fix at end of words.
314
+ Strategy: Always prefer ة when the previous char is a consonant,
315
+ UNLESS the ه form is specifically a known word and the ة form is NOT.
316
+ """
317
+ PROTECTED_ENDINGS = ['لله']
318
+ # Words that genuinely end in ه (not ة)
319
+ PROTECTED_HA_WORDS = {
320
+ 'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
321
+ 'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
322
+ 'اتجه', 'توجه', 'تشابه',
323
+ }
324
+ words = text.split()
325
+ result = []
326
+ for word in words:
327
+ if any(word.endswith(e) for e in PROTECTED_ENDINGS):
328
+ result.append(word)
329
+ continue
330
+ if word in PROTECTED_HA_WORDS:
331
+ result.append(word)
332
+ continue
333
+ if len(word) >= 3 and word.endswith('ه'):
334
+ if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
335
+ candidate_with_ta = word[:-1] + 'ة'
336
+ # Default: prefer ة (correct Arabic orthography for feminine nouns)
337
+ if vocab_manager:
338
+ ta_iv = vocab_manager.is_iv(candidate_with_ta)
339
+ ha_iv = vocab_manager.is_iv(word)
340
+ if ta_iv:
341
+ # Always prefer ة when it's a valid word
342
+ result.append(candidate_with_ta)
343
+ continue
344
+ elif ha_iv:
345
+ result.append(word)
346
+ continue
347
+ # No vocab manager — default to ة
348
+ result.append(candidate_with_ta)
349
+ continue
350
+ result.append(word)
351
+ return ' '.join(result)
352
+
353
+ # --- Hallucination Removal ---
354
+
355
+ @staticmethod
356
+ def remove_hallucinations(text: str) -> str:
357
+ """Remove model hallucinations: duplicate words, trailing 'و' artifacts."""
358
+ words = text.split()
359
+ if not words:
360
+ return text
361
+ result = []
362
+ i = 0
363
+
364
+ def normalize_word(w: str) -> str:
365
+ w = w.replace('ال', '').replace('ة', 'ه')
366
+ w = re.sub(r'[أإآ]', 'ا', w)
367
+ return w
368
+
369
+ while i < len(words):
370
+ word = words[i]
371
+ if len(word) > 4 and word.endswith('و'):
372
+ prev_char = word[-2]
373
+ if prev_char in 'ةهاأإآء':
374
+ word = word[:-1]
375
+ if i + 1 < len(words):
376
+ next_word = words[i + 1]
377
+ if normalize_word(word) == normalize_word(next_word):
378
+ keep = next_word if next_word.startswith('ال') and not word.startswith('ال') else word
379
+ result.append(keep)
380
+ i += 2
381
+ continue
382
+ result.append(word)
383
+ i += 1
384
+ return ' '.join(result)
385
+
386
+ @staticmethod
387
+ def remove_hallucinated_prefix(text: str, original: str) -> str:
388
+ """Remove particles (و/في) added by model if not in original"""
389
+ if not original:
390
+ return text
391
+ if text.startswith('و ') and not original.startswith('و'):
392
+ rest = text[2:].strip()
393
+ if AraSpellPostProcessor.normalize_special_chars(rest) == AraSpellPostProcessor.normalize_special_chars(original):
394
+ return rest
395
+ return text
396
+
397
+ # --- Word Splitting & Merging ---
398
+
399
+ @staticmethod
400
+ def merge_separated_al(text: str) -> str:
401
+ """Merge 'ال' separated by space: ال + كتاب → الكتاب"""
402
+ return re.sub(r'\bال\s+(\w+)', r'ال\1', text)
403
+
404
+ @staticmethod
405
+ def join_fragments(text: str) -> str:
406
+ """Join short fragments with validation."""
407
+ words = text.split()
408
+ if len(words) < 2:
409
+ return text
410
+ STANDALONE_WORDS = {
411
+ 'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال',
412
+ 'بعد', 'قبل', 'ب', 'ل', 'ك', 'و', 'أو', 'لا', 'ما', 'لم', 'لن',
413
+ 'هو', 'هي', 'هم', 'أن', 'إن', 'كل', 'كان', 'قد', 'قال', 'ذلك',
414
+ 'هذا', 'هذه', 'تلك', 'التي', 'الذي', 'التى', 'اللذي'
415
+ }
416
+ result = []
417
+ i = 0
418
+ while i < len(words):
419
+ word = words[i]
420
+ if i + 1 < len(words):
421
+ next_word = words[i + 1]
422
+ if word in STANDALONE_WORDS and next_word in STANDALONE_WORDS:
423
+ result.append(word)
424
+ i += 1
425
+ continue
426
+ if len(next_word) == 1:
427
+ result.append(word + next_word)
428
+ i += 2
429
+ continue
430
+ if len(word) >= 2 and len(next_word) >= 2 and word[-1] == next_word[0]:
431
+ if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
432
+ result.append(word[:-1] + next_word)
433
+ i += 2
434
+ continue
435
+ if (2 <= len(word) <= 4 and
436
+ 1 <= len(next_word) <= 2 and
437
+ 3 <= len(word) + len(next_word) <= 7):
438
+ if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
439
+ result.append(word + next_word)
440
+ i += 2
441
+ continue
442
+ result.append(word)
443
+ i += 1
444
+ return ' '.join(result)
445
+
446
+ # --- Main Pipelines ---
447
+
448
+ @staticmethod
449
+ def full_postprocess(text: str, original: str = "", vocab_manager=None) -> str:
450
+ """Apply all post-processing steps."""
451
+ if original:
452
+ text = AraSpellPostProcessor.remove_hallucinated_prefix(text, original)
453
+ text = AraSpellPostProcessor.normalize_special_chars(text)
454
+ text = AraSpellPostProcessor.remove_hallucinations(text)
455
+ text = AraSpellPostProcessor.unified_collapse_repeated(text)
456
+ text = AraSpellPostProcessor.fix_hamza_conservative(text)
457
+ text = AraSpellPostProcessor.fix_common_hamza(text) # Fix S3: hamza whitelist
458
+ text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
459
+ text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
460
+ text = AraSpellPostProcessor.remove_duplicate_words(text)
461
+ text = AraSpellPostProcessor.normalize_spaces(text)
462
+ return text
463
+
464
+
465
+ # ─────────────────────────────────────────────────────────────────────────────
466
+ # ERROR CLASSIFIER
467
+ # ─────────────────────────────────────────────────────────────────────────────
468
+
469
+ class ErrorClassifier:
470
+ """Classify type of spelling error"""
471
+
472
+ NON_ARABIC_KEYBOARD = set('پگچژکەڕڤڵڎےۀۃھیټډڼڑ')
473
+
474
+ @staticmethod
475
+ def has_char_substitution(text: str) -> bool:
476
+ return any(c in ErrorClassifier.NON_ARABIC_KEYBOARD for c in text)
477
+
478
+ @staticmethod
479
+ def has_char_repetition(text: str, threshold: int = 3) -> bool:
480
+ return bool(re.search(r"(.)\1{" + str(threshold - 1) + ",}", text))
481
+
482
+ @staticmethod
483
+ def has_word_merge(text: str, max_word_len: int = 8) -> bool:
484
+ words = text.split()
485
+ if any(len(w) > max_word_len for w in words):
486
+ return True
487
+ if len(words) == 1 and len(text) > 6:
488
+ return True
489
+ return False
490
+
491
+ @staticmethod
492
+ def classify(text: str) -> ErrorType:
493
+ has_rep = ErrorClassifier.has_char_repetition(text)
494
+ has_merge = ErrorClassifier.has_word_merge(text)
495
+ has_sub = ErrorClassifier.has_char_substitution(text)
496
+ error_count = sum([has_rep, has_merge, has_sub])
497
+ if error_count >= 2:
498
+ return ErrorType.MIXED
499
+ elif has_sub:
500
+ return ErrorType.CHAR_SUBSTITUTION
501
+ elif has_rep:
502
+ return ErrorType.CHAR_REPETITION
503
+ elif has_merge:
504
+ return ErrorType.WORD_MERGE
505
+ else:
506
+ return ErrorType.CLEAN
507
+
508
+
509
+ # ═══════════════════════════════════════════════════════════════════════════════
510
+ # RULES-BASED CORRECTOR
511
+ # ═══════════════════════════════════════════════════════════════════════════════
512
+
513
+ class RulesBasedCorrector:
514
+ """Rules-based correction with keyboard proximity mapping."""
515
+
516
+ SUBSTITUTION_MAP = {
517
+ 'ک': 'ك', 'ی': 'ي', 'ے': 'ي',
518
+ 'پ': 'ب', 'چ': 'ج', 'ژ': 'ز',
519
+ 'گ': 'ك', 'ڤ': 'ف', 'ڵ': 'ل',
520
+ 'ڕ': 'ر', 'ڎ': 'د', 'ڼ': 'ن',
521
+ 'ټ': 'ت', 'ډ': 'د', 'ړ': 'ر',
522
+ 'ۀ': 'ه', 'ۃ': 'ة', 'ھ': 'ه',
523
+ 'ە': 'ه', 'ڑ': 'ر'
524
+ }
525
+
526
+ PREPOSITIONS = {
527
+ 'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى',
528
+ 'حتى', 'منذ', 'خلال', 'بعد', 'قبل',
529
+ 'ب', 'ل', 'ك', 'لل'
530
+ }
531
+
532
+ KEYBOARD_NEIGHBORS = {
533
+ 'ض': ['ص', 'ق'], 'ص': ['ض', 'ث', 'ق'], 'ث': ['ص', 'ق'],
534
+ 'ق': ['ض', 'ص', 'ث', 'ف', 'غ'], 'ف': ['ق', 'غ', 'ع', 'ب'],
535
+ 'غ': ['ق', 'ف', 'ع', 'ه'], 'ع': ['ف', 'غ', 'ه', 'خ'],
536
+ 'ه': ['غ', 'ع', 'خ', 'ح'], 'خ': ['ع', 'ه', 'ح', 'ج'],
537
+ 'ح': ['ه', 'خ', 'ج'], 'ج': ['خ', 'ح', 'د'],
538
+ 'د': ['ج', 'ذ'], 'ذ': ['د'],
539
+ 'ش': ['س', 'ي', 'ئ'], 'س': ['ش', 'ي', 'ب'],
540
+ 'ي': ['ش', 'س', 'ب', 'ت'], 'ب': ['ي', 'س', 'ف', 'ل', 'ن'],
541
+ 'ل': ['ب', 'ا', 'ن', 'م'], 'ا': ['ل', 'ت', 'م'],
542
+ 'ت': ['ي', 'ا', 'ن'], 'ن': ['ب', 'ل', 'ت', 'م', 'ك'],
543
+ 'م': ['ل', 'ا', 'ن', 'ك'], 'ك': ['ن', 'م', 'ط'],
544
+ 'ط': ['ك', 'ظ'], 'ظ': ['ط'],
545
+ 'ئ': ['ش', 'ء', 'ر'], 'ء': ['ئ', 'ؤ'], 'ؤ': ['ء', 'ر'],
546
+ 'ر': ['ئ', 'ؤ', 'لا', 'ى', 'ز'], 'لا': ['ر', 'ى'],
547
+ 'ى': ['ر', 'لا', 'ة', 'ز'], 'ة': ['ى', 'و', 'ز'],
548
+ 'و': ['ة', 'ز'], 'ز': ['ر', 'ى', 'ة', 'و'],
549
+ 'أ': ['ا', 'إ', 'آ'], 'إ': ['ا', 'أ'], 'آ': ['ا', 'أ'],
550
+ }
551
+
552
+ @staticmethod
553
+ def is_keyboard_neighbor(char1: str, char2: str) -> bool:
554
+ neighbors = RulesBasedCorrector.KEYBOARD_NEIGHBORS.get(char1, [])
555
+ return char2 in neighbors
556
+
557
+ @staticmethod
558
+ def fix_char_substitution(text: str) -> str:
559
+ for old, new in RulesBasedCorrector.SUBSTITUTION_MAP.items():
560
+ text = text.replace(old, new)
561
+ return text
562
+
563
+ @staticmethod
564
+ def fix_char_repetition(text: str) -> str:
565
+ text = re.sub(r'([^\d\s])\1{2,}', r'\1', text)
566
+ return text
567
+
568
+ @staticmethod
569
+ def advanced_heuristic_repair(text: str) -> str:
570
+ text = RulesBasedCorrector.fix_char_substitution(text)
571
+ text = RulesBasedCorrector.fix_char_repetition(text)
572
+ words = text.split()
573
+ processed_words = []
574
+ for word in words:
575
+ processed_words.append(RulesBasedCorrector._recursive_split(word))
576
+ return ' '.join(processed_words)
577
+
578
+ @staticmethod
579
+ def _recursive_split(word: str) -> str:
580
+ if len(word) < 4:
581
+ return word
582
+ separables = sorted(['من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال', 'بعد', 'قبل'], key=len, reverse=True)
583
+ for sep in separables:
584
+ if word == sep:
585
+ return word
586
+ if word.startswith(sep):
587
+ remainder = word[len(sep):]
588
+ if len(remainder) >= 3:
589
+ return sep + " " + RulesBasedCorrector._recursive_split(remainder)
590
+ if word.startswith('يا') and len(word) > 4:
591
+ return 'يا ' + RulesBasedCorrector._recursive_split(word[2:])
592
+ return word
593
+
594
+
595
+ # ═══════════════════════════════════════════════════════════════════════════════
596
+ # OUTPUT VALIDATOR (Hallucination Prevention)
597
+ # ═══════════════════════════════════════════════════════════════════════════════
598
+
599
+ class OutputValidator:
600
+ """Validate model outputs to prevent hallucinations"""
601
+
602
+ @staticmethod
603
+ def calculate_edit_distance(s1: str, s2: str) -> int:
604
+ return Levenshtein.distance(s1, s2)
605
+
606
+ @staticmethod
607
+ def check_character_preservation(original: str, corrected: str) -> Tuple[bool, str]:
608
+ chars_original = set(original)
609
+ chars_corrected = set(corrected)
610
+ if not chars_original:
611
+ return True, "valid"
612
+ intersection = chars_original & chars_corrected
613
+ union = chars_original | chars_corrected
614
+ jaccard = len(intersection) / len(union) if union else 0
615
+ if jaccard < 0.35:
616
+ return False, "low_character_similarity"
617
+ return True, "valid"
618
+
619
+ @staticmethod
620
+ def check_word_count(original: str, corrected: str) -> Tuple[bool, str]:
621
+ len_orig = len(original.split())
622
+ len_corr = len(corrected.split())
623
+ if len_orig == 1:
624
+ if len_corr <= 3:
625
+ return True, "valid"
626
+ if len(original) > 12 and len_corr <= 6:
627
+ return True, "valid"
628
+ ratio = len_corr / len_orig if len_orig > 0 else 0
629
+ if ratio > 2.0 or ratio < 0.5:
630
+ return False, "word_count_mismatch"
631
+ return True, "valid"
632
+
633
+ def validate(self, original: str, corrected: str, error_type: str) -> Tuple[bool, str]:
634
+ if not corrected or not corrected.strip():
635
+ return False, "empty_output"
636
+ original_no_space = original.replace(' ', '').replace('\u200c', '')
637
+ corrected_no_space = corrected.replace(' ', '').replace('\u200c', '')
638
+ if original_no_space == corrected_no_space:
639
+ return True, "space_leniency_accept"
640
+ len_orig = len(original)
641
+ len_corr = len(corrected)
642
+ if len_corr > len_orig * 2.5:
643
+ return False, "too_long"
644
+ if len_corr < len_orig * 0.5:
645
+ if error_type == ErrorType.CHAR_REPETITION:
646
+ pass
647
+ else:
648
+ return False, "too_short"
649
+ is_valid_count, reason = self.check_word_count(original, corrected)
650
+ if not is_valid_count:
651
+ return False, reason
652
+ is_valid_chars, reason = self.check_character_preservation(original, corrected)
653
+ if not is_valid_chars:
654
+ return False, reason
655
+ return True, "valid"
656
+
657
+
658
+ # ═══════════════════════════════════════════════════════════════════════════════
659
+ # VOCABULARY MANAGER
660
+ # ═══════════════════════════════════════════════════════════════════════════════
661
+
662
+ class VocabularyManager:
663
+ """Centralized vocabulary management for OOV/IV detection."""
664
+
665
+ HAMZA_VARIANTS = {'أ', 'إ', 'آ', 'ء', 'ؤ', 'ئ', 'ا'}
666
+ ALEF_NORMALIZED = 'ا'
667
+ TA_MARBUTA = 'ة'
668
+ HA = 'ه'
669
+ YA_VARIANTS = {'ي', 'ى'}
670
+ YA_NORMALIZED = 'ي'
671
+
672
+ def __init__(self, tokenizer):
673
+ self.tokenizer = tokenizer
674
+ self.vocab = {
675
+ w for w in tokenizer.get_vocab().keys()
676
+ if w.isalpha() and not w.startswith('##') and len(w) > 1
677
+ }
678
+ self.vocab_rank = {w: i for w, i in tokenizer.get_vocab().items()}
679
+ self.normalized_vocab = {self.normalize_for_comparison(w): w for w in self.vocab}
680
+ logger.info(f"VocabularyManager initialized: {len(self.vocab)} words")
681
+
682
+ @classmethod
683
+ def normalize_for_comparison(cls, word: str) -> str:
684
+ result = []
685
+ for i, char in enumerate(word):
686
+ if char in cls.HAMZA_VARIANTS:
687
+ result.append(cls.ALEF_NORMALIZED)
688
+ elif char == cls.TA_MARBUTA and i == len(word) - 1:
689
+ result.append(cls.HA)
690
+ elif char in cls.YA_VARIANTS:
691
+ result.append(cls.YA_NORMALIZED)
692
+ else:
693
+ result.append(char)
694
+ return ''.join(result)
695
+
696
+ def is_iv(self, word: str) -> bool:
697
+ clean = re.sub(r'[^\w]', '', word)
698
+ if not clean:
699
+ return True
700
+ if clean in self.vocab:
701
+ return True
702
+ normalized = self.normalize_for_comparison(clean)
703
+ if normalized in self.normalized_vocab:
704
+ return True
705
+ return False
706
+
707
+ def is_oov(self, word: str) -> bool:
708
+ return not self.is_iv(word)
709
+
710
+ def get_frequency_rank(self, word: str) -> int:
711
+ clean = re.sub(r'[^\w]', '', word)
712
+ return self.vocab_rank.get(clean, 999999)
713
+
714
+ def all_words_iv(self, text: str) -> bool:
715
+ words = text.split()
716
+ return all(self.is_iv(w) for w in words)
717
+
718
+ def count_oov_words(self, text: str) -> int:
719
+ words = text.split()
720
+ return sum(1 for w in words if self.is_oov(w))
721
+
722
+ def get_oov_words(self, text: str) -> List[str]:
723
+ words = text.split()
724
+ return [w for w in words if self.is_oov(w)]
725
+
726
+ def words_are_equivalent(self, word1: str, word2: str) -> bool:
727
+ norm1 = self.normalize_for_comparison(word1)
728
+ norm2 = self.normalize_for_comparison(word2)
729
+ return norm1 == norm2
730
+
731
+ @staticmethod
732
+ def damerau_levenshtein_distance(s1: str, s2: str) -> int:
733
+ return jellyfish.damerau_levenshtein_distance(s1, s2)
734
+
735
+ def calculate_similarity(self, original: str, corrected: str) -> float:
736
+ dist = self.damerau_levenshtein_distance(original, corrected)
737
+ max_len = max(len(original), len(corrected), 1)
738
+ return 1.0 - (dist / max_len)
739
+
740
+
741
+ # ═══════════════════════════════════════════════════════════════════════════════
742
+ # WORD ALIGNER
743
+ # ═══════════════════════════════════════════════════════════════════════════════
744
+
745
+ class WordAligner:
746
+ """Aligns input and output words to create hybrid corrections."""
747
+
748
+ def __init__(self, vocab_manager):
749
+ self.vocab = vocab_manager
750
+
751
+ def align_words(self, input_text: str, output_text: str) -> str:
752
+ input_words = input_text.split()
753
+ output_words = output_text.split()
754
+ if abs(len(input_words) - len(output_words)) > 2:
755
+ input_oov = self.vocab.count_oov_words(input_text)
756
+ output_oov = self.vocab.count_oov_words(output_text)
757
+ return output_text if output_oov < input_oov else input_text
758
+ result = []
759
+ min_len = min(len(input_words), len(output_words))
760
+ for i in range(min_len):
761
+ in_word = input_words[i]
762
+ out_word = output_words[i]
763
+ best_word = self._select_best_word(in_word, out_word)
764
+ result.append(best_word)
765
+ if len(output_words) > min_len:
766
+ result.extend(output_words[min_len:])
767
+ elif len(input_words) > min_len:
768
+ for w in input_words[min_len:]:
769
+ if self.vocab.is_iv(w):
770
+ result.append(w)
771
+ return ' '.join(result)
772
+
773
+ def _select_best_word(self, input_word: str, output_word: str) -> str:
774
+ if input_word == output_word:
775
+ return input_word
776
+ in_iv = self.vocab.is_iv(input_word)
777
+ out_iv = self.vocab.is_iv(output_word)
778
+ if not in_iv and out_iv:
779
+ return output_word
780
+ if in_iv and not out_iv:
781
+ return input_word
782
+ if in_iv and out_iv:
783
+ # Fix S1: When only difference is ه→ة at word end, prefer ة
784
+ # (correct Arabic orthography — ة is the standard feminine ending)
785
+ if (input_word.endswith('ه') and output_word.endswith('ة')
786
+ and input_word[:-1] == output_word[:-1]):
787
+ return output_word
788
+ # Fix S1: Also handle ة→ه (don't regress a correct ة to ه)
789
+ if (input_word.endswith('ة') and output_word.endswith('ه')
790
+ and input_word[:-1] == output_word[:-1]):
791
+ return input_word
792
+ return input_word
793
+ if len(input_word) == len(output_word) and len(input_word) >= 3:
794
+ for i in range(len(input_word)):
795
+ if input_word[i] != output_word[i]:
796
+ hybrid = input_word[:i] + output_word[i] + input_word[i+1:]
797
+ if self.vocab.is_iv(hybrid):
798
+ return hybrid
799
+ hybrid2 = output_word[:i] + input_word[i] + output_word[i+1:]
800
+ if self.vocab.is_iv(hybrid2):
801
+ return hybrid2
802
+ return output_word
803
+
804
+
805
+ # ═══════════════════════════════════════════════════════════════════════════════
806
+ # SPLIT/MERGE SPECIALIST
807
+ # ═══════════════════════════════════════════════════════════════════════════════
808
+
809
+ class SplitMergeSpecialist:
810
+ """Handles word splitting and merging with vocabulary validation."""
811
+
812
+ SEPARABLE_PREFIXES = [
813
+ 'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال',
814
+ 'بعد', 'قبل', 'بين', 'حول', 'تحت', 'فوق', 'أمام', 'وراء', 'دون',
815
+ 'أن', 'لن', 'لم', 'قد', 'سوف', 'كي', 'إذا', 'لو', 'مثل', 'غير',
816
+ 'يا',
817
+ ]
818
+
819
+ PROTECTED_WORDS = {
820
+ 'في', 'من', 'على', 'عن', 'مع', 'إلى', 'الى', 'ان', 'أن', 'لا', 'ما', 'هو', 'هي',
821
+ 'لم', 'لن', 'قد', 'كل', 'كان', 'ذلك', 'هذا', 'هذه', 'التي', 'الذي', 'بين',
822
+ }
823
+
824
+ ATTACHED_PREFIXES = [
825
+ 'وال', 'بال', 'فال', 'كال', 'لل',
826
+ 'وب', 'وف', 'ول', 'وك', 'وم', 'ون',
827
+ 'فب', 'فل', 'فك', 'فم',
828
+ ]
829
+
830
+ PRONOUN_SUFFIXES = {'كم', 'هم', 'ها', 'هن', 'كن', 'نا', 'هما', 'كما', 'تم', 'تن'}
831
+
832
+ def __init__(self, vocab_manager):
833
+ self.vocab = vocab_manager
834
+ self.separable_prefixes = sorted(
835
+ self.SEPARABLE_PREFIXES, key=len, reverse=True
836
+ )
837
+
838
+ def split_word(self, word: str) -> str:
839
+ if len(word) < 5:
840
+ return word
841
+ if self.vocab.is_iv(word):
842
+ return word
843
+ if word in self.PROTECTED_WORDS:
844
+ return word
845
+ for prefix in self.ATTACHED_PREFIXES:
846
+ if word.startswith(prefix):
847
+ remainder = word[len(prefix):]
848
+ if self.vocab.is_iv(remainder):
849
+ return word
850
+ if prefix.endswith('ال') and self.vocab.is_iv(remainder):
851
+ return word
852
+ for prefix in self.separable_prefixes:
853
+ if word.startswith(prefix) and len(word) > len(prefix) + 2:
854
+ remainder = word[len(prefix):]
855
+ if self.vocab.is_iv(remainder):
856
+ return f"{prefix} {remainder}"
857
+ for i in range(3, len(word) - 2):
858
+ left = word[:i]
859
+ right = word[i:]
860
+ if self.vocab.is_iv(left) and self.vocab.is_iv(right):
861
+ return f"{left} {right}"
862
+ return word
863
+
864
+ def merge_fragments(self, text: str) -> str:
865
+ words = text.split()
866
+ if len(words) < 2:
867
+ return text
868
+ result = []
869
+ i = 0
870
+ while i < len(words):
871
+ word = words[i]
872
+ if i + 1 < len(words):
873
+ next_word = words[i + 1]
874
+ merged = word + next_word
875
+ if len(next_word) == 1 and next_word in 'ةهاي':
876
+ if self.vocab.is_iv(merged):
877
+ result.append(merged)
878
+ i += 2
879
+ continue
880
+ if word == 'ال' and len(next_word) >= 2:
881
+ if self.vocab.is_iv(merged):
882
+ result.append(merged)
883
+ i += 2
884
+ continue
885
+ if self.vocab.is_oov(word) and self.vocab.is_oov(next_word):
886
+ if self.vocab.is_iv(merged):
887
+ result.append(merged)
888
+ i += 2
889
+ continue
890
+ if len(word) <= 2 and self.vocab.is_oov(word):
891
+ if self.vocab.is_iv(merged):
892
+ result.append(merged)
893
+ i += 2
894
+ continue
895
+ if next_word in self.PRONOUN_SUFFIXES:
896
+ if self.vocab.is_iv(merged) and not self.vocab.is_iv(word):
897
+ result.append(merged)
898
+ i += 2
899
+ continue
900
+ if len(word) <= 3 and len(next_word) <= 3:
901
+ if len(merged) >= 5 and self.vocab.is_iv(merged):
902
+ result.append(merged)
903
+ i += 2
904
+ continue
905
+ result.append(word)
906
+ i += 1
907
+ return ' '.join(result)
908
+
909
+ def process_text(self, text: str) -> str:
910
+ text = self.merge_fragments(text)
911
+ words = text.split()
912
+ processed = []
913
+ for word in words:
914
+ if self.vocab.is_oov(word) and len(word) >= 4:
915
+ split_result = self.split_word(word)
916
+ processed.append(split_result)
917
+ else:
918
+ processed.append(word)
919
+ return ' '.join(processed)
920
+
921
+
922
+ # ═══════════════════════════════════════════════════════════════════════════════
923
+ # EDIT DISTANCE CORRECTOR
924
+ # ═══════════════════════════════════════════════════════════════════════════════
925
+
926
+ class EditDistanceCorrector:
927
+ """Generates candidates based on Levenshtein distance."""
928
+
929
+ def __init__(self, tokenizer):
930
+ self.tokenizer = tokenizer
931
+ self.vocab = {
932
+ w for w in tokenizer.get_vocab().keys()
933
+ if w.isalpha() and not w.startswith('##') and len(w) > 1
934
+ }
935
+ self.vocab_rank = {w: i for w, i in tokenizer.get_vocab().items()}
936
+
937
+ def edits1(self, word):
938
+ letters = 'أابتثجحخدذرزسشصضطظعغفقكلمنهويءآىةئؤ'
939
+ splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
940
+ deletes = [L + R[1:] for L, R in splits if R]
941
+ transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
942
+ replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
943
+ inserts = [L + c + R for L, R in splits for c in letters]
944
+ return set(deletes + transposes + replaces + inserts)
945
+
946
+ def edits2(self, word):
947
+ return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
948
+
949
+ def known(self, words):
950
+ return set(w for w in words if w in self.vocab)
951
+
952
+ def generate_candidate(self, text: str) -> str:
953
+ words = text.split()
954
+ corrected_words = []
955
+ for word in words:
956
+ clean_word = re.sub(r'[^\w]', '', word)
957
+ if clean_word in self.vocab:
958
+ corrected_words.append(word)
959
+ continue
960
+ candidates = self.known(self.edits1(clean_word))
961
+ if not candidates:
962
+ if len(clean_word) < 7:
963
+ candidates = self.known(self.edits2(clean_word))
964
+ if candidates:
965
+ best_candidate = min(candidates, key=lambda w: self.vocab_rank.get(w, 999999))
966
+ corrected_words.append(best_candidate)
967
+ else:
968
+ corrected_words.append(word)
969
+ return ' '.join(corrected_words)
970
+
971
+
972
+ # ═══════════════════════════════════════════════════════════════════════════════
973
+ # CONTEXTUAL CORRECTOR (MLM-based) — Optional, disabled by default to save RAM
974
+ # ═══════════════════════════════════════════════════════════════════════════════
975
+
976
+ class ContextualCorrector:
977
+ """MLM-based contextual correction for confusion pairs"""
978
+
979
+ CONFUSION_PAIRS = [
980
+ ('ض', 'ظ'), ('ذ', 'ز'), ('ث', 'س'), ('ص', 'س'),
981
+ ('ط', 'ت'), ('ق', 'ك'), ('ه', 'ة'), ('ا', 'ى'),
982
+ ('ت', 'د'), ('د', 'ض'), ('ك', 'ق'), ('غ', 'ق'),
983
+ ('ج', 'ش'), ('س', 'ز'), ('ف', 'ب'), ('و', 'و'),
984
+ ('ؤ', 'و'), ('ئ', 'ي'), ('ء', 'أ'), ('إ', 'أ'),
985
+ ]
986
+
987
+ def __init__(self, model_name: str = 'aubmindlab/bert-base-arabertv02', cache_size: int = 10000):
988
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
989
+
990
+ self.tokenizer = AutoTokenizer.from_pretrained(model_name)
991
+ self.model = AutoModelForMaskedLM.from_pretrained(model_name)
992
+ self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
993
+ self.model = self.model.to(self.device)
994
+ self.model.eval()
995
+ self.confusion_map = self._build_confusion_map()
996
+ self.cache_hits = 0
997
+ self.cache_misses = 0
998
+ self._score_cache = {}
999
+ self.cache_size = cache_size
1000
+ self.vocab = self.tokenizer.get_vocab()
1001
+
1002
+ def _build_confusion_map(self):
1003
+ confusion_map = {}
1004
+ for char1, char2 in self.CONFUSION_PAIRS:
1005
+ if char1 not in confusion_map:
1006
+ confusion_map[char1] = []
1007
+ if char2 not in confusion_map:
1008
+ confusion_map[char2] = []
1009
+ confusion_map[char1].append(char2)
1010
+ confusion_map[char2].append(char1)
1011
+ return confusion_map
1012
+
1013
+ def get_confusable_chars(self, char: str) -> List[str]:
1014
+ return self.confusion_map.get(char, [])
1015
+
1016
+ def generate_candidates(self, word: str) -> List[str]:
1017
+ candidates = [word]
1018
+ for i, char in enumerate(word):
1019
+ confusables = self.get_confusable_chars(char)
1020
+ for conf_char in confusables:
1021
+ candidate = word[:i] + conf_char + word[i+1:]
1022
+ if candidate not in candidates:
1023
+ candidates.append(candidate)
1024
+ for i in range(len(word) - 1):
1025
+ if word[i] == word[i+1]:
1026
+ candidate = word[:i] + word[i+1:]
1027
+ if candidate not in candidates:
1028
+ candidates.append(candidate)
1029
+ COMMON_CHARS = 'ابتثجحخدذرزسشصضطظعغفقكلمنهويأإآءئؤةى'
1030
+ for i in range(len(word) + 1):
1031
+ for char in COMMON_CHARS:
1032
+ candidate = word[:i] + char + word[i:]
1033
+ if candidate in self.vocab and candidate not in candidates:
1034
+ candidates.append(candidate)
1035
+ if len(word) < 7:
1036
+ for i in range(len(word)):
1037
+ for char in COMMON_CHARS:
1038
+ if char != word[i]:
1039
+ candidate = word[:i] + char + word[i+1:]
1040
+ if candidate in self.vocab and candidate not in candidates:
1041
+ candidates.append(candidate)
1042
+ for i in range(len(word)):
1043
+ candidate = word[:i] + word[i+1:]
1044
+ if len(candidate) > 1:
1045
+ if candidate in self.vocab and candidate not in candidates:
1046
+ candidates.append(candidate)
1047
+ return candidates
1048
+
1049
+ def score_with_mlm(self, text: str, position: int, word: str) -> float:
1050
+ cache_key = f"{text}|{position}|{word}"
1051
+ if cache_key in self._score_cache:
1052
+ self.cache_hits += 1
1053
+ return self._score_cache[cache_key]
1054
+ self.cache_misses += 1
1055
+ words = text.split()
1056
+ if position >= len(words):
1057
+ return 0.0
1058
+ masked_words = words.copy()
1059
+ masked_words[position] = '[MASK]'
1060
+ masked_text = ' '.join(masked_words)
1061
+ inputs = self.tokenizer(masked_text, return_tensors='pt', padding=True, truncation=True)
1062
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
1063
+ with torch.no_grad():
1064
+ outputs = self.model(**inputs)
1065
+ predictions = outputs.logits
1066
+ mask_token_index = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
1067
+ if len(mask_token_index) == 0:
1068
+ return 0.0
1069
+ mask_token_logits = predictions[0, mask_token_index[0], :]
1070
+ probs = torch.softmax(mask_token_logits, dim=0)
1071
+ word_tokens = self.tokenizer.encode(word, add_special_tokens=False)
1072
+ if not word_tokens:
1073
+ return 0.0
1074
+ word_token_id = word_tokens[0]
1075
+ score = probs[word_token_id].item()
1076
+ if len(self._score_cache) >= self.cache_size:
1077
+ self._score_cache.pop(next(iter(self._score_cache)))
1078
+ self._score_cache[cache_key] = score
1079
+ return score
1080
+
1081
+ def score_candidates_batch(self, text: str, position: int, candidates: List[str]) -> dict:
1082
+ scores = {}
1083
+ for candidate in candidates:
1084
+ scores[candidate] = self.score_with_mlm(text, position, candidate)
1085
+ return scores
1086
+
1087
+ def predict_masked_token(self, text: str, position: int, top_k: int = 5) -> List[Tuple[str, float]]:
1088
+ words = text.split()
1089
+ if position >= len(words):
1090
+ return []
1091
+ masked_words = words.copy()
1092
+ masked_words[position] = '[MASK]'
1093
+ masked_text = ' '.join(masked_words)
1094
+ inputs = self.tokenizer(masked_text, return_tensors='pt', padding=True, truncation=True).to(self.device)
1095
+ with torch.no_grad():
1096
+ outputs = self.model(**inputs)
1097
+ predictions = outputs.logits
1098
+ mask_token_index = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
1099
+ if len(mask_token_index) == 0:
1100
+ return []
1101
+ mask_token_logits = predictions[0, mask_token_index[0], :]
1102
+ probs = torch.softmax(mask_token_logits, dim=0)
1103
+ top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
1104
+ results = []
1105
+ for i in range(top_k):
1106
+ token_id = top_k_indices[i].item()
1107
+ score = top_k_weights[i].item()
1108
+ token = self.tokenizer.decode([token_id]).strip()
1109
+ if not token.startswith("##") and token not in self.tokenizer.all_special_tokens:
1110
+ results.append((token, score))
1111
+ return results
1112
+
1113
+ def refine_sentence_with_mask(self, text: str, threshold: float = 0.001, vocab_manager=None, raw_model_output=None) -> str:
1114
+ words = text.split()
1115
+ refined_words = words.copy()
1116
+ raw_words = raw_model_output.split() if raw_model_output else []
1117
+ for i, word in enumerate(words):
1118
+ if vocab_manager and vocab_manager.is_iv(word):
1119
+ continue
1120
+ if i < len(raw_words) and word == raw_words[i]:
1121
+ continue
1122
+ if len(word) <= 2:
1123
+ continue
1124
+ current_score = self.score_with_mlm(text, i, word)
1125
+ if current_score > threshold:
1126
+ continue
1127
+ predictions = self.predict_masked_token(text, i, top_k=10)
1128
+ for pred_word, pred_score in predictions:
1129
+ if pred_word == word:
1130
+ continue
1131
+ if abs(len(pred_word) - len(word)) > 1:
1132
+ continue
1133
+ dist = Levenshtein.distance(word, pred_word)
1134
+ max_len = max(len(word), len(pred_word))
1135
+ similarity = 1.0 - (dist / max_len)
1136
+ if similarity < 0.90:
1137
+ continue
1138
+ if vocab_manager and vocab_manager.is_oov(pred_word):
1139
+ continue
1140
+ if pred_score < 0.12:
1141
+ continue
1142
+ is_original_common = current_score > 0.001
1143
+ if is_original_common:
1144
+ if pred_score > current_score * 1000:
1145
+ refined_words[i] = pred_word
1146
+ break
1147
+ else:
1148
+ if pred_score > current_score * 50 and pred_score > 0.2:
1149
+ refined_words[i] = pred_word
1150
+ break
1151
+ return ' '.join(refined_words)
1152
+
1153
+ def calculate_sentence_score(self, text: str) -> float:
1154
+ words = text.split()
1155
+ if not words:
1156
+ return 0.0
1157
+ total_score = 0.0
1158
+ scored_words = 0
1159
+ for i, word in enumerate(words):
1160
+ score = self.score_with_mlm(text, i, word)
1161
+ total_score += score
1162
+ scored_words += 1
1163
+ if scored_words == 0:
1164
+ return 0.0
1165
+ return total_score / scored_words
1166
+
1167
+
1168
+ # ═══════════════════════════════════════════════════════════════════════════════
1169
+ # MAIN SPELL CHECKER CLASS
1170
+ # ═══════════════════════════════════════════════════════════════════════════════
1171
+
1172
+ class ArabicSpellChecker:
1173
+ """Main Arabic Spell Checker class"""
1174
+
1175
+ def __init__(self, model, tokenizer, device, use_contextual: bool = True):
1176
+ self.model = model
1177
+ self.tokenizer = tokenizer
1178
+ self.device = device
1179
+
1180
+ self.postprocessor = AraSpellPostProcessor()
1181
+ self.classifier = ErrorClassifier()
1182
+ self.rules = RulesBasedCorrector()
1183
+ self.validator = OutputValidator()
1184
+ self.vocab_manager = VocabularyManager(tokenizer)
1185
+ self.edit_corrector = EditDistanceCorrector(tokenizer)
1186
+ self.split_merge = SplitMergeSpecialist(self.vocab_manager)
1187
+ self.word_aligner = WordAligner(self.vocab_manager)
1188
+
1189
+ self.use_contextual = use_contextual
1190
+ if use_contextual:
1191
+ try:
1192
+ self.contextual = ContextualCorrector()
1193
+ logger.info("Contextual correction enabled")
1194
+ except Exception as e:
1195
+ logger.warning(f"Contextual correction disabled: {e}")
1196
+ self.contextual = None
1197
+ self.use_contextual = False
1198
+ else:
1199
+ self.contextual = None
1200
+
1201
+ def _fix_repeated_end_chars(self, text: str) -> str:
1202
+ text = re.sub(r'([ا-ي])\1+\b', r'\1', text)
1203
+ return text
1204
+
1205
+ def _fix_merged_with_errors(self, text: str) -> str:
1206
+ text = re.sub(r'ال([ا-ي])\1+([ا-ي]{2,})', r'ال\2', text)
1207
+ text = re.sub(r'\b([ا-ي]{3,})([ا-ي])\2+\b', r'\1\2', text)
1208
+ return text
1209
+
1210
+ def _split_merged_words_linguistic(self, text: str) -> str:
1211
+ text = re.sub(
1212
+ r'\b(في|من|إلى|الى|حتى|منذ|خلال|بعد|قبل)(ال)?([ا-ي]{3,})',
1213
+ r'\1 \2\3', text
1214
+ )
1215
+ text = re.sub(r'\b(كل)([ا-ي]{3,})', r'\1 \2', text)
1216
+ text = re.sub(r'([ا-ي]{3,})(ال)([ا-ي]{3,})', r'\1 \2\3', text)
1217
+ text = re.sub(r'\b([بلك])(ال)?([ا-ي]{3,})', r'\1 \2\3', text)
1218
+ text = re.sub(r'([ا-ي]{4,})(عليكم|عليك|عليه|عليها)', r'\1 \2', text)
1219
+ text = re.sub(r'([ا-ي]{3,})(على|عن)([ا-ي]{3,})', r'\1 \2 \3', text)
1220
+ return text
1221
+
1222
+ def _split_long_words_heuristic(self, text: str, max_length: int = 15) -> str:
1223
+ words = text.split()
1224
+ result = []
1225
+ for word in words:
1226
+ if len(word) <= max_length:
1227
+ result.append(word)
1228
+ continue
1229
+ if 'ال' in word[2:]:
1230
+ parts = word.split('ال', 1)
1231
+ if len(parts[0]) >= 2 and len(parts[1]) >= 3:
1232
+ result.extend([parts[0], 'ال' + parts[1]])
1233
+ continue
1234
+ if len(word) >= 8:
1235
+ split_found = False
1236
+ for split_pos in [2, 3]:
1237
+ prefix = word[:split_pos]
1238
+ suffix = word[split_pos:]
1239
+ if prefix in ['في', 'من', 'على', 'عن', 'مع', 'كل', 'ب', 'ل', 'ك']:
1240
+ result.extend([prefix, suffix])
1241
+ split_found = True
1242
+ break
1243
+ if not split_found:
1244
+ result.append(word)
1245
+ else:
1246
+ result.append(word)
1247
+ return ' '.join(result)
1248
+
1249
+ def _normalize_tanween_patterns(self, text: str) -> str:
1250
+ text = re.sub(r'([ا-ي]{2,})أ\b', r'\1اً', text)
1251
+ text = re.sub(r'\s+أ\s+', ' ', text)
1252
+ text = re.sub(r'\b([بلك])\s+([ا-ي])', r'\1\2', text)
1253
+ return text
1254
+
1255
+ def preprocess(self, text: str) -> str:
1256
+ """Preprocessing pipeline"""
1257
+ text = self.postprocessor.remove_harakat(text)
1258
+ text = self.postprocessor.remove_tatweel(text)
1259
+ text = self.postprocessor.normalize_special_chars(text)
1260
+ text = self._fix_repeated_end_chars(text)
1261
+ text = self._fix_merged_with_errors(text)
1262
+ text = self._split_merged_words_linguistic(text)
1263
+ text = self._split_long_words_heuristic(text)
1264
+ text = self._normalize_tanween_patterns(text)
1265
+ text = self.postprocessor.merge_separated_al(text)
1266
+ text = self.postprocessor.unified_collapse_repeated(text)
1267
+ text = self.rules.fix_char_substitution(text)
1268
+ text = self.rules.fix_char_repetition(text)
1269
+ text = self.postprocessor.normalize_spaces(text)
1270
+ return text
1271
+
1272
+ def postprocess(self, text: str, original: str = "") -> str:
1273
+ """Postprocessing pipeline"""
1274
+ return self.postprocessor.full_postprocess(text, original, vocab_manager=self.vocab_manager)
1275
+
1276
+ def model_inference(self, text: str, num_return_sequences: int = 5) -> List[str]:
1277
+ """Run seq2seq model inference and return top candidates."""
1278
+ inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
1279
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
1280
+ with torch.no_grad():
1281
+ outputs = self.model.generate(
1282
+ **inputs,
1283
+ num_beams=5,
1284
+ num_return_sequences=num_return_sequences,
1285
+ early_stopping=True,
1286
+ return_dict_in_generate=True,
1287
+ output_scores=True
1288
+ )
1289
+ candidates = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
1290
+ self._last_beam_scores = {}
1291
+ if hasattr(outputs, 'sequences_scores') and outputs.sequences_scores is not None:
1292
+ scores = outputs.sequences_scores.tolist()
1293
+ for cand, score in zip(candidates, scores):
1294
+ self._last_beam_scores[cand] = score
1295
+ return candidates
1296
+
1297
+ def correct(self, text: str) -> str:
1298
+ """
1299
+ Main correction pipeline (RERANKING APPROACH)
1300
+
1301
+ Steps:
1302
+ 1. Preprocess
1303
+ 2. Generate Candidates (Model Beams + Baseline)
1304
+ 3. Rerank Candidates (Validator + Fluency)
1305
+ 4. Select Best
1306
+ 5. Postprocess
1307
+ """
1308
+ if not text or not text.strip():
1309
+ return text
1310
+
1311
+ original = text
1312
+
1313
+ # 1. Preprocess
1314
+ preprocessed_text = self.preprocess(text)
1315
+
1316
+ # 2. Classify error type
1317
+ error_type = self.classifier.classify(preprocessed_text)
1318
+
1319
+ # 3. Generate Candidates
1320
+ candidates = []
1321
+ candidates.append(preprocessed_text)
1322
+
1323
+ rules_candidate = self.rules.advanced_heuristic_repair(text)
1324
+ candidates.append(rules_candidate)
1325
+
1326
+ edit_candidate = self.edit_corrector.generate_candidate(text)
1327
+ if edit_candidate != text and edit_candidate != rules_candidate:
1328
+ candidates.append(edit_candidate)
1329
+
1330
+ raw_model_output = None
1331
+ try:
1332
+ model_candidates = self.model_inference(preprocessed_text, num_return_sequences=5)
1333
+ raw_model_output = model_candidates[0] if model_candidates else None
1334
+ candidates.extend(model_candidates)
1335
+
1336
+ if model_candidates:
1337
+ hybrid_candidate = self.word_aligner.align_words(preprocessed_text, model_candidates[0])
1338
+ if hybrid_candidate not in candidates:
1339
+ candidates.append(hybrid_candidate)
1340
+ for beam in model_candidates[1:3]:
1341
+ hybrid_beam = self.word_aligner.align_words(preprocessed_text, beam)
1342
+ if hybrid_beam not in candidates:
1343
+ candidates.append(hybrid_beam)
1344
+
1345
+ if model_candidates and len(model_candidates) >= 3:
1346
+ try:
1347
+ beam_word_lists = [c.split() for c in model_candidates]
1348
+ max_words = max(len(wl) for wl in beam_word_lists)
1349
+ voted_words = []
1350
+ for pos in range(max_words):
1351
+ words_at_pos = []
1352
+ for wl in beam_word_lists:
1353
+ if pos < len(wl):
1354
+ words_at_pos.append(wl[pos])
1355
+ if words_at_pos:
1356
+ most_common = Counter(words_at_pos).most_common(1)[0][0]
1357
+ voted_words.append(most_common)
1358
+ voted_candidate = ' '.join(voted_words)
1359
+ if voted_candidate not in candidates:
1360
+ candidates.append(voted_candidate)
1361
+ except Exception:
1362
+ pass
1363
+ except Exception as e:
1364
+ logger.warning(f"Model inference failed: {e}")
1365
+
1366
+ # Remove duplicates
1367
+ unique_candidates = []
1368
+ seen = set()
1369
+ for c in candidates:
1370
+ if c not in seen:
1371
+ unique_candidates.append(c)
1372
+ seen.add(c)
1373
+ candidates = unique_candidates
1374
+
1375
+ # 4. Rerank Candidates
1376
+ best_candidate = preprocessed_text
1377
+ best_score = -1.0
1378
+ candidate_scores = []
1379
+
1380
+ for cand in candidates:
1381
+ is_valid, reason = self.validator.validate(original, cand, error_type.value)
1382
+ if len(cand) < len(original) * 0.5:
1383
+ is_valid = False
1384
+ reason = "too_short"
1385
+
1386
+ input_oov_count = self.vocab_manager.count_oov_words(original)
1387
+ cand_oov_count = self.vocab_manager.count_oov_words(cand)
1388
+ vocab_boost = 1.0
1389
+
1390
+ if input_oov_count > 0 and cand_oov_count < input_oov_count:
1391
+ oov_reduction = input_oov_count - cand_oov_count
1392
+ vocab_boost = 1.0 + (oov_reduction * 0.3)
1393
+ if cand_oov_count == 0 and self.vocab_manager.all_words_iv(cand):
1394
+ if not is_valid and reason not in ["empty_output"]:
1395
+ is_valid = True
1396
+ reason = "vocab_aware_accept"
1397
+ elif cand_oov_count > input_oov_count:
1398
+ vocab_boost = 0.5
1399
+ elif input_oov_count == 0 and cand_oov_count == 0:
1400
+ vocab_boost = 1.0
1401
+
1402
+ validity_factor = 1.0 if is_valid else 0.001
1403
+
1404
+ fluency_score = 0.0
1405
+ if self.use_contextual and self.contextual:
1406
+ try:
1407
+ fluency_score = self.contextual.calculate_sentence_score(cand)
1408
+ except Exception as e:
1409
+ logger.warning(f"Scoring failed: {e}")
1410
+ fluency_score = 0.5
1411
+ else:
1412
+ fluency_score = 1.0
1413
+
1414
+ dist = VocabularyManager.damerau_levenshtein_distance(preprocessed_text, cand)
1415
+ max_len = max(len(preprocessed_text), len(cand), 1)
1416
+ similarity = 1.0 - (dist / max_len)
1417
+ if cand == preprocessed_text:
1418
+ similarity = 1.0
1419
+
1420
+ keyboard_bonus = 1.0
1421
+ input_words = preprocessed_text.split()
1422
+ cand_words = cand.split()
1423
+ if len(input_words) == len(cand_words):
1424
+ for iw, cw in zip(input_words, cand_words):
1425
+ if iw != cw and len(iw) == len(cw):
1426
+ for ic, cc in zip(iw, cw):
1427
+ if ic != cc and RulesBasedCorrector.is_keyboard_neighbor(ic, cc):
1428
+ keyboard_bonus *= 1.05
1429
+
1430
+ if fluency_score > 0.85 and cand_oov_count == 0:
1431
+ if not is_valid and reason in ["too_short", "low_character_similarity", "word_count_mismatch"]:
1432
+ if len(cand) >= len(original) * 0.4:
1433
+ is_valid = True
1434
+ reason = "high_confidence_override"
1435
+ vocab_boost *= 1.2
1436
+ validity_factor = 1.0
1437
+
1438
+ fluency_exp = 0.3
1439
+ similarity_exp = 3.0
1440
+ beam_boost = 1.0
1441
+ if raw_model_output and cand == raw_model_output:
1442
+ beam_boost = 1.15
1443
+
1444
+ final_score = (fluency_score ** fluency_exp) * (similarity ** similarity_exp) * validity_factor * vocab_boost * keyboard_bonus * beam_boost
1445
+
1446
+ candidate_scores.append({
1447
+ 'text': cand, 'is_valid': is_valid, 'reason': reason,
1448
+ 'fluency': fluency_score, 'similarity': similarity,
1449
+ 'vocab_boost': vocab_boost, 'input_oov': input_oov_count,
1450
+ 'cand_oov': cand_oov_count, 'final_score': final_score
1451
+ })
1452
+
1453
+ if final_score > best_score:
1454
+ best_score = final_score
1455
+ best_candidate = cand
1456
+
1457
+ # Output Quality Scoring
1458
+ if best_candidate != preprocessed_text:
1459
+ preprocessed_score = 0.0
1460
+ for cs in candidate_scores:
1461
+ if cs['text'] == preprocessed_text:
1462
+ preprocessed_score = cs['final_score']
1463
+ break
1464
+ if preprocessed_score > 0 and best_score < preprocessed_score * 1.05:
1465
+ best_oov = self.vocab_manager.count_oov_words(best_candidate)
1466
+ prep_oov = self.vocab_manager.count_oov_words(preprocessed_text)
1467
+ if best_oov > prep_oov:
1468
+ best_candidate = preprocessed_text
1469
+ best_score = preprocessed_score
1470
+
1471
+ # Contextual Validation Layer
1472
+ if best_candidate != preprocessed_text and self.use_contextual and self.contextual:
1473
+ try:
1474
+ input_fluency = self.contextual.calculate_sentence_score(preprocessed_text)
1475
+ best_fluency = 0.0
1476
+ for cs in candidate_scores:
1477
+ if cs['text'] == best_candidate:
1478
+ best_fluency = cs['fluency']
1479
+ break
1480
+ if input_fluency > 0 and best_fluency > 0:
1481
+ if input_fluency > best_fluency * 1.5:
1482
+ input_oov = self.vocab_manager.count_oov_words(preprocessed_text)
1483
+ best_oov = self.vocab_manager.count_oov_words(best_candidate)
1484
+ if input_oov <= best_oov:
1485
+ best_candidate = preprocessed_text
1486
+ except Exception:
1487
+ pass
1488
+
1489
+ # 5. Postprocess Winner
1490
+ result = self.postprocess(best_candidate, original)
1491
+
1492
+ # IV-Safe Postprocessing Check
1493
+ if result != best_candidate:
1494
+ result_words = result.split()
1495
+ best_words = best_candidate.split()
1496
+ if len(result_words) == len(best_words):
1497
+ fixed_words = []
1498
+ for idx_fw, (rw, bw) in enumerate(zip(result_words, best_words)):
1499
+ if rw != bw:
1500
+ bw_iv = self.vocab_manager.is_iv(bw)
1501
+ rw_iv = self.vocab_manager.is_iv(rw)
1502
+ if bw_iv and not rw_iv:
1503
+ fixed_words.append(bw)
1504
+ else:
1505
+ fixed_words.append(rw)
1506
+ else:
1507
+ fixed_words.append(rw)
1508
+ result = ' '.join(fixed_words)
1509
+
1510
+ # 6. Contextual fine-tuning
1511
+ if self.use_contextual and self.contextual:
1512
+ if len(result) > 3:
1513
+ result = self.contextual.refine_sentence_with_mask(
1514
+ result, vocab_manager=self.vocab_manager,
1515
+ raw_model_output=raw_model_output
1516
+ )
1517
+
1518
+ # 7. Safe Split/Merge Post-processing
1519
+ result = self.split_merge.merge_fragments(result)
1520
+
1521
+ # 8. Output Stability Test
1522
+ if result != preprocessed_text and raw_model_output:
1523
+ try:
1524
+ re_preprocessed = self.preprocess(result)
1525
+ stability_dist = VocabularyManager.damerau_levenshtein_distance(result, re_preprocessed)
1526
+ result_len = max(len(result), 1)
1527
+ if stability_dist > 0:
1528
+ stability_ratio = stability_dist / result_len
1529
+ if stability_ratio > 0.15:
1530
+ raw_re = self.preprocess(raw_model_output)
1531
+ raw_stability = VocabularyManager.damerau_levenshtein_distance(
1532
+ raw_model_output, raw_re
1533
+ ) / max(len(raw_model_output), 1)
1534
+ if raw_stability < stability_ratio:
1535
+ raw_oov = self.vocab_manager.count_oov_words(raw_model_output)
1536
+ our_oov = self.vocab_manager.count_oov_words(result)
1537
+ if raw_oov <= our_oov:
1538
+ result = raw_model_output
1539
+ except Exception:
1540
+ pass
1541
+
1542
+ # 9. Bidirectional Word-Level Validation
1543
+ if raw_model_output and result != raw_model_output:
1544
+ result_words = result.split()
1545
+ raw_words = raw_model_output.split()
1546
+ if len(result_words) == len(raw_words):
1547
+ corrected_words = []
1548
+ changed = False
1549
+ for rw, raw_w in zip(result_words, raw_words):
1550
+ if rw != raw_w:
1551
+ rw_iv = self.vocab_manager.is_iv(rw)
1552
+ raw_iv = self.vocab_manager.is_iv(raw_w)
1553
+ if not rw_iv and raw_iv:
1554
+ corrected_words.append(raw_w)
1555
+ changed = True
1556
+ elif rw_iv and raw_iv:
1557
+ input_words_list = preprocessed_text.split()
1558
+ idx = len(corrected_words)
1559
+ if idx < len(input_words_list):
1560
+ input_w = input_words_list[idx]
1561
+ rw_dist = Levenshtein.distance(input_w, rw)
1562
+ raw_dist = Levenshtein.distance(input_w, raw_w)
1563
+ if raw_dist < rw_dist:
1564
+ corrected_words.append(raw_w)
1565
+ changed = True
1566
+ else:
1567
+ corrected_words.append(rw)
1568
+ else:
1569
+ corrected_words.append(rw)
1570
+ else:
1571
+ corrected_words.append(rw)
1572
+ else:
1573
+ corrected_words.append(rw)
1574
+ if changed:
1575
+ new_result = ' '.join(corrected_words)
1576
+ new_oov = self.vocab_manager.count_oov_words(new_result)
1577
+ old_oov = self.vocab_manager.count_oov_words(result)
1578
+ if new_oov <= old_oov:
1579
+ result = new_result
1580
+
1581
+ # 10. SAFETY NET
1582
+ if raw_model_output and raw_model_output != result:
1583
+ raw_oov = self.vocab_manager.count_oov_words(raw_model_output)
1584
+ our_oov = self.vocab_manager.count_oov_words(result)
1585
+ if raw_oov == 0 and our_oov > 0:
1586
+ is_valid, reason = self.validator.validate(original, raw_model_output, "mixed")
1587
+ if is_valid or reason == "space_leniency_accept":
1588
+ result = raw_model_output
1589
+ elif raw_oov == 0 and our_oov == 0:
1590
+ raw_dist = VocabularyManager.damerau_levenshtein_distance(original, raw_model_output)
1591
+ our_dist = VocabularyManager.damerau_levenshtein_distance(original, result)
1592
+ result_vs_raw_dist = VocabularyManager.damerau_levenshtein_distance(result, raw_model_output)
1593
+ if raw_dist < our_dist and result_vs_raw_dist <= 3:
1594
+ raw_valid, _ = self.validator.validate(original, raw_model_output, "mixed")
1595
+ if raw_valid:
1596
+ result = raw_model_output
1597
+ elif raw_oov == 0:
1598
+ raw_wc = len(raw_model_output.split())
1599
+ our_wc = len(result.split())
1600
+ if raw_wc != our_wc:
1601
+ raw_dist = VocabularyManager.damerau_levenshtein_distance(original, raw_model_output)
1602
+ our_dist = VocabularyManager.damerau_levenshtein_distance(original, result)
1603
+ if raw_dist < our_dist:
1604
+ raw_valid, _ = self.validator.validate(original, raw_model_output, "mixed")
1605
+ if raw_valid:
1606
+ result = raw_model_output
1607
+ # ── FINAL PASS: Hamza whitelist + Ta Marbuta fixes (unrevertable) ──
1608
+ # These are applied AFTER all validation/safety steps so they can't
1609
+ # be undone by Steps 8-10 which compare against raw_model_output.
1610
+ # The root issue: Steps 8-10 use edit distance to INPUT (which has errors)
1611
+ # so they revert corrections back to the erroneous form.
1612
+ result = AraSpellPostProcessor.fix_common_hamza(result)
1613
+ result = AraSpellPostProcessor.fix_ha_ta_marbuta(result, vocab_manager=self.vocab_manager)
1614
+
1615
+ return result
src/nlp/punctuation/spelling/araspell_service.py ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ AraSpell Service — Lazy-loaded Arabic spelling correction.
3
+
4
+ Model is loaded on first request and kept in memory.
5
+ Pre-downloaded during Docker build; loaded from HF cache at runtime (no network needed).
6
+ """
7
+
8
+ import os
9
+ import logging
10
+ import time
11
+ import torch
12
+
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # ── Lazy-loaded singletons ──
16
+ _spell_checker = None
17
+ _load_error = None
18
+
19
+ # Model identifiers
20
+ MODEL_REPO = 'bayan10/AraSpell-Model'
21
+ MODEL_FILENAME = 'last_model.pt'
22
+ TOKENIZER_NAME = 'aubmindlab/bert-base-arabertv02'
23
+
24
+
25
+ def get_spelling_model():
26
+ """
27
+ Lazy-load the spelling model on first call.
28
+ Returns the ArabicSpellChecker instance, or raises RuntimeError if loading fails.
29
+ """
30
+ global _spell_checker, _load_error
31
+
32
+ if _spell_checker is not None:
33
+ return _spell_checker
34
+
35
+ if _load_error is not None:
36
+ raise RuntimeError(f"Spelling model previously failed to load: {_load_error}")
37
+
38
+ try:
39
+ t0 = time.time()
40
+ logger.info("Loading AraSpell spelling model (lazy init)...")
41
+
42
+ from huggingface_hub import hf_hub_download
43
+ from transformers import AutoTokenizer, EncoderDecoderModel
44
+
45
+ # 1. Download checkpoint (from HF cache — pre-downloaded in Docker build)
46
+ logger.info(f"Resolving checkpoint: {MODEL_REPO}/{MODEL_FILENAME}")
47
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
48
+ logger.info(f"Checkpoint path: {model_path}")
49
+
50
+ # 2. Load tokenizer
51
+ logger.info(f"Loading tokenizer: {TOKENIZER_NAME}")
52
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
53
+
54
+ # 3. Build encoder-decoder model from AraBERT
55
+ logger.info("Building EncoderDecoderModel from AraBERT...")
56
+ model = EncoderDecoderModel.from_encoder_decoder_pretrained(
57
+ TOKENIZER_NAME, TOKENIZER_NAME
58
+ )
59
+
60
+ # 4. Configure generation
61
+ model.config.decoder_start_token_id = tokenizer.cls_token_id
62
+ model.config.pad_token_id = tokenizer.pad_token_id
63
+ model.config.eos_token_id = tokenizer.sep_token_id
64
+ model.generation_config.max_length = 128
65
+ model.generation_config.decoder_start_token_id = tokenizer.cls_token_id
66
+ model.generation_config.pad_token_id = tokenizer.pad_token_id
67
+ model.generation_config.eos_token_id = tokenizer.sep_token_id
68
+
69
+ # 5. Load trained weights
70
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
71
+ logger.info(f"Loading checkpoint weights on {device}...")
72
+ checkpoint = torch.load(model_path, map_location=device, weights_only=False)
73
+ model.load_state_dict(checkpoint['model_state_dict'], strict=False)
74
+ model = model.to(device)
75
+ model.eval()
76
+
77
+ epoch = checkpoint.get('epoch', 'N/A')
78
+ logger.info(f"Spelling model loaded on {device}, epoch: {epoch}")
79
+ # 6. Initialize the spell checker pipeline (contextual=True for MLM-based refinement)
80
+
81
+ from nlp.spelling.araspell_rules import ArabicSpellChecker
82
+ _spell_checker = ArabicSpellChecker(
83
+ model, tokenizer, device, use_contextual=True
84
+ )
85
+
86
+ elapsed = time.time() - t0
87
+ logger.info(f"AraSpell ready in {elapsed:.1f}s")
88
+ return _spell_checker
89
+
90
+ except Exception as e:
91
+ import traceback
92
+ _load_error = str(e)
93
+ logger.error(f"Failed to load spelling model: {e}")
94
+ logger.error(traceback.format_exc())
95
+ raise RuntimeError(f"Spelling model load failed: {e}")
96
+
97
+
98
+ def is_loaded() -> bool:
99
+ """Check if the spelling model is loaded."""
100
+ return _spell_checker is not None
101
+
102
+
103
+ def get_load_error() -> str:
104
+ """Return the last load error, or empty string."""
105
+ return _load_error or ""
tests/phase10/reports/collision_benchmark_results.json ADDED
The diff for this file is too large to render. See raw diff
 
tests/phase10/reports/phase10_results.json CHANGED
The diff for this file is too large to render. See raw diff