Spaces:

bayan10
/

bayan-api

Running

App Files Files Community

youssefreda9 commited on 2 days ago

Commit

70184c4

1 Parent(s): 16ae935

UI/UX: Add Quran standalone tab, summarize textarea, floating selection bar, fix clear editor, remove duplicate button

Browse files

Files changed (9) hide show

analyze_failures.py +67 -0
src/css/components.css +91 -0
src/index.html +289 -17
src/js/editor.js +1 -2
src/nlp/punctuation/spelling/__init__.py +1 -0
src/nlp/punctuation/spelling/araspell_rules.py +1615 -0
src/nlp/punctuation/spelling/araspell_service.py +105 -0
tests/phase10/reports/collision_benchmark_results.json +0 -0
tests/phase10/reports/phase10_results.json +0 -0

analyze_failures.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Analyze remaining 24 failures after Layer 1/2/3 fixes."""
+import json, re
+with open('tests/phase10/reports/collision_benchmark_results.json', 'r', encoding='utf-8') as f:
+    data = json.load(f)
+def norm(t):
+    t = re.sub(r'[\u064B-\u065F\u0670]', '', t)
+    t = t.rstrip('.،؛؟!?!')
+    return re.sub(r'\s+', ' ', t).strip()
+categories = {}
+for r in data['results']:
+    if r['pipeline_verdict'] != 'FN':
+        continue
+    rid = r['id']
+    exp = r['expected'].strip()
+    act = r['pipeline_output'].strip()
+    inp = r['input'].strip()
+    inp_w = inp.split()
+    exp_w = exp.split()
+    act_w = act.split()
+    issues = []
+    for i in range(min(len(exp_w), len(act_w))):
+        aw = act_w[i].rstrip('.،؛؟!?!')
+        ew = exp_w[i].rstrip('.،؛؟!?!')
+        iw = inp_w[i] if i < len(inp_w) else '—'
+        aw_n = re.sub(r'[\u064B-\u065F]', '', aw)
+        ew_n = re.sub(r'[\u064B-\u065F]', '', ew)
+        if aw_n == ew_n:
+            continue  # tanween/diacritic only diff
+        if aw != ew:
+            if iw == aw:
+                cause = "MODEL_MISS"
+            elif iw == ew:
+                cause = "CORRUPTED"
+            else:
+                cause = "WRONG_FIX"
+            issues.append(f"    [{i}] '{iw}'→'{aw}' (exp:'{ew}') {cause}")
+    if len(exp_w) != len(act_w):
+        issues.append(f"    word count: {len(act_w)} vs {len(exp_w)}")
+    # Classify
+    has_junk = any('وومن' in a or '.و' in a or 'ةل' in a for a in act_w)
+    has_trailing_و = any(a.endswith('و') and not e.endswith('و') and not e.endswith('وا')
+                         for a, e in zip(act_w, exp_w) if a != e)
+    cat = r['category']
+    print(f"\n{rid} [{cat}]")
+    print(f"  IN:  {inp[:60]}")
+    print(f"  EXP: {exp[:60]}")
+    print(f"  ACT: {act[:60]}")
+    for iss in issues:
+        print(iss)
+    if has_junk:
+        print("  >>> TRAILING JUNK")
+# Summary of what each failure needs
+print("\n" + "="*60)
+print("FIXABILITY ANALYSIS")
+print("="*60)
+print(f"\nTotal failures: 24")
+print(f"Need: 17 more passes to reach 85% (43/50)")

src/css/components.css CHANGED Viewed

@@ -3433,3 +3433,94 @@ select:focus-visible,
   outline-offset: 2px;
 }

   outline-offset: 2px;
 }
+/* ── Floating Selection Toolbar ── */
+.selection-toolbar {
+  position: absolute;
+  z-index: 1100;
+  display: flex;
+  align-items: center;
+  gap: 2px;
+  padding: 4px 6px;
+  border-radius: 12px;
+  background: var(--color-surface-elevated);
+  border: 1px solid var(--color-border-strong);
+  box-shadow: 0 8px 32px rgba(0,0,0,0.18), 0 0 0 1px rgba(255,255,255,0.05);
+  backdrop-filter: blur(16px);
+  transform: translateX(-50%);
+  animation: selbar-in 0.2s ease;
+  pointer-events: auto;
+}
+.selection-toolbar.is-hidden {
+  display: none;
+}
+@keyframes selbar-in {
+  from { opacity: 0; transform: translateX(-50%) translateY(6px); }
+  to   { opacity: 1; transform: translateX(-50%) translateY(0); }
+}
+.sel-tool-btn {
+  display: inline-flex;
+  align-items: center;
+  gap: 5px;
+  padding: 6px 12px;
+  border: none;
+  border-radius: 8px;
+  background: transparent;
+  color: var(--color-text-secondary);
+  font-family: inherit;
+  font-size: 12px;
+  font-weight: 600;
+  cursor: pointer;
+  transition: all 0.15s ease;
+  white-space: nowrap;
+}
+.sel-tool-btn:hover {
+  background: var(--color-surface);
+  color: var(--color-text-primary);
+}
+.sel-tool-sep {
+  width: 1px;
+  height: 20px;
+  background: var(--color-border);
+  flex-shrink: 0;
+}
+/* ── Summary Source Toggle ── */
+.summary-source-toggle {
+  display: flex;
+  gap: 0;
+  border-radius: 10px;
+  background: var(--color-surface-elevated);
+  border: 1px solid var(--color-border);
+  padding: 3px;
+  overflow: hidden;
+}
+.summary-source-btn {
+  flex: 1;
+  padding: 8px 16px;
+  border: none;
+  border-radius: 8px;
+  background: transparent;
+  color: var(--color-text-secondary);
+  font-family: inherit;
+  font-size: 13px;
+  font-weight: 600;
+  cursor: pointer;
+  transition: all 0.2s ease;
+}
+.summary-source-btn:hover {
+  color: var(--color-text-primary);
+}
+.summary-source-btn.active {
+  background: linear-gradient(135deg, var(--color-primary), var(--color-secondary));
+  color: var(--color-text-inverse);
+  box-shadow: 0 2px 8px rgba(107, 163, 224, 0.25);
+}
+/* ── Editor Tab Icons ── */
+.editor-tab svg {
+  margin-left: 4px;
+  opacity: 0.7;
+}
+.editor-tab.active svg {
+  opacity: 1;
+}

src/index.html CHANGED Viewed

@@ -668,6 +668,7 @@
             <button id="write-tab" onclick="switchTab('write')" class="editor-tab active" type="button">كتابة</button>
             <button id="summarize-tab" onclick="switchTab('summarize')" class="editor-tab" type="button">تلخيص</button>
              <button id="dialect-tab" onclick="switchTab('dialect')" class="editor-tab" type="button">تحويل للفصحى</button>
             <button id="docs-sidebar-toggle" class="docs-sidebar-toggle-mobile btn-ghost lg:hidden" type="button" aria-label="مستنداتي" aria-expanded="false" aria-controls="docs-sidebar">
              <svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 7v10a2 2 0 002 2h14a2 2 0 002-2V9a2 2 0 00-2-2h-6l-2-2H5a2 2 0 00-2 2z"/></svg>
             </button>
@@ -679,7 +680,7 @@
             </div>
             <span id="auto-save-status" class="text-xs text-secondary" style="opacity:0;transition:opacity 0.3s;"></span>
             <div class="window-dots" aria-hidden="true">
-             <span class="dot dot--red" title="مسح المحرر" onclick="showConfirmDialog('مسح المحرر','هل تريد مسح جميع محتويات المحرر؟ لا يمكن التراجع عن هذا.',function(){document.getElementById('editor-surface').innerHTML='';if(typeof updatePlaceholder==='function')updatePlaceholder();if(typeof updateEditorStats==='function')updateEditorStats()})" style="cursor:pointer;"></span>
              <span class="dot dot--yellow" title="طي لوحة الاقتراحات" onclick="document.querySelector('.sidebar-desktop')?.classList.toggle('collapsed')" style="cursor:pointer;"></span>
              <span class="dot dot--green" title="توسيع المحرر للعرض الكامل" onclick="document.querySelector('.editor-layout')?.classList.toggle('editor-fullscreen')" style="cursor:pointer;"></span>
             </div>
@@ -804,8 +805,25 @@
            <p class="popover-hint">اختر التصحيح المناسب · Escape للإغلاق</p>
           </div>
          </div>
-         <div id="summarize-area" class="summarize-panel is-hidden">
-          <!-- Item 11: Mode Toggle -->
           <div class="summary-mode-toggle">
            <button type="button" class="summary-mode-btn active" id="summary-mode-paragraph" onclick="setSummaryMode('paragraph')">
             <svg width="14" height="14" fill="currentColor" viewBox="0 0 24 24"><path d="M3 5h18v2H3V5zm0 8h18v2H3v-2zm0 4h12v2H3v-2z"/></svg>
@@ -875,7 +893,55 @@
            </div>
            <div id="dialect-result" class="text-right text-lg editor-content" dir="rtl" style="line-height: 2;"></div>
           </div>
-         </div>
          <div class="editor-footer">
           <div class="editor-stats" role="status" aria-label="إحصائيات">
            <div class="flex items-center gap-2"><span class="stat-dot stat-dot--spelling" aria-hidden="true"></span><span class="text-sm text-secondary"><span id="spelling-count">٠</span> إملائي</span></div>
@@ -1213,18 +1279,201 @@
     function switchTab(tab) {
-      const writeTab = document.getElementById('write-tab');
-      const summarizeTab = document.getElementById('summarize-tab');
-      const dialectTab = document.getElementById('dialect-tab');
-      const writeArea = document.getElementById('write-area');
-      const summarizeArea = document.getElementById('summarize-area');
-      const dialectArea = document.getElementById('dialect-area');
       const formatToolbar = document.getElementById('format-toolbar');
-      [writeTab, summarizeTab, dialectTab].forEach(function(t){if(t)t.classList.remove('active');});
-      [writeArea, summarizeArea, dialectArea].forEach(function(a){if(a)a.classList.add('is-hidden');});
-      if (tab === 'write') { writeTab.classList.add('active'); writeArea.classList.remove('is-hidden'); if(formatToolbar)formatToolbar.style.display=''; }
-      else if (tab === 'summarize') { summarizeTab.classList.add('active'); summarizeArea.classList.remove('is-hidden'); if(formatToolbar)formatToolbar.style.display='none'; }
-      else if (tab === 'dialect') { dialectTab.classList.add('active'); dialectArea.classList.remove('is-hidden'); if(formatToolbar)formatToolbar.style.display='none'; }
     }
     let _dialectResult = '';
     async function convertDialect() {
@@ -1533,11 +1782,17 @@
     }
     async function generateSummary(event) {
-      const text = (typeof getEditorText === 'function' ? getEditorText() : '').trim();
       if (!text) {
         const summaryText = document.getElementById('summary-text');
-        summaryText.innerHTML = '<p class="text-secondary text-center">الرجاء كتابة نص في المحرر أولاً</p>';
         document.getElementById('summary-preview').classList.add('show');
         return;
       }
@@ -1946,5 +2201,22 @@
     </div>
    </div>
   </div>
  </body>
 </html>

             <button id="write-tab" onclick="switchTab('write')" class="editor-tab active" type="button">كتابة</button>
             <button id="summarize-tab" onclick="switchTab('summarize')" class="editor-tab" type="button">تلخيص</button>
              <button id="dialect-tab" onclick="switchTab('dialect')" class="editor-tab" type="button">تحويل للفصحى</button>
+            <button id="quran-tab" onclick="switchTab('quran')" class="editor-tab" type="button">القرآن</button>
             <button id="docs-sidebar-toggle" class="docs-sidebar-toggle-mobile btn-ghost lg:hidden" type="button" aria-label="مستنداتي" aria-expanded="false" aria-controls="docs-sidebar">
              <svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 7v10a2 2 0 002 2h14a2 2 0 002-2V9a2 2 0 00-2-2h-6l-2-2H5a2 2 0 00-2 2z"/></svg>
             </button>
             </div>
             <span id="auto-save-status" class="text-xs text-secondary" style="opacity:0;transition:opacity 0.3s;"></span>
             <div class="window-dots" aria-hidden="true">
+             <span class="dot dot--red" title="مسح المحرر" onclick="if(typeof clearEditor==='function'){clearEditor();}" style="cursor:pointer;"></span>
              <span class="dot dot--yellow" title="طي لوحة الاقتراحات" onclick="document.querySelector('.sidebar-desktop')?.classList.toggle('collapsed')" style="cursor:pointer;"></span>
              <span class="dot dot--green" title="توسيع المحرر للعرض الكامل" onclick="document.querySelector('.editor-layout')?.classList.toggle('editor-fullscreen')" style="cursor:pointer;"></span>
             </div>
            <p class="popover-hint">اختر التصحيح المناسب · Escape للإغلاق</p>
           </div>
          </div>
+          <div id="summarize-area" class="summarize-panel is-hidden">
+           <!-- Source Toggle: Editor text vs Custom input -->
+           <div class="mb-4">
+            <div class="flex items-center gap-2 mb-3">
+             <svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24" style="color: var(--color-primary);"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/></svg>
+             <span class="text-base font-bold">تلخيص النصوص</span>
+            </div>
+            <div class="summary-source-toggle mb-3">
+             <button type="button" class="summary-source-btn active" id="summary-src-editor" onclick="setSummarySource('editor')">نص المحرر</button>
+             <button type="button" class="summary-source-btn" id="summary-src-custom" onclick="setSummarySource('custom')">نص مخصص</button>
+            </div>
+            <div id="summary-custom-input-wrap" class="is-hidden">
+             <textarea id="summary-custom-input" class="w-full p-4 rounded-xl text-right text-lg editor-content" dir="rtl" rows="6" placeholder="الصق أو اكتب النص الذي تريد تلخيصه هنا..." style="background: var(--color-surface); border: 1px solid var(--color-border); color: var(--color-text-primary); resize: vertical; font-family: inherit;"></textarea>
+             <div class="flex items-center justify-between mt-1 mb-2" dir="rtl">
+              <span id="summary-char-count" class="text-xs" style="color: var(--text-secondary);">٠ حرف</span>
+             </div>
+            </div>
+           </div>
+           <!-- Item 11: Mode Toggle -->
           <div class="summary-mode-toggle">
            <button type="button" class="summary-mode-btn active" id="summary-mode-paragraph" onclick="setSummaryMode('paragraph')">
             <svg width="14" height="14" fill="currentColor" viewBox="0 0 24 24"><path d="M3 5h18v2H3V5zm0 8h18v2H3v-2zm0 4h12v2H3v-2z"/></svg>
            </div>
            <div id="dialect-result" class="text-right text-lg editor-content" dir="rtl" style="line-height: 2;"></div>
           </div>
+          </div>
+          <!-- Quran Standalone Panel -->
+          <div id="quran-area" class="summarize-panel is-hidden">
+           <div class="mb-4">
+            <div class="flex items-center gap-2 mb-3">
+             <svg width="18" height="18" fill="none" stroke="#06b6d4" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"/></svg>
+             <span class="text-base font-bold">تدقيق النص القرآني</span>
+            </div>
+            <p class="text-sm mb-4" style="color: var(--text-secondary);">اكتب أو الصق نصًا قرآنيًا وسنعرض لك النص الصحيح بالتشكيل مع اسم السورة ورقم الآية، مع إمكانية ترجمته إلى ١٤ لغة.</p>
+           </div>
+           <textarea id="quran-input" class="w-full p-4 rounded-xl text-right text-lg" dir="rtl" rows="4" placeholder="اكتب الآية أو جزءًا منها هنا..." style="background: var(--color-surface); border: 1px solid var(--color-border); color: var(--color-text-primary); resize: vertical; font-family: 'Amiri Quran', 'Cairo', serif; font-size: 20px; line-height: 2;"></textarea>
+           <button id="quran-search-btn" onclick="searchQuranStandalone()" class="btn-primary w-full py-4 text-lg mt-4 mb-4" type="button">بحث وتدقيق</button>
+           <div id="quran-inline-result" class="is-hidden" style="background: var(--color-surface); border: 1px solid rgba(6,182,212,0.2); border-radius: 1rem; padding: 1.5rem;">
+            <div class="flex items-center justify-between mb-3">
+             <div class="text-sm font-bold" style="color:#06b6d4;">✓ النص القرآني المدقق</div>
+             <div class="flex items-center gap-2">
+              <button onclick="copyQuranInlineResult()" class="quran-copy-btn" type="button" title="نسخ">📋</button>
+              <button id="quran-inline-apply-btn" onclick="applyQuranInlineResult()" class="quran-apply-btn" type="button">تطبيق في المحرر ✓</button>
+             </div>
+            </div>
+            <p id="quran-inline-uthmani" class="quran-uthmani" style="font-size: 24px; line-height: 2.2; text-align: center;"></p>
+            <p id="quran-inline-reference" class="quran-reference text-center mt-2"></p>
+            <div class="mt-4 pt-4" style="border-top:1px solid var(--color-border);">
+             <div class="flex items-center gap-3 mb-3 flex-wrap">
+              <svg width="16" height="16" fill="none" stroke="#06b6d4" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129"/></svg>
+              <span class="text-sm font-bold">ترجمة الآية</span>
+              <select id="quran-inline-lang" onchange="translateQuranInline()" class="quran-lang-select">
+               <option value="">— اختر لغة —</option>
+               <option value="english">🇬🇧 English</option>
+               <option value="french">🇫🇷 Français</option>
+               <option value="turkish">🇹🇷 Türkçe</option>
+               <option value="persian">🇮🇷 فارسی</option>
+               <option value="russian">🇷🇺 Русский</option>
+               <option value="spanish">🇪🇸 Español</option>
+               <option value="german">🇩🇪 Deutsch</option>
+               <option value="indonesian">🇮🇩 Indonesia</option>
+               <option value="malay">🇲🇾 Melayu</option>
+               <option value="bengali">🇧🇩 বাংলা</option>
+               <option value="bosnian">🇧🇦 Bosanski</option>
+               <option value="portuguese">🇵🇹 Português</option>
+               <option value="uzbek">🇺🇿 O'zbek</option>
+              </select>
+             </div>
+             <div id="quran-inline-translation" class="is-hidden p-4 rounded-xl" style="background:rgba(6,182,212,0.06); border:1px solid rgba(6,182,212,0.15);">
+              <p id="quran-inline-trans-text" style="font-size:18px; line-height:2; color:var(--color-text-primary); text-align:center;"></p>
+             </div>
+            </div>
+           </div>
+          </div>
          <div class="editor-footer">
           <div class="editor-stats" role="status" aria-label="إحصائيات">
            <div class="flex items-center gap-2"><span class="stat-dot stat-dot--spelling" aria-hidden="true"></span><span class="text-sm text-secondary"><span id="spelling-count">٠</span> إملائي</span></div>
     function switchTab(tab) {
+      const tabs = ['write', 'summarize', 'dialect', 'quran'];
       const formatToolbar = document.getElementById('format-toolbar');
+      tabs.forEach(function(t) {
+        var tabEl = document.getElementById(t + '-tab');
+        var areaEl = document.getElementById(t + '-area') || document.getElementById(t === 'write' ? 'write-area' : t + '-area');
+        if (tabEl) tabEl.classList.remove('active');
+        if (areaEl) areaEl.classList.add('is-hidden');
+      });
+      var activeTab = document.getElementById(tab + '-tab');
+      var activeArea = document.getElementById(tab === 'write' ? 'write-area' : tab + '-area');
+      if (activeTab) activeTab.classList.add('active');
+      if (activeArea) activeArea.classList.remove('is-hidden');
+      if (formatToolbar) formatToolbar.style.display = (tab === 'write') ? '' : 'none';
+    }
+    /* ═══════════════════════════════════════════
+       Summarize — Source Toggle (editor vs custom)
+       ═══════════════════════════════════════════ */
+    window._summarySource = 'editor';
+    function setSummarySource(src) {
+      window._summarySource = src;
+      var editorBtn = document.getElementById('summary-src-editor');
+      var customBtn = document.getElementById('summary-src-custom');
+      var customWrap = document.getElementById('summary-custom-input-wrap');
+      if (editorBtn) editorBtn.classList.toggle('active', src === 'editor');
+      if (customBtn) customBtn.classList.toggle('active', src === 'custom');
+      if (customWrap) customWrap.classList.toggle('is-hidden', src !== 'custom');
+    }
+    /* ═══════════════════════════════════════════
+       Floating Selection Toolbar
+       ═══════════════════════════════════════════ */
+    (function() {
+      var selBar = null;
+      var hideTimer = null;
+      function showSelectionBar() {
+        var sel = window.getSelection();
+        if (!sel || sel.isCollapsed || !sel.toString().trim()) { hideSelectionBar(); return; }
+        var editor = document.getElementById('editor-container');
+        if (!editor || !editor.contains(sel.anchorNode)) { hideSelectionBar(); return; }
+        if (!selBar) selBar = document.getElementById('selection-toolbar');
+        if (!selBar) return;
+        var range = sel.getRangeAt(0);
+        var rect = range.getBoundingClientRect();
+        selBar.style.top = (rect.top + window.scrollY - 48) + 'px';
+        selBar.style.left = (rect.left + rect.width / 2) + 'px';
+        selBar.classList.remove('is-hidden');
+      }
+      function hideSelectionBar() {
+        if (!selBar) selBar = document.getElementById('selection-toolbar');
+        if (selBar) selBar.classList.add('is-hidden');
+      }
+      document.addEventListener('selectionchange', function() {
+        clearTimeout(hideTimer);
+        hideTimer = setTimeout(function() {
+          var sel = window.getSelection();
+          if (sel && !sel.isCollapsed && sel.toString().trim().length > 2) {
+            var editor = document.getElementById('editor-container');
+            if (editor && editor.contains(sel.anchorNode)) { showSelectionBar(); return; }
+          }
+          hideSelectionBar();
+        }, 300);
+      });
+      document.addEventListener('mousedown', function(e) {
+        if (!selBar) selBar = document.getElementById('selection-toolbar');
+        if (selBar && !selBar.contains(e.target)) hideSelectionBar();
+      });
+    })();
+    function selectionToolAction(tool) {
+      var sel = window.getSelection();
+      var text = sel ? sel.toString().trim() : '';
+      if (!text) { if (typeof showToast === 'function') showToast('حدد نصًا أولاً', 'warning'); return; }
+      var selBar = document.getElementById('selection-toolbar');
+      if (selBar) selBar.classList.add('is-hidden');
+      if (tool === 'summarize') {
+        switchTab('summarize');
+        setSummarySource('custom');
+        var ta = document.getElementById('summary-custom-input');
+        if (ta) { ta.value = text; }
+      } else if (tool === 'dialect') {
+        switchTab('dialect');
+        var ta = document.getElementById('dialect-input');
+        if (ta) { ta.value = text; if (typeof updateDialectCharCount === 'function') updateDialectCharCount(); }
+      } else if (tool === 'quran') {
+        switchTab('quran');
+        var ta = document.getElementById('quran-input');
+        if (ta) ta.value = text;
+      }
+    }
+    /* ═══════════════════════════════════════════
+       Quran Standalone Panel Functions
+       ═══════════════════════════════════════════ */
+    let _quranInlineVerse = '';
+    let _quranInlineRef = '';
+    let _quranInlineQuery = '';
+    async function searchQuranStandalone() {
+      var input = document.getElementById('quran-input').value.trim();
+      if (!input) { if (typeof showToast === 'function') showToast('الرجاء كتابة نص قرآني أولاً', 'warning'); return; }
+      _quranInlineQuery = input;
+      var resultDiv = document.getElementById('quran-inline-result');
+      var uthmaniEl = document.getElementById('quran-inline-uthmani');
+      var refEl = document.getElementById('quran-inline-reference');
+      var searchBtn = document.getElementById('quran-search-btn');
+      uthmaniEl.innerHTML = '<span class="text-secondary">⏳ جاري البحث...</span>';
+      refEl.textContent = '';
+      resultDiv.classList.remove('is-hidden');
+      document.getElementById('quran-inline-translation').classList.add('is-hidden');
+      document.getElementById('quran-inline-lang').value = '';
+      if (searchBtn) { searchBtn.disabled = true; searchBtn.textContent = '⏳ جاري البحث...'; }
+      var _abortCtrl = new AbortController();
+      var _timeout = setTimeout(function(){ _abortCtrl.abort(); }, 30000);
+      try {
+        var res = await fetch('/api/quran', {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ text: input, language: 'تدقيق الايات' }),
+          signal: _abortCtrl.signal
+        });
+        var data = await res.json();
+        if (data.error) {
+          uthmaniEl.innerHTML = '<span class="text-secondary">' + data.error.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>';
+          return;
+        }
+        var seg = data.matched_segment || '';
+        var refMatch = seg.match(/【([^】]+)】/);
+        var verseText = seg.replace(/\s*【[^】]+】\s*$/, '').replace(/^\(/, '').replace(/\)$/, '');
+        var reference = refMatch ? refMatch[1] : '';
+        _quranInlineVerse = verseText;
+        _quranInlineRef = reference;
+        uthmaniEl.textContent = verseText;
+        refEl.textContent = reference ? '[' + reference + ']' : '';
+      } catch (err) {
+        var msg = err.name === 'AbortError' ? 'انتهى وقت الانتظار — حاول مرة أخرى' : 'حدث خطأ أثناء البحث — تأكد من الاتصال';
+        uthmaniEl.innerHTML = '<span class="text-secondary">' + msg + '</span>';
+      } finally {
+        clearTimeout(_timeout);
+        if (searchBtn) { searchBtn.disabled = false; searchBtn.textContent = 'بحث وتدقيق'; }
+      }
+    }
+    async function translateQuranInline() {
+      var lang = document.getElementById('quran-inline-lang').value;
+      if (!lang || !_quranInlineQuery) return;
+      var resultDiv = document.getElementById('quran-inline-translation');
+      var textEl = document.getElementById('quran-inline-trans-text');
+      textEl.innerHTML = '<span class="text-secondary">⏳ جاري الترجمة...</span>';
+      resultDiv.classList.remove('is-hidden');
+      var _abortCtrl = new AbortController();
+      var _timeout = setTimeout(function(){ _abortCtrl.abort(); }, 30000);
+      try {
+        var res = await fetch('/api/quran', {
+          method: 'POST',
+          headers: { 'Content-Type': 'application/json' },
+          body: JSON.stringify({ text: _quranInlineQuery, language: lang }),
+          signal: _abortCtrl.signal
+        });
+        var data = await res.json();
+        if (data.error) {
+          textEl.innerHTML = '<span class="text-secondary">' + data.error.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;') + '</span>';
+          return;
+        }
+        var seg = data.matched_segment || '';
+        var transText = seg.replace(/\s*【[^】]+】\s*$/, '').replace(/^\(/, '').replace(/\)$/, '');
+        textEl.textContent = transText;
+      } catch (err) {
+        var msg = err.name === 'AbortError' ? 'انتهى وقت الانتظار' : 'حدث خطأ في الترجمة';
+        textEl.innerHTML = '<span class="text-secondary">' + msg + '</span>';
+      } finally { clearTimeout(_timeout); }
+    }
+    function copyQuranInlineResult() {
+      var text = (_quranInlineVerse || '') + (_quranInlineRef ? ' [' + _quranInlineRef + ']' : '');
+      if (!text.trim()) return;
+      navigator.clipboard.writeText(text).then(function() {
+        if (typeof showToast === 'function') showToast('✓ تم نسخ النص المدقق');
+      });
+    }
+    function applyQuranInlineResult() {
+      if (!_quranInlineVerse) return;
+      var editor = document.getElementById('editor-container');
+      if (!editor) return;
+      if (typeof pushUndoState === 'function') pushUndoState();
+      var esc = function(t) { return t.replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;'); };
+      var refHTML = _quranInlineRef ? ' <span class="quran-ref-inline">[' + esc(_quranInlineRef) + ']</span>' : '';
+      var existing = editor.innerHTML;
+      editor.innerHTML = existing + (existing ? '<br>' : '') +
+        '<span class="quran-applied" contenteditable="false" data-quran="true">' +
+        esc(_quranInlineVerse) + refHTML + '</span>';
+      editor.dispatchEvent(new Event('input', { bubbles: true }));
+      switchTab('write');
+      if (typeof showToast === 'function') showToast('✓ تم إضافة النص القرآني في المحرر');
     }
     let _dialectResult = '';
     async function convertDialect() {
     }
     async function generateSummary(event) {
+      let text = '';
+      if (window._summarySource === 'custom') {
+        var customInput = document.getElementById('summary-custom-input');
+        text = customInput ? customInput.value.trim() : '';
+      } else {
+        text = (typeof getEditorText === 'function' ? getEditorText() : '').trim();
+      }
       if (!text) {
         const summaryText = document.getElementById('summary-text');
+        summaryText.innerHTML = '<p class="text-secondary text-center">' + (window._summarySource === 'custom' ? 'الرجاء كتابة نص في مربع الإدخال أولاً' : 'الرجاء كتابة نص في المحرر أولاً') + '</p>';
         document.getElementById('summary-preview').classList.add('show');
         return;
       }
     </div>
    </div>
   </div>
+   <!-- Floating Selection Toolbar -->
+   <div id="selection-toolbar" class="selection-toolbar is-hidden" role="toolbar" aria-label="أدوات النص المحدد">
+    <button type="button" class="sel-tool-btn" onclick="selectionToolAction('summarize')" title="تلخيص النص المحدد">
+     <svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/></svg>
+     تلخيص
+    </button>
+    <span class="sel-tool-sep"></span>
+    <button type="button" class="sel-tool-btn" onclick="selectionToolAction('dialect')" title="تحويل النص المحدد للفصحى">
+     <svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129"/></svg>
+     فصحى
+    </button>
+    <span class="sel-tool-sep"></span>
+    <button type="button" class="sel-tool-btn" onclick="selectionToolAction('quran')" title="تدقيق النص القرآني المحدد">
+     <svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"/></svg>
+     قرآن
+    </button>
+   </div>
  </body>
 </html>

src/js/editor.js CHANGED Viewed

@@ -409,8 +409,7 @@ function showTooltip(element) {
       const btnClass = isMain ? 'popover-alt-btn popover-alt-main' : 'popover-alt-btn';
       html += `<button class="${btnClass}" data-alt-correction="${escapeHtml(alt)}" type="button">${isMain ? '✓ ' : ''}${escapeHtml(alt)}</button>`;
     });
-    // Render keep button at end
-    html += `<button class="popover-alt-btn popover-alt-keep" data-alt-correction="${escapeHtml(suggestion.original)}" type="button">إبقاء كما هي</button>`;
     alternativesEl.innerHTML = html;
     // Bind click events for alternatives

       const btnClass = isMain ? 'popover-alt-btn popover-alt-main' : 'popover-alt-btn';
       html += `<button class="${btnClass}" data-alt-correction="${escapeHtml(alt)}" type="button">${isMain ? '✓ ' : ''}${escapeHtml(alt)}</button>`;
     });
+    // No separate "keep" button — the "تجاهل" popover button handles dismissal
     alternativesEl.innerHTML = html;
     // Bind click events for alternatives

src/nlp/punctuation/spelling/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # AraSpell — Arabic Spelling Correction

src/nlp/punctuation/spelling/araspell_rules.py ADDED Viewed

	@@ -0,0 +1,1615 @@

+# AraSpell — Arabic Spell Checker Pipeline (Rules & Classes)
+# Extracted from AraSpell.py — NO global model loading, NO Gradio dependencies.
+# All classes are imported by araspell_service.py.
+import re
+import math
+import logging
+import torch
+from collections import Counter
+from enum import Enum
+from typing import List, Tuple, Optional
+import Levenshtein
+import jellyfish
+logger = logging.getLogger(__name__)
+# ─────────────────────────────────────────────────────────────────────────────
+# ERROR TYPE ENUM
+# ─────────────────────────────────────────────────────────────────────────────
+class ErrorType(Enum):
+    """Types of spelling errors"""
+    CHAR_REPETITION = "char_repetition"
+    WORD_MERGE = "word_merge"
+    CHAR_SUBSTITUTION = "char_substitution"
+    MIXED = "mixed"
+    CLEAN = "clean"
+# ═══════════════════════════════════════════════════════════════════════════════
+# KEYBOARD PROXIMITY (Phase 12 — from original AraSpell.py L475-520)
+# ═══════════════════════════════════════════════════════════════════════════════
+class RulesBasedCorrector:
+    """Arabic keyboard-proximity and character substitution rules."""
+    # Arabic keyboard layout adjacency mapping
+    KEYBOARD_NEIGHBORS = {
+        'ض': ['ص', 'ق'],
+        'ص': ['ض', 'ث', 'ق'],
+        'ث': ['ص', 'ق'],
+        'ق': ['ض', 'ص', 'ث', 'ف', 'غ'],
+        'ف': ['ق', 'غ', 'ع', 'ب'],
+        'غ': ['ق', 'ف', 'ع', 'ه'],
+        'ع': ['ف', 'غ', 'ه', 'خ'],
+        'ه': ['غ', 'ع', 'خ', 'ح'],
+        'خ': ['ع', 'ه', 'ح', 'ج'],
+        'ح': ['ه', 'خ', 'ج'],
+        'ج': ['خ', 'ح', 'د'],
+        'د': ['ج', 'ذ'],
+        'ذ': ['د'],
+        'ش': ['س', 'ي', 'ئ'],
+        'س': ['ش', 'ي', 'ب'],
+        'ي': ['ش', 'س', 'ب', 'ت'],
+        'ب': ['ي', 'س', 'ف', 'ل', 'ن'],
+        'ل': ['ب', 'ا', 'ن', 'م'],
+        'ا': ['ل', 'ت', 'م'],
+        'ت': ['ي', 'ا', 'ن'],
+        'ن': ['ب', 'ل', 'ت', 'م', 'ك'],
+        'م': ['ل', 'ا', 'ن', 'ك'],
+        'ك': ['ن', 'م', 'ط'],
+        'ط': ['ك', 'ظ'],
+        'ظ': ['ط'],
+        'ئ': ['ش', 'ء', 'ر'],
+        'ء': ['ئ', 'ؤ'],
+        'ؤ': ['ء', 'ر'],
+        'ر': ['ئ', 'ؤ', 'لا', 'ى', 'ز'],
+        'لا': ['ر', 'ى'],
+        'ى': ['ر', 'لا', 'ة', 'ز'],
+        'ة': ['ى', 'و', 'ز'],
+        'و': ['ة', 'ز'],
+        'ز': ['ر', 'ى', 'ة', 'و'],
+        'أ': ['ا', 'إ', 'آ'],
+        'إ': ['ا', 'أ'],
+        'آ': ['ا', 'أ'],
+    }
+    @staticmethod
+    def is_keyboard_neighbor(char1: str, char2: str) -> bool:
+        """Check if two Arabic chars are adjacent on the keyboard."""
+        neighbors = RulesBasedCorrector.KEYBOARD_NEIGHBORS.get(char1, [])
+        return char2 in neighbors
+# ═══════════════════════════════════════════════════════════════════════════════
+# POST PROCESSOR
+# ═══════════════════════════════════════════════════════════════════════════════
+class AraSpellPostProcessor:
+    """Arabic text post-processing techniques."""
+    ARABIC_HARAKAT = 'ًٌٍَُِّْ'
+    TATWEEL = 'ـ'
+    NORMALIZER_MAP = {
+        'ﻹ': 'لإ', 'ﻷ': 'لأ', 'ﻵ': 'لآ', 'ﻻ': 'لا', 'ﷲ': 'الله'
+    }
+    ARABIC_CONSONANTS = set('بتثجحخدذرزسشصضطظعغفقكلمن')
+    # --- Basic Normalization ---
+    @staticmethod
+    def remove_harakat(text: str) -> str:
+        """Remove Arabic diacritics"""
+        return re.sub(r'[ً-ْ]', '', text)
+    @staticmethod
+    def remove_tatweel(text: str) -> str:
+        """Remove Arabic kashida/tatweel"""
+        return text.replace(AraSpellPostProcessor.TATWEEL, '')
+    @staticmethod
+    def normalize_special_chars(text: str) -> str:
+        """Normalize special Arabic ligatures"""
+        for old, new in AraSpellPostProcessor.NORMALIZER_MAP.items():
+            text = text.replace(old, new)
+        return text
+    # --- Core Functions ---
+    @staticmethod
+    def unified_collapse_repeated(text: str) -> str:
+        """
+        Collapse repeated characters.
+        Arabic: 3+ consecutive → 1 | Latin: 2+ consecutive → 1
+        """
+        text = re.sub(r"([\u0600-\u06FF])\1{2,}", r"\1", text)
+        text = re.sub(r"([a-zA-Z])\1+", r"\1", text)
+        return text
+    @staticmethod
+    def remove_duplicate_words(text: str) -> str:
+        """Remove consecutive duplicate words. e.g. كتاب كتاب → كتاب"""
+        words = text.split()
+        if len(words) < 2:
+            return text
+        result = [words[0]]
+        for i in range(1, len(words)):
+            if words[i] != words[i-1]:
+                result.append(words[i])
+        return ' '.join(result)
+    @staticmethod
+    def normalize_spaces(text: str) -> str:
+        """Normalize whitespace: multiple spaces, unicode spaces, punctuation spacing."""
+        text = re.sub(r' +', ' ', text)
+        text = text.replace('\u00A0', ' ')
+        text = text.replace('\u200B', '')
+        text = text.replace('\u200C', '')
+        text = text.replace('\u200D', '')
+        text = text.strip()
+        text = re.sub(r'\s*([،؛؟!.])\s*', r'\1 ', text)
+        text = text.strip()
+        return text
+    @staticmethod
+    def remove_word_repetition_with_wa(text: str) -> str:
+        """Remove word و word → word"""
+        words = text.split()
+        result = []
+        i = 0
+        while i < len(words):
+            if i + 2 < len(words) and words[i] == words[i+2] and words[i+1] == 'و':
+                result.append(words[i])
+                i += 3
+            else:
+                result.append(words[i])
+                i += 1
+        return ' '.join(result)
+    # --- Hamza & Ta Marbuta Handling ---
+    # Common Arabic words with hamza errors — covers the most frequent
+    # spelling mistakes in informal Arabic writing
+    HAMZA_WHITELIST = {
+        'الي': 'إلى', 'الى': 'إلى',
+        'انت': 'أنت', 'انتم': 'أنتم', 'انتي': 'أنتِ',
+        'انتو': 'أنتم', 'انتن': 'أنتن',
+        'انا': 'أنا',
+        'امس': 'أمس',
+        'لان': 'لأن', 'لانه': 'لأنه', 'لانها': 'لأنها',
+        'لانهم': 'لأنهم', 'لانك': 'لأنك',
+        'اذا': 'إذا', 'اذ': 'إذ',
+        'اي': 'أي', 'اين': 'أين',
+        'او': 'أو',
+        'اما': 'أما',
+        'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم',
+        'اخر': 'آخر', 'اخرى': 'أخرى',
+        'الان': 'الآن',
+        'اول': 'أول', 'اولى': 'أولى',
+        'اصبح': 'أصبح', 'اصبحت': 'أصبحت',
+        'اكثر': 'أكثر', 'اقل': 'أقل',
+        'اعلى': 'أعلى', 'ادنى': 'أدنى',
+        'اسرع': 'أسرع', 'ابطا': 'أبطأ',
+        'اكبر': 'أكبر', 'اصغر': 'أصغر',
+        'احسن': 'أحسن', 'اسوا': 'أسوأ',
+        'امام': 'أمام',
+        'اثناء': 'أثناء',
+        'ايضا': 'أيضاً', 'ايض': 'أيضاً',
+        'اساسي': 'أساسي', 'اساسية': 'أساسية',
+        'اخي': 'أخي', 'اخت': 'أخت', 'اخو': 'أخو',
+        'ابي': 'أبي', 'اب': 'أب', 'ابو': 'أبو',
+        'اهل': 'أهل',
+        'اطفال': 'أطفال',
+        'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي',
+        'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب',
+        'اعرف': 'أعرف', 'اعلم': 'أعلم',
+        'اخذ': 'أخذ', 'اكل': 'أكل',
+        'الايام': 'الأيام',
+        'الاطفال': 'الأطفال',
+        'الاسعار': 'الأسعار',
+        'الاولى': 'الأولى',
+        'الاخير': 'الأخير', 'الاخيرة': 'الأخيرة',
+        'واصدقائي': 'وأصدقائي',
+        # FIX-14: Additional hamza entries
+        'ابناء': 'أبناء',
+        'اجمل': 'أجمل', 'اجمع': 'أجمع',
+        'اعلن': 'أعلن', 'اعلنت': 'أعلنت',
+        'اكد': 'أكد', 'اكدت': 'أكدت',
+        'اشار': 'أشار', 'اشارت': 'أشارت',
+        'ارسل': 'أرسل', 'ارسلت': 'أرسلت',
+        'اضاف': 'أضاف', 'اضافت': 'أضافت',
+        'اخيرا': 'أخيراً', 'اخيراً': 'أخيراً',
+        'اساسا': 'أساساً', 'اساساً': 'أساساً',
+        'احيانا': 'أحياناً', 'احياناً': 'أحياناً',
+        'ابدا': 'أبداً', 'ابداً': 'أبداً',
+        'اصلا': 'أصلاً', 'اصلاً': 'أصلاً',
+        'اخبار': 'أخبار', 'اخبر': 'أخبر',
+        'امر': 'أمر', 'امور': 'أمور',
+        'اهم': 'أهم', 'اهمية': 'أهمية',
+        'اصبح': 'أصبح', 'اصل': 'أصل',
+        'اثر': 'أثر', 'اثار': 'آثار',
+        'اساء': 'أساء', 'اساس': 'أساس',
+        'استاذ': 'أستاذ', 'اسلام': 'إسلام',
+        # Batch 3: More hamza entries for remaining FN cases
+        'اسرة': 'أسرة', 'اسر': 'أسر',
+        'اعضاء': 'أعضاء', 'اعداد': 'أعداد',
+        'اعمال': 'أعمال', 'اعمار': 'أعمار',
+        'انجاز': 'إنجاز', 'انجازات': 'إنجازات',
+        'انشاء': 'إنشاء', 'انتاج': 'إنتاج',
+        'انتخابات': 'انتخابات', 'انتظار': 'انتظار',
+        'اسلامي': 'إسلامي', 'اسلامية': 'إسلامية',
+        'امكانية': 'إمكانية', 'امكان': 'إمكان',
+        'اشكالية': 'إشكالية',
+        'ادارة': 'إدارة', 'ادارية': 'إدارية',
+        'اعلام': 'إعلام', 'اعلامي': 'إعلامي',
+        'احتمال': 'احتمال', 'احتفال': 'احتفال',
+        'ازور': 'أزور', 'اذهب': 'أذهب', 'اكتب': 'أكتب',
+        'اقرا': 'أقرأ', 'اقرأ': 'أقرأ',
+        'اعمل': 'أعمل', 'ادرس': 'أدرس',
+        'اشتري': 'أشتري', 'اسافر': 'أسافر',
+        'مسؤول': 'مسؤول', 'مسؤولية': 'مسؤولية',
+        'رؤية': 'رؤية', 'رؤيا': 'رؤيا',
+        'مؤسسة': 'مؤسسة', 'مؤتمر': 'مؤتمر',
+        'تأثير': 'تأثير', 'تأكيد': 'تأكيد',
+        # FIX-14: Alif maqsura common errors
+        'المستشفي': 'المستشفى',
+        'مصطفي': 'مصطفى', 'موسي': 'موسى', 'عيسي': 'عيسى',
+        'هدي': 'هدى', 'بني': 'بنى',
+        'معني': 'معنى', 'مبني': 'مبنى',
+        'علي': 'على',  # Common alif maqsura confusion
+        'الي': 'إلى',
+    }
+    @staticmethod
+    def fix_hamza_conservative(text: str) -> str:
+        """Conservative Hamza normalization — only at word END, not middle."""
+        words = text.split()
+        result = []
+        for word in words:
+            if len(word) >= 3:
+                if word.endswith('أ'):
+                    word = word[:-1] + 'ا'
+                if word.endswith('إ'):
+                    word = word[:-1] + 'ا'
+            result.append(word)
+        return ' '.join(result)
+    # Attached prefixes that can precede hamza-whitelist words
+    # Ordered longest-first so وال is tried before و
+    HAMZA_PREFIXES = ['وبال', 'فبال', 'وال', 'بال', 'فال', 'كال', 'ول', 'فل',
+                      'وب', 'فب', 'وك', 'فك', 'و', 'ف', 'ب', 'ك', 'ل']
+    @staticmethod
+    def fix_common_hamza(text: str) -> str:
+        """
+        Fix common hamza placement errors using a whitelist.
+        Also handles prefixed words: و/ف/ب/ك/ل + whitelist word.
+        e.g. واصدقائي → وأصدقائي, بالاسعار → بالأسعار
+        """
+        words = text.split()
+        result = []
+        for word in words:
+            # Check exact match first
+            if word in AraSpellPostProcessor.HAMZA_WHITELIST:
+                result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
+                continue
+            # Try stripping common prefixes and looking up the remainder
+            fixed = False
+            for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
+                if word.startswith(prefix) and len(word) > len(prefix) + 1:
+                    remainder = word[len(prefix):]
+                    if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
+                        result.append(prefix + AraSpellPostProcessor.HAMZA_WHITELIST[remainder])
+                        fixed = True
+                        break
+            if not fixed:
+                result.append(word)
+        return ' '.join(result)
+    @staticmethod
+    def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
+        """
+        Smart ه → ة fix at end of words.
+        Strategy: Always prefer ة when the previous char is a consonant,
+        UNLESS the ه form is specifically a known word and the ة form is NOT.
+        """
+        PROTECTED_ENDINGS = ['لله']
+        # Words that genuinely end in ه (not ة)
+        PROTECTED_HA_WORDS = {
+            'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
+            'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
+            'اتجه', 'توجه', 'تشابه',
+        }
+        words = text.split()
+        result = []
+        for word in words:
+            if any(word.endswith(e) for e in PROTECTED_ENDINGS):
+                result.append(word)
+                continue
+            if word in PROTECTED_HA_WORDS:
+                result.append(word)
+                continue
+            if len(word) >= 3 and word.endswith('ه'):
+                if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
+                    candidate_with_ta = word[:-1] + 'ة'
+                    # Default: prefer ة (correct Arabic orthography for feminine nouns)
+                    if vocab_manager:
+                        ta_iv = vocab_manager.is_iv(candidate_with_ta)
+                        ha_iv = vocab_manager.is_iv(word)
+                        if ta_iv:
+                            # Always prefer ة when it's a valid word
+                            result.append(candidate_with_ta)
+                            continue
+                        elif ha_iv:
+                            result.append(word)
+                            continue
+                    # No vocab manager — default to ة
+                    result.append(candidate_with_ta)
+                    continue
+            result.append(word)
+        return ' '.join(result)
+    # --- Hallucination Removal ---
+    @staticmethod
+    def remove_hallucinations(text: str) -> str:
+        """Remove model hallucinations: duplicate words, trailing 'و' artifacts."""
+        words = text.split()
+        if not words:
+            return text
+        result = []
+        i = 0
+        def normalize_word(w: str) -> str:
+            w = w.replace('ال', '').replace('ة', 'ه')
+            w = re.sub(r'[أإآ]', 'ا', w)
+            return w
+        while i < len(words):
+            word = words[i]
+            if len(word) > 4 and word.endswith('و'):
+                prev_char = word[-2]
+                if prev_char in 'ةهاأإآء':
+                    word = word[:-1]
+            if i + 1 < len(words):
+                next_word = words[i + 1]
+                if normalize_word(word) == normalize_word(next_word):
+                    keep = next_word if next_word.startswith('ال') and not word.startswith('ال') else word
+                    result.append(keep)
+                    i += 2
+                    continue
+            result.append(word)
+            i += 1
+        return ' '.join(result)
+    @staticmethod
+    def remove_hallucinated_prefix(text: str, original: str) -> str:
+        """Remove particles (و/في) added by model if not in original"""
+        if not original:
+            return text
+        if text.startswith('و ') and not original.startswith('و'):
+            rest = text[2:].strip()
+            if AraSpellPostProcessor.normalize_special_chars(rest) == AraSpellPostProcessor.normalize_special_chars(original):
+                return rest
+        return text
+    # --- Word Splitting & Merging ---
+    @staticmethod
+    def merge_separated_al(text: str) -> str:
+        """Merge 'ال' separated by space: ال + كتاب → الكتاب"""
+        return re.sub(r'\bال\s+(\w+)', r'ال\1', text)
+    @staticmethod
+    def join_fragments(text: str) -> str:
+        """Join short fragments with validation."""
+        words = text.split()
+        if len(words) < 2:
+            return text
+        STANDALONE_WORDS = {
+            'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال',
+            'بعد', 'قبل', 'ب', 'ل', 'ك', 'و', 'أو', 'لا', 'ما', 'لم', 'لن',
+            'هو', 'هي', 'هم', 'أن', 'إن', 'كل', 'كان', 'قد', 'قال', 'ذلك',
+            'هذا', 'هذه', 'تلك', 'التي', 'الذي', 'التى', 'اللذي'
+        }
+        result = []
+        i = 0
+        while i < len(words):
+            word = words[i]
+            if i + 1 < len(words):
+                next_word = words[i + 1]
+                if word in STANDALONE_WORDS and next_word in STANDALONE_WORDS:
+                    result.append(word)
+                    i += 1
+                    continue
+                if len(next_word) == 1:
+                    result.append(word + next_word)
+                    i += 2
+                    continue
+                if len(word) >= 2 and len(next_word) >= 2 and word[-1] == next_word[0]:
+                    if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
+                        result.append(word[:-1] + next_word)
+                        i += 2
+                        continue
+                if (2 <= len(word) <= 4 and
+                    1 <= len(next_word) <= 2 and
+                    3 <= len(word) + len(next_word) <= 7):
+                    if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
+                        result.append(word + next_word)
+                        i += 2
+                        continue
+            result.append(word)
+            i += 1
+        return ' '.join(result)
+    # --- Main Pipelines ---
+    @staticmethod
+    def full_postprocess(text: str, original: str = "", vocab_manager=None) -> str:
+        """Apply all post-processing steps."""
+        if original:
+            text = AraSpellPostProcessor.remove_hallucinated_prefix(text, original)
+        text = AraSpellPostProcessor.normalize_special_chars(text)
+        text = AraSpellPostProcessor.remove_hallucinations(text)
+        text = AraSpellPostProcessor.unified_collapse_repeated(text)
+        text = AraSpellPostProcessor.fix_hamza_conservative(text)
+        text = AraSpellPostProcessor.fix_common_hamza(text)  # Fix S3: hamza whitelist
+        text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
+        text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
+        text = AraSpellPostProcessor.remove_duplicate_words(text)
+        text = AraSpellPostProcessor.normalize_spaces(text)
+        return text
+# ─────────────────────────────────────────────────────────────────────────────
+# ERROR CLASSIFIER
+# ─────────────────────────────────────────────────────────────────────────────
+class ErrorClassifier:
+    """Classify type of spelling error"""
+    NON_ARABIC_KEYBOARD = set('پگچژکەڕڤڵڎےۀۃھیټډڼڑ')
+    @staticmethod
+    def has_char_substitution(text: str) -> bool:
+        return any(c in ErrorClassifier.NON_ARABIC_KEYBOARD for c in text)
+    @staticmethod
+    def has_char_repetition(text: str, threshold: int = 3) -> bool:
+        return bool(re.search(r"(.)\1{" + str(threshold - 1) + ",}", text))
+    @staticmethod
+    def has_word_merge(text: str, max_word_len: int = 8) -> bool:
+        words = text.split()
+        if any(len(w) > max_word_len for w in words):
+            return True
+        if len(words) == 1 and len(text) > 6:
+            return True
+        return False
+    @staticmethod
+    def classify(text: str) -> ErrorType:
+        has_rep = ErrorClassifier.has_char_repetition(text)
+        has_merge = ErrorClassifier.has_word_merge(text)
+        has_sub = ErrorClassifier.has_char_substitution(text)
+        error_count = sum([has_rep, has_merge, has_sub])
+        if error_count >= 2:
+            return ErrorType.MIXED
+        elif has_sub:
+            return ErrorType.CHAR_SUBSTITUTION
+        elif has_rep:
+            return ErrorType.CHAR_REPETITION
+        elif has_merge:
+            return ErrorType.WORD_MERGE
+        else:
+            return ErrorType.CLEAN
+# ═══════════════════════════════════════════════════════════════════════════════
+# RULES-BASED CORRECTOR
+# ═══════════════════════════════════════════════════════════════════════════════
+class RulesBasedCorrector:
+    """Rules-based correction with keyboard proximity mapping."""
+    SUBSTITUTION_MAP = {
+        'ک': 'ك', 'ی': 'ي', 'ے': 'ي',
+        'پ': 'ب', 'چ': 'ج', 'ژ': 'ز',
+        'گ': 'ك', 'ڤ': 'ف', 'ڵ': 'ل',
+        'ڕ': 'ر', 'ڎ': 'د', 'ڼ': 'ن',
+        'ټ': 'ت', 'ډ': 'د', 'ړ': 'ر',
+        'ۀ': 'ه', 'ۃ': 'ة', 'ھ': 'ه',
+        'ە': 'ه', 'ڑ': 'ر'
+    }
+    PREPOSITIONS = {
+        'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى',
+        'حتى', 'منذ', 'خلال', 'بعد', 'قبل',
+        'ب', 'ل', 'ك', 'لل'
+    }
+    KEYBOARD_NEIGHBORS = {
+        'ض': ['ص', 'ق'], 'ص': ['ض', 'ث', 'ق'], 'ث': ['ص', 'ق'],
+        'ق': ['ض', 'ص', 'ث', 'ف', 'غ'], 'ف': ['ق', 'غ', 'ع', 'ب'],
+        'غ': ['ق', 'ف', 'ع', 'ه'], 'ع': ['ف', 'غ', 'ه', 'خ'],
+        'ه': ['غ', 'ع', 'خ', 'ح'], 'خ': ['ع', 'ه', 'ح', 'ج'],
+        'ح': ['ه', 'خ', 'ج'], 'ج': ['خ', 'ح', 'د'],
+        'د': ['ج', 'ذ'], 'ذ': ['د'],
+        'ش': ['س', 'ي', 'ئ'], 'س': ['ش', 'ي', 'ب'],
+        'ي': ['ش', 'س', 'ب', 'ت'], 'ب': ['ي', 'س', 'ف', 'ل', 'ن'],
+        'ل': ['ب', 'ا', 'ن', 'م'], 'ا': ['ل', 'ت', 'م'],
+        'ت': ['ي', 'ا', 'ن'], 'ن': ['ب', 'ل', 'ت', 'م', 'ك'],
+        'م': ['ل', 'ا', 'ن', 'ك'], 'ك': ['ن', 'م', 'ط'],
+        'ط': ['ك', 'ظ'], 'ظ': ['ط'],
+        'ئ': ['ش', 'ء', 'ر'], 'ء': ['ئ', 'ؤ'], 'ؤ': ['ء', 'ر'],
+        'ر': ['ئ', 'ؤ', 'لا', 'ى', 'ز'], 'لا': ['ر', 'ى'],
+        'ى': ['ر', 'لا', 'ة', 'ز'], 'ة': ['ى', 'و', 'ز'],
+        'و': ['ة', 'ز'], 'ز': ['ر', 'ى', 'ة', 'و'],
+        'أ': ['ا', 'إ', 'آ'], 'إ': ['ا', 'أ'], 'آ': ['ا', 'أ'],
+    }
+    @staticmethod
+    def is_keyboard_neighbor(char1: str, char2: str) -> bool:
+        neighbors = RulesBasedCorrector.KEYBOARD_NEIGHBORS.get(char1, [])
+        return char2 in neighbors
+    @staticmethod
+    def fix_char_substitution(text: str) -> str:
+        for old, new in RulesBasedCorrector.SUBSTITUTION_MAP.items():
+            text = text.replace(old, new)
+        return text
+    @staticmethod
+    def fix_char_repetition(text: str) -> str:
+        text = re.sub(r'([^\d\s])\1{2,}', r'\1', text)
+        return text
+    @staticmethod
+    def advanced_heuristic_repair(text: str) -> str:
+        text = RulesBasedCorrector.fix_char_substitution(text)
+        text = RulesBasedCorrector.fix_char_repetition(text)
+        words = text.split()
+        processed_words = []
+        for word in words:
+            processed_words.append(RulesBasedCorrector._recursive_split(word))
+        return ' '.join(processed_words)
+    @staticmethod
+    def _recursive_split(word: str) -> str:
+        if len(word) < 4:
+            return word
+        separables = sorted(['من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال', 'بعد', 'قبل'], key=len, reverse=True)
+        for sep in separables:
+            if word == sep:
+                return word
+            if word.startswith(sep):
+                remainder = word[len(sep):]
+                if len(remainder) >= 3:
+                     return sep + " " + RulesBasedCorrector._recursive_split(remainder)
+        if word.startswith('يا') and len(word) > 4:
+             return 'يا ' + RulesBasedCorrector._recursive_split(word[2:])
+        return word
+# ═══════════════════════════════════════════════════════════════════════════════
+# OUTPUT VALIDATOR (Hallucination Prevention)
+# ═══════════════════════════════════════════════════════════════════════════════
+class OutputValidator:
+    """Validate model outputs to prevent hallucinations"""
+    @staticmethod
+    def calculate_edit_distance(s1: str, s2: str) -> int:
+        return Levenshtein.distance(s1, s2)
+    @staticmethod
+    def check_character_preservation(original: str, corrected: str) -> Tuple[bool, str]:
+        chars_original = set(original)
+        chars_corrected = set(corrected)
+        if not chars_original:
+            return True, "valid"
+        intersection = chars_original & chars_corrected
+        union = chars_original | chars_corrected
+        jaccard = len(intersection) / len(union) if union else 0
+        if jaccard < 0.35:
+            return False, "low_character_similarity"
+        return True, "valid"
+    @staticmethod
+    def check_word_count(original: str, corrected: str) -> Tuple[bool, str]:
+        len_orig = len(original.split())
+        len_corr = len(corrected.split())
+        if len_orig == 1:
+            if len_corr <= 3:
+                return True, "valid"
+            if len(original) > 12 and len_corr <= 6:
+                return True, "valid"
+        ratio = len_corr / len_orig if len_orig > 0 else 0
+        if ratio > 2.0 or ratio < 0.5:
+             return False, "word_count_mismatch"
+        return True, "valid"
+    def validate(self, original: str, corrected: str, error_type: str) -> Tuple[bool, str]:
+        if not corrected or not corrected.strip():
+            return False, "empty_output"
+        original_no_space = original.replace(' ', '').replace('\u200c', '')
+        corrected_no_space = corrected.replace(' ', '').replace('\u200c', '')
+        if original_no_space == corrected_no_space:
+            return True, "space_leniency_accept"
+        len_orig = len(original)
+        len_corr = len(corrected)
+        if len_corr > len_orig * 2.5:
+             return False, "too_long"
+        if len_corr < len_orig * 0.5:
+             if error_type == ErrorType.CHAR_REPETITION:
+                 pass
+             else:
+                 return False, "too_short"
+        is_valid_count, reason = self.check_word_count(original, corrected)
+        if not is_valid_count:
+            return False, reason
+        is_valid_chars, reason = self.check_character_preservation(original, corrected)
+        if not is_valid_chars:
+             return False, reason
+        return True, "valid"
+# ═══════════════════════════════════════════════════════════════════════════════
+# VOCABULARY MANAGER
+# ═══════════════════════════════════════════════════════════════════════════════
+class VocabularyManager:
+    """Centralized vocabulary management for OOV/IV detection."""
+    HAMZA_VARIANTS = {'أ', 'إ', 'آ', 'ء', 'ؤ', 'ئ', 'ا'}
+    ALEF_NORMALIZED = 'ا'
+    TA_MARBUTA = 'ة'
+    HA = 'ه'
+    YA_VARIANTS = {'ي', 'ى'}
+    YA_NORMALIZED = 'ي'
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.vocab = {
+            w for w in tokenizer.get_vocab().keys()
+            if w.isalpha() and not w.startswith('##') and len(w) > 1
+        }
+        self.vocab_rank = {w: i for w, i in tokenizer.get_vocab().items()}
+        self.normalized_vocab = {self.normalize_for_comparison(w): w for w in self.vocab}
+        logger.info(f"VocabularyManager initialized: {len(self.vocab)} words")
+    @classmethod
+    def normalize_for_comparison(cls, word: str) -> str:
+        result = []
+        for i, char in enumerate(word):
+            if char in cls.HAMZA_VARIANTS:
+                result.append(cls.ALEF_NORMALIZED)
+            elif char == cls.TA_MARBUTA and i == len(word) - 1:
+                result.append(cls.HA)
+            elif char in cls.YA_VARIANTS:
+                result.append(cls.YA_NORMALIZED)
+            else:
+                result.append(char)
+        return ''.join(result)
+    def is_iv(self, word: str) -> bool:
+        clean = re.sub(r'[^\w]', '', word)
+        if not clean:
+            return True
+        if clean in self.vocab:
+            return True
+        normalized = self.normalize_for_comparison(clean)
+        if normalized in self.normalized_vocab:
+            return True
+        return False
+    def is_oov(self, word: str) -> bool:
+        return not self.is_iv(word)
+    def get_frequency_rank(self, word: str) -> int:
+        clean = re.sub(r'[^\w]', '', word)
+        return self.vocab_rank.get(clean, 999999)
+    def all_words_iv(self, text: str) -> bool:
+        words = text.split()
+        return all(self.is_iv(w) for w in words)
+    def count_oov_words(self, text: str) -> int:
+        words = text.split()
+        return sum(1 for w in words if self.is_oov(w))
+    def get_oov_words(self, text: str) -> List[str]:
+        words = text.split()
+        return [w for w in words if self.is_oov(w)]
+    def words_are_equivalent(self, word1: str, word2: str) -> bool:
+        norm1 = self.normalize_for_comparison(word1)
+        norm2 = self.normalize_for_comparison(word2)
+        return norm1 == norm2
+    @staticmethod
+    def damerau_levenshtein_distance(s1: str, s2: str) -> int:
+        return jellyfish.damerau_levenshtein_distance(s1, s2)
+    def calculate_similarity(self, original: str, corrected: str) -> float:
+        dist = self.damerau_levenshtein_distance(original, corrected)
+        max_len = max(len(original), len(corrected), 1)
+        return 1.0 - (dist / max_len)
+# ═══════════════════════════════════════════════════════════════════════════════
+# WORD ALIGNER
+# ═══════════════════════════════════════════════════════════════════════════════
+class WordAligner:
+    """Aligns input and output words to create hybrid corrections."""
+    def __init__(self, vocab_manager):
+        self.vocab = vocab_manager
+    def align_words(self, input_text: str, output_text: str) -> str:
+        input_words = input_text.split()
+        output_words = output_text.split()
+        if abs(len(input_words) - len(output_words)) > 2:
+            input_oov = self.vocab.count_oov_words(input_text)
+            output_oov = self.vocab.count_oov_words(output_text)
+            return output_text if output_oov < input_oov else input_text
+        result = []
+        min_len = min(len(input_words), len(output_words))
+        for i in range(min_len):
+            in_word = input_words[i]
+            out_word = output_words[i]
+            best_word = self._select_best_word(in_word, out_word)
+            result.append(best_word)
+        if len(output_words) > min_len:
+            result.extend(output_words[min_len:])
+        elif len(input_words) > min_len:
+            for w in input_words[min_len:]:
+                 if self.vocab.is_iv(w):
+                     result.append(w)
+        return ' '.join(result)
+    def _select_best_word(self, input_word: str, output_word: str) -> str:
+        if input_word == output_word:
+            return input_word
+        in_iv = self.vocab.is_iv(input_word)
+        out_iv = self.vocab.is_iv(output_word)
+        if not in_iv and out_iv:
+            return output_word
+        if in_iv and not out_iv:
+            return input_word
+        if in_iv and out_iv:
+            # Fix S1: When only difference is ه→ة at word end, prefer ة
+            # (correct Arabic orthography — ة is the standard feminine ending)
+            if (input_word.endswith('ه') and output_word.endswith('ة')
+                    and input_word[:-1] == output_word[:-1]):
+                return output_word
+            # Fix S1: Also handle ة→ه (don't regress a correct ة to ه)
+            if (input_word.endswith('ة') and output_word.endswith('ه')
+                    and input_word[:-1] == output_word[:-1]):
+                return input_word
+            return input_word
+        if len(input_word) == len(output_word) and len(input_word) >= 3:
+            for i in range(len(input_word)):
+                if input_word[i] != output_word[i]:
+                    hybrid = input_word[:i] + output_word[i] + input_word[i+1:]
+                    if self.vocab.is_iv(hybrid):
+                        return hybrid
+                    hybrid2 = output_word[:i] + input_word[i] + output_word[i+1:]
+                    if self.vocab.is_iv(hybrid2):
+                        return hybrid2
+        return output_word
+# ═══════════════════════════════════════════════════════════════════════════════
+# SPLIT/MERGE SPECIALIST
+# ═══════════════════════════════════════════════════════════════════════════════
+class SplitMergeSpecialist:
+    """Handles word splitting and merging with vocabulary validation."""
+    SEPARABLE_PREFIXES = [
+        'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال',
+        'بعد', 'قبل', 'بين', 'حول', 'تحت', 'فوق', 'أمام', 'وراء', 'دون',
+        'أن', 'لن', 'لم', 'قد', 'سوف', 'كي', 'إذا', 'لو', 'مثل', 'غير',
+        'يا',
+    ]
+    PROTECTED_WORDS = {
+        'في', 'من', 'على', 'عن', 'مع', 'إلى', 'الى', 'ان', 'أن', 'لا', 'ما', 'هو', 'هي',
+        'لم', 'لن', 'قد', 'كل', 'كان', 'ذلك', 'هذا', 'هذه', 'التي', 'الذي', 'بين',
+    }
+    ATTACHED_PREFIXES = [
+        'وال', 'بال', 'فال', 'كال', 'لل',
+        'وب', 'وف', 'ول', 'وك', 'وم', 'ون',
+        'فب', 'فل', 'فك', 'فم',
+    ]
+    PRONOUN_SUFFIXES = {'كم', 'هم', 'ها', 'هن', 'كن', 'نا', 'هما', 'كما', 'تم', 'تن'}
+    def __init__(self, vocab_manager):
+        self.vocab = vocab_manager
+        self.separable_prefixes = sorted(
+            self.SEPARABLE_PREFIXES, key=len, reverse=True
+        )
+    def split_word(self, word: str) -> str:
+        if len(word) < 5:
+            return word
+        if self.vocab.is_iv(word):
+            return word
+        if word in self.PROTECTED_WORDS:
+            return word
+        for prefix in self.ATTACHED_PREFIXES:
+            if word.startswith(prefix):
+                remainder = word[len(prefix):]
+                if self.vocab.is_iv(remainder):
+                    return word
+                if prefix.endswith('ال') and self.vocab.is_iv(remainder):
+                    return word
+        for prefix in self.separable_prefixes:
+            if word.startswith(prefix) and len(word) > len(prefix) + 2:
+                remainder = word[len(prefix):]
+                if self.vocab.is_iv(remainder):
+                    return f"{prefix} {remainder}"
+        for i in range(3, len(word) - 2):
+            left = word[:i]
+            right = word[i:]
+            if self.vocab.is_iv(left) and self.vocab.is_iv(right):
+                return f"{left} {right}"
+        return word
+    def merge_fragments(self, text: str) -> str:
+        words = text.split()
+        if len(words) < 2:
+            return text
+        result = []
+        i = 0
+        while i < len(words):
+            word = words[i]
+            if i + 1 < len(words):
+                next_word = words[i + 1]
+                merged = word + next_word
+                if len(next_word) == 1 and next_word in 'ةهاي':
+                    if self.vocab.is_iv(merged):
+                        result.append(merged)
+                        i += 2
+                        continue
+                if word == 'ال' and len(next_word) >= 2:
+                    if self.vocab.is_iv(merged):
+                        result.append(merged)
+                        i += 2
+                        continue
+                if self.vocab.is_oov(word) and self.vocab.is_oov(next_word):
+                    if self.vocab.is_iv(merged):
+                        result.append(merged)
+                        i += 2
+                        continue
+                if len(word) <= 2 and self.vocab.is_oov(word):
+                    if self.vocab.is_iv(merged):
+                        result.append(merged)
+                        i += 2
+                        continue
+                if next_word in self.PRONOUN_SUFFIXES:
+                    if self.vocab.is_iv(merged) and not self.vocab.is_iv(word):
+                        result.append(merged)
+                        i += 2
+                        continue
+                if len(word) <= 3 and len(next_word) <= 3:
+                    if len(merged) >= 5 and self.vocab.is_iv(merged):
+                        result.append(merged)
+                        i += 2
+                        continue
+            result.append(word)
+            i += 1
+        return ' '.join(result)
+    def process_text(self, text: str) -> str:
+        text = self.merge_fragments(text)
+        words = text.split()
+        processed = []
+        for word in words:
+            if self.vocab.is_oov(word) and len(word) >= 4:
+                split_result = self.split_word(word)
+                processed.append(split_result)
+            else:
+                processed.append(word)
+        return ' '.join(processed)
+# ═══════════════════════════════════════════════════════════════════════════════
+# EDIT DISTANCE CORRECTOR
+# ═══════════════════════════════════════════════════════════════════════════════
+class EditDistanceCorrector:
+    """Generates candidates based on Levenshtein distance."""
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+        self.vocab = {
+            w for w in tokenizer.get_vocab().keys()
+            if w.isalpha() and not w.startswith('##') and len(w) > 1
+        }
+        self.vocab_rank = {w: i for w, i in tokenizer.get_vocab().items()}
+    def edits1(self, word):
+        letters    = 'أابتثجحخدذرزسشصضطظعغفقكلمنهويءآىةئؤ'
+        splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
+        deletes    = [L + R[1:]               for L, R in splits if R]
+        transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
+        replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
+        inserts    = [L + c + R               for L, R in splits for c in letters]
+        return set(deletes + transposes + replaces + inserts)
+    def edits2(self, word):
+        return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
+    def known(self, words):
+        return set(w for w in words if w in self.vocab)
+    def generate_candidate(self, text: str) -> str:
+        words = text.split()
+        corrected_words = []
+        for word in words:
+            clean_word = re.sub(r'[^\w]', '', word)
+            if clean_word in self.vocab:
+                corrected_words.append(word)
+                continue
+            candidates = self.known(self.edits1(clean_word))
+            if not candidates:
+                if len(clean_word) < 7:
+                    candidates = self.known(self.edits2(clean_word))
+            if candidates:
+                best_candidate = min(candidates, key=lambda w: self.vocab_rank.get(w, 999999))
+                corrected_words.append(best_candidate)
+            else:
+                corrected_words.append(word)
+        return ' '.join(corrected_words)
+# ═══════════════════════════════════════════════════════════════════════════════
+# CONTEXTUAL CORRECTOR (MLM-based) — Optional, disabled by default to save RAM
+# ═══════════════════════════════════════════════════════════════════════════════
+class ContextualCorrector:
+    """MLM-based contextual correction for confusion pairs"""
+    CONFUSION_PAIRS = [
+        ('ض', 'ظ'), ('ذ', 'ز'), ('ث', 'س'), ('ص', 'س'),
+        ('ط', 'ت'), ('ق', 'ك'), ('ه', 'ة'), ('ا', 'ى'),
+        ('ت', 'د'), ('د', 'ض'), ('ك', 'ق'), ('غ', 'ق'),
+        ('ج', 'ش'), ('س', 'ز'), ('ف', 'ب'), ('و', 'و'),
+        ('ؤ', 'و'), ('ئ', 'ي'), ('ء', 'أ'), ('إ', 'أ'),
+    ]
+    def __init__(self, model_name: str = 'aubmindlab/bert-base-arabertv02', cache_size: int = 10000):
+        from transformers import AutoTokenizer, AutoModelForMaskedLM
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModelForMaskedLM.from_pretrained(model_name)
+        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.model = self.model.to(self.device)
+        self.model.eval()
+        self.confusion_map = self._build_confusion_map()
+        self.cache_hits = 0
+        self.cache_misses = 0
+        self._score_cache = {}
+        self.cache_size = cache_size
+        self.vocab = self.tokenizer.get_vocab()
+    def _build_confusion_map(self):
+        confusion_map = {}
+        for char1, char2 in self.CONFUSION_PAIRS:
+            if char1 not in confusion_map:
+                confusion_map[char1] = []
+            if char2 not in confusion_map:
+                confusion_map[char2] = []
+            confusion_map[char1].append(char2)
+            confusion_map[char2].append(char1)
+        return confusion_map
+    def get_confusable_chars(self, char: str) -> List[str]:
+        return self.confusion_map.get(char, [])
+    def generate_candidates(self, word: str) -> List[str]:
+        candidates = [word]
+        for i, char in enumerate(word):
+            confusables = self.get_confusable_chars(char)
+            for conf_char in confusables:
+                candidate = word[:i] + conf_char + word[i+1:]
+                if candidate not in candidates:
+                    candidates.append(candidate)
+        for i in range(len(word) - 1):
+            if word[i] == word[i+1]:
+                candidate = word[:i] + word[i+1:]
+                if candidate not in candidates:
+                    candidates.append(candidate)
+        COMMON_CHARS = 'ابتثجحخدذرزسشصضطظعغفقكلمنهويأإآءئؤةى'
+        for i in range(len(word) + 1):
+            for char in COMMON_CHARS:
+                candidate = word[:i] + char + word[i:]
+                if candidate in self.vocab and candidate not in candidates:
+                    candidates.append(candidate)
+        if len(word) < 7:
+            for i in range(len(word)):
+                for char in COMMON_CHARS:
+                    if char != word[i]:
+                        candidate = word[:i] + char + word[i+1:]
+                        if candidate in self.vocab and candidate not in candidates:
+                            candidates.append(candidate)
+        for i in range(len(word)):
+            candidate = word[:i] + word[i+1:]
+            if len(candidate) > 1:
+                if candidate in self.vocab and candidate not in candidates:
+                    candidates.append(candidate)
+        return candidates
+    def score_with_mlm(self, text: str, position: int, word: str) -> float:
+        cache_key = f"{text}|{position}|{word}"
+        if cache_key in self._score_cache:
+            self.cache_hits += 1
+            return self._score_cache[cache_key]
+        self.cache_misses += 1
+        words = text.split()
+        if position >= len(words):
+            return 0.0
+        masked_words = words.copy()
+        masked_words[position] = '[MASK]'
+        masked_text = ' '.join(masked_words)
+        inputs = self.tokenizer(masked_text, return_tensors='pt', padding=True, truncation=True)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            predictions = outputs.logits
+        mask_token_index = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
+        if len(mask_token_index) == 0:
+            return 0.0
+        mask_token_logits = predictions[0, mask_token_index[0], :]
+        probs = torch.softmax(mask_token_logits, dim=0)
+        word_tokens = self.tokenizer.encode(word, add_special_tokens=False)
+        if not word_tokens:
+            return 0.0
+        word_token_id = word_tokens[0]
+        score = probs[word_token_id].item()
+        if len(self._score_cache) >= self.cache_size:
+            self._score_cache.pop(next(iter(self._score_cache)))
+        self._score_cache[cache_key] = score
+        return score
+    def score_candidates_batch(self, text: str, position: int, candidates: List[str]) -> dict:
+        scores = {}
+        for candidate in candidates:
+            scores[candidate] = self.score_with_mlm(text, position, candidate)
+        return scores
+    def predict_masked_token(self, text: str, position: int, top_k: int = 5) -> List[Tuple[str, float]]:
+        words = text.split()
+        if position >= len(words):
+            return []
+        masked_words = words.copy()
+        masked_words[position] = '[MASK]'
+        masked_text = ' '.join(masked_words)
+        inputs = self.tokenizer(masked_text, return_tensors='pt', padding=True, truncation=True).to(self.device)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+            predictions = outputs.logits
+        mask_token_index = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
+        if len(mask_token_index) == 0:
+            return []
+        mask_token_logits = predictions[0, mask_token_index[0], :]
+        probs = torch.softmax(mask_token_logits, dim=0)
+        top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
+        results = []
+        for i in range(top_k):
+            token_id = top_k_indices[i].item()
+            score = top_k_weights[i].item()
+            token = self.tokenizer.decode([token_id]).strip()
+            if not token.startswith("##") and token not in self.tokenizer.all_special_tokens:
+                results.append((token, score))
+        return results
+    def refine_sentence_with_mask(self, text: str, threshold: float = 0.001, vocab_manager=None, raw_model_output=None) -> str:
+        words = text.split()
+        refined_words = words.copy()
+        raw_words = raw_model_output.split() if raw_model_output else []
+        for i, word in enumerate(words):
+            if vocab_manager and vocab_manager.is_iv(word):
+                continue
+            if i < len(raw_words) and word == raw_words[i]:
+                continue
+            if len(word) <= 2:
+                continue
+            current_score = self.score_with_mlm(text, i, word)
+            if current_score > threshold:
+                continue
+            predictions = self.predict_masked_token(text, i, top_k=10)
+            for pred_word, pred_score in predictions:
+                if pred_word == word:
+                    continue
+                if abs(len(pred_word) - len(word)) > 1:
+                     continue
+                dist = Levenshtein.distance(word, pred_word)
+                max_len = max(len(word), len(pred_word))
+                similarity = 1.0 - (dist / max_len)
+                if similarity < 0.90:
+                    continue
+                if vocab_manager and vocab_manager.is_oov(pred_word):
+                    continue
+                if pred_score < 0.12:
+                    continue
+                is_original_common = current_score > 0.001
+                if is_original_common:
+                     if pred_score > current_score * 1000:
+                         refined_words[i] = pred_word
+                         break
+                else:
+                    if pred_score > current_score * 50 and pred_score > 0.2:
+                        refined_words[i] = pred_word
+                        break
+        return ' '.join(refined_words)
+    def calculate_sentence_score(self, text: str) -> float:
+        words = text.split()
+        if not words:
+            return 0.0
+        total_score = 0.0
+        scored_words = 0
+        for i, word in enumerate(words):
+            score = self.score_with_mlm(text, i, word)
+            total_score += score
+            scored_words += 1
+        if scored_words == 0:
+            return 0.0
+        return total_score / scored_words
+# ═══════════════════════════════════════════════════════════════════════════════
+# MAIN SPELL CHECKER CLASS
+# ═══════════════════════════════════════════════════════════════════════════════
+class ArabicSpellChecker:
+    """Main Arabic Spell Checker class"""
+    def __init__(self, model, tokenizer, device, use_contextual: bool = True):
+        self.model = model
+        self.tokenizer = tokenizer
+        self.device = device
+        self.postprocessor = AraSpellPostProcessor()
+        self.classifier = ErrorClassifier()
+        self.rules = RulesBasedCorrector()
+        self.validator = OutputValidator()
+        self.vocab_manager = VocabularyManager(tokenizer)
+        self.edit_corrector = EditDistanceCorrector(tokenizer)
+        self.split_merge = SplitMergeSpecialist(self.vocab_manager)
+        self.word_aligner = WordAligner(self.vocab_manager)
+        self.use_contextual = use_contextual
+        if use_contextual:
+            try:
+                self.contextual = ContextualCorrector()
+                logger.info("Contextual correction enabled")
+            except Exception as e:
+                logger.warning(f"Contextual correction disabled: {e}")
+                self.contextual = None
+                self.use_contextual = False
+        else:
+            self.contextual = None
+    def _fix_repeated_end_chars(self, text: str) -> str:
+        text = re.sub(r'([ا-ي])\1+\b', r'\1', text)
+        return text
+    def _fix_merged_with_errors(self, text: str) -> str:
+        text = re.sub(r'ال([ا-ي])\1+([ا-ي]{2,})', r'ال\2', text)
+        text = re.sub(r'\b([ا-ي]{3,})([ا-ي])\2+\b', r'\1\2', text)
+        return text
+    def _split_merged_words_linguistic(self, text: str) -> str:
+        text = re.sub(
+            r'\b(في|من|إلى|الى|حتى|منذ|خلال|بعد|قبل)(ال)?([ا-ي]{3,})',
+            r'\1 \2\3', text
+        )
+        text = re.sub(r'\b(كل)([ا-ي]{3,})', r'\1 \2', text)
+        text = re.sub(r'([ا-ي]{3,})(ال)([ا-ي]{3,})', r'\1 \2\3', text)
+        text = re.sub(r'\b([بلك])(ال)?([ا-ي]{3,})', r'\1 \2\3', text)
+        text = re.sub(r'([ا-ي]{4,})(عليكم|عليك|عليه|عليها)', r'\1 \2', text)
+        text = re.sub(r'([ا-ي]{3,})(على|عن)([ا-ي]{3,})', r'\1 \2 \3', text)
+        return text
+    def _split_long_words_heuristic(self, text: str, max_length: int = 15) -> str:
+        words = text.split()
+        result = []
+        for word in words:
+            if len(word) <= max_length:
+                result.append(word)
+                continue
+            if 'ال' in word[2:]:
+                parts = word.split('ال', 1)
+                if len(parts[0]) >= 2 and len(parts[1]) >= 3:
+                    result.extend([parts[0], 'ال' + parts[1]])
+                    continue
+            if len(word) >= 8:
+                split_found = False
+                for split_pos in [2, 3]:
+                    prefix = word[:split_pos]
+                    suffix = word[split_pos:]
+                    if prefix in ['في', 'من', 'على', 'عن', 'مع', 'كل', 'ب', 'ل', 'ك']:
+                        result.extend([prefix, suffix])
+                        split_found = True
+                        break
+                if not split_found:
+                    result.append(word)
+            else:
+                result.append(word)
+        return ' '.join(result)
+    def _normalize_tanween_patterns(self, text: str) -> str:
+        text = re.sub(r'([ا-ي]{2,})أ\b', r'\1اً', text)
+        text = re.sub(r'\s+أ\s+', ' ', text)
+        text = re.sub(r'\b([بلك])\s+([ا-ي])', r'\1\2', text)
+        return text
+    def preprocess(self, text: str) -> str:
+        """Preprocessing pipeline"""
+        text = self.postprocessor.remove_harakat(text)
+        text = self.postprocessor.remove_tatweel(text)
+        text = self.postprocessor.normalize_special_chars(text)
+        text = self._fix_repeated_end_chars(text)
+        text = self._fix_merged_with_errors(text)
+        text = self._split_merged_words_linguistic(text)
+        text = self._split_long_words_heuristic(text)
+        text = self._normalize_tanween_patterns(text)
+        text = self.postprocessor.merge_separated_al(text)
+        text = self.postprocessor.unified_collapse_repeated(text)
+        text = self.rules.fix_char_substitution(text)
+        text = self.rules.fix_char_repetition(text)
+        text = self.postprocessor.normalize_spaces(text)
+        return text
+    def postprocess(self, text: str, original: str = "") -> str:
+        """Postprocessing pipeline"""
+        return self.postprocessor.full_postprocess(text, original, vocab_manager=self.vocab_manager)
+    def model_inference(self, text: str, num_return_sequences: int = 5) -> List[str]:
+        """Run seq2seq model inference and return top candidates."""
+        inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
+        inputs = {k: v.to(self.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                num_beams=5,
+                num_return_sequences=num_return_sequences,
+                early_stopping=True,
+                return_dict_in_generate=True,
+                output_scores=True
+            )
+        candidates = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
+        self._last_beam_scores = {}
+        if hasattr(outputs, 'sequences_scores') and outputs.sequences_scores is not None:
+            scores = outputs.sequences_scores.tolist()
+            for cand, score in zip(candidates, scores):
+                self._last_beam_scores[cand] = score
+        return candidates
+    def correct(self, text: str) -> str:
+        """
+        Main correction pipeline (RERANKING APPROACH)
+        Steps:
+        1. Preprocess
+        2. Generate Candidates (Model Beams + Baseline)
+        3. Rerank Candidates (Validator + Fluency)
+        4. Select Best
+        5. Postprocess
+        """
+        if not text or not text.strip():
+            return text
+        original = text
+        # 1. Preprocess
+        preprocessed_text = self.preprocess(text)
+        # 2. Classify error type
+        error_type = self.classifier.classify(preprocessed_text)
+        # 3. Generate Candidates
+        candidates = []
+        candidates.append(preprocessed_text)
+        rules_candidate = self.rules.advanced_heuristic_repair(text)
+        candidates.append(rules_candidate)
+        edit_candidate = self.edit_corrector.generate_candidate(text)
+        if edit_candidate != text and edit_candidate != rules_candidate:
+            candidates.append(edit_candidate)
+        raw_model_output = None
+        try:
+            model_candidates = self.model_inference(preprocessed_text, num_return_sequences=5)
+            raw_model_output = model_candidates[0] if model_candidates else None
+            candidates.extend(model_candidates)
+            if model_candidates:
+                hybrid_candidate = self.word_aligner.align_words(preprocessed_text, model_candidates[0])
+                if hybrid_candidate not in candidates:
+                    candidates.append(hybrid_candidate)
+                for beam in model_candidates[1:3]:
+                    hybrid_beam = self.word_aligner.align_words(preprocessed_text, beam)
+                    if hybrid_beam not in candidates:
+                        candidates.append(hybrid_beam)
+            if model_candidates and len(model_candidates) >= 3:
+                try:
+                    beam_word_lists = [c.split() for c in model_candidates]
+                    max_words = max(len(wl) for wl in beam_word_lists)
+                    voted_words = []
+                    for pos in range(max_words):
+                        words_at_pos = []
+                        for wl in beam_word_lists:
+                            if pos < len(wl):
+                                words_at_pos.append(wl[pos])
+                        if words_at_pos:
+                            most_common = Counter(words_at_pos).most_common(1)[0][0]
+                            voted_words.append(most_common)
+                    voted_candidate = ' '.join(voted_words)
+                    if voted_candidate not in candidates:
+                        candidates.append(voted_candidate)
+                except Exception:
+                    pass
+        except Exception as e:
+            logger.warning(f"Model inference failed: {e}")
+        # Remove duplicates
+        unique_candidates = []
+        seen = set()
+        for c in candidates:
+            if c not in seen:
+                unique_candidates.append(c)
+                seen.add(c)
+        candidates = unique_candidates
+        # 4. Rerank Candidates
+        best_candidate = preprocessed_text
+        best_score = -1.0
+        candidate_scores = []
+        for cand in candidates:
+            is_valid, reason = self.validator.validate(original, cand, error_type.value)
+            if len(cand) < len(original) * 0.5:
+                is_valid = False
+                reason = "too_short"
+            input_oov_count = self.vocab_manager.count_oov_words(original)
+            cand_oov_count = self.vocab_manager.count_oov_words(cand)
+            vocab_boost = 1.0
+            if input_oov_count > 0 and cand_oov_count < input_oov_count:
+                oov_reduction = input_oov_count - cand_oov_count
+                vocab_boost = 1.0 + (oov_reduction * 0.3)
+                if cand_oov_count == 0 and self.vocab_manager.all_words_iv(cand):
+                    if not is_valid and reason not in ["empty_output"]:
+                        is_valid = True
+                        reason = "vocab_aware_accept"
+            elif cand_oov_count > input_oov_count:
+                vocab_boost = 0.5
+            elif input_oov_count == 0 and cand_oov_count == 0:
+                vocab_boost = 1.0
+            validity_factor = 1.0 if is_valid else 0.001
+            fluency_score = 0.0
+            if self.use_contextual and self.contextual:
+                try:
+                    fluency_score = self.contextual.calculate_sentence_score(cand)
+                except Exception as e:
+                    logger.warning(f"Scoring failed: {e}")
+                    fluency_score = 0.5
+            else:
+                fluency_score = 1.0
+            dist = VocabularyManager.damerau_levenshtein_distance(preprocessed_text, cand)
+            max_len = max(len(preprocessed_text), len(cand), 1)
+            similarity = 1.0 - (dist / max_len)
+            if cand == preprocessed_text:
+                similarity = 1.0
+            keyboard_bonus = 1.0
+            input_words = preprocessed_text.split()
+            cand_words = cand.split()
+            if len(input_words) == len(cand_words):
+                for iw, cw in zip(input_words, cand_words):
+                    if iw != cw and len(iw) == len(cw):
+                        for ic, cc in zip(iw, cw):
+                            if ic != cc and RulesBasedCorrector.is_keyboard_neighbor(ic, cc):
+                                keyboard_bonus *= 1.05
+            if fluency_score > 0.85 and cand_oov_count == 0:
+                 if not is_valid and reason in ["too_short", "low_character_similarity", "word_count_mismatch"]:
+                      if len(cand) >= len(original) * 0.4:
+                          is_valid = True
+                          reason = "high_confidence_override"
+                          vocab_boost *= 1.2
+                          validity_factor = 1.0
+            fluency_exp = 0.3
+            similarity_exp = 3.0
+            beam_boost = 1.0
+            if raw_model_output and cand == raw_model_output:
+                beam_boost = 1.15
+            final_score = (fluency_score ** fluency_exp) * (similarity ** similarity_exp) * validity_factor * vocab_boost * keyboard_bonus * beam_boost
+            candidate_scores.append({
+                'text': cand, 'is_valid': is_valid, 'reason': reason,
+                'fluency': fluency_score, 'similarity': similarity,
+                'vocab_boost': vocab_boost, 'input_oov': input_oov_count,
+                'cand_oov': cand_oov_count, 'final_score': final_score
+            })
+            if final_score > best_score:
+                best_score = final_score
+                best_candidate = cand
+        # Output Quality Scoring
+        if best_candidate != preprocessed_text:
+            preprocessed_score = 0.0
+            for cs in candidate_scores:
+                if cs['text'] == preprocessed_text:
+                    preprocessed_score = cs['final_score']
+                    break
+            if preprocessed_score > 0 and best_score < preprocessed_score * 1.05:
+                best_oov = self.vocab_manager.count_oov_words(best_candidate)
+                prep_oov = self.vocab_manager.count_oov_words(preprocessed_text)
+                if best_oov > prep_oov:
+                    best_candidate = preprocessed_text
+                    best_score = preprocessed_score
+        # Contextual Validation Layer
+        if best_candidate != preprocessed_text and self.use_contextual and self.contextual:
+            try:
+                input_fluency = self.contextual.calculate_sentence_score(preprocessed_text)
+                best_fluency = 0.0
+                for cs in candidate_scores:
+                    if cs['text'] == best_candidate:
+                        best_fluency = cs['fluency']
+                        break
+                if input_fluency > 0 and best_fluency > 0:
+                    if input_fluency > best_fluency * 1.5:
+                        input_oov = self.vocab_manager.count_oov_words(preprocessed_text)
+                        best_oov = self.vocab_manager.count_oov_words(best_candidate)
+                        if input_oov <= best_oov:
+                            best_candidate = preprocessed_text
+            except Exception:
+                pass
+        # 5. Postprocess Winner
+        result = self.postprocess(best_candidate, original)
+        # IV-Safe Postprocessing Check
+        if result != best_candidate:
+            result_words = result.split()
+            best_words = best_candidate.split()
+            if len(result_words) == len(best_words):
+                fixed_words = []
+                for idx_fw, (rw, bw) in enumerate(zip(result_words, best_words)):
+                    if rw != bw:
+                        bw_iv = self.vocab_manager.is_iv(bw)
+                        rw_iv = self.vocab_manager.is_iv(rw)
+                        if bw_iv and not rw_iv:
+                            fixed_words.append(bw)
+                        else:
+                            fixed_words.append(rw)
+                    else:
+                        fixed_words.append(rw)
+                result = ' '.join(fixed_words)
+        # 6. Contextual fine-tuning
+        if self.use_contextual and self.contextual:
+             if len(result) > 3:
+                 result = self.contextual.refine_sentence_with_mask(
+                     result, vocab_manager=self.vocab_manager,
+                     raw_model_output=raw_model_output
+                 )
+        # 7. Safe Split/Merge Post-processing
+        result = self.split_merge.merge_fragments(result)
+        # 8. Output Stability Test
+        if result != preprocessed_text and raw_model_output:
+            try:
+                re_preprocessed = self.preprocess(result)
+                stability_dist = VocabularyManager.damerau_levenshtein_distance(result, re_preprocessed)
+                result_len = max(len(result), 1)
+                if stability_dist > 0:
+                    stability_ratio = stability_dist / result_len
+                    if stability_ratio > 0.15:
+                        raw_re = self.preprocess(raw_model_output)
+                        raw_stability = VocabularyManager.damerau_levenshtein_distance(
+                            raw_model_output, raw_re
+                        ) / max(len(raw_model_output), 1)
+                        if raw_stability < stability_ratio:
+                            raw_oov = self.vocab_manager.count_oov_words(raw_model_output)
+                            our_oov = self.vocab_manager.count_oov_words(result)
+                            if raw_oov <= our_oov:
+                                result = raw_model_output
+            except Exception:
+                pass
+        # 9. Bidirectional Word-Level Validation
+        if raw_model_output and result != raw_model_output:
+            result_words = result.split()
+            raw_words = raw_model_output.split()
+            if len(result_words) == len(raw_words):
+                corrected_words = []
+                changed = False
+                for rw, raw_w in zip(result_words, raw_words):
+                    if rw != raw_w:
+                        rw_iv = self.vocab_manager.is_iv(rw)
+                        raw_iv = self.vocab_manager.is_iv(raw_w)
+                        if not rw_iv and raw_iv:
+                            corrected_words.append(raw_w)
+                            changed = True
+                        elif rw_iv and raw_iv:
+                            input_words_list = preprocessed_text.split()
+                            idx = len(corrected_words)
+                            if idx < len(input_words_list):
+                                input_w = input_words_list[idx]
+                                rw_dist = Levenshtein.distance(input_w, rw)
+                                raw_dist = Levenshtein.distance(input_w, raw_w)
+                                if raw_dist < rw_dist:
+                                    corrected_words.append(raw_w)
+                                    changed = True
+                                else:
+                                    corrected_words.append(rw)
+                            else:
+                                corrected_words.append(rw)
+                        else:
+                            corrected_words.append(rw)
+                    else:
+                        corrected_words.append(rw)
+                if changed:
+                    new_result = ' '.join(corrected_words)
+                    new_oov = self.vocab_manager.count_oov_words(new_result)
+                    old_oov = self.vocab_manager.count_oov_words(result)
+                    if new_oov <= old_oov:
+                        result = new_result
+        # 10. SAFETY NET
+        if raw_model_output and raw_model_output != result:
+            raw_oov = self.vocab_manager.count_oov_words(raw_model_output)
+            our_oov = self.vocab_manager.count_oov_words(result)
+            if raw_oov == 0 and our_oov > 0:
+                is_valid, reason = self.validator.validate(original, raw_model_output, "mixed")
+                if is_valid or reason == "space_leniency_accept":
+                    result = raw_model_output
+            elif raw_oov == 0 and our_oov == 0:
+                raw_dist = VocabularyManager.damerau_levenshtein_distance(original, raw_model_output)
+                our_dist = VocabularyManager.damerau_levenshtein_distance(original, result)
+                result_vs_raw_dist = VocabularyManager.damerau_levenshtein_distance(result, raw_model_output)
+                if raw_dist < our_dist and result_vs_raw_dist <= 3:
+                    raw_valid, _ = self.validator.validate(original, raw_model_output, "mixed")
+                    if raw_valid:
+                        result = raw_model_output
+            elif raw_oov == 0:
+                raw_wc = len(raw_model_output.split())
+                our_wc = len(result.split())
+                if raw_wc != our_wc:
+                    raw_dist = VocabularyManager.damerau_levenshtein_distance(original, raw_model_output)
+                    our_dist = VocabularyManager.damerau_levenshtein_distance(original, result)
+                    if raw_dist < our_dist:
+                        raw_valid, _ = self.validator.validate(original, raw_model_output, "mixed")
+                        if raw_valid:
+                            result = raw_model_output
+        # ── FINAL PASS: Hamza whitelist + Ta Marbuta fixes (unrevertable) ──
+        # These are applied AFTER all validation/safety steps so they can't
+        # be undone by Steps 8-10 which compare against raw_model_output.
+        # The root issue: Steps 8-10 use edit distance to INPUT (which has errors)
+        # so they revert corrections back to the erroneous form.
+        result = AraSpellPostProcessor.fix_common_hamza(result)
+        result = AraSpellPostProcessor.fix_ha_ta_marbuta(result, vocab_manager=self.vocab_manager)
+        return result

src/nlp/punctuation/spelling/araspell_service.py ADDED Viewed

	@@ -0,0 +1,105 @@

+"""
+AraSpell Service — Lazy-loaded Arabic spelling correction.
+Model is loaded on first request and kept in memory.
+Pre-downloaded during Docker build; loaded from HF cache at runtime (no network needed).
+"""
+import os
+import logging
+import time
+import torch
+logger = logging.getLogger(__name__)
+# ── Lazy-loaded singletons ──
+_spell_checker = None
+_load_error = None
+# Model identifiers
+MODEL_REPO = 'bayan10/AraSpell-Model'
+MODEL_FILENAME = 'last_model.pt'
+TOKENIZER_NAME = 'aubmindlab/bert-base-arabertv02'
+def get_spelling_model():
+    """
+    Lazy-load the spelling model on first call.
+    Returns the ArabicSpellChecker instance, or raises RuntimeError if loading fails.
+    """
+    global _spell_checker, _load_error
+    if _spell_checker is not None:
+        return _spell_checker
+    if _load_error is not None:
+        raise RuntimeError(f"Spelling model previously failed to load: {_load_error}")
+    try:
+        t0 = time.time()
+        logger.info("Loading AraSpell spelling model (lazy init)...")
+        from huggingface_hub import hf_hub_download
+        from transformers import AutoTokenizer, EncoderDecoderModel
+        # 1. Download checkpoint (from HF cache — pre-downloaded in Docker build)
+        logger.info(f"Resolving checkpoint: {MODEL_REPO}/{MODEL_FILENAME}")
+        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
+        logger.info(f"Checkpoint path: {model_path}")
+        # 2. Load tokenizer
+        logger.info(f"Loading tokenizer: {TOKENIZER_NAME}")
+        tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
+        # 3. Build encoder-decoder model from AraBERT
+        logger.info("Building EncoderDecoderModel from AraBERT...")
+        model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+            TOKENIZER_NAME, TOKENIZER_NAME
+        )
+        # 4. Configure generation
+        model.config.decoder_start_token_id = tokenizer.cls_token_id
+        model.config.pad_token_id = tokenizer.pad_token_id
+        model.config.eos_token_id = tokenizer.sep_token_id
+        model.generation_config.max_length = 128
+        model.generation_config.decoder_start_token_id = tokenizer.cls_token_id
+        model.generation_config.pad_token_id = tokenizer.pad_token_id
+        model.generation_config.eos_token_id = tokenizer.sep_token_id
+        # 5. Load trained weights
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        logger.info(f"Loading checkpoint weights on {device}...")
+        checkpoint = torch.load(model_path, map_location=device, weights_only=False)
+        model.load_state_dict(checkpoint['model_state_dict'], strict=False)
+        model = model.to(device)
+        model.eval()
+        epoch = checkpoint.get('epoch', 'N/A')
+        logger.info(f"Spelling model loaded on {device}, epoch: {epoch}")
+        # 6. Initialize the spell checker pipeline (contextual=True for MLM-based refinement)
+        from nlp.spelling.araspell_rules import ArabicSpellChecker
+        _spell_checker = ArabicSpellChecker(
+            model, tokenizer, device, use_contextual=True
+        )
+        elapsed = time.time() - t0
+        logger.info(f"AraSpell ready in {elapsed:.1f}s")
+        return _spell_checker
+    except Exception as e:
+        import traceback
+        _load_error = str(e)
+        logger.error(f"Failed to load spelling model: {e}")
+        logger.error(traceback.format_exc())
+        raise RuntimeError(f"Spelling model load failed: {e}")
+def is_loaded() -> bool:
+    """Check if the spelling model is loaded."""
+    return _spell_checker is not None
+def get_load_error() -> str:
+    """Return the last load error, or empty string."""
+    return _load_error or ""

tests/phase10/reports/collision_benchmark_results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tests/phase10/reports/phase10_results.json CHANGED Viewed

The diff for this file is too large to render. See raw diff