Commit ·
70184c4
1
Parent(s): 16ae935
UI/UX: Add Quran standalone tab, summarize textarea, floating selection bar, fix clear editor, remove duplicate button
Browse files- analyze_failures.py +67 -0
- src/css/components.css +91 -0
- src/index.html +289 -17
- src/js/editor.js +1 -2
- src/nlp/punctuation/spelling/__init__.py +1 -0
- src/nlp/punctuation/spelling/araspell_rules.py +1615 -0
- src/nlp/punctuation/spelling/araspell_service.py +105 -0
- tests/phase10/reports/collision_benchmark_results.json +0 -0
- tests/phase10/reports/phase10_results.json +0 -0
analyze_failures.py
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyze remaining 24 failures after Layer 1/2/3 fixes."""
|
| 2 |
+
import json, re
|
| 3 |
+
|
| 4 |
+
with open('tests/phase10/reports/collision_benchmark_results.json', 'r', encoding='utf-8') as f:
|
| 5 |
+
data = json.load(f)
|
| 6 |
+
|
| 7 |
+
def norm(t):
|
| 8 |
+
t = re.sub(r'[\u064B-\u065F\u0670]', '', t)
|
| 9 |
+
t = t.rstrip('.،؛؟!?!')
|
| 10 |
+
return re.sub(r'\s+', ' ', t).strip()
|
| 11 |
+
|
| 12 |
+
categories = {}
|
| 13 |
+
for r in data['results']:
|
| 14 |
+
if r['pipeline_verdict'] != 'FN':
|
| 15 |
+
continue
|
| 16 |
+
rid = r['id']
|
| 17 |
+
exp = r['expected'].strip()
|
| 18 |
+
act = r['pipeline_output'].strip()
|
| 19 |
+
inp = r['input'].strip()
|
| 20 |
+
|
| 21 |
+
inp_w = inp.split()
|
| 22 |
+
exp_w = exp.split()
|
| 23 |
+
act_w = act.split()
|
| 24 |
+
|
| 25 |
+
issues = []
|
| 26 |
+
for i in range(min(len(exp_w), len(act_w))):
|
| 27 |
+
aw = act_w[i].rstrip('.،؛؟!?!')
|
| 28 |
+
ew = exp_w[i].rstrip('.،؛؟!?!')
|
| 29 |
+
iw = inp_w[i] if i < len(inp_w) else '—'
|
| 30 |
+
aw_n = re.sub(r'[\u064B-\u065F]', '', aw)
|
| 31 |
+
ew_n = re.sub(r'[\u064B-\u065F]', '', ew)
|
| 32 |
+
|
| 33 |
+
if aw_n == ew_n:
|
| 34 |
+
continue # tanween/diacritic only diff
|
| 35 |
+
if aw != ew:
|
| 36 |
+
if iw == aw:
|
| 37 |
+
cause = "MODEL_MISS"
|
| 38 |
+
elif iw == ew:
|
| 39 |
+
cause = "CORRUPTED"
|
| 40 |
+
else:
|
| 41 |
+
cause = "WRONG_FIX"
|
| 42 |
+
issues.append(f" [{i}] '{iw}'→'{aw}' (exp:'{ew}') {cause}")
|
| 43 |
+
|
| 44 |
+
if len(exp_w) != len(act_w):
|
| 45 |
+
issues.append(f" word count: {len(act_w)} vs {len(exp_w)}")
|
| 46 |
+
|
| 47 |
+
# Classify
|
| 48 |
+
has_junk = any('وومن' in a or '.و' in a or 'ةل' in a for a in act_w)
|
| 49 |
+
has_trailing_و = any(a.endswith('و') and not e.endswith('و') and not e.endswith('وا')
|
| 50 |
+
for a, e in zip(act_w, exp_w) if a != e)
|
| 51 |
+
|
| 52 |
+
cat = r['category']
|
| 53 |
+
print(f"\n{rid} [{cat}]")
|
| 54 |
+
print(f" IN: {inp[:60]}")
|
| 55 |
+
print(f" EXP: {exp[:60]}")
|
| 56 |
+
print(f" ACT: {act[:60]}")
|
| 57 |
+
for iss in issues:
|
| 58 |
+
print(iss)
|
| 59 |
+
if has_junk:
|
| 60 |
+
print(" >>> TRAILING JUNK")
|
| 61 |
+
|
| 62 |
+
# Summary of what each failure needs
|
| 63 |
+
print("\n" + "="*60)
|
| 64 |
+
print("FIXABILITY ANALYSIS")
|
| 65 |
+
print("="*60)
|
| 66 |
+
print(f"\nTotal failures: 24")
|
| 67 |
+
print(f"Need: 17 more passes to reach 85% (43/50)")
|
src/css/components.css
CHANGED
|
@@ -3433,3 +3433,94 @@ select:focus-visible,
|
|
| 3433 |
outline-offset: 2px;
|
| 3434 |
}
|
| 3435 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3433 |
outline-offset: 2px;
|
| 3434 |
}
|
| 3435 |
|
| 3436 |
+
/* ── Floating Selection Toolbar ── */
|
| 3437 |
+
.selection-toolbar {
|
| 3438 |
+
position: absolute;
|
| 3439 |
+
z-index: 1100;
|
| 3440 |
+
display: flex;
|
| 3441 |
+
align-items: center;
|
| 3442 |
+
gap: 2px;
|
| 3443 |
+
padding: 4px 6px;
|
| 3444 |
+
border-radius: 12px;
|
| 3445 |
+
background: var(--color-surface-elevated);
|
| 3446 |
+
border: 1px solid var(--color-border-strong);
|
| 3447 |
+
box-shadow: 0 8px 32px rgba(0,0,0,0.18), 0 0 0 1px rgba(255,255,255,0.05);
|
| 3448 |
+
backdrop-filter: blur(16px);
|
| 3449 |
+
transform: translateX(-50%);
|
| 3450 |
+
animation: selbar-in 0.2s ease;
|
| 3451 |
+
pointer-events: auto;
|
| 3452 |
+
}
|
| 3453 |
+
.selection-toolbar.is-hidden {
|
| 3454 |
+
display: none;
|
| 3455 |
+
}
|
| 3456 |
+
@keyframes selbar-in {
|
| 3457 |
+
from { opacity: 0; transform: translateX(-50%) translateY(6px); }
|
| 3458 |
+
to { opacity: 1; transform: translateX(-50%) translateY(0); }
|
| 3459 |
+
}
|
| 3460 |
+
.sel-tool-btn {
|
| 3461 |
+
display: inline-flex;
|
| 3462 |
+
align-items: center;
|
| 3463 |
+
gap: 5px;
|
| 3464 |
+
padding: 6px 12px;
|
| 3465 |
+
border: none;
|
| 3466 |
+
border-radius: 8px;
|
| 3467 |
+
background: transparent;
|
| 3468 |
+
color: var(--color-text-secondary);
|
| 3469 |
+
font-family: inherit;
|
| 3470 |
+
font-size: 12px;
|
| 3471 |
+
font-weight: 600;
|
| 3472 |
+
cursor: pointer;
|
| 3473 |
+
transition: all 0.15s ease;
|
| 3474 |
+
white-space: nowrap;
|
| 3475 |
+
}
|
| 3476 |
+
.sel-tool-btn:hover {
|
| 3477 |
+
background: var(--color-surface);
|
| 3478 |
+
color: var(--color-text-primary);
|
| 3479 |
+
}
|
| 3480 |
+
.sel-tool-sep {
|
| 3481 |
+
width: 1px;
|
| 3482 |
+
height: 20px;
|
| 3483 |
+
background: var(--color-border);
|
| 3484 |
+
flex-shrink: 0;
|
| 3485 |
+
}
|
| 3486 |
+
|
| 3487 |
+
/* ── Summary Source Toggle ── */
|
| 3488 |
+
.summary-source-toggle {
|
| 3489 |
+
display: flex;
|
| 3490 |
+
gap: 0;
|
| 3491 |
+
border-radius: 10px;
|
| 3492 |
+
background: var(--color-surface-elevated);
|
| 3493 |
+
border: 1px solid var(--color-border);
|
| 3494 |
+
padding: 3px;
|
| 3495 |
+
overflow: hidden;
|
| 3496 |
+
}
|
| 3497 |
+
.summary-source-btn {
|
| 3498 |
+
flex: 1;
|
| 3499 |
+
padding: 8px 16px;
|
| 3500 |
+
border: none;
|
| 3501 |
+
border-radius: 8px;
|
| 3502 |
+
background: transparent;
|
| 3503 |
+
color: var(--color-text-secondary);
|
| 3504 |
+
font-family: inherit;
|
| 3505 |
+
font-size: 13px;
|
| 3506 |
+
font-weight: 600;
|
| 3507 |
+
cursor: pointer;
|
| 3508 |
+
transition: all 0.2s ease;
|
| 3509 |
+
}
|
| 3510 |
+
.summary-source-btn:hover {
|
| 3511 |
+
color: var(--color-text-primary);
|
| 3512 |
+
}
|
| 3513 |
+
.summary-source-btn.active {
|
| 3514 |
+
background: linear-gradient(135deg, var(--color-primary), var(--color-secondary));
|
| 3515 |
+
color: var(--color-text-inverse);
|
| 3516 |
+
box-shadow: 0 2px 8px rgba(107, 163, 224, 0.25);
|
| 3517 |
+
}
|
| 3518 |
+
|
| 3519 |
+
/* ── Editor Tab Icons ── */
|
| 3520 |
+
.editor-tab svg {
|
| 3521 |
+
margin-left: 4px;
|
| 3522 |
+
opacity: 0.7;
|
| 3523 |
+
}
|
| 3524 |
+
.editor-tab.active svg {
|
| 3525 |
+
opacity: 1;
|
| 3526 |
+
}
|
src/index.html
CHANGED
|
@@ -668,6 +668,7 @@
|
|
| 668 |
<button id="write-tab" onclick="switchTab('write')" class="editor-tab active" type="button">كتابة</button>
|
| 669 |
<button id="summarize-tab" onclick="switchTab('summarize')" class="editor-tab" type="button">تلخيص</button>
|
| 670 |
<button id="dialect-tab" onclick="switchTab('dialect')" class="editor-tab" type="button">تحويل للفصحى</button>
|
|
|
|
| 671 |
<button id="docs-sidebar-toggle" class="docs-sidebar-toggle-mobile btn-ghost lg:hidden" type="button" aria-label="مستنداتي" aria-expanded="false" aria-controls="docs-sidebar">
|
| 672 |
<svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 7v10a2 2 0 002 2h14a2 2 0 002-2V9a2 2 0 00-2-2h-6l-2-2H5a2 2 0 00-2 2z"/></svg>
|
| 673 |
</button>
|
|
@@ -679,7 +680,7 @@
|
|
| 679 |
</div>
|
| 680 |
<span id="auto-save-status" class="text-xs text-secondary" style="opacity:0;transition:opacity 0.3s;"></span>
|
| 681 |
<div class="window-dots" aria-hidden="true">
|
| 682 |
-
<span class="dot dot--red" title="مسح المحرر" onclick="
|
| 683 |
<span class="dot dot--yellow" title="طي لوحة الاقتراحات" onclick="document.querySelector('.sidebar-desktop')?.classList.toggle('collapsed')" style="cursor:pointer;"></span>
|
| 684 |
<span class="dot dot--green" title="توسيع المحرر للعرض الكامل" onclick="document.querySelector('.editor-layout')?.classList.toggle('editor-fullscreen')" style="cursor:pointer;"></span>
|
| 685 |
</div>
|
|
@@ -804,8 +805,25 @@
|
|
| 804 |
<p class="popover-hint">اختر التصحيح المناسب · Escape للإغلاق</p>
|
| 805 |
</div>
|
| 806 |
</div>
|
| 807 |
-
|
| 808 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 809 |
<div class="summary-mode-toggle">
|
| 810 |
<button type="button" class="summary-mode-btn active" id="summary-mode-paragraph" onclick="setSummaryMode('paragraph')">
|
| 811 |
<svg width="14" height="14" fill="currentColor" viewBox="0 0 24 24"><path d="M3 5h18v2H3V5zm0 8h18v2H3v-2zm0 4h12v2H3v-2z"/></svg>
|
|
@@ -875,7 +893,55 @@
|
|
| 875 |
</div>
|
| 876 |
<div id="dialect-result" class="text-right text-lg editor-content" dir="rtl" style="line-height: 2;"></div>
|
| 877 |
</div>
|
| 878 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 879 |
<div class="editor-footer">
|
| 880 |
<div class="editor-stats" role="status" aria-label="إحصائيات">
|
| 881 |
<div class="flex items-center gap-2"><span class="stat-dot stat-dot--spelling" aria-hidden="true"></span><span class="text-sm text-secondary"><span id="spelling-count">٠</span> إملائي</span></div>
|
|
@@ -1213,18 +1279,201 @@
|
|
| 1213 |
|
| 1214 |
|
| 1215 |
function switchTab(tab) {
|
| 1216 |
-
const
|
| 1217 |
-
const summarizeTab = document.getElementById('summarize-tab');
|
| 1218 |
-
const dialectTab = document.getElementById('dialect-tab');
|
| 1219 |
-
const writeArea = document.getElementById('write-area');
|
| 1220 |
-
const summarizeArea = document.getElementById('summarize-area');
|
| 1221 |
-
const dialectArea = document.getElementById('dialect-area');
|
| 1222 |
const formatToolbar = document.getElementById('format-toolbar');
|
| 1223 |
-
|
| 1224 |
-
|
| 1225 |
-
|
| 1226 |
-
|
| 1227 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1228 |
}
|
| 1229 |
let _dialectResult = '';
|
| 1230 |
async function convertDialect() {
|
|
@@ -1533,11 +1782,17 @@
|
|
| 1533 |
}
|
| 1534 |
|
| 1535 |
async function generateSummary(event) {
|
| 1536 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1537 |
|
| 1538 |
if (!text) {
|
| 1539 |
const summaryText = document.getElementById('summary-text');
|
| 1540 |
-
summaryText.innerHTML = '<p class="text-secondary text-center">الرجاء كتابة نص في المحرر أولاً</p>';
|
| 1541 |
document.getElementById('summary-preview').classList.add('show');
|
| 1542 |
return;
|
| 1543 |
}
|
|
@@ -1946,5 +2201,22 @@
|
|
| 1946 |
</div>
|
| 1947 |
</div>
|
| 1948 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1949 |
</body>
|
| 1950 |
</html>
|
|
|
|
| 668 |
<button id="write-tab" onclick="switchTab('write')" class="editor-tab active" type="button">كتابة</button>
|
| 669 |
<button id="summarize-tab" onclick="switchTab('summarize')" class="editor-tab" type="button">تلخيص</button>
|
| 670 |
<button id="dialect-tab" onclick="switchTab('dialect')" class="editor-tab" type="button">تحويل للفصحى</button>
|
| 671 |
+
<button id="quran-tab" onclick="switchTab('quran')" class="editor-tab" type="button">القرآن</button>
|
| 672 |
<button id="docs-sidebar-toggle" class="docs-sidebar-toggle-mobile btn-ghost lg:hidden" type="button" aria-label="مستنداتي" aria-expanded="false" aria-controls="docs-sidebar">
|
| 673 |
<svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 7v10a2 2 0 002 2h14a2 2 0 002-2V9a2 2 0 00-2-2h-6l-2-2H5a2 2 0 00-2 2z"/></svg>
|
| 674 |
</button>
|
|
|
|
| 680 |
</div>
|
| 681 |
<span id="auto-save-status" class="text-xs text-secondary" style="opacity:0;transition:opacity 0.3s;"></span>
|
| 682 |
<div class="window-dots" aria-hidden="true">
|
| 683 |
+
<span class="dot dot--red" title="مسح المحرر" onclick="if(typeof clearEditor==='function'){clearEditor();}" style="cursor:pointer;"></span>
|
| 684 |
<span class="dot dot--yellow" title="طي لوحة الاقتراحات" onclick="document.querySelector('.sidebar-desktop')?.classList.toggle('collapsed')" style="cursor:pointer;"></span>
|
| 685 |
<span class="dot dot--green" title="توسيع المحرر للعرض الكامل" onclick="document.querySelector('.editor-layout')?.classList.toggle('editor-fullscreen')" style="cursor:pointer;"></span>
|
| 686 |
</div>
|
|
|
|
| 805 |
<p class="popover-hint">اختر التصحيح المناسب · Escape للإغلاق</p>
|
| 806 |
</div>
|
| 807 |
</div>
|
| 808 |
+
<div id="summarize-area" class="summarize-panel is-hidden">
|
| 809 |
+
<!-- Source Toggle: Editor text vs Custom input -->
|
| 810 |
+
<div class="mb-4">
|
| 811 |
+
<div class="flex items-center gap-2 mb-3">
|
| 812 |
+
<svg width="18" height="18" fill="none" stroke="currentColor" viewBox="0 0 24 24" style="color: var(--color-primary);"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/></svg>
|
| 813 |
+
<span class="text-base font-bold">تلخيص النصوص</span>
|
| 814 |
+
</div>
|
| 815 |
+
<div class="summary-source-toggle mb-3">
|
| 816 |
+
<button type="button" class="summary-source-btn active" id="summary-src-editor" onclick="setSummarySource('editor')">نص المحرر</button>
|
| 817 |
+
<button type="button" class="summary-source-btn" id="summary-src-custom" onclick="setSummarySource('custom')">نص مخصص</button>
|
| 818 |
+
</div>
|
| 819 |
+
<div id="summary-custom-input-wrap" class="is-hidden">
|
| 820 |
+
<textarea id="summary-custom-input" class="w-full p-4 rounded-xl text-right text-lg editor-content" dir="rtl" rows="6" placeholder="الصق أو اكتب النص الذي تريد تلخيصه هنا..." style="background: var(--color-surface); border: 1px solid var(--color-border); color: var(--color-text-primary); resize: vertical; font-family: inherit;"></textarea>
|
| 821 |
+
<div class="flex items-center justify-between mt-1 mb-2" dir="rtl">
|
| 822 |
+
<span id="summary-char-count" class="text-xs" style="color: var(--text-secondary);">٠ حرف</span>
|
| 823 |
+
</div>
|
| 824 |
+
</div>
|
| 825 |
+
</div>
|
| 826 |
+
<!-- Item 11: Mode Toggle -->
|
| 827 |
<div class="summary-mode-toggle">
|
| 828 |
<button type="button" class="summary-mode-btn active" id="summary-mode-paragraph" onclick="setSummaryMode('paragraph')">
|
| 829 |
<svg width="14" height="14" fill="currentColor" viewBox="0 0 24 24"><path d="M3 5h18v2H3V5zm0 8h18v2H3v-2zm0 4h12v2H3v-2z"/></svg>
|
|
|
|
| 893 |
</div>
|
| 894 |
<div id="dialect-result" class="text-right text-lg editor-content" dir="rtl" style="line-height: 2;"></div>
|
| 895 |
</div>
|
| 896 |
+
</div>
|
| 897 |
+
<!-- Quran Standalone Panel -->
|
| 898 |
+
<div id="quran-area" class="summarize-panel is-hidden">
|
| 899 |
+
<div class="mb-4">
|
| 900 |
+
<div class="flex items-center gap-2 mb-3">
|
| 901 |
+
<svg width="18" height="18" fill="none" stroke="#06b6d4" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"/></svg>
|
| 902 |
+
<span class="text-base font-bold">تدقيق النص القرآني</span>
|
| 903 |
+
</div>
|
| 904 |
+
<p class="text-sm mb-4" style="color: var(--text-secondary);">اكتب أو الصق نصًا قرآنيًا وسنعرض لك النص الصحيح بالتشكيل مع اسم السورة ورقم الآية، مع إمكانية ترجمته إلى ١٤ لغة.</p>
|
| 905 |
+
</div>
|
| 906 |
+
<textarea id="quran-input" class="w-full p-4 rounded-xl text-right text-lg" dir="rtl" rows="4" placeholder="اكتب الآية أو جزءًا منها هنا..." style="background: var(--color-surface); border: 1px solid var(--color-border); color: var(--color-text-primary); resize: vertical; font-family: 'Amiri Quran', 'Cairo', serif; font-size: 20px; line-height: 2;"></textarea>
|
| 907 |
+
<button id="quran-search-btn" onclick="searchQuranStandalone()" class="btn-primary w-full py-4 text-lg mt-4 mb-4" type="button">بحث وتدقيق</button>
|
| 908 |
+
<div id="quran-inline-result" class="is-hidden" style="background: var(--color-surface); border: 1px solid rgba(6,182,212,0.2); border-radius: 1rem; padding: 1.5rem;">
|
| 909 |
+
<div class="flex items-center justify-between mb-3">
|
| 910 |
+
<div class="text-sm font-bold" style="color:#06b6d4;">✓ النص القرآني المدقق</div>
|
| 911 |
+
<div class="flex items-center gap-2">
|
| 912 |
+
<button onclick="copyQuranInlineResult()" class="quran-copy-btn" type="button" title="نسخ">📋</button>
|
| 913 |
+
<button id="quran-inline-apply-btn" onclick="applyQuranInlineResult()" class="quran-apply-btn" type="button">تطبيق في المحرر ✓</button>
|
| 914 |
+
</div>
|
| 915 |
+
</div>
|
| 916 |
+
<p id="quran-inline-uthmani" class="quran-uthmani" style="font-size: 24px; line-height: 2.2; text-align: center;"></p>
|
| 917 |
+
<p id="quran-inline-reference" class="quran-reference text-center mt-2"></p>
|
| 918 |
+
<div class="mt-4 pt-4" style="border-top:1px solid var(--color-border);">
|
| 919 |
+
<div class="flex items-center gap-3 mb-3 flex-wrap">
|
| 920 |
+
<svg width="16" height="16" fill="none" stroke="#06b6d4" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129"/></svg>
|
| 921 |
+
<span class="text-sm font-bold">ترجمة الآية</span>
|
| 922 |
+
<select id="quran-inline-lang" onchange="translateQuranInline()" class="quran-lang-select">
|
| 923 |
+
<option value="">— اختر لغة —</option>
|
| 924 |
+
<option value="english">🇬🇧 English</option>
|
| 925 |
+
<option value="french">🇫🇷 Français</option>
|
| 926 |
+
<option value="turkish">🇹🇷 Türkçe</option>
|
| 927 |
+
<option value="persian">🇮🇷 فارسی</option>
|
| 928 |
+
<option value="russian">🇷🇺 Русский</option>
|
| 929 |
+
<option value="spanish">🇪🇸 Español</option>
|
| 930 |
+
<option value="german">🇩🇪 Deutsch</option>
|
| 931 |
+
<option value="indonesian">🇮🇩 Indonesia</option>
|
| 932 |
+
<option value="malay">🇲🇾 Melayu</option>
|
| 933 |
+
<option value="bengali">🇧🇩 বাংলা</option>
|
| 934 |
+
<option value="bosnian">🇧🇦 Bosanski</option>
|
| 935 |
+
<option value="portuguese">🇵🇹 Português</option>
|
| 936 |
+
<option value="uzbek">🇺🇿 O'zbek</option>
|
| 937 |
+
</select>
|
| 938 |
+
</div>
|
| 939 |
+
<div id="quran-inline-translation" class="is-hidden p-4 rounded-xl" style="background:rgba(6,182,212,0.06); border:1px solid rgba(6,182,212,0.15);">
|
| 940 |
+
<p id="quran-inline-trans-text" style="font-size:18px; line-height:2; color:var(--color-text-primary); text-align:center;"></p>
|
| 941 |
+
</div>
|
| 942 |
+
</div>
|
| 943 |
+
</div>
|
| 944 |
+
</div>
|
| 945 |
<div class="editor-footer">
|
| 946 |
<div class="editor-stats" role="status" aria-label="إحصائيات">
|
| 947 |
<div class="flex items-center gap-2"><span class="stat-dot stat-dot--spelling" aria-hidden="true"></span><span class="text-sm text-secondary"><span id="spelling-count">٠</span> إملائي</span></div>
|
|
|
|
| 1279 |
|
| 1280 |
|
| 1281 |
function switchTab(tab) {
|
| 1282 |
+
const tabs = ['write', 'summarize', 'dialect', 'quran'];
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1283 |
const formatToolbar = document.getElementById('format-toolbar');
|
| 1284 |
+
tabs.forEach(function(t) {
|
| 1285 |
+
var tabEl = document.getElementById(t + '-tab');
|
| 1286 |
+
var areaEl = document.getElementById(t + '-area') || document.getElementById(t === 'write' ? 'write-area' : t + '-area');
|
| 1287 |
+
if (tabEl) tabEl.classList.remove('active');
|
| 1288 |
+
if (areaEl) areaEl.classList.add('is-hidden');
|
| 1289 |
+
});
|
| 1290 |
+
var activeTab = document.getElementById(tab + '-tab');
|
| 1291 |
+
var activeArea = document.getElementById(tab === 'write' ? 'write-area' : tab + '-area');
|
| 1292 |
+
if (activeTab) activeTab.classList.add('active');
|
| 1293 |
+
if (activeArea) activeArea.classList.remove('is-hidden');
|
| 1294 |
+
if (formatToolbar) formatToolbar.style.display = (tab === 'write') ? '' : 'none';
|
| 1295 |
+
}
|
| 1296 |
+
|
| 1297 |
+
/* ═══════════════════════════════════════════
|
| 1298 |
+
Summarize — Source Toggle (editor vs custom)
|
| 1299 |
+
═══════════════════════════════════════════ */
|
| 1300 |
+
window._summarySource = 'editor';
|
| 1301 |
+
function setSummarySource(src) {
|
| 1302 |
+
window._summarySource = src;
|
| 1303 |
+
var editorBtn = document.getElementById('summary-src-editor');
|
| 1304 |
+
var customBtn = document.getElementById('summary-src-custom');
|
| 1305 |
+
var customWrap = document.getElementById('summary-custom-input-wrap');
|
| 1306 |
+
if (editorBtn) editorBtn.classList.toggle('active', src === 'editor');
|
| 1307 |
+
if (customBtn) customBtn.classList.toggle('active', src === 'custom');
|
| 1308 |
+
if (customWrap) customWrap.classList.toggle('is-hidden', src !== 'custom');
|
| 1309 |
+
}
|
| 1310 |
+
|
| 1311 |
+
/* ═══════════════════════════════════════════
|
| 1312 |
+
Floating Selection Toolbar
|
| 1313 |
+
═══════════════════════════════════════════ */
|
| 1314 |
+
(function() {
|
| 1315 |
+
var selBar = null;
|
| 1316 |
+
var hideTimer = null;
|
| 1317 |
+
function showSelectionBar() {
|
| 1318 |
+
var sel = window.getSelection();
|
| 1319 |
+
if (!sel || sel.isCollapsed || !sel.toString().trim()) { hideSelectionBar(); return; }
|
| 1320 |
+
var editor = document.getElementById('editor-container');
|
| 1321 |
+
if (!editor || !editor.contains(sel.anchorNode)) { hideSelectionBar(); return; }
|
| 1322 |
+
if (!selBar) selBar = document.getElementById('selection-toolbar');
|
| 1323 |
+
if (!selBar) return;
|
| 1324 |
+
var range = sel.getRangeAt(0);
|
| 1325 |
+
var rect = range.getBoundingClientRect();
|
| 1326 |
+
selBar.style.top = (rect.top + window.scrollY - 48) + 'px';
|
| 1327 |
+
selBar.style.left = (rect.left + rect.width / 2) + 'px';
|
| 1328 |
+
selBar.classList.remove('is-hidden');
|
| 1329 |
+
}
|
| 1330 |
+
function hideSelectionBar() {
|
| 1331 |
+
if (!selBar) selBar = document.getElementById('selection-toolbar');
|
| 1332 |
+
if (selBar) selBar.classList.add('is-hidden');
|
| 1333 |
+
}
|
| 1334 |
+
document.addEventListener('selectionchange', function() {
|
| 1335 |
+
clearTimeout(hideTimer);
|
| 1336 |
+
hideTimer = setTimeout(function() {
|
| 1337 |
+
var sel = window.getSelection();
|
| 1338 |
+
if (sel && !sel.isCollapsed && sel.toString().trim().length > 2) {
|
| 1339 |
+
var editor = document.getElementById('editor-container');
|
| 1340 |
+
if (editor && editor.contains(sel.anchorNode)) { showSelectionBar(); return; }
|
| 1341 |
+
}
|
| 1342 |
+
hideSelectionBar();
|
| 1343 |
+
}, 300);
|
| 1344 |
+
});
|
| 1345 |
+
document.addEventListener('mousedown', function(e) {
|
| 1346 |
+
if (!selBar) selBar = document.getElementById('selection-toolbar');
|
| 1347 |
+
if (selBar && !selBar.contains(e.target)) hideSelectionBar();
|
| 1348 |
+
});
|
| 1349 |
+
})();
|
| 1350 |
+
|
| 1351 |
+
function selectionToolAction(tool) {
|
| 1352 |
+
var sel = window.getSelection();
|
| 1353 |
+
var text = sel ? sel.toString().trim() : '';
|
| 1354 |
+
if (!text) { if (typeof showToast === 'function') showToast('حدد نصًا أولاً', 'warning'); return; }
|
| 1355 |
+
var selBar = document.getElementById('selection-toolbar');
|
| 1356 |
+
if (selBar) selBar.classList.add('is-hidden');
|
| 1357 |
+
if (tool === 'summarize') {
|
| 1358 |
+
switchTab('summarize');
|
| 1359 |
+
setSummarySource('custom');
|
| 1360 |
+
var ta = document.getElementById('summary-custom-input');
|
| 1361 |
+
if (ta) { ta.value = text; }
|
| 1362 |
+
} else if (tool === 'dialect') {
|
| 1363 |
+
switchTab('dialect');
|
| 1364 |
+
var ta = document.getElementById('dialect-input');
|
| 1365 |
+
if (ta) { ta.value = text; if (typeof updateDialectCharCount === 'function') updateDialectCharCount(); }
|
| 1366 |
+
} else if (tool === 'quran') {
|
| 1367 |
+
switchTab('quran');
|
| 1368 |
+
var ta = document.getElementById('quran-input');
|
| 1369 |
+
if (ta) ta.value = text;
|
| 1370 |
+
}
|
| 1371 |
+
}
|
| 1372 |
+
|
| 1373 |
+
/* ═══════════════════════════════════════════
|
| 1374 |
+
Quran Standalone Panel Functions
|
| 1375 |
+
═══════════════════════════════════════════ */
|
| 1376 |
+
let _quranInlineVerse = '';
|
| 1377 |
+
let _quranInlineRef = '';
|
| 1378 |
+
let _quranInlineQuery = '';
|
| 1379 |
+
|
| 1380 |
+
async function searchQuranStandalone() {
|
| 1381 |
+
var input = document.getElementById('quran-input').value.trim();
|
| 1382 |
+
if (!input) { if (typeof showToast === 'function') showToast('الرجاء كتابة نص قرآني أولاً', 'warning'); return; }
|
| 1383 |
+
_quranInlineQuery = input;
|
| 1384 |
+
var resultDiv = document.getElementById('quran-inline-result');
|
| 1385 |
+
var uthmaniEl = document.getElementById('quran-inline-uthmani');
|
| 1386 |
+
var refEl = document.getElementById('quran-inline-reference');
|
| 1387 |
+
var searchBtn = document.getElementById('quran-search-btn');
|
| 1388 |
+
uthmaniEl.innerHTML = '<span class="text-secondary">⏳ جاري البحث...</span>';
|
| 1389 |
+
refEl.textContent = '';
|
| 1390 |
+
resultDiv.classList.remove('is-hidden');
|
| 1391 |
+
document.getElementById('quran-inline-translation').classList.add('is-hidden');
|
| 1392 |
+
document.getElementById('quran-inline-lang').value = '';
|
| 1393 |
+
if (searchBtn) { searchBtn.disabled = true; searchBtn.textContent = '⏳ جاري البحث...'; }
|
| 1394 |
+
var _abortCtrl = new AbortController();
|
| 1395 |
+
var _timeout = setTimeout(function(){ _abortCtrl.abort(); }, 30000);
|
| 1396 |
+
try {
|
| 1397 |
+
var res = await fetch('/api/quran', {
|
| 1398 |
+
method: 'POST',
|
| 1399 |
+
headers: { 'Content-Type': 'application/json' },
|
| 1400 |
+
body: JSON.stringify({ text: input, language: 'تدقيق الايات' }),
|
| 1401 |
+
signal: _abortCtrl.signal
|
| 1402 |
+
});
|
| 1403 |
+
var data = await res.json();
|
| 1404 |
+
if (data.error) {
|
| 1405 |
+
uthmaniEl.innerHTML = '<span class="text-secondary">' + data.error.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>') + '</span>';
|
| 1406 |
+
return;
|
| 1407 |
+
}
|
| 1408 |
+
var seg = data.matched_segment || '';
|
| 1409 |
+
var refMatch = seg.match(/【([^】]+)】/);
|
| 1410 |
+
var verseText = seg.replace(/\s*【[^】]+】\s*$/, '').replace(/^\(/, '').replace(/\)$/, '');
|
| 1411 |
+
var reference = refMatch ? refMatch[1] : '';
|
| 1412 |
+
_quranInlineVerse = verseText;
|
| 1413 |
+
_quranInlineRef = reference;
|
| 1414 |
+
uthmaniEl.textContent = verseText;
|
| 1415 |
+
refEl.textContent = reference ? '[' + reference + ']' : '';
|
| 1416 |
+
} catch (err) {
|
| 1417 |
+
var msg = err.name === 'AbortError' ? 'انتهى وقت الانتظار — حاول مرة أخرى' : 'حدث خطأ أثناء البحث — تأكد من الاتصال';
|
| 1418 |
+
uthmaniEl.innerHTML = '<span class="text-secondary">' + msg + '</span>';
|
| 1419 |
+
} finally {
|
| 1420 |
+
clearTimeout(_timeout);
|
| 1421 |
+
if (searchBtn) { searchBtn.disabled = false; searchBtn.textContent = 'بحث وتدقيق'; }
|
| 1422 |
+
}
|
| 1423 |
+
}
|
| 1424 |
+
|
| 1425 |
+
async function translateQuranInline() {
|
| 1426 |
+
var lang = document.getElementById('quran-inline-lang').value;
|
| 1427 |
+
if (!lang || !_quranInlineQuery) return;
|
| 1428 |
+
var resultDiv = document.getElementById('quran-inline-translation');
|
| 1429 |
+
var textEl = document.getElementById('quran-inline-trans-text');
|
| 1430 |
+
textEl.innerHTML = '<span class="text-secondary">⏳ جاري الترجمة...</span>';
|
| 1431 |
+
resultDiv.classList.remove('is-hidden');
|
| 1432 |
+
var _abortCtrl = new AbortController();
|
| 1433 |
+
var _timeout = setTimeout(function(){ _abortCtrl.abort(); }, 30000);
|
| 1434 |
+
try {
|
| 1435 |
+
var res = await fetch('/api/quran', {
|
| 1436 |
+
method: 'POST',
|
| 1437 |
+
headers: { 'Content-Type': 'application/json' },
|
| 1438 |
+
body: JSON.stringify({ text: _quranInlineQuery, language: lang }),
|
| 1439 |
+
signal: _abortCtrl.signal
|
| 1440 |
+
});
|
| 1441 |
+
var data = await res.json();
|
| 1442 |
+
if (data.error) {
|
| 1443 |
+
textEl.innerHTML = '<span class="text-secondary">' + data.error.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>') + '</span>';
|
| 1444 |
+
return;
|
| 1445 |
+
}
|
| 1446 |
+
var seg = data.matched_segment || '';
|
| 1447 |
+
var transText = seg.replace(/\s*【[^】]+】\s*$/, '').replace(/^\(/, '').replace(/\)$/, '');
|
| 1448 |
+
textEl.textContent = transText;
|
| 1449 |
+
} catch (err) {
|
| 1450 |
+
var msg = err.name === 'AbortError' ? 'انتهى وقت الانتظار' : 'حدث خطأ في الترجمة';
|
| 1451 |
+
textEl.innerHTML = '<span class="text-secondary">' + msg + '</span>';
|
| 1452 |
+
} finally { clearTimeout(_timeout); }
|
| 1453 |
+
}
|
| 1454 |
+
|
| 1455 |
+
function copyQuranInlineResult() {
|
| 1456 |
+
var text = (_quranInlineVerse || '') + (_quranInlineRef ? ' [' + _quranInlineRef + ']' : '');
|
| 1457 |
+
if (!text.trim()) return;
|
| 1458 |
+
navigator.clipboard.writeText(text).then(function() {
|
| 1459 |
+
if (typeof showToast === 'function') showToast('✓ تم نسخ النص المدقق');
|
| 1460 |
+
});
|
| 1461 |
+
}
|
| 1462 |
+
|
| 1463 |
+
function applyQuranInlineResult() {
|
| 1464 |
+
if (!_quranInlineVerse) return;
|
| 1465 |
+
var editor = document.getElementById('editor-container');
|
| 1466 |
+
if (!editor) return;
|
| 1467 |
+
if (typeof pushUndoState === 'function') pushUndoState();
|
| 1468 |
+
var esc = function(t) { return t.replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); };
|
| 1469 |
+
var refHTML = _quranInlineRef ? ' <span class="quran-ref-inline">[' + esc(_quranInlineRef) + ']</span>' : '';
|
| 1470 |
+
var existing = editor.innerHTML;
|
| 1471 |
+
editor.innerHTML = existing + (existing ? '<br>' : '') +
|
| 1472 |
+
'<span class="quran-applied" contenteditable="false" data-quran="true">' +
|
| 1473 |
+
esc(_quranInlineVerse) + refHTML + '</span>';
|
| 1474 |
+
editor.dispatchEvent(new Event('input', { bubbles: true }));
|
| 1475 |
+
switchTab('write');
|
| 1476 |
+
if (typeof showToast === 'function') showToast('✓ تم إضافة النص القرآني في المحرر');
|
| 1477 |
}
|
| 1478 |
let _dialectResult = '';
|
| 1479 |
async function convertDialect() {
|
|
|
|
| 1782 |
}
|
| 1783 |
|
| 1784 |
async function generateSummary(event) {
|
| 1785 |
+
let text = '';
|
| 1786 |
+
if (window._summarySource === 'custom') {
|
| 1787 |
+
var customInput = document.getElementById('summary-custom-input');
|
| 1788 |
+
text = customInput ? customInput.value.trim() : '';
|
| 1789 |
+
} else {
|
| 1790 |
+
text = (typeof getEditorText === 'function' ? getEditorText() : '').trim();
|
| 1791 |
+
}
|
| 1792 |
|
| 1793 |
if (!text) {
|
| 1794 |
const summaryText = document.getElementById('summary-text');
|
| 1795 |
+
summaryText.innerHTML = '<p class="text-secondary text-center">' + (window._summarySource === 'custom' ? 'الرجاء كتابة نص في مربع الإدخال أولاً' : 'الرجاء كتابة نص في المحرر أولاً') + '</p>';
|
| 1796 |
document.getElementById('summary-preview').classList.add('show');
|
| 1797 |
return;
|
| 1798 |
}
|
|
|
|
| 2201 |
</div>
|
| 2202 |
</div>
|
| 2203 |
</div>
|
| 2204 |
+
<!-- Floating Selection Toolbar -->
|
| 2205 |
+
<div id="selection-toolbar" class="selection-toolbar is-hidden" role="toolbar" aria-label="أدوات النص المحدد">
|
| 2206 |
+
<button type="button" class="sel-tool-btn" onclick="selectionToolAction('summarize')" title="تلخيص النص المحدد">
|
| 2207 |
+
<svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M9 12h6m-6 4h6m2 5H7a2 2 0 01-2-2V5a2 2 0 012-2h5.586a1 1 0 01.707.293l5.414 5.414a1 1 0 01.293.707V19a2 2 0 01-2 2z"/></svg>
|
| 2208 |
+
تلخيص
|
| 2209 |
+
</button>
|
| 2210 |
+
<span class="sel-tool-sep"></span>
|
| 2211 |
+
<button type="button" class="sel-tool-btn" onclick="selectionToolAction('dialect')" title="تحويل النص المحدد للفصحى">
|
| 2212 |
+
<svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M3 5h12M9 3v2m1.048 9.5A18.022 18.022 0 016.412 9m6.088 9h7M11 21l5-10 5 10M12.751 5C11.783 10.77 8.07 15.61 3 18.129"/></svg>
|
| 2213 |
+
فصحى
|
| 2214 |
+
</button>
|
| 2215 |
+
<span class="sel-tool-sep"></span>
|
| 2216 |
+
<button type="button" class="sel-tool-btn" onclick="selectionToolAction('quran')" title="تدقيق النص القرآني المحدد">
|
| 2217 |
+
<svg width="14" height="14" fill="none" stroke="currentColor" viewBox="0 0 24 24"><path stroke-linecap="round" stroke-linejoin="round" stroke-width="2" d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"/></svg>
|
| 2218 |
+
قرآن
|
| 2219 |
+
</button>
|
| 2220 |
+
</div>
|
| 2221 |
</body>
|
| 2222 |
</html>
|
src/js/editor.js
CHANGED
|
@@ -409,8 +409,7 @@ function showTooltip(element) {
|
|
| 409 |
const btnClass = isMain ? 'popover-alt-btn popover-alt-main' : 'popover-alt-btn';
|
| 410 |
html += `<button class="${btnClass}" data-alt-correction="${escapeHtml(alt)}" type="button">${isMain ? '✓ ' : ''}${escapeHtml(alt)}</button>`;
|
| 411 |
});
|
| 412 |
-
//
|
| 413 |
-
html += `<button class="popover-alt-btn popover-alt-keep" data-alt-correction="${escapeHtml(suggestion.original)}" type="button">إبقاء كما هي</button>`;
|
| 414 |
alternativesEl.innerHTML = html;
|
| 415 |
|
| 416 |
// Bind click events for alternatives
|
|
|
|
| 409 |
const btnClass = isMain ? 'popover-alt-btn popover-alt-main' : 'popover-alt-btn';
|
| 410 |
html += `<button class="${btnClass}" data-alt-correction="${escapeHtml(alt)}" type="button">${isMain ? '✓ ' : ''}${escapeHtml(alt)}</button>`;
|
| 411 |
});
|
| 412 |
+
// No separate "keep" button — the "تجاهل" popover button handles dismissal
|
|
|
|
| 413 |
alternativesEl.innerHTML = html;
|
| 414 |
|
| 415 |
// Bind click events for alternatives
|
src/nlp/punctuation/spelling/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# AraSpell — Arabic Spelling Correction
|
src/nlp/punctuation/spelling/araspell_rules.py
ADDED
|
@@ -0,0 +1,1615 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AraSpell — Arabic Spell Checker Pipeline (Rules & Classes)
|
| 2 |
+
# Extracted from AraSpell.py — NO global model loading, NO Gradio dependencies.
|
| 3 |
+
# All classes are imported by araspell_service.py.
|
| 4 |
+
|
| 5 |
+
import re
|
| 6 |
+
import math
|
| 7 |
+
import logging
|
| 8 |
+
import torch
|
| 9 |
+
from collections import Counter
|
| 10 |
+
from enum import Enum
|
| 11 |
+
from typing import List, Tuple, Optional
|
| 12 |
+
|
| 13 |
+
import Levenshtein
|
| 14 |
+
import jellyfish
|
| 15 |
+
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 19 |
+
# ERROR TYPE ENUM
|
| 20 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 21 |
+
|
| 22 |
+
class ErrorType(Enum):
|
| 23 |
+
"""Types of spelling errors"""
|
| 24 |
+
CHAR_REPETITION = "char_repetition"
|
| 25 |
+
WORD_MERGE = "word_merge"
|
| 26 |
+
CHAR_SUBSTITUTION = "char_substitution"
|
| 27 |
+
MIXED = "mixed"
|
| 28 |
+
CLEAN = "clean"
|
| 29 |
+
|
| 30 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 31 |
+
# KEYBOARD PROXIMITY (Phase 12 — from original AraSpell.py L475-520)
|
| 32 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 33 |
+
|
| 34 |
+
class RulesBasedCorrector:
|
| 35 |
+
"""Arabic keyboard-proximity and character substitution rules."""
|
| 36 |
+
|
| 37 |
+
# Arabic keyboard layout adjacency mapping
|
| 38 |
+
KEYBOARD_NEIGHBORS = {
|
| 39 |
+
'ض': ['ص', 'ق'],
|
| 40 |
+
'ص': ['ض', 'ث', 'ق'],
|
| 41 |
+
'ث': ['ص', 'ق'],
|
| 42 |
+
'ق': ['ض', 'ص', 'ث', 'ف', 'غ'],
|
| 43 |
+
'ف': ['ق', 'غ', 'ع', 'ب'],
|
| 44 |
+
'غ': ['ق', 'ف', 'ع', 'ه'],
|
| 45 |
+
'ع': ['ف', 'غ', 'ه', 'خ'],
|
| 46 |
+
'ه': ['غ', 'ع', 'خ', 'ح'],
|
| 47 |
+
'خ': ['ع', 'ه', 'ح', 'ج'],
|
| 48 |
+
'ح': ['ه', 'خ', 'ج'],
|
| 49 |
+
'ج': ['خ', 'ح', 'د'],
|
| 50 |
+
'د': ['ج', 'ذ'],
|
| 51 |
+
'ذ': ['د'],
|
| 52 |
+
'ش': ['س', 'ي', 'ئ'],
|
| 53 |
+
'س': ['ش', 'ي', 'ب'],
|
| 54 |
+
'ي': ['ش', 'س', 'ب', 'ت'],
|
| 55 |
+
'ب': ['ي', 'س', 'ف', 'ل', 'ن'],
|
| 56 |
+
'ل': ['ب', 'ا', 'ن', 'م'],
|
| 57 |
+
'ا': ['ل', 'ت', 'م'],
|
| 58 |
+
'ت': ['ي', 'ا', 'ن'],
|
| 59 |
+
'ن': ['ب', 'ل', 'ت', 'م', 'ك'],
|
| 60 |
+
'م': ['ل', 'ا', 'ن', 'ك'],
|
| 61 |
+
'ك': ['ن', 'م', 'ط'],
|
| 62 |
+
'ط': ['ك', 'ظ'],
|
| 63 |
+
'ظ': ['ط'],
|
| 64 |
+
'ئ': ['ش', 'ء', 'ر'],
|
| 65 |
+
'ء': ['ئ', 'ؤ'],
|
| 66 |
+
'ؤ': ['ء', 'ر'],
|
| 67 |
+
'ر': ['ئ', 'ؤ', 'لا', 'ى', 'ز'],
|
| 68 |
+
'لا': ['ر', 'ى'],
|
| 69 |
+
'ى': ['ر', 'لا', 'ة', 'ز'],
|
| 70 |
+
'ة': ['ى', 'و', 'ز'],
|
| 71 |
+
'و': ['ة', 'ز'],
|
| 72 |
+
'ز': ['ر', 'ى', 'ة', 'و'],
|
| 73 |
+
'أ': ['ا', 'إ', 'آ'],
|
| 74 |
+
'إ': ['ا', 'أ'],
|
| 75 |
+
'آ': ['ا', 'أ'],
|
| 76 |
+
}
|
| 77 |
+
|
| 78 |
+
@staticmethod
|
| 79 |
+
def is_keyboard_neighbor(char1: str, char2: str) -> bool:
|
| 80 |
+
"""Check if two Arabic chars are adjacent on the keyboard."""
|
| 81 |
+
neighbors = RulesBasedCorrector.KEYBOARD_NEIGHBORS.get(char1, [])
|
| 82 |
+
return char2 in neighbors
|
| 83 |
+
|
| 84 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 85 |
+
# POST PROCESSOR
|
| 86 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 87 |
+
|
| 88 |
+
class AraSpellPostProcessor:
|
| 89 |
+
"""Arabic text post-processing techniques."""
|
| 90 |
+
|
| 91 |
+
ARABIC_HARAKAT = 'ًٌٍَُِّْ'
|
| 92 |
+
TATWEEL = 'ـ'
|
| 93 |
+
NORMALIZER_MAP = {
|
| 94 |
+
'ﻹ': 'لإ', 'ﻷ': 'لأ', 'ﻵ': 'لآ', 'ﻻ': 'لا', 'ﷲ': 'الله'
|
| 95 |
+
}
|
| 96 |
+
ARABIC_CONSONANTS = set('بتثجحخدذرزسشصضطظعغفقكلمن')
|
| 97 |
+
|
| 98 |
+
# --- Basic Normalization ---
|
| 99 |
+
|
| 100 |
+
@staticmethod
|
| 101 |
+
def remove_harakat(text: str) -> str:
|
| 102 |
+
"""Remove Arabic diacritics"""
|
| 103 |
+
return re.sub(r'[ً-ْ]', '', text)
|
| 104 |
+
|
| 105 |
+
@staticmethod
|
| 106 |
+
def remove_tatweel(text: str) -> str:
|
| 107 |
+
"""Remove Arabic kashida/tatweel"""
|
| 108 |
+
return text.replace(AraSpellPostProcessor.TATWEEL, '')
|
| 109 |
+
|
| 110 |
+
@staticmethod
|
| 111 |
+
def normalize_special_chars(text: str) -> str:
|
| 112 |
+
"""Normalize special Arabic ligatures"""
|
| 113 |
+
for old, new in AraSpellPostProcessor.NORMALIZER_MAP.items():
|
| 114 |
+
text = text.replace(old, new)
|
| 115 |
+
return text
|
| 116 |
+
|
| 117 |
+
# --- Core Functions ---
|
| 118 |
+
|
| 119 |
+
@staticmethod
|
| 120 |
+
def unified_collapse_repeated(text: str) -> str:
|
| 121 |
+
"""
|
| 122 |
+
Collapse repeated characters.
|
| 123 |
+
Arabic: 3+ consecutive → 1 | Latin: 2+ consecutive → 1
|
| 124 |
+
"""
|
| 125 |
+
text = re.sub(r"([\u0600-\u06FF])\1{2,}", r"\1", text)
|
| 126 |
+
text = re.sub(r"([a-zA-Z])\1+", r"\1", text)
|
| 127 |
+
return text
|
| 128 |
+
|
| 129 |
+
@staticmethod
|
| 130 |
+
def remove_duplicate_words(text: str) -> str:
|
| 131 |
+
"""Remove consecutive duplicate words. e.g. كتاب كتاب → كتاب"""
|
| 132 |
+
words = text.split()
|
| 133 |
+
if len(words) < 2:
|
| 134 |
+
return text
|
| 135 |
+
result = [words[0]]
|
| 136 |
+
for i in range(1, len(words)):
|
| 137 |
+
if words[i] != words[i-1]:
|
| 138 |
+
result.append(words[i])
|
| 139 |
+
return ' '.join(result)
|
| 140 |
+
|
| 141 |
+
@staticmethod
|
| 142 |
+
def normalize_spaces(text: str) -> str:
|
| 143 |
+
"""Normalize whitespace: multiple spaces, unicode spaces, punctuation spacing."""
|
| 144 |
+
text = re.sub(r' +', ' ', text)
|
| 145 |
+
text = text.replace('\u00A0', ' ')
|
| 146 |
+
text = text.replace('\u200B', '')
|
| 147 |
+
text = text.replace('\u200C', '')
|
| 148 |
+
text = text.replace('\u200D', '')
|
| 149 |
+
text = text.strip()
|
| 150 |
+
text = re.sub(r'\s*([،؛؟!.])\s*', r'\1 ', text)
|
| 151 |
+
text = text.strip()
|
| 152 |
+
return text
|
| 153 |
+
|
| 154 |
+
@staticmethod
|
| 155 |
+
def remove_word_repetition_with_wa(text: str) -> str:
|
| 156 |
+
"""Remove word و word → word"""
|
| 157 |
+
words = text.split()
|
| 158 |
+
result = []
|
| 159 |
+
i = 0
|
| 160 |
+
while i < len(words):
|
| 161 |
+
if i + 2 < len(words) and words[i] == words[i+2] and words[i+1] == 'و':
|
| 162 |
+
result.append(words[i])
|
| 163 |
+
i += 3
|
| 164 |
+
else:
|
| 165 |
+
result.append(words[i])
|
| 166 |
+
i += 1
|
| 167 |
+
return ' '.join(result)
|
| 168 |
+
|
| 169 |
+
# --- Hamza & Ta Marbuta Handling ---
|
| 170 |
+
|
| 171 |
+
# Common Arabic words with hamza errors — covers the most frequent
|
| 172 |
+
# spelling mistakes in informal Arabic writing
|
| 173 |
+
HAMZA_WHITELIST = {
|
| 174 |
+
'الي': 'إلى', 'الى': 'إلى',
|
| 175 |
+
'انت': 'أنت', 'انتم': 'أنتم', 'انتي': 'أنتِ',
|
| 176 |
+
'انتو': 'أنتم', 'انتن': 'أنتن',
|
| 177 |
+
'انا': 'أنا',
|
| 178 |
+
'امس': 'أمس',
|
| 179 |
+
'لان': 'لأن', 'لانه': 'لأنه', 'لانها': 'لأنها',
|
| 180 |
+
'لانهم': 'لأنهم', 'لانك': 'لأنك',
|
| 181 |
+
'اذا': 'إذا', 'اذ': 'إذ',
|
| 182 |
+
'اي': 'أي', 'اين': 'أين',
|
| 183 |
+
'او': 'أو',
|
| 184 |
+
'اما': 'أما',
|
| 185 |
+
'ان': 'أن', 'انه': 'أنه', 'انها': 'أنها', 'انهم': 'أنهم',
|
| 186 |
+
'اخر': 'آخر', 'اخرى': 'أخرى',
|
| 187 |
+
'الان': 'الآن',
|
| 188 |
+
'اول': 'أول', 'اولى': 'أولى',
|
| 189 |
+
'اصبح': 'أصبح', 'اصبحت': 'أصبحت',
|
| 190 |
+
'اكثر': 'أكثر', 'اقل': 'أقل',
|
| 191 |
+
'اعلى': 'أعلى', 'ادنى': 'أدنى',
|
| 192 |
+
'اسرع': 'أسرع', 'ابطا': 'أبطأ',
|
| 193 |
+
'اكبر': 'أكبر', 'اصغر': 'أصغر',
|
| 194 |
+
'احسن': 'أحسن', 'اسوا': 'أسوأ',
|
| 195 |
+
'امام': 'أمام',
|
| 196 |
+
'اثناء': 'أثناء',
|
| 197 |
+
'ايضا': 'أيضاً', 'ايض': 'أيضاً',
|
| 198 |
+
'اساسي': 'أساسي', 'اساسية': 'أساسية',
|
| 199 |
+
'اخي': 'أخي', 'اخت': 'أخت', 'اخو': 'أخو',
|
| 200 |
+
'ابي': 'أبي', 'اب': 'أب', 'ابو': 'أبو',
|
| 201 |
+
'اهل': 'أهل',
|
| 202 |
+
'اطفال': 'أطفال',
|
| 203 |
+
'اصدقاء': 'أصدقاء', 'اصدقائي': 'أصدقائي',
|
| 204 |
+
'اعتقد': 'أعتقد', 'اريد': 'أريد', 'احب': 'أحب',
|
| 205 |
+
'اعرف': 'أعرف', 'اعلم': 'أعلم',
|
| 206 |
+
'اخذ': 'أخذ', 'اكل': 'أكل',
|
| 207 |
+
'الايام': 'الأيام',
|
| 208 |
+
'الاطفال': 'الأطفال',
|
| 209 |
+
'الاسعار': 'الأسعار',
|
| 210 |
+
'الاولى': 'الأولى',
|
| 211 |
+
'الاخير': 'الأخير', 'الاخيرة': 'الأخيرة',
|
| 212 |
+
'واصدقائي': 'وأصدقائي',
|
| 213 |
+
# FIX-14: Additional hamza entries
|
| 214 |
+
'ابناء': 'أبناء',
|
| 215 |
+
'اجمل': 'أجمل', 'اجمع': 'أجمع',
|
| 216 |
+
'اعلن': 'أعلن', 'اعلنت': 'أعلنت',
|
| 217 |
+
'اكد': 'أكد', 'اكدت': 'أكدت',
|
| 218 |
+
'اشار': 'أشار', 'اشارت': 'أشارت',
|
| 219 |
+
'ارسل': 'أرسل', 'ارسلت': 'أرسلت',
|
| 220 |
+
'اضاف': 'أضاف', 'اضافت': 'أضافت',
|
| 221 |
+
'اخيرا': 'أخيراً', 'اخيراً': 'أخيراً',
|
| 222 |
+
'اساسا': 'أساساً', 'اساساً': 'أساساً',
|
| 223 |
+
'احيانا': 'أحياناً', 'احياناً': 'أحياناً',
|
| 224 |
+
'ابدا': 'أبداً', 'ابداً': 'أبداً',
|
| 225 |
+
'اصلا': 'أصلاً', 'اصلاً': 'أصلاً',
|
| 226 |
+
'اخبار': 'أخبار', 'اخبر': 'أخبر',
|
| 227 |
+
'امر': 'أمر', 'امور': 'أمور',
|
| 228 |
+
'اهم': 'أهم', 'اهمية': 'أهمية',
|
| 229 |
+
'اصبح': 'أصبح', 'اصل': 'أصل',
|
| 230 |
+
'اثر': 'أثر', 'اثار': 'آثار',
|
| 231 |
+
'اساء': 'أساء', 'اساس': 'أساس',
|
| 232 |
+
'استاذ': 'أستاذ', 'اسلام': 'إسلام',
|
| 233 |
+
# Batch 3: More hamza entries for remaining FN cases
|
| 234 |
+
'اسرة': 'أسرة', 'اسر': 'أسر',
|
| 235 |
+
'اعضاء': 'أعضاء', 'اعداد': 'أعداد',
|
| 236 |
+
'اعمال': 'أعمال', 'اعمار': 'أعمار',
|
| 237 |
+
'انجاز': 'إنجاز', 'انجازات': 'إنجازات',
|
| 238 |
+
'انشاء': 'إنشاء', 'انتاج': 'إنتاج',
|
| 239 |
+
'انتخابات': 'انتخابات', 'انتظار': 'انتظار',
|
| 240 |
+
'اسلامي': 'إسلامي', 'اسلامية': 'إسلامية',
|
| 241 |
+
'امكانية': 'إمكانية', 'امكان': 'إمكان',
|
| 242 |
+
'اشكالية': 'إشكالية',
|
| 243 |
+
'ادارة': 'إدارة', 'ادارية': 'إدارية',
|
| 244 |
+
'اعلام': 'إعلام', 'اعلامي': 'إعلامي',
|
| 245 |
+
'احتمال': 'احتمال', 'احتفال': 'احتفال',
|
| 246 |
+
'ازور': 'أزور', 'اذهب': 'أذهب', 'اكتب': 'أكتب',
|
| 247 |
+
'اقرا': 'أقرأ', 'اقرأ': 'أقرأ',
|
| 248 |
+
'اعمل': 'أعمل', 'ادرس': 'أدرس',
|
| 249 |
+
'اشتري': 'أشتري', 'اسافر': 'أسافر',
|
| 250 |
+
'مسؤول': 'مسؤول', 'مسؤولية': 'مسؤولية',
|
| 251 |
+
'رؤية': 'رؤية', 'رؤيا': 'رؤيا',
|
| 252 |
+
'مؤسسة': 'مؤسسة', 'مؤتمر': 'مؤتمر',
|
| 253 |
+
'تأثير': 'تأثير', 'تأكيد': 'تأكيد',
|
| 254 |
+
# FIX-14: Alif maqsura common errors
|
| 255 |
+
'المستشفي': 'المستشفى',
|
| 256 |
+
'مصطفي': 'مصطفى', 'موسي': 'موسى', 'عيسي': 'عيسى',
|
| 257 |
+
'هدي': 'هدى', 'بني': 'بنى',
|
| 258 |
+
'معني': 'معنى', 'مبني': 'مبنى',
|
| 259 |
+
'علي': 'على', # Common alif maqsura confusion
|
| 260 |
+
'الي': 'إلى',
|
| 261 |
+
}
|
| 262 |
+
|
| 263 |
+
@staticmethod
|
| 264 |
+
def fix_hamza_conservative(text: str) -> str:
|
| 265 |
+
"""Conservative Hamza normalization — only at word END, not middle."""
|
| 266 |
+
words = text.split()
|
| 267 |
+
result = []
|
| 268 |
+
for word in words:
|
| 269 |
+
if len(word) >= 3:
|
| 270 |
+
if word.endswith('أ'):
|
| 271 |
+
word = word[:-1] + 'ا'
|
| 272 |
+
if word.endswith('إ'):
|
| 273 |
+
word = word[:-1] + 'ا'
|
| 274 |
+
result.append(word)
|
| 275 |
+
return ' '.join(result)
|
| 276 |
+
|
| 277 |
+
# Attached prefixes that can precede hamza-whitelist words
|
| 278 |
+
# Ordered longest-first so وال is tried before و
|
| 279 |
+
HAMZA_PREFIXES = ['وبال', 'فبال', 'وال', 'بال', 'فال', 'كال', 'ول', 'فل',
|
| 280 |
+
'وب', 'فب', 'وك', 'فك', 'و', 'ف', 'ب', 'ك', 'ل']
|
| 281 |
+
|
| 282 |
+
@staticmethod
|
| 283 |
+
def fix_common_hamza(text: str) -> str:
|
| 284 |
+
"""
|
| 285 |
+
Fix common hamza placement errors using a whitelist.
|
| 286 |
+
Also handles prefixed words: و/ف/ب/ك/ل + whitelist word.
|
| 287 |
+
e.g. واصدقائي → وأصدقائي, بالاسعار → بالأسعار
|
| 288 |
+
"""
|
| 289 |
+
words = text.split()
|
| 290 |
+
result = []
|
| 291 |
+
for word in words:
|
| 292 |
+
# Check exact match first
|
| 293 |
+
if word in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 294 |
+
result.append(AraSpellPostProcessor.HAMZA_WHITELIST[word])
|
| 295 |
+
continue
|
| 296 |
+
|
| 297 |
+
# Try stripping common prefixes and looking up the remainder
|
| 298 |
+
fixed = False
|
| 299 |
+
for prefix in AraSpellPostProcessor.HAMZA_PREFIXES:
|
| 300 |
+
if word.startswith(prefix) and len(word) > len(prefix) + 1:
|
| 301 |
+
remainder = word[len(prefix):]
|
| 302 |
+
if remainder in AraSpellPostProcessor.HAMZA_WHITELIST:
|
| 303 |
+
result.append(prefix + AraSpellPostProcessor.HAMZA_WHITELIST[remainder])
|
| 304 |
+
fixed = True
|
| 305 |
+
break
|
| 306 |
+
if not fixed:
|
| 307 |
+
result.append(word)
|
| 308 |
+
return ' '.join(result)
|
| 309 |
+
|
| 310 |
+
@staticmethod
|
| 311 |
+
def fix_ha_ta_marbuta(text: str, vocab_manager=None) -> str:
|
| 312 |
+
"""
|
| 313 |
+
Smart ه → ة fix at end of words.
|
| 314 |
+
Strategy: Always prefer ة when the previous char is a consonant,
|
| 315 |
+
UNLESS the ه form is specifically a known word and the ة form is NOT.
|
| 316 |
+
"""
|
| 317 |
+
PROTECTED_ENDINGS = ['لله']
|
| 318 |
+
# Words that genuinely end in ه (not ة)
|
| 319 |
+
PROTECTED_HA_WORDS = {
|
| 320 |
+
'الله', 'لله', 'فيه', 'عليه', 'منه', 'به', 'له', 'إليه',
|
| 321 |
+
'وجه', 'نزه', 'سفه', 'فقه', 'نبه', 'شبه', 'مكره', 'تنبه',
|
| 322 |
+
'اتجه', 'توجه', 'تشابه',
|
| 323 |
+
}
|
| 324 |
+
words = text.split()
|
| 325 |
+
result = []
|
| 326 |
+
for word in words:
|
| 327 |
+
if any(word.endswith(e) for e in PROTECTED_ENDINGS):
|
| 328 |
+
result.append(word)
|
| 329 |
+
continue
|
| 330 |
+
if word in PROTECTED_HA_WORDS:
|
| 331 |
+
result.append(word)
|
| 332 |
+
continue
|
| 333 |
+
if len(word) >= 3 and word.endswith('ه'):
|
| 334 |
+
if word[-2] in AraSpellPostProcessor.ARABIC_CONSONANTS:
|
| 335 |
+
candidate_with_ta = word[:-1] + 'ة'
|
| 336 |
+
# Default: prefer ة (correct Arabic orthography for feminine nouns)
|
| 337 |
+
if vocab_manager:
|
| 338 |
+
ta_iv = vocab_manager.is_iv(candidate_with_ta)
|
| 339 |
+
ha_iv = vocab_manager.is_iv(word)
|
| 340 |
+
if ta_iv:
|
| 341 |
+
# Always prefer ة when it's a valid word
|
| 342 |
+
result.append(candidate_with_ta)
|
| 343 |
+
continue
|
| 344 |
+
elif ha_iv:
|
| 345 |
+
result.append(word)
|
| 346 |
+
continue
|
| 347 |
+
# No vocab manager — default to ة
|
| 348 |
+
result.append(candidate_with_ta)
|
| 349 |
+
continue
|
| 350 |
+
result.append(word)
|
| 351 |
+
return ' '.join(result)
|
| 352 |
+
|
| 353 |
+
# --- Hallucination Removal ---
|
| 354 |
+
|
| 355 |
+
@staticmethod
|
| 356 |
+
def remove_hallucinations(text: str) -> str:
|
| 357 |
+
"""Remove model hallucinations: duplicate words, trailing 'و' artifacts."""
|
| 358 |
+
words = text.split()
|
| 359 |
+
if not words:
|
| 360 |
+
return text
|
| 361 |
+
result = []
|
| 362 |
+
i = 0
|
| 363 |
+
|
| 364 |
+
def normalize_word(w: str) -> str:
|
| 365 |
+
w = w.replace('ال', '').replace('ة', 'ه')
|
| 366 |
+
w = re.sub(r'[أإآ]', 'ا', w)
|
| 367 |
+
return w
|
| 368 |
+
|
| 369 |
+
while i < len(words):
|
| 370 |
+
word = words[i]
|
| 371 |
+
if len(word) > 4 and word.endswith('و'):
|
| 372 |
+
prev_char = word[-2]
|
| 373 |
+
if prev_char in 'ةهاأإآء':
|
| 374 |
+
word = word[:-1]
|
| 375 |
+
if i + 1 < len(words):
|
| 376 |
+
next_word = words[i + 1]
|
| 377 |
+
if normalize_word(word) == normalize_word(next_word):
|
| 378 |
+
keep = next_word if next_word.startswith('ال') and not word.startswith('ال') else word
|
| 379 |
+
result.append(keep)
|
| 380 |
+
i += 2
|
| 381 |
+
continue
|
| 382 |
+
result.append(word)
|
| 383 |
+
i += 1
|
| 384 |
+
return ' '.join(result)
|
| 385 |
+
|
| 386 |
+
@staticmethod
|
| 387 |
+
def remove_hallucinated_prefix(text: str, original: str) -> str:
|
| 388 |
+
"""Remove particles (و/في) added by model if not in original"""
|
| 389 |
+
if not original:
|
| 390 |
+
return text
|
| 391 |
+
if text.startswith('و ') and not original.startswith('و'):
|
| 392 |
+
rest = text[2:].strip()
|
| 393 |
+
if AraSpellPostProcessor.normalize_special_chars(rest) == AraSpellPostProcessor.normalize_special_chars(original):
|
| 394 |
+
return rest
|
| 395 |
+
return text
|
| 396 |
+
|
| 397 |
+
# --- Word Splitting & Merging ---
|
| 398 |
+
|
| 399 |
+
@staticmethod
|
| 400 |
+
def merge_separated_al(text: str) -> str:
|
| 401 |
+
"""Merge 'ال' separated by space: ال + كتاب → الكتاب"""
|
| 402 |
+
return re.sub(r'\bال\s+(\w+)', r'ال\1', text)
|
| 403 |
+
|
| 404 |
+
@staticmethod
|
| 405 |
+
def join_fragments(text: str) -> str:
|
| 406 |
+
"""Join short fragments with validation."""
|
| 407 |
+
words = text.split()
|
| 408 |
+
if len(words) < 2:
|
| 409 |
+
return text
|
| 410 |
+
STANDALONE_WORDS = {
|
| 411 |
+
'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال',
|
| 412 |
+
'بعد', 'قبل', 'ب', 'ل', 'ك', 'و', 'أو', 'لا', 'ما', 'لم', 'لن',
|
| 413 |
+
'هو', 'هي', 'هم', 'أن', 'إن', 'كل', 'كان', 'قد', 'قال', 'ذلك',
|
| 414 |
+
'هذا', 'هذه', 'تلك', 'التي', 'الذي', 'التى', 'اللذي'
|
| 415 |
+
}
|
| 416 |
+
result = []
|
| 417 |
+
i = 0
|
| 418 |
+
while i < len(words):
|
| 419 |
+
word = words[i]
|
| 420 |
+
if i + 1 < len(words):
|
| 421 |
+
next_word = words[i + 1]
|
| 422 |
+
if word in STANDALONE_WORDS and next_word in STANDALONE_WORDS:
|
| 423 |
+
result.append(word)
|
| 424 |
+
i += 1
|
| 425 |
+
continue
|
| 426 |
+
if len(next_word) == 1:
|
| 427 |
+
result.append(word + next_word)
|
| 428 |
+
i += 2
|
| 429 |
+
continue
|
| 430 |
+
if len(word) >= 2 and len(next_word) >= 2 and word[-1] == next_word[0]:
|
| 431 |
+
if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
|
| 432 |
+
result.append(word[:-1] + next_word)
|
| 433 |
+
i += 2
|
| 434 |
+
continue
|
| 435 |
+
if (2 <= len(word) <= 4 and
|
| 436 |
+
1 <= len(next_word) <= 2 and
|
| 437 |
+
3 <= len(word) + len(next_word) <= 7):
|
| 438 |
+
if not (word in STANDALONE_WORDS and next_word in STANDALONE_WORDS):
|
| 439 |
+
result.append(word + next_word)
|
| 440 |
+
i += 2
|
| 441 |
+
continue
|
| 442 |
+
result.append(word)
|
| 443 |
+
i += 1
|
| 444 |
+
return ' '.join(result)
|
| 445 |
+
|
| 446 |
+
# --- Main Pipelines ---
|
| 447 |
+
|
| 448 |
+
@staticmethod
|
| 449 |
+
def full_postprocess(text: str, original: str = "", vocab_manager=None) -> str:
|
| 450 |
+
"""Apply all post-processing steps."""
|
| 451 |
+
if original:
|
| 452 |
+
text = AraSpellPostProcessor.remove_hallucinated_prefix(text, original)
|
| 453 |
+
text = AraSpellPostProcessor.normalize_special_chars(text)
|
| 454 |
+
text = AraSpellPostProcessor.remove_hallucinations(text)
|
| 455 |
+
text = AraSpellPostProcessor.unified_collapse_repeated(text)
|
| 456 |
+
text = AraSpellPostProcessor.fix_hamza_conservative(text)
|
| 457 |
+
text = AraSpellPostProcessor.fix_common_hamza(text) # Fix S3: hamza whitelist
|
| 458 |
+
text = AraSpellPostProcessor.fix_ha_ta_marbuta(text, vocab_manager=vocab_manager)
|
| 459 |
+
text = AraSpellPostProcessor.remove_word_repetition_with_wa(text)
|
| 460 |
+
text = AraSpellPostProcessor.remove_duplicate_words(text)
|
| 461 |
+
text = AraSpellPostProcessor.normalize_spaces(text)
|
| 462 |
+
return text
|
| 463 |
+
|
| 464 |
+
|
| 465 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 466 |
+
# ERROR CLASSIFIER
|
| 467 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 468 |
+
|
| 469 |
+
class ErrorClassifier:
|
| 470 |
+
"""Classify type of spelling error"""
|
| 471 |
+
|
| 472 |
+
NON_ARABIC_KEYBOARD = set('پگچژکەڕڤڵڎےۀۃھیټډڼڑ')
|
| 473 |
+
|
| 474 |
+
@staticmethod
|
| 475 |
+
def has_char_substitution(text: str) -> bool:
|
| 476 |
+
return any(c in ErrorClassifier.NON_ARABIC_KEYBOARD for c in text)
|
| 477 |
+
|
| 478 |
+
@staticmethod
|
| 479 |
+
def has_char_repetition(text: str, threshold: int = 3) -> bool:
|
| 480 |
+
return bool(re.search(r"(.)\1{" + str(threshold - 1) + ",}", text))
|
| 481 |
+
|
| 482 |
+
@staticmethod
|
| 483 |
+
def has_word_merge(text: str, max_word_len: int = 8) -> bool:
|
| 484 |
+
words = text.split()
|
| 485 |
+
if any(len(w) > max_word_len for w in words):
|
| 486 |
+
return True
|
| 487 |
+
if len(words) == 1 and len(text) > 6:
|
| 488 |
+
return True
|
| 489 |
+
return False
|
| 490 |
+
|
| 491 |
+
@staticmethod
|
| 492 |
+
def classify(text: str) -> ErrorType:
|
| 493 |
+
has_rep = ErrorClassifier.has_char_repetition(text)
|
| 494 |
+
has_merge = ErrorClassifier.has_word_merge(text)
|
| 495 |
+
has_sub = ErrorClassifier.has_char_substitution(text)
|
| 496 |
+
error_count = sum([has_rep, has_merge, has_sub])
|
| 497 |
+
if error_count >= 2:
|
| 498 |
+
return ErrorType.MIXED
|
| 499 |
+
elif has_sub:
|
| 500 |
+
return ErrorType.CHAR_SUBSTITUTION
|
| 501 |
+
elif has_rep:
|
| 502 |
+
return ErrorType.CHAR_REPETITION
|
| 503 |
+
elif has_merge:
|
| 504 |
+
return ErrorType.WORD_MERGE
|
| 505 |
+
else:
|
| 506 |
+
return ErrorType.CLEAN
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 510 |
+
# RULES-BASED CORRECTOR
|
| 511 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 512 |
+
|
| 513 |
+
class RulesBasedCorrector:
|
| 514 |
+
"""Rules-based correction with keyboard proximity mapping."""
|
| 515 |
+
|
| 516 |
+
SUBSTITUTION_MAP = {
|
| 517 |
+
'ک': 'ك', 'ی': 'ي', 'ے': 'ي',
|
| 518 |
+
'پ': 'ب', 'چ': 'ج', 'ژ': 'ز',
|
| 519 |
+
'گ': 'ك', 'ڤ': 'ف', 'ڵ': 'ل',
|
| 520 |
+
'ڕ': 'ر', 'ڎ': 'د', 'ڼ': 'ن',
|
| 521 |
+
'ټ': 'ت', 'ډ': 'د', 'ړ': 'ر',
|
| 522 |
+
'ۀ': 'ه', 'ۃ': 'ة', 'ھ': 'ه',
|
| 523 |
+
'ە': 'ه', 'ڑ': 'ر'
|
| 524 |
+
}
|
| 525 |
+
|
| 526 |
+
PREPOSITIONS = {
|
| 527 |
+
'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى',
|
| 528 |
+
'حتى', 'منذ', 'خلال', 'بعد', 'قبل',
|
| 529 |
+
'ب', 'ل', 'ك', 'لل'
|
| 530 |
+
}
|
| 531 |
+
|
| 532 |
+
KEYBOARD_NEIGHBORS = {
|
| 533 |
+
'ض': ['ص', 'ق'], 'ص': ['ض', 'ث', 'ق'], 'ث': ['ص', 'ق'],
|
| 534 |
+
'ق': ['ض', 'ص', 'ث', 'ف', 'غ'], 'ف': ['ق', 'غ', 'ع', 'ب'],
|
| 535 |
+
'غ': ['ق', 'ف', 'ع', 'ه'], 'ع': ['ف', 'غ', 'ه', 'خ'],
|
| 536 |
+
'ه': ['غ', 'ع', 'خ', 'ح'], 'خ': ['ع', 'ه', 'ح', 'ج'],
|
| 537 |
+
'ح': ['ه', 'خ', 'ج'], 'ج': ['خ', 'ح', 'د'],
|
| 538 |
+
'د': ['ج', 'ذ'], 'ذ': ['د'],
|
| 539 |
+
'ش': ['س', 'ي', 'ئ'], 'س': ['ش', 'ي', 'ب'],
|
| 540 |
+
'ي': ['ش', 'س', 'ب', 'ت'], 'ب': ['ي', 'س', 'ف', 'ل', 'ن'],
|
| 541 |
+
'ل': ['ب', 'ا', 'ن', 'م'], 'ا': ['ل', 'ت', 'م'],
|
| 542 |
+
'ت': ['ي', 'ا', 'ن'], 'ن': ['ب', 'ل', 'ت', 'م', 'ك'],
|
| 543 |
+
'م': ['ل', 'ا', 'ن', 'ك'], 'ك': ['ن', 'م', 'ط'],
|
| 544 |
+
'ط': ['ك', 'ظ'], 'ظ': ['ط'],
|
| 545 |
+
'ئ': ['ش', 'ء', 'ر'], 'ء': ['ئ', 'ؤ'], 'ؤ': ['ء', 'ر'],
|
| 546 |
+
'ر': ['ئ', 'ؤ', 'لا', 'ى', 'ز'], 'لا': ['ر', 'ى'],
|
| 547 |
+
'ى': ['ر', 'لا', 'ة', 'ز'], 'ة': ['ى', 'و', 'ز'],
|
| 548 |
+
'و': ['ة', 'ز'], 'ز': ['ر', 'ى', 'ة', 'و'],
|
| 549 |
+
'أ': ['ا', 'إ', 'آ'], 'إ': ['ا', 'أ'], 'آ': ['ا', 'أ'],
|
| 550 |
+
}
|
| 551 |
+
|
| 552 |
+
@staticmethod
|
| 553 |
+
def is_keyboard_neighbor(char1: str, char2: str) -> bool:
|
| 554 |
+
neighbors = RulesBasedCorrector.KEYBOARD_NEIGHBORS.get(char1, [])
|
| 555 |
+
return char2 in neighbors
|
| 556 |
+
|
| 557 |
+
@staticmethod
|
| 558 |
+
def fix_char_substitution(text: str) -> str:
|
| 559 |
+
for old, new in RulesBasedCorrector.SUBSTITUTION_MAP.items():
|
| 560 |
+
text = text.replace(old, new)
|
| 561 |
+
return text
|
| 562 |
+
|
| 563 |
+
@staticmethod
|
| 564 |
+
def fix_char_repetition(text: str) -> str:
|
| 565 |
+
text = re.sub(r'([^\d\s])\1{2,}', r'\1', text)
|
| 566 |
+
return text
|
| 567 |
+
|
| 568 |
+
@staticmethod
|
| 569 |
+
def advanced_heuristic_repair(text: str) -> str:
|
| 570 |
+
text = RulesBasedCorrector.fix_char_substitution(text)
|
| 571 |
+
text = RulesBasedCorrector.fix_char_repetition(text)
|
| 572 |
+
words = text.split()
|
| 573 |
+
processed_words = []
|
| 574 |
+
for word in words:
|
| 575 |
+
processed_words.append(RulesBasedCorrector._recursive_split(word))
|
| 576 |
+
return ' '.join(processed_words)
|
| 577 |
+
|
| 578 |
+
@staticmethod
|
| 579 |
+
def _recursive_split(word: str) -> str:
|
| 580 |
+
if len(word) < 4:
|
| 581 |
+
return word
|
| 582 |
+
separables = sorted(['من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال', 'بعد', 'قبل'], key=len, reverse=True)
|
| 583 |
+
for sep in separables:
|
| 584 |
+
if word == sep:
|
| 585 |
+
return word
|
| 586 |
+
if word.startswith(sep):
|
| 587 |
+
remainder = word[len(sep):]
|
| 588 |
+
if len(remainder) >= 3:
|
| 589 |
+
return sep + " " + RulesBasedCorrector._recursive_split(remainder)
|
| 590 |
+
if word.startswith('يا') and len(word) > 4:
|
| 591 |
+
return 'يا ' + RulesBasedCorrector._recursive_split(word[2:])
|
| 592 |
+
return word
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 596 |
+
# OUTPUT VALIDATOR (Hallucination Prevention)
|
| 597 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 598 |
+
|
| 599 |
+
class OutputValidator:
|
| 600 |
+
"""Validate model outputs to prevent hallucinations"""
|
| 601 |
+
|
| 602 |
+
@staticmethod
|
| 603 |
+
def calculate_edit_distance(s1: str, s2: str) -> int:
|
| 604 |
+
return Levenshtein.distance(s1, s2)
|
| 605 |
+
|
| 606 |
+
@staticmethod
|
| 607 |
+
def check_character_preservation(original: str, corrected: str) -> Tuple[bool, str]:
|
| 608 |
+
chars_original = set(original)
|
| 609 |
+
chars_corrected = set(corrected)
|
| 610 |
+
if not chars_original:
|
| 611 |
+
return True, "valid"
|
| 612 |
+
intersection = chars_original & chars_corrected
|
| 613 |
+
union = chars_original | chars_corrected
|
| 614 |
+
jaccard = len(intersection) / len(union) if union else 0
|
| 615 |
+
if jaccard < 0.35:
|
| 616 |
+
return False, "low_character_similarity"
|
| 617 |
+
return True, "valid"
|
| 618 |
+
|
| 619 |
+
@staticmethod
|
| 620 |
+
def check_word_count(original: str, corrected: str) -> Tuple[bool, str]:
|
| 621 |
+
len_orig = len(original.split())
|
| 622 |
+
len_corr = len(corrected.split())
|
| 623 |
+
if len_orig == 1:
|
| 624 |
+
if len_corr <= 3:
|
| 625 |
+
return True, "valid"
|
| 626 |
+
if len(original) > 12 and len_corr <= 6:
|
| 627 |
+
return True, "valid"
|
| 628 |
+
ratio = len_corr / len_orig if len_orig > 0 else 0
|
| 629 |
+
if ratio > 2.0 or ratio < 0.5:
|
| 630 |
+
return False, "word_count_mismatch"
|
| 631 |
+
return True, "valid"
|
| 632 |
+
|
| 633 |
+
def validate(self, original: str, corrected: str, error_type: str) -> Tuple[bool, str]:
|
| 634 |
+
if not corrected or not corrected.strip():
|
| 635 |
+
return False, "empty_output"
|
| 636 |
+
original_no_space = original.replace(' ', '').replace('\u200c', '')
|
| 637 |
+
corrected_no_space = corrected.replace(' ', '').replace('\u200c', '')
|
| 638 |
+
if original_no_space == corrected_no_space:
|
| 639 |
+
return True, "space_leniency_accept"
|
| 640 |
+
len_orig = len(original)
|
| 641 |
+
len_corr = len(corrected)
|
| 642 |
+
if len_corr > len_orig * 2.5:
|
| 643 |
+
return False, "too_long"
|
| 644 |
+
if len_corr < len_orig * 0.5:
|
| 645 |
+
if error_type == ErrorType.CHAR_REPETITION:
|
| 646 |
+
pass
|
| 647 |
+
else:
|
| 648 |
+
return False, "too_short"
|
| 649 |
+
is_valid_count, reason = self.check_word_count(original, corrected)
|
| 650 |
+
if not is_valid_count:
|
| 651 |
+
return False, reason
|
| 652 |
+
is_valid_chars, reason = self.check_character_preservation(original, corrected)
|
| 653 |
+
if not is_valid_chars:
|
| 654 |
+
return False, reason
|
| 655 |
+
return True, "valid"
|
| 656 |
+
|
| 657 |
+
|
| 658 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 659 |
+
# VOCABULARY MANAGER
|
| 660 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 661 |
+
|
| 662 |
+
class VocabularyManager:
|
| 663 |
+
"""Centralized vocabulary management for OOV/IV detection."""
|
| 664 |
+
|
| 665 |
+
HAMZA_VARIANTS = {'أ', 'إ', 'آ', 'ء', 'ؤ', 'ئ', 'ا'}
|
| 666 |
+
ALEF_NORMALIZED = 'ا'
|
| 667 |
+
TA_MARBUTA = 'ة'
|
| 668 |
+
HA = 'ه'
|
| 669 |
+
YA_VARIANTS = {'ي', 'ى'}
|
| 670 |
+
YA_NORMALIZED = 'ي'
|
| 671 |
+
|
| 672 |
+
def __init__(self, tokenizer):
|
| 673 |
+
self.tokenizer = tokenizer
|
| 674 |
+
self.vocab = {
|
| 675 |
+
w for w in tokenizer.get_vocab().keys()
|
| 676 |
+
if w.isalpha() and not w.startswith('##') and len(w) > 1
|
| 677 |
+
}
|
| 678 |
+
self.vocab_rank = {w: i for w, i in tokenizer.get_vocab().items()}
|
| 679 |
+
self.normalized_vocab = {self.normalize_for_comparison(w): w for w in self.vocab}
|
| 680 |
+
logger.info(f"VocabularyManager initialized: {len(self.vocab)} words")
|
| 681 |
+
|
| 682 |
+
@classmethod
|
| 683 |
+
def normalize_for_comparison(cls, word: str) -> str:
|
| 684 |
+
result = []
|
| 685 |
+
for i, char in enumerate(word):
|
| 686 |
+
if char in cls.HAMZA_VARIANTS:
|
| 687 |
+
result.append(cls.ALEF_NORMALIZED)
|
| 688 |
+
elif char == cls.TA_MARBUTA and i == len(word) - 1:
|
| 689 |
+
result.append(cls.HA)
|
| 690 |
+
elif char in cls.YA_VARIANTS:
|
| 691 |
+
result.append(cls.YA_NORMALIZED)
|
| 692 |
+
else:
|
| 693 |
+
result.append(char)
|
| 694 |
+
return ''.join(result)
|
| 695 |
+
|
| 696 |
+
def is_iv(self, word: str) -> bool:
|
| 697 |
+
clean = re.sub(r'[^\w]', '', word)
|
| 698 |
+
if not clean:
|
| 699 |
+
return True
|
| 700 |
+
if clean in self.vocab:
|
| 701 |
+
return True
|
| 702 |
+
normalized = self.normalize_for_comparison(clean)
|
| 703 |
+
if normalized in self.normalized_vocab:
|
| 704 |
+
return True
|
| 705 |
+
return False
|
| 706 |
+
|
| 707 |
+
def is_oov(self, word: str) -> bool:
|
| 708 |
+
return not self.is_iv(word)
|
| 709 |
+
|
| 710 |
+
def get_frequency_rank(self, word: str) -> int:
|
| 711 |
+
clean = re.sub(r'[^\w]', '', word)
|
| 712 |
+
return self.vocab_rank.get(clean, 999999)
|
| 713 |
+
|
| 714 |
+
def all_words_iv(self, text: str) -> bool:
|
| 715 |
+
words = text.split()
|
| 716 |
+
return all(self.is_iv(w) for w in words)
|
| 717 |
+
|
| 718 |
+
def count_oov_words(self, text: str) -> int:
|
| 719 |
+
words = text.split()
|
| 720 |
+
return sum(1 for w in words if self.is_oov(w))
|
| 721 |
+
|
| 722 |
+
def get_oov_words(self, text: str) -> List[str]:
|
| 723 |
+
words = text.split()
|
| 724 |
+
return [w for w in words if self.is_oov(w)]
|
| 725 |
+
|
| 726 |
+
def words_are_equivalent(self, word1: str, word2: str) -> bool:
|
| 727 |
+
norm1 = self.normalize_for_comparison(word1)
|
| 728 |
+
norm2 = self.normalize_for_comparison(word2)
|
| 729 |
+
return norm1 == norm2
|
| 730 |
+
|
| 731 |
+
@staticmethod
|
| 732 |
+
def damerau_levenshtein_distance(s1: str, s2: str) -> int:
|
| 733 |
+
return jellyfish.damerau_levenshtein_distance(s1, s2)
|
| 734 |
+
|
| 735 |
+
def calculate_similarity(self, original: str, corrected: str) -> float:
|
| 736 |
+
dist = self.damerau_levenshtein_distance(original, corrected)
|
| 737 |
+
max_len = max(len(original), len(corrected), 1)
|
| 738 |
+
return 1.0 - (dist / max_len)
|
| 739 |
+
|
| 740 |
+
|
| 741 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 742 |
+
# WORD ALIGNER
|
| 743 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 744 |
+
|
| 745 |
+
class WordAligner:
|
| 746 |
+
"""Aligns input and output words to create hybrid corrections."""
|
| 747 |
+
|
| 748 |
+
def __init__(self, vocab_manager):
|
| 749 |
+
self.vocab = vocab_manager
|
| 750 |
+
|
| 751 |
+
def align_words(self, input_text: str, output_text: str) -> str:
|
| 752 |
+
input_words = input_text.split()
|
| 753 |
+
output_words = output_text.split()
|
| 754 |
+
if abs(len(input_words) - len(output_words)) > 2:
|
| 755 |
+
input_oov = self.vocab.count_oov_words(input_text)
|
| 756 |
+
output_oov = self.vocab.count_oov_words(output_text)
|
| 757 |
+
return output_text if output_oov < input_oov else input_text
|
| 758 |
+
result = []
|
| 759 |
+
min_len = min(len(input_words), len(output_words))
|
| 760 |
+
for i in range(min_len):
|
| 761 |
+
in_word = input_words[i]
|
| 762 |
+
out_word = output_words[i]
|
| 763 |
+
best_word = self._select_best_word(in_word, out_word)
|
| 764 |
+
result.append(best_word)
|
| 765 |
+
if len(output_words) > min_len:
|
| 766 |
+
result.extend(output_words[min_len:])
|
| 767 |
+
elif len(input_words) > min_len:
|
| 768 |
+
for w in input_words[min_len:]:
|
| 769 |
+
if self.vocab.is_iv(w):
|
| 770 |
+
result.append(w)
|
| 771 |
+
return ' '.join(result)
|
| 772 |
+
|
| 773 |
+
def _select_best_word(self, input_word: str, output_word: str) -> str:
|
| 774 |
+
if input_word == output_word:
|
| 775 |
+
return input_word
|
| 776 |
+
in_iv = self.vocab.is_iv(input_word)
|
| 777 |
+
out_iv = self.vocab.is_iv(output_word)
|
| 778 |
+
if not in_iv and out_iv:
|
| 779 |
+
return output_word
|
| 780 |
+
if in_iv and not out_iv:
|
| 781 |
+
return input_word
|
| 782 |
+
if in_iv and out_iv:
|
| 783 |
+
# Fix S1: When only difference is ه→ة at word end, prefer ة
|
| 784 |
+
# (correct Arabic orthography — ة is the standard feminine ending)
|
| 785 |
+
if (input_word.endswith('ه') and output_word.endswith('ة')
|
| 786 |
+
and input_word[:-1] == output_word[:-1]):
|
| 787 |
+
return output_word
|
| 788 |
+
# Fix S1: Also handle ة→ه (don't regress a correct ة to ه)
|
| 789 |
+
if (input_word.endswith('ة') and output_word.endswith('ه')
|
| 790 |
+
and input_word[:-1] == output_word[:-1]):
|
| 791 |
+
return input_word
|
| 792 |
+
return input_word
|
| 793 |
+
if len(input_word) == len(output_word) and len(input_word) >= 3:
|
| 794 |
+
for i in range(len(input_word)):
|
| 795 |
+
if input_word[i] != output_word[i]:
|
| 796 |
+
hybrid = input_word[:i] + output_word[i] + input_word[i+1:]
|
| 797 |
+
if self.vocab.is_iv(hybrid):
|
| 798 |
+
return hybrid
|
| 799 |
+
hybrid2 = output_word[:i] + input_word[i] + output_word[i+1:]
|
| 800 |
+
if self.vocab.is_iv(hybrid2):
|
| 801 |
+
return hybrid2
|
| 802 |
+
return output_word
|
| 803 |
+
|
| 804 |
+
|
| 805 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 806 |
+
# SPLIT/MERGE SPECIALIST
|
| 807 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 808 |
+
|
| 809 |
+
class SplitMergeSpecialist:
|
| 810 |
+
"""Handles word splitting and merging with vocabulary validation."""
|
| 811 |
+
|
| 812 |
+
SEPARABLE_PREFIXES = [
|
| 813 |
+
'من', 'في', 'على', 'عن', 'مع', 'إلى', 'الى', 'حتى', 'منذ', 'خلال',
|
| 814 |
+
'بعد', 'قبل', 'بين', 'حول', 'تحت', 'فوق', 'أمام', 'وراء', 'دون',
|
| 815 |
+
'أن', 'لن', 'لم', 'قد', 'سوف', 'كي', 'إذا', 'لو', 'مثل', 'غير',
|
| 816 |
+
'يا',
|
| 817 |
+
]
|
| 818 |
+
|
| 819 |
+
PROTECTED_WORDS = {
|
| 820 |
+
'في', 'من', 'على', 'عن', 'مع', 'إلى', 'الى', 'ان', 'أن', 'لا', 'ما', 'هو', 'هي',
|
| 821 |
+
'لم', 'لن', 'قد', 'كل', 'كان', 'ذلك', 'هذا', 'هذه', 'التي', 'الذي', 'بين',
|
| 822 |
+
}
|
| 823 |
+
|
| 824 |
+
ATTACHED_PREFIXES = [
|
| 825 |
+
'وال', 'بال', 'فال', 'كال', 'لل',
|
| 826 |
+
'وب', 'وف', 'ول', 'وك', 'وم', 'ون',
|
| 827 |
+
'فب', 'فل', 'فك', 'فم',
|
| 828 |
+
]
|
| 829 |
+
|
| 830 |
+
PRONOUN_SUFFIXES = {'كم', 'هم', 'ها', 'هن', 'كن', 'نا', 'هما', 'كما', 'تم', 'تن'}
|
| 831 |
+
|
| 832 |
+
def __init__(self, vocab_manager):
|
| 833 |
+
self.vocab = vocab_manager
|
| 834 |
+
self.separable_prefixes = sorted(
|
| 835 |
+
self.SEPARABLE_PREFIXES, key=len, reverse=True
|
| 836 |
+
)
|
| 837 |
+
|
| 838 |
+
def split_word(self, word: str) -> str:
|
| 839 |
+
if len(word) < 5:
|
| 840 |
+
return word
|
| 841 |
+
if self.vocab.is_iv(word):
|
| 842 |
+
return word
|
| 843 |
+
if word in self.PROTECTED_WORDS:
|
| 844 |
+
return word
|
| 845 |
+
for prefix in self.ATTACHED_PREFIXES:
|
| 846 |
+
if word.startswith(prefix):
|
| 847 |
+
remainder = word[len(prefix):]
|
| 848 |
+
if self.vocab.is_iv(remainder):
|
| 849 |
+
return word
|
| 850 |
+
if prefix.endswith('ال') and self.vocab.is_iv(remainder):
|
| 851 |
+
return word
|
| 852 |
+
for prefix in self.separable_prefixes:
|
| 853 |
+
if word.startswith(prefix) and len(word) > len(prefix) + 2:
|
| 854 |
+
remainder = word[len(prefix):]
|
| 855 |
+
if self.vocab.is_iv(remainder):
|
| 856 |
+
return f"{prefix} {remainder}"
|
| 857 |
+
for i in range(3, len(word) - 2):
|
| 858 |
+
left = word[:i]
|
| 859 |
+
right = word[i:]
|
| 860 |
+
if self.vocab.is_iv(left) and self.vocab.is_iv(right):
|
| 861 |
+
return f"{left} {right}"
|
| 862 |
+
return word
|
| 863 |
+
|
| 864 |
+
def merge_fragments(self, text: str) -> str:
|
| 865 |
+
words = text.split()
|
| 866 |
+
if len(words) < 2:
|
| 867 |
+
return text
|
| 868 |
+
result = []
|
| 869 |
+
i = 0
|
| 870 |
+
while i < len(words):
|
| 871 |
+
word = words[i]
|
| 872 |
+
if i + 1 < len(words):
|
| 873 |
+
next_word = words[i + 1]
|
| 874 |
+
merged = word + next_word
|
| 875 |
+
if len(next_word) == 1 and next_word in 'ةهاي':
|
| 876 |
+
if self.vocab.is_iv(merged):
|
| 877 |
+
result.append(merged)
|
| 878 |
+
i += 2
|
| 879 |
+
continue
|
| 880 |
+
if word == 'ال' and len(next_word) >= 2:
|
| 881 |
+
if self.vocab.is_iv(merged):
|
| 882 |
+
result.append(merged)
|
| 883 |
+
i += 2
|
| 884 |
+
continue
|
| 885 |
+
if self.vocab.is_oov(word) and self.vocab.is_oov(next_word):
|
| 886 |
+
if self.vocab.is_iv(merged):
|
| 887 |
+
result.append(merged)
|
| 888 |
+
i += 2
|
| 889 |
+
continue
|
| 890 |
+
if len(word) <= 2 and self.vocab.is_oov(word):
|
| 891 |
+
if self.vocab.is_iv(merged):
|
| 892 |
+
result.append(merged)
|
| 893 |
+
i += 2
|
| 894 |
+
continue
|
| 895 |
+
if next_word in self.PRONOUN_SUFFIXES:
|
| 896 |
+
if self.vocab.is_iv(merged) and not self.vocab.is_iv(word):
|
| 897 |
+
result.append(merged)
|
| 898 |
+
i += 2
|
| 899 |
+
continue
|
| 900 |
+
if len(word) <= 3 and len(next_word) <= 3:
|
| 901 |
+
if len(merged) >= 5 and self.vocab.is_iv(merged):
|
| 902 |
+
result.append(merged)
|
| 903 |
+
i += 2
|
| 904 |
+
continue
|
| 905 |
+
result.append(word)
|
| 906 |
+
i += 1
|
| 907 |
+
return ' '.join(result)
|
| 908 |
+
|
| 909 |
+
def process_text(self, text: str) -> str:
|
| 910 |
+
text = self.merge_fragments(text)
|
| 911 |
+
words = text.split()
|
| 912 |
+
processed = []
|
| 913 |
+
for word in words:
|
| 914 |
+
if self.vocab.is_oov(word) and len(word) >= 4:
|
| 915 |
+
split_result = self.split_word(word)
|
| 916 |
+
processed.append(split_result)
|
| 917 |
+
else:
|
| 918 |
+
processed.append(word)
|
| 919 |
+
return ' '.join(processed)
|
| 920 |
+
|
| 921 |
+
|
| 922 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 923 |
+
# EDIT DISTANCE CORRECTOR
|
| 924 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 925 |
+
|
| 926 |
+
class EditDistanceCorrector:
|
| 927 |
+
"""Generates candidates based on Levenshtein distance."""
|
| 928 |
+
|
| 929 |
+
def __init__(self, tokenizer):
|
| 930 |
+
self.tokenizer = tokenizer
|
| 931 |
+
self.vocab = {
|
| 932 |
+
w for w in tokenizer.get_vocab().keys()
|
| 933 |
+
if w.isalpha() and not w.startswith('##') and len(w) > 1
|
| 934 |
+
}
|
| 935 |
+
self.vocab_rank = {w: i for w, i in tokenizer.get_vocab().items()}
|
| 936 |
+
|
| 937 |
+
def edits1(self, word):
|
| 938 |
+
letters = 'أابتثجحخدذرزسشصضطظعغفقكلمنهويءآىةئؤ'
|
| 939 |
+
splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
|
| 940 |
+
deletes = [L + R[1:] for L, R in splits if R]
|
| 941 |
+
transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
|
| 942 |
+
replaces = [L + c + R[1:] for L, R in splits if R for c in letters]
|
| 943 |
+
inserts = [L + c + R for L, R in splits for c in letters]
|
| 944 |
+
return set(deletes + transposes + replaces + inserts)
|
| 945 |
+
|
| 946 |
+
def edits2(self, word):
|
| 947 |
+
return (e2 for e1 in self.edits1(word) for e2 in self.edits1(e1))
|
| 948 |
+
|
| 949 |
+
def known(self, words):
|
| 950 |
+
return set(w for w in words if w in self.vocab)
|
| 951 |
+
|
| 952 |
+
def generate_candidate(self, text: str) -> str:
|
| 953 |
+
words = text.split()
|
| 954 |
+
corrected_words = []
|
| 955 |
+
for word in words:
|
| 956 |
+
clean_word = re.sub(r'[^\w]', '', word)
|
| 957 |
+
if clean_word in self.vocab:
|
| 958 |
+
corrected_words.append(word)
|
| 959 |
+
continue
|
| 960 |
+
candidates = self.known(self.edits1(clean_word))
|
| 961 |
+
if not candidates:
|
| 962 |
+
if len(clean_word) < 7:
|
| 963 |
+
candidates = self.known(self.edits2(clean_word))
|
| 964 |
+
if candidates:
|
| 965 |
+
best_candidate = min(candidates, key=lambda w: self.vocab_rank.get(w, 999999))
|
| 966 |
+
corrected_words.append(best_candidate)
|
| 967 |
+
else:
|
| 968 |
+
corrected_words.append(word)
|
| 969 |
+
return ' '.join(corrected_words)
|
| 970 |
+
|
| 971 |
+
|
| 972 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 973 |
+
# CONTEXTUAL CORRECTOR (MLM-based) — Optional, disabled by default to save RAM
|
| 974 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 975 |
+
|
| 976 |
+
class ContextualCorrector:
|
| 977 |
+
"""MLM-based contextual correction for confusion pairs"""
|
| 978 |
+
|
| 979 |
+
CONFUSION_PAIRS = [
|
| 980 |
+
('ض', 'ظ'), ('ذ', 'ز'), ('ث', 'س'), ('ص', 'س'),
|
| 981 |
+
('ط', 'ت'), ('ق', 'ك'), ('ه', 'ة'), ('ا', 'ى'),
|
| 982 |
+
('ت', 'د'), ('د', 'ض'), ('ك', 'ق'), ('غ', 'ق'),
|
| 983 |
+
('ج', 'ش'), ('س', 'ز'), ('ف', 'ب'), ('و', 'و'),
|
| 984 |
+
('ؤ', 'و'), ('ئ', 'ي'), ('ء', 'أ'), ('إ', 'أ'),
|
| 985 |
+
]
|
| 986 |
+
|
| 987 |
+
def __init__(self, model_name: str = 'aubmindlab/bert-base-arabertv02', cache_size: int = 10000):
|
| 988 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
| 989 |
+
|
| 990 |
+
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 991 |
+
self.model = AutoModelForMaskedLM.from_pretrained(model_name)
|
| 992 |
+
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 993 |
+
self.model = self.model.to(self.device)
|
| 994 |
+
self.model.eval()
|
| 995 |
+
self.confusion_map = self._build_confusion_map()
|
| 996 |
+
self.cache_hits = 0
|
| 997 |
+
self.cache_misses = 0
|
| 998 |
+
self._score_cache = {}
|
| 999 |
+
self.cache_size = cache_size
|
| 1000 |
+
self.vocab = self.tokenizer.get_vocab()
|
| 1001 |
+
|
| 1002 |
+
def _build_confusion_map(self):
|
| 1003 |
+
confusion_map = {}
|
| 1004 |
+
for char1, char2 in self.CONFUSION_PAIRS:
|
| 1005 |
+
if char1 not in confusion_map:
|
| 1006 |
+
confusion_map[char1] = []
|
| 1007 |
+
if char2 not in confusion_map:
|
| 1008 |
+
confusion_map[char2] = []
|
| 1009 |
+
confusion_map[char1].append(char2)
|
| 1010 |
+
confusion_map[char2].append(char1)
|
| 1011 |
+
return confusion_map
|
| 1012 |
+
|
| 1013 |
+
def get_confusable_chars(self, char: str) -> List[str]:
|
| 1014 |
+
return self.confusion_map.get(char, [])
|
| 1015 |
+
|
| 1016 |
+
def generate_candidates(self, word: str) -> List[str]:
|
| 1017 |
+
candidates = [word]
|
| 1018 |
+
for i, char in enumerate(word):
|
| 1019 |
+
confusables = self.get_confusable_chars(char)
|
| 1020 |
+
for conf_char in confusables:
|
| 1021 |
+
candidate = word[:i] + conf_char + word[i+1:]
|
| 1022 |
+
if candidate not in candidates:
|
| 1023 |
+
candidates.append(candidate)
|
| 1024 |
+
for i in range(len(word) - 1):
|
| 1025 |
+
if word[i] == word[i+1]:
|
| 1026 |
+
candidate = word[:i] + word[i+1:]
|
| 1027 |
+
if candidate not in candidates:
|
| 1028 |
+
candidates.append(candidate)
|
| 1029 |
+
COMMON_CHARS = 'ابتثجحخدذرزسشصضطظعغفقكلمنهويأإآءئؤةى'
|
| 1030 |
+
for i in range(len(word) + 1):
|
| 1031 |
+
for char in COMMON_CHARS:
|
| 1032 |
+
candidate = word[:i] + char + word[i:]
|
| 1033 |
+
if candidate in self.vocab and candidate not in candidates:
|
| 1034 |
+
candidates.append(candidate)
|
| 1035 |
+
if len(word) < 7:
|
| 1036 |
+
for i in range(len(word)):
|
| 1037 |
+
for char in COMMON_CHARS:
|
| 1038 |
+
if char != word[i]:
|
| 1039 |
+
candidate = word[:i] + char + word[i+1:]
|
| 1040 |
+
if candidate in self.vocab and candidate not in candidates:
|
| 1041 |
+
candidates.append(candidate)
|
| 1042 |
+
for i in range(len(word)):
|
| 1043 |
+
candidate = word[:i] + word[i+1:]
|
| 1044 |
+
if len(candidate) > 1:
|
| 1045 |
+
if candidate in self.vocab and candidate not in candidates:
|
| 1046 |
+
candidates.append(candidate)
|
| 1047 |
+
return candidates
|
| 1048 |
+
|
| 1049 |
+
def score_with_mlm(self, text: str, position: int, word: str) -> float:
|
| 1050 |
+
cache_key = f"{text}|{position}|{word}"
|
| 1051 |
+
if cache_key in self._score_cache:
|
| 1052 |
+
self.cache_hits += 1
|
| 1053 |
+
return self._score_cache[cache_key]
|
| 1054 |
+
self.cache_misses += 1
|
| 1055 |
+
words = text.split()
|
| 1056 |
+
if position >= len(words):
|
| 1057 |
+
return 0.0
|
| 1058 |
+
masked_words = words.copy()
|
| 1059 |
+
masked_words[position] = '[MASK]'
|
| 1060 |
+
masked_text = ' '.join(masked_words)
|
| 1061 |
+
inputs = self.tokenizer(masked_text, return_tensors='pt', padding=True, truncation=True)
|
| 1062 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 1063 |
+
with torch.no_grad():
|
| 1064 |
+
outputs = self.model(**inputs)
|
| 1065 |
+
predictions = outputs.logits
|
| 1066 |
+
mask_token_index = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
|
| 1067 |
+
if len(mask_token_index) == 0:
|
| 1068 |
+
return 0.0
|
| 1069 |
+
mask_token_logits = predictions[0, mask_token_index[0], :]
|
| 1070 |
+
probs = torch.softmax(mask_token_logits, dim=0)
|
| 1071 |
+
word_tokens = self.tokenizer.encode(word, add_special_tokens=False)
|
| 1072 |
+
if not word_tokens:
|
| 1073 |
+
return 0.0
|
| 1074 |
+
word_token_id = word_tokens[0]
|
| 1075 |
+
score = probs[word_token_id].item()
|
| 1076 |
+
if len(self._score_cache) >= self.cache_size:
|
| 1077 |
+
self._score_cache.pop(next(iter(self._score_cache)))
|
| 1078 |
+
self._score_cache[cache_key] = score
|
| 1079 |
+
return score
|
| 1080 |
+
|
| 1081 |
+
def score_candidates_batch(self, text: str, position: int, candidates: List[str]) -> dict:
|
| 1082 |
+
scores = {}
|
| 1083 |
+
for candidate in candidates:
|
| 1084 |
+
scores[candidate] = self.score_with_mlm(text, position, candidate)
|
| 1085 |
+
return scores
|
| 1086 |
+
|
| 1087 |
+
def predict_masked_token(self, text: str, position: int, top_k: int = 5) -> List[Tuple[str, float]]:
|
| 1088 |
+
words = text.split()
|
| 1089 |
+
if position >= len(words):
|
| 1090 |
+
return []
|
| 1091 |
+
masked_words = words.copy()
|
| 1092 |
+
masked_words[position] = '[MASK]'
|
| 1093 |
+
masked_text = ' '.join(masked_words)
|
| 1094 |
+
inputs = self.tokenizer(masked_text, return_tensors='pt', padding=True, truncation=True).to(self.device)
|
| 1095 |
+
with torch.no_grad():
|
| 1096 |
+
outputs = self.model(**inputs)
|
| 1097 |
+
predictions = outputs.logits
|
| 1098 |
+
mask_token_index = (inputs['input_ids'] == self.tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
|
| 1099 |
+
if len(mask_token_index) == 0:
|
| 1100 |
+
return []
|
| 1101 |
+
mask_token_logits = predictions[0, mask_token_index[0], :]
|
| 1102 |
+
probs = torch.softmax(mask_token_logits, dim=0)
|
| 1103 |
+
top_k_weights, top_k_indices = torch.topk(probs, top_k, sorted=True)
|
| 1104 |
+
results = []
|
| 1105 |
+
for i in range(top_k):
|
| 1106 |
+
token_id = top_k_indices[i].item()
|
| 1107 |
+
score = top_k_weights[i].item()
|
| 1108 |
+
token = self.tokenizer.decode([token_id]).strip()
|
| 1109 |
+
if not token.startswith("##") and token not in self.tokenizer.all_special_tokens:
|
| 1110 |
+
results.append((token, score))
|
| 1111 |
+
return results
|
| 1112 |
+
|
| 1113 |
+
def refine_sentence_with_mask(self, text: str, threshold: float = 0.001, vocab_manager=None, raw_model_output=None) -> str:
|
| 1114 |
+
words = text.split()
|
| 1115 |
+
refined_words = words.copy()
|
| 1116 |
+
raw_words = raw_model_output.split() if raw_model_output else []
|
| 1117 |
+
for i, word in enumerate(words):
|
| 1118 |
+
if vocab_manager and vocab_manager.is_iv(word):
|
| 1119 |
+
continue
|
| 1120 |
+
if i < len(raw_words) and word == raw_words[i]:
|
| 1121 |
+
continue
|
| 1122 |
+
if len(word) <= 2:
|
| 1123 |
+
continue
|
| 1124 |
+
current_score = self.score_with_mlm(text, i, word)
|
| 1125 |
+
if current_score > threshold:
|
| 1126 |
+
continue
|
| 1127 |
+
predictions = self.predict_masked_token(text, i, top_k=10)
|
| 1128 |
+
for pred_word, pred_score in predictions:
|
| 1129 |
+
if pred_word == word:
|
| 1130 |
+
continue
|
| 1131 |
+
if abs(len(pred_word) - len(word)) > 1:
|
| 1132 |
+
continue
|
| 1133 |
+
dist = Levenshtein.distance(word, pred_word)
|
| 1134 |
+
max_len = max(len(word), len(pred_word))
|
| 1135 |
+
similarity = 1.0 - (dist / max_len)
|
| 1136 |
+
if similarity < 0.90:
|
| 1137 |
+
continue
|
| 1138 |
+
if vocab_manager and vocab_manager.is_oov(pred_word):
|
| 1139 |
+
continue
|
| 1140 |
+
if pred_score < 0.12:
|
| 1141 |
+
continue
|
| 1142 |
+
is_original_common = current_score > 0.001
|
| 1143 |
+
if is_original_common:
|
| 1144 |
+
if pred_score > current_score * 1000:
|
| 1145 |
+
refined_words[i] = pred_word
|
| 1146 |
+
break
|
| 1147 |
+
else:
|
| 1148 |
+
if pred_score > current_score * 50 and pred_score > 0.2:
|
| 1149 |
+
refined_words[i] = pred_word
|
| 1150 |
+
break
|
| 1151 |
+
return ' '.join(refined_words)
|
| 1152 |
+
|
| 1153 |
+
def calculate_sentence_score(self, text: str) -> float:
|
| 1154 |
+
words = text.split()
|
| 1155 |
+
if not words:
|
| 1156 |
+
return 0.0
|
| 1157 |
+
total_score = 0.0
|
| 1158 |
+
scored_words = 0
|
| 1159 |
+
for i, word in enumerate(words):
|
| 1160 |
+
score = self.score_with_mlm(text, i, word)
|
| 1161 |
+
total_score += score
|
| 1162 |
+
scored_words += 1
|
| 1163 |
+
if scored_words == 0:
|
| 1164 |
+
return 0.0
|
| 1165 |
+
return total_score / scored_words
|
| 1166 |
+
|
| 1167 |
+
|
| 1168 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 1169 |
+
# MAIN SPELL CHECKER CLASS
|
| 1170 |
+
# ═══════════════════════════════════════════════════════════════════════════════
|
| 1171 |
+
|
| 1172 |
+
class ArabicSpellChecker:
|
| 1173 |
+
"""Main Arabic Spell Checker class"""
|
| 1174 |
+
|
| 1175 |
+
def __init__(self, model, tokenizer, device, use_contextual: bool = True):
|
| 1176 |
+
self.model = model
|
| 1177 |
+
self.tokenizer = tokenizer
|
| 1178 |
+
self.device = device
|
| 1179 |
+
|
| 1180 |
+
self.postprocessor = AraSpellPostProcessor()
|
| 1181 |
+
self.classifier = ErrorClassifier()
|
| 1182 |
+
self.rules = RulesBasedCorrector()
|
| 1183 |
+
self.validator = OutputValidator()
|
| 1184 |
+
self.vocab_manager = VocabularyManager(tokenizer)
|
| 1185 |
+
self.edit_corrector = EditDistanceCorrector(tokenizer)
|
| 1186 |
+
self.split_merge = SplitMergeSpecialist(self.vocab_manager)
|
| 1187 |
+
self.word_aligner = WordAligner(self.vocab_manager)
|
| 1188 |
+
|
| 1189 |
+
self.use_contextual = use_contextual
|
| 1190 |
+
if use_contextual:
|
| 1191 |
+
try:
|
| 1192 |
+
self.contextual = ContextualCorrector()
|
| 1193 |
+
logger.info("Contextual correction enabled")
|
| 1194 |
+
except Exception as e:
|
| 1195 |
+
logger.warning(f"Contextual correction disabled: {e}")
|
| 1196 |
+
self.contextual = None
|
| 1197 |
+
self.use_contextual = False
|
| 1198 |
+
else:
|
| 1199 |
+
self.contextual = None
|
| 1200 |
+
|
| 1201 |
+
def _fix_repeated_end_chars(self, text: str) -> str:
|
| 1202 |
+
text = re.sub(r'([ا-ي])\1+\b', r'\1', text)
|
| 1203 |
+
return text
|
| 1204 |
+
|
| 1205 |
+
def _fix_merged_with_errors(self, text: str) -> str:
|
| 1206 |
+
text = re.sub(r'ال([ا-ي])\1+([ا-ي]{2,})', r'ال\2', text)
|
| 1207 |
+
text = re.sub(r'\b([ا-ي]{3,})([ا-ي])\2+\b', r'\1\2', text)
|
| 1208 |
+
return text
|
| 1209 |
+
|
| 1210 |
+
def _split_merged_words_linguistic(self, text: str) -> str:
|
| 1211 |
+
text = re.sub(
|
| 1212 |
+
r'\b(في|من|إلى|الى|حتى|منذ|خلال|بعد|قبل)(ال)?([ا-ي]{3,})',
|
| 1213 |
+
r'\1 \2\3', text
|
| 1214 |
+
)
|
| 1215 |
+
text = re.sub(r'\b(كل)([ا-ي]{3,})', r'\1 \2', text)
|
| 1216 |
+
text = re.sub(r'([ا-ي]{3,})(ال)([ا-ي]{3,})', r'\1 \2\3', text)
|
| 1217 |
+
text = re.sub(r'\b([بلك])(ال)?([ا-ي]{3,})', r'\1 \2\3', text)
|
| 1218 |
+
text = re.sub(r'([ا-ي]{4,})(عليكم|عليك|عليه|عليها)', r'\1 \2', text)
|
| 1219 |
+
text = re.sub(r'([ا-ي]{3,})(على|عن)([ا-ي]{3,})', r'\1 \2 \3', text)
|
| 1220 |
+
return text
|
| 1221 |
+
|
| 1222 |
+
def _split_long_words_heuristic(self, text: str, max_length: int = 15) -> str:
|
| 1223 |
+
words = text.split()
|
| 1224 |
+
result = []
|
| 1225 |
+
for word in words:
|
| 1226 |
+
if len(word) <= max_length:
|
| 1227 |
+
result.append(word)
|
| 1228 |
+
continue
|
| 1229 |
+
if 'ال' in word[2:]:
|
| 1230 |
+
parts = word.split('ال', 1)
|
| 1231 |
+
if len(parts[0]) >= 2 and len(parts[1]) >= 3:
|
| 1232 |
+
result.extend([parts[0], 'ال' + parts[1]])
|
| 1233 |
+
continue
|
| 1234 |
+
if len(word) >= 8:
|
| 1235 |
+
split_found = False
|
| 1236 |
+
for split_pos in [2, 3]:
|
| 1237 |
+
prefix = word[:split_pos]
|
| 1238 |
+
suffix = word[split_pos:]
|
| 1239 |
+
if prefix in ['في', 'من', 'على', 'عن', 'مع', 'كل', 'ب', 'ل', 'ك']:
|
| 1240 |
+
result.extend([prefix, suffix])
|
| 1241 |
+
split_found = True
|
| 1242 |
+
break
|
| 1243 |
+
if not split_found:
|
| 1244 |
+
result.append(word)
|
| 1245 |
+
else:
|
| 1246 |
+
result.append(word)
|
| 1247 |
+
return ' '.join(result)
|
| 1248 |
+
|
| 1249 |
+
def _normalize_tanween_patterns(self, text: str) -> str:
|
| 1250 |
+
text = re.sub(r'([ا-ي]{2,})أ\b', r'\1اً', text)
|
| 1251 |
+
text = re.sub(r'\s+أ\s+', ' ', text)
|
| 1252 |
+
text = re.sub(r'\b([بلك])\s+([ا-ي])', r'\1\2', text)
|
| 1253 |
+
return text
|
| 1254 |
+
|
| 1255 |
+
def preprocess(self, text: str) -> str:
|
| 1256 |
+
"""Preprocessing pipeline"""
|
| 1257 |
+
text = self.postprocessor.remove_harakat(text)
|
| 1258 |
+
text = self.postprocessor.remove_tatweel(text)
|
| 1259 |
+
text = self.postprocessor.normalize_special_chars(text)
|
| 1260 |
+
text = self._fix_repeated_end_chars(text)
|
| 1261 |
+
text = self._fix_merged_with_errors(text)
|
| 1262 |
+
text = self._split_merged_words_linguistic(text)
|
| 1263 |
+
text = self._split_long_words_heuristic(text)
|
| 1264 |
+
text = self._normalize_tanween_patterns(text)
|
| 1265 |
+
text = self.postprocessor.merge_separated_al(text)
|
| 1266 |
+
text = self.postprocessor.unified_collapse_repeated(text)
|
| 1267 |
+
text = self.rules.fix_char_substitution(text)
|
| 1268 |
+
text = self.rules.fix_char_repetition(text)
|
| 1269 |
+
text = self.postprocessor.normalize_spaces(text)
|
| 1270 |
+
return text
|
| 1271 |
+
|
| 1272 |
+
def postprocess(self, text: str, original: str = "") -> str:
|
| 1273 |
+
"""Postprocessing pipeline"""
|
| 1274 |
+
return self.postprocessor.full_postprocess(text, original, vocab_manager=self.vocab_manager)
|
| 1275 |
+
|
| 1276 |
+
def model_inference(self, text: str, num_return_sequences: int = 5) -> List[str]:
|
| 1277 |
+
"""Run seq2seq model inference and return top candidates."""
|
| 1278 |
+
inputs = self.tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
|
| 1279 |
+
inputs = {k: v.to(self.device) for k, v in inputs.items()}
|
| 1280 |
+
with torch.no_grad():
|
| 1281 |
+
outputs = self.model.generate(
|
| 1282 |
+
**inputs,
|
| 1283 |
+
num_beams=5,
|
| 1284 |
+
num_return_sequences=num_return_sequences,
|
| 1285 |
+
early_stopping=True,
|
| 1286 |
+
return_dict_in_generate=True,
|
| 1287 |
+
output_scores=True
|
| 1288 |
+
)
|
| 1289 |
+
candidates = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)
|
| 1290 |
+
self._last_beam_scores = {}
|
| 1291 |
+
if hasattr(outputs, 'sequences_scores') and outputs.sequences_scores is not None:
|
| 1292 |
+
scores = outputs.sequences_scores.tolist()
|
| 1293 |
+
for cand, score in zip(candidates, scores):
|
| 1294 |
+
self._last_beam_scores[cand] = score
|
| 1295 |
+
return candidates
|
| 1296 |
+
|
| 1297 |
+
def correct(self, text: str) -> str:
|
| 1298 |
+
"""
|
| 1299 |
+
Main correction pipeline (RERANKING APPROACH)
|
| 1300 |
+
|
| 1301 |
+
Steps:
|
| 1302 |
+
1. Preprocess
|
| 1303 |
+
2. Generate Candidates (Model Beams + Baseline)
|
| 1304 |
+
3. Rerank Candidates (Validator + Fluency)
|
| 1305 |
+
4. Select Best
|
| 1306 |
+
5. Postprocess
|
| 1307 |
+
"""
|
| 1308 |
+
if not text or not text.strip():
|
| 1309 |
+
return text
|
| 1310 |
+
|
| 1311 |
+
original = text
|
| 1312 |
+
|
| 1313 |
+
# 1. Preprocess
|
| 1314 |
+
preprocessed_text = self.preprocess(text)
|
| 1315 |
+
|
| 1316 |
+
# 2. Classify error type
|
| 1317 |
+
error_type = self.classifier.classify(preprocessed_text)
|
| 1318 |
+
|
| 1319 |
+
# 3. Generate Candidates
|
| 1320 |
+
candidates = []
|
| 1321 |
+
candidates.append(preprocessed_text)
|
| 1322 |
+
|
| 1323 |
+
rules_candidate = self.rules.advanced_heuristic_repair(text)
|
| 1324 |
+
candidates.append(rules_candidate)
|
| 1325 |
+
|
| 1326 |
+
edit_candidate = self.edit_corrector.generate_candidate(text)
|
| 1327 |
+
if edit_candidate != text and edit_candidate != rules_candidate:
|
| 1328 |
+
candidates.append(edit_candidate)
|
| 1329 |
+
|
| 1330 |
+
raw_model_output = None
|
| 1331 |
+
try:
|
| 1332 |
+
model_candidates = self.model_inference(preprocessed_text, num_return_sequences=5)
|
| 1333 |
+
raw_model_output = model_candidates[0] if model_candidates else None
|
| 1334 |
+
candidates.extend(model_candidates)
|
| 1335 |
+
|
| 1336 |
+
if model_candidates:
|
| 1337 |
+
hybrid_candidate = self.word_aligner.align_words(preprocessed_text, model_candidates[0])
|
| 1338 |
+
if hybrid_candidate not in candidates:
|
| 1339 |
+
candidates.append(hybrid_candidate)
|
| 1340 |
+
for beam in model_candidates[1:3]:
|
| 1341 |
+
hybrid_beam = self.word_aligner.align_words(preprocessed_text, beam)
|
| 1342 |
+
if hybrid_beam not in candidates:
|
| 1343 |
+
candidates.append(hybrid_beam)
|
| 1344 |
+
|
| 1345 |
+
if model_candidates and len(model_candidates) >= 3:
|
| 1346 |
+
try:
|
| 1347 |
+
beam_word_lists = [c.split() for c in model_candidates]
|
| 1348 |
+
max_words = max(len(wl) for wl in beam_word_lists)
|
| 1349 |
+
voted_words = []
|
| 1350 |
+
for pos in range(max_words):
|
| 1351 |
+
words_at_pos = []
|
| 1352 |
+
for wl in beam_word_lists:
|
| 1353 |
+
if pos < len(wl):
|
| 1354 |
+
words_at_pos.append(wl[pos])
|
| 1355 |
+
if words_at_pos:
|
| 1356 |
+
most_common = Counter(words_at_pos).most_common(1)[0][0]
|
| 1357 |
+
voted_words.append(most_common)
|
| 1358 |
+
voted_candidate = ' '.join(voted_words)
|
| 1359 |
+
if voted_candidate not in candidates:
|
| 1360 |
+
candidates.append(voted_candidate)
|
| 1361 |
+
except Exception:
|
| 1362 |
+
pass
|
| 1363 |
+
except Exception as e:
|
| 1364 |
+
logger.warning(f"Model inference failed: {e}")
|
| 1365 |
+
|
| 1366 |
+
# Remove duplicates
|
| 1367 |
+
unique_candidates = []
|
| 1368 |
+
seen = set()
|
| 1369 |
+
for c in candidates:
|
| 1370 |
+
if c not in seen:
|
| 1371 |
+
unique_candidates.append(c)
|
| 1372 |
+
seen.add(c)
|
| 1373 |
+
candidates = unique_candidates
|
| 1374 |
+
|
| 1375 |
+
# 4. Rerank Candidates
|
| 1376 |
+
best_candidate = preprocessed_text
|
| 1377 |
+
best_score = -1.0
|
| 1378 |
+
candidate_scores = []
|
| 1379 |
+
|
| 1380 |
+
for cand in candidates:
|
| 1381 |
+
is_valid, reason = self.validator.validate(original, cand, error_type.value)
|
| 1382 |
+
if len(cand) < len(original) * 0.5:
|
| 1383 |
+
is_valid = False
|
| 1384 |
+
reason = "too_short"
|
| 1385 |
+
|
| 1386 |
+
input_oov_count = self.vocab_manager.count_oov_words(original)
|
| 1387 |
+
cand_oov_count = self.vocab_manager.count_oov_words(cand)
|
| 1388 |
+
vocab_boost = 1.0
|
| 1389 |
+
|
| 1390 |
+
if input_oov_count > 0 and cand_oov_count < input_oov_count:
|
| 1391 |
+
oov_reduction = input_oov_count - cand_oov_count
|
| 1392 |
+
vocab_boost = 1.0 + (oov_reduction * 0.3)
|
| 1393 |
+
if cand_oov_count == 0 and self.vocab_manager.all_words_iv(cand):
|
| 1394 |
+
if not is_valid and reason not in ["empty_output"]:
|
| 1395 |
+
is_valid = True
|
| 1396 |
+
reason = "vocab_aware_accept"
|
| 1397 |
+
elif cand_oov_count > input_oov_count:
|
| 1398 |
+
vocab_boost = 0.5
|
| 1399 |
+
elif input_oov_count == 0 and cand_oov_count == 0:
|
| 1400 |
+
vocab_boost = 1.0
|
| 1401 |
+
|
| 1402 |
+
validity_factor = 1.0 if is_valid else 0.001
|
| 1403 |
+
|
| 1404 |
+
fluency_score = 0.0
|
| 1405 |
+
if self.use_contextual and self.contextual:
|
| 1406 |
+
try:
|
| 1407 |
+
fluency_score = self.contextual.calculate_sentence_score(cand)
|
| 1408 |
+
except Exception as e:
|
| 1409 |
+
logger.warning(f"Scoring failed: {e}")
|
| 1410 |
+
fluency_score = 0.5
|
| 1411 |
+
else:
|
| 1412 |
+
fluency_score = 1.0
|
| 1413 |
+
|
| 1414 |
+
dist = VocabularyManager.damerau_levenshtein_distance(preprocessed_text, cand)
|
| 1415 |
+
max_len = max(len(preprocessed_text), len(cand), 1)
|
| 1416 |
+
similarity = 1.0 - (dist / max_len)
|
| 1417 |
+
if cand == preprocessed_text:
|
| 1418 |
+
similarity = 1.0
|
| 1419 |
+
|
| 1420 |
+
keyboard_bonus = 1.0
|
| 1421 |
+
input_words = preprocessed_text.split()
|
| 1422 |
+
cand_words = cand.split()
|
| 1423 |
+
if len(input_words) == len(cand_words):
|
| 1424 |
+
for iw, cw in zip(input_words, cand_words):
|
| 1425 |
+
if iw != cw and len(iw) == len(cw):
|
| 1426 |
+
for ic, cc in zip(iw, cw):
|
| 1427 |
+
if ic != cc and RulesBasedCorrector.is_keyboard_neighbor(ic, cc):
|
| 1428 |
+
keyboard_bonus *= 1.05
|
| 1429 |
+
|
| 1430 |
+
if fluency_score > 0.85 and cand_oov_count == 0:
|
| 1431 |
+
if not is_valid and reason in ["too_short", "low_character_similarity", "word_count_mismatch"]:
|
| 1432 |
+
if len(cand) >= len(original) * 0.4:
|
| 1433 |
+
is_valid = True
|
| 1434 |
+
reason = "high_confidence_override"
|
| 1435 |
+
vocab_boost *= 1.2
|
| 1436 |
+
validity_factor = 1.0
|
| 1437 |
+
|
| 1438 |
+
fluency_exp = 0.3
|
| 1439 |
+
similarity_exp = 3.0
|
| 1440 |
+
beam_boost = 1.0
|
| 1441 |
+
if raw_model_output and cand == raw_model_output:
|
| 1442 |
+
beam_boost = 1.15
|
| 1443 |
+
|
| 1444 |
+
final_score = (fluency_score ** fluency_exp) * (similarity ** similarity_exp) * validity_factor * vocab_boost * keyboard_bonus * beam_boost
|
| 1445 |
+
|
| 1446 |
+
candidate_scores.append({
|
| 1447 |
+
'text': cand, 'is_valid': is_valid, 'reason': reason,
|
| 1448 |
+
'fluency': fluency_score, 'similarity': similarity,
|
| 1449 |
+
'vocab_boost': vocab_boost, 'input_oov': input_oov_count,
|
| 1450 |
+
'cand_oov': cand_oov_count, 'final_score': final_score
|
| 1451 |
+
})
|
| 1452 |
+
|
| 1453 |
+
if final_score > best_score:
|
| 1454 |
+
best_score = final_score
|
| 1455 |
+
best_candidate = cand
|
| 1456 |
+
|
| 1457 |
+
# Output Quality Scoring
|
| 1458 |
+
if best_candidate != preprocessed_text:
|
| 1459 |
+
preprocessed_score = 0.0
|
| 1460 |
+
for cs in candidate_scores:
|
| 1461 |
+
if cs['text'] == preprocessed_text:
|
| 1462 |
+
preprocessed_score = cs['final_score']
|
| 1463 |
+
break
|
| 1464 |
+
if preprocessed_score > 0 and best_score < preprocessed_score * 1.05:
|
| 1465 |
+
best_oov = self.vocab_manager.count_oov_words(best_candidate)
|
| 1466 |
+
prep_oov = self.vocab_manager.count_oov_words(preprocessed_text)
|
| 1467 |
+
if best_oov > prep_oov:
|
| 1468 |
+
best_candidate = preprocessed_text
|
| 1469 |
+
best_score = preprocessed_score
|
| 1470 |
+
|
| 1471 |
+
# Contextual Validation Layer
|
| 1472 |
+
if best_candidate != preprocessed_text and self.use_contextual and self.contextual:
|
| 1473 |
+
try:
|
| 1474 |
+
input_fluency = self.contextual.calculate_sentence_score(preprocessed_text)
|
| 1475 |
+
best_fluency = 0.0
|
| 1476 |
+
for cs in candidate_scores:
|
| 1477 |
+
if cs['text'] == best_candidate:
|
| 1478 |
+
best_fluency = cs['fluency']
|
| 1479 |
+
break
|
| 1480 |
+
if input_fluency > 0 and best_fluency > 0:
|
| 1481 |
+
if input_fluency > best_fluency * 1.5:
|
| 1482 |
+
input_oov = self.vocab_manager.count_oov_words(preprocessed_text)
|
| 1483 |
+
best_oov = self.vocab_manager.count_oov_words(best_candidate)
|
| 1484 |
+
if input_oov <= best_oov:
|
| 1485 |
+
best_candidate = preprocessed_text
|
| 1486 |
+
except Exception:
|
| 1487 |
+
pass
|
| 1488 |
+
|
| 1489 |
+
# 5. Postprocess Winner
|
| 1490 |
+
result = self.postprocess(best_candidate, original)
|
| 1491 |
+
|
| 1492 |
+
# IV-Safe Postprocessing Check
|
| 1493 |
+
if result != best_candidate:
|
| 1494 |
+
result_words = result.split()
|
| 1495 |
+
best_words = best_candidate.split()
|
| 1496 |
+
if len(result_words) == len(best_words):
|
| 1497 |
+
fixed_words = []
|
| 1498 |
+
for idx_fw, (rw, bw) in enumerate(zip(result_words, best_words)):
|
| 1499 |
+
if rw != bw:
|
| 1500 |
+
bw_iv = self.vocab_manager.is_iv(bw)
|
| 1501 |
+
rw_iv = self.vocab_manager.is_iv(rw)
|
| 1502 |
+
if bw_iv and not rw_iv:
|
| 1503 |
+
fixed_words.append(bw)
|
| 1504 |
+
else:
|
| 1505 |
+
fixed_words.append(rw)
|
| 1506 |
+
else:
|
| 1507 |
+
fixed_words.append(rw)
|
| 1508 |
+
result = ' '.join(fixed_words)
|
| 1509 |
+
|
| 1510 |
+
# 6. Contextual fine-tuning
|
| 1511 |
+
if self.use_contextual and self.contextual:
|
| 1512 |
+
if len(result) > 3:
|
| 1513 |
+
result = self.contextual.refine_sentence_with_mask(
|
| 1514 |
+
result, vocab_manager=self.vocab_manager,
|
| 1515 |
+
raw_model_output=raw_model_output
|
| 1516 |
+
)
|
| 1517 |
+
|
| 1518 |
+
# 7. Safe Split/Merge Post-processing
|
| 1519 |
+
result = self.split_merge.merge_fragments(result)
|
| 1520 |
+
|
| 1521 |
+
# 8. Output Stability Test
|
| 1522 |
+
if result != preprocessed_text and raw_model_output:
|
| 1523 |
+
try:
|
| 1524 |
+
re_preprocessed = self.preprocess(result)
|
| 1525 |
+
stability_dist = VocabularyManager.damerau_levenshtein_distance(result, re_preprocessed)
|
| 1526 |
+
result_len = max(len(result), 1)
|
| 1527 |
+
if stability_dist > 0:
|
| 1528 |
+
stability_ratio = stability_dist / result_len
|
| 1529 |
+
if stability_ratio > 0.15:
|
| 1530 |
+
raw_re = self.preprocess(raw_model_output)
|
| 1531 |
+
raw_stability = VocabularyManager.damerau_levenshtein_distance(
|
| 1532 |
+
raw_model_output, raw_re
|
| 1533 |
+
) / max(len(raw_model_output), 1)
|
| 1534 |
+
if raw_stability < stability_ratio:
|
| 1535 |
+
raw_oov = self.vocab_manager.count_oov_words(raw_model_output)
|
| 1536 |
+
our_oov = self.vocab_manager.count_oov_words(result)
|
| 1537 |
+
if raw_oov <= our_oov:
|
| 1538 |
+
result = raw_model_output
|
| 1539 |
+
except Exception:
|
| 1540 |
+
pass
|
| 1541 |
+
|
| 1542 |
+
# 9. Bidirectional Word-Level Validation
|
| 1543 |
+
if raw_model_output and result != raw_model_output:
|
| 1544 |
+
result_words = result.split()
|
| 1545 |
+
raw_words = raw_model_output.split()
|
| 1546 |
+
if len(result_words) == len(raw_words):
|
| 1547 |
+
corrected_words = []
|
| 1548 |
+
changed = False
|
| 1549 |
+
for rw, raw_w in zip(result_words, raw_words):
|
| 1550 |
+
if rw != raw_w:
|
| 1551 |
+
rw_iv = self.vocab_manager.is_iv(rw)
|
| 1552 |
+
raw_iv = self.vocab_manager.is_iv(raw_w)
|
| 1553 |
+
if not rw_iv and raw_iv:
|
| 1554 |
+
corrected_words.append(raw_w)
|
| 1555 |
+
changed = True
|
| 1556 |
+
elif rw_iv and raw_iv:
|
| 1557 |
+
input_words_list = preprocessed_text.split()
|
| 1558 |
+
idx = len(corrected_words)
|
| 1559 |
+
if idx < len(input_words_list):
|
| 1560 |
+
input_w = input_words_list[idx]
|
| 1561 |
+
rw_dist = Levenshtein.distance(input_w, rw)
|
| 1562 |
+
raw_dist = Levenshtein.distance(input_w, raw_w)
|
| 1563 |
+
if raw_dist < rw_dist:
|
| 1564 |
+
corrected_words.append(raw_w)
|
| 1565 |
+
changed = True
|
| 1566 |
+
else:
|
| 1567 |
+
corrected_words.append(rw)
|
| 1568 |
+
else:
|
| 1569 |
+
corrected_words.append(rw)
|
| 1570 |
+
else:
|
| 1571 |
+
corrected_words.append(rw)
|
| 1572 |
+
else:
|
| 1573 |
+
corrected_words.append(rw)
|
| 1574 |
+
if changed:
|
| 1575 |
+
new_result = ' '.join(corrected_words)
|
| 1576 |
+
new_oov = self.vocab_manager.count_oov_words(new_result)
|
| 1577 |
+
old_oov = self.vocab_manager.count_oov_words(result)
|
| 1578 |
+
if new_oov <= old_oov:
|
| 1579 |
+
result = new_result
|
| 1580 |
+
|
| 1581 |
+
# 10. SAFETY NET
|
| 1582 |
+
if raw_model_output and raw_model_output != result:
|
| 1583 |
+
raw_oov = self.vocab_manager.count_oov_words(raw_model_output)
|
| 1584 |
+
our_oov = self.vocab_manager.count_oov_words(result)
|
| 1585 |
+
if raw_oov == 0 and our_oov > 0:
|
| 1586 |
+
is_valid, reason = self.validator.validate(original, raw_model_output, "mixed")
|
| 1587 |
+
if is_valid or reason == "space_leniency_accept":
|
| 1588 |
+
result = raw_model_output
|
| 1589 |
+
elif raw_oov == 0 and our_oov == 0:
|
| 1590 |
+
raw_dist = VocabularyManager.damerau_levenshtein_distance(original, raw_model_output)
|
| 1591 |
+
our_dist = VocabularyManager.damerau_levenshtein_distance(original, result)
|
| 1592 |
+
result_vs_raw_dist = VocabularyManager.damerau_levenshtein_distance(result, raw_model_output)
|
| 1593 |
+
if raw_dist < our_dist and result_vs_raw_dist <= 3:
|
| 1594 |
+
raw_valid, _ = self.validator.validate(original, raw_model_output, "mixed")
|
| 1595 |
+
if raw_valid:
|
| 1596 |
+
result = raw_model_output
|
| 1597 |
+
elif raw_oov == 0:
|
| 1598 |
+
raw_wc = len(raw_model_output.split())
|
| 1599 |
+
our_wc = len(result.split())
|
| 1600 |
+
if raw_wc != our_wc:
|
| 1601 |
+
raw_dist = VocabularyManager.damerau_levenshtein_distance(original, raw_model_output)
|
| 1602 |
+
our_dist = VocabularyManager.damerau_levenshtein_distance(original, result)
|
| 1603 |
+
if raw_dist < our_dist:
|
| 1604 |
+
raw_valid, _ = self.validator.validate(original, raw_model_output, "mixed")
|
| 1605 |
+
if raw_valid:
|
| 1606 |
+
result = raw_model_output
|
| 1607 |
+
# ── FINAL PASS: Hamza whitelist + Ta Marbuta fixes (unrevertable) ──
|
| 1608 |
+
# These are applied AFTER all validation/safety steps so they can't
|
| 1609 |
+
# be undone by Steps 8-10 which compare against raw_model_output.
|
| 1610 |
+
# The root issue: Steps 8-10 use edit distance to INPUT (which has errors)
|
| 1611 |
+
# so they revert corrections back to the erroneous form.
|
| 1612 |
+
result = AraSpellPostProcessor.fix_common_hamza(result)
|
| 1613 |
+
result = AraSpellPostProcessor.fix_ha_ta_marbuta(result, vocab_manager=self.vocab_manager)
|
| 1614 |
+
|
| 1615 |
+
return result
|
src/nlp/punctuation/spelling/araspell_service.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
AraSpell Service — Lazy-loaded Arabic spelling correction.
|
| 3 |
+
|
| 4 |
+
Model is loaded on first request and kept in memory.
|
| 5 |
+
Pre-downloaded during Docker build; loaded from HF cache at runtime (no network needed).
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import os
|
| 9 |
+
import logging
|
| 10 |
+
import time
|
| 11 |
+
import torch
|
| 12 |
+
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
# ── Lazy-loaded singletons ──
|
| 16 |
+
_spell_checker = None
|
| 17 |
+
_load_error = None
|
| 18 |
+
|
| 19 |
+
# Model identifiers
|
| 20 |
+
MODEL_REPO = 'bayan10/AraSpell-Model'
|
| 21 |
+
MODEL_FILENAME = 'last_model.pt'
|
| 22 |
+
TOKENIZER_NAME = 'aubmindlab/bert-base-arabertv02'
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def get_spelling_model():
|
| 26 |
+
"""
|
| 27 |
+
Lazy-load the spelling model on first call.
|
| 28 |
+
Returns the ArabicSpellChecker instance, or raises RuntimeError if loading fails.
|
| 29 |
+
"""
|
| 30 |
+
global _spell_checker, _load_error
|
| 31 |
+
|
| 32 |
+
if _spell_checker is not None:
|
| 33 |
+
return _spell_checker
|
| 34 |
+
|
| 35 |
+
if _load_error is not None:
|
| 36 |
+
raise RuntimeError(f"Spelling model previously failed to load: {_load_error}")
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
t0 = time.time()
|
| 40 |
+
logger.info("Loading AraSpell spelling model (lazy init)...")
|
| 41 |
+
|
| 42 |
+
from huggingface_hub import hf_hub_download
|
| 43 |
+
from transformers import AutoTokenizer, EncoderDecoderModel
|
| 44 |
+
|
| 45 |
+
# 1. Download checkpoint (from HF cache — pre-downloaded in Docker build)
|
| 46 |
+
logger.info(f"Resolving checkpoint: {MODEL_REPO}/{MODEL_FILENAME}")
|
| 47 |
+
model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILENAME)
|
| 48 |
+
logger.info(f"Checkpoint path: {model_path}")
|
| 49 |
+
|
| 50 |
+
# 2. Load tokenizer
|
| 51 |
+
logger.info(f"Loading tokenizer: {TOKENIZER_NAME}")
|
| 52 |
+
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_NAME)
|
| 53 |
+
|
| 54 |
+
# 3. Build encoder-decoder model from AraBERT
|
| 55 |
+
logger.info("Building EncoderDecoderModel from AraBERT...")
|
| 56 |
+
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
|
| 57 |
+
TOKENIZER_NAME, TOKENIZER_NAME
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# 4. Configure generation
|
| 61 |
+
model.config.decoder_start_token_id = tokenizer.cls_token_id
|
| 62 |
+
model.config.pad_token_id = tokenizer.pad_token_id
|
| 63 |
+
model.config.eos_token_id = tokenizer.sep_token_id
|
| 64 |
+
model.generation_config.max_length = 128
|
| 65 |
+
model.generation_config.decoder_start_token_id = tokenizer.cls_token_id
|
| 66 |
+
model.generation_config.pad_token_id = tokenizer.pad_token_id
|
| 67 |
+
model.generation_config.eos_token_id = tokenizer.sep_token_id
|
| 68 |
+
|
| 69 |
+
# 5. Load trained weights
|
| 70 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
| 71 |
+
logger.info(f"Loading checkpoint weights on {device}...")
|
| 72 |
+
checkpoint = torch.load(model_path, map_location=device, weights_only=False)
|
| 73 |
+
model.load_state_dict(checkpoint['model_state_dict'], strict=False)
|
| 74 |
+
model = model.to(device)
|
| 75 |
+
model.eval()
|
| 76 |
+
|
| 77 |
+
epoch = checkpoint.get('epoch', 'N/A')
|
| 78 |
+
logger.info(f"Spelling model loaded on {device}, epoch: {epoch}")
|
| 79 |
+
# 6. Initialize the spell checker pipeline (contextual=True for MLM-based refinement)
|
| 80 |
+
|
| 81 |
+
from nlp.spelling.araspell_rules import ArabicSpellChecker
|
| 82 |
+
_spell_checker = ArabicSpellChecker(
|
| 83 |
+
model, tokenizer, device, use_contextual=True
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
elapsed = time.time() - t0
|
| 87 |
+
logger.info(f"AraSpell ready in {elapsed:.1f}s")
|
| 88 |
+
return _spell_checker
|
| 89 |
+
|
| 90 |
+
except Exception as e:
|
| 91 |
+
import traceback
|
| 92 |
+
_load_error = str(e)
|
| 93 |
+
logger.error(f"Failed to load spelling model: {e}")
|
| 94 |
+
logger.error(traceback.format_exc())
|
| 95 |
+
raise RuntimeError(f"Spelling model load failed: {e}")
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def is_loaded() -> bool:
|
| 99 |
+
"""Check if the spelling model is loaded."""
|
| 100 |
+
return _spell_checker is not None
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
def get_load_error() -> str:
|
| 104 |
+
"""Return the last load error, or empty string."""
|
| 105 |
+
return _load_error or ""
|
tests/phase10/reports/collision_benchmark_results.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/phase10/reports/phase10_results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|