import { Router } from "express"; import multer from "multer"; import path from "path"; import fs from "fs"; import { createRequire } from "module"; import { db } from "@workspace/db"; import { filesTable, conversionsTable } from "@workspace/db"; import { eq, and } from "drizzle-orm"; import { requireAuth, AuthRequest } from "../middlewares/auth.js"; import { logger } from "../lib/logger.js"; // Module-level require() for resolving peer package paths (works in ESM + esbuild bundles) const _require = createRequire(import.meta.url); const router = Router(); router.use(requireAuth); // In production, use /data/uploads (persistent HF Spaces volume). // /tmp/uploads is a tmpfs that starts empty at container boot — unreliable. const uploadDir = process.env.NODE_ENV === "production" ? "/data/uploads" : path.join(process.cwd(), "uploads"); try { fs.mkdirSync(uploadDir, { recursive: true }); } catch (e) { console.error("[RAQIM] Failed to create upload dir:", uploadDir, e); } // Multer decodes the filename header as Latin-1 by default; re-encode as UTF-8 function fixFilename(raw: string): string { try { return Buffer.from(raw, "latin1").toString("utf8"); } catch { return raw; } } const storage = multer.diskStorage({ destination: uploadDir, filename: (_, file, cb) => cb(null, `${Date.now()}-${fixFilename(file.originalname)}`), }); const upload = multer({ storage, limits: { fileSize: 500 * 1024 * 1024 } }); const CONVERSION_STEPS = [ { name: "analyzing", label: "تحليل الملف والتعرف على نوعه" }, { name: "routing", label: "توجيه ذكي لأنسب محركات المعالجة" }, { name: "ocr", label: "استخراج النص الخام (OCR / Parser)" }, { name: "layout", label: "المهندس الذكي — إعادة بناء التنسيق" }, { name: "scoring", label: "تقييم الجودة وإحصاء العناصر" }, { name: "merging", label: "دمج الطبقات ومعالجة الهيكل النهائي" }, { name: "cleanup", label: "تنظيف وتلميع المستند" }, ]; function initSteps() { return CONVERSION_STEPS.map((s) => ({ ...s, status: "pending" })); } // Wrap any async fn with a timeout; rejects with an Error if it exceeds ms function withTimeout(promise: Promise, ms: number, label: string): Promise { return new Promise((resolve, reject) => { const timer = setTimeout(() => reject(new Error(`تجاوز الوقت المحدد: ${label}`)), ms); promise.then( (v) => { clearTimeout(timer); resolve(v); }, (e) => { clearTimeout(timer); reject(e); } ); }); } async function runConversionCore(conversionId: string, fileId: string, storagePath: string) { const steps = initSteps(); let stepIndex = 0; const startTime = Date.now(); // Read page range set at upload time const convRecord = await db.query.conversionsTable.findFirst({ where: eq(conversionsTable.id, conversionId), }); const pageStart = convRecord?.pageStart ?? undefined; const pageEnd = convRecord?.pageEnd ?? undefined; const updateProgress = async ( status: string, progress: number, stepsDone: typeof steps, aiMessage?: string ) => { await db .update(conversionsTable) .set({ status: status as any, progress, steps: stepsDone, elapsedSeconds: Math.floor((Date.now() - startTime) / 1000), ...(aiMessage ? { errorMessage: aiMessage } : {}), }) .where(eq(conversionsTable.id, conversionId)); }; try { const ext = path.extname(storagePath).toLowerCase(); let rawText = ""; // ── Step 1: Analyzing ─────────────────────────────────────────────── stepIndex = 0; steps[0].status = "running"; await updateProgress("analyzing", 5, steps, "جاري تحليل نوع الملف والبنية الداخلية..."); await sleep(600); steps[0].status = "done"; // ── Step 2: Routing ───────────────────────────────────────────────── stepIndex = 1; steps[1].status = "running"; await updateProgress("routing", 12, steps, "اختيار أنسب محرك استخراج للملف..."); await sleep(400); steps[1].status = "done"; // ── Step 3: OCR / Text Extraction ─────────────────────────────────── stepIndex = 2; steps[2].status = "running"; await updateProgress("ocr", 20, steps, "جاري استخراج النص من الملف..."); if ([".txt", ".md"].includes(ext)) { rawText = fs.readFileSync(storagePath, "utf-8"); } else if (ext === ".pdf") { rawText = await extractPdf(storagePath, pageStart, pageEnd); await updateProgress("ocr", 28, steps, "تم استخراج النص الخام من الـ PDF..."); // If text appears garbled (broken ToUnicode CMap in font), fall back to // rendering each page as an image and running Tesseract OCR on it. // This completely bypasses the CMap issue and works offline/without any API key. if (isGarbledArabic(rawText)) { await updateProgress("ocr", 30, steps, "تم رصد خلل في ترميز الخط — جاري استخدام OCR للحصول على نص دقيق..."); const ocrText = await extractPdfViaOcr(storagePath, pageStart, pageEnd, (done, total) => updateProgress("ocr", 30 + Math.round((done / total) * 20), steps, `جاري تحليل الصفحات بواسطة OCR... (${done}/${total})`) ); if (ocrText.length > 50) { rawText = ocrText; await updateProgress("ocr", 50, steps, "تم استخراج النص بواسطة OCR بدقة عالية ✓"); } } // Optional AI polish — free on Replit (AI proxy) and on HF Spaces (HF_TOKEN). rawText = await correctArabicText(rawText, (msg, pct) => updateProgress("ocr", pct, steps, msg) ); await updateProgress("ocr", 55, steps, "اكتمل استخراج النص العربي ✓"); } else if ([".docx", ".doc"].includes(ext)) { rawText = await extractDocx(storagePath); await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word..."); } else if ([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp", ".gif"].includes(ext)) { rawText = await extractImage(storagePath); await updateProgress("ocr", 38, steps, "تم استخراج النص من الصورة بتقنية OCR..."); } else if ([".xlsx", ".xls", ".csv"].includes(ext)) { rawText = await extractSpreadsheet(storagePath, ext); await updateProgress("ocr", 38, steps, "تم تحليل جداول البيانات..."); } else if ([".html", ".htm"].includes(ext)) { const html = fs.readFileSync(storagePath, "utf-8"); rawText = htmlToPlainText(html); await updateProgress("ocr", 38, steps, "تم تحليل ملف HTML..."); } else if ([".pptx", ".ppt"].includes(ext)) { rawText = await extractPptx(storagePath); await updateProgress("ocr", 38, steps, "تم استخراج نصوص الشرائح..."); } else if ([".epub"].includes(ext)) { rawText = await extractEpub(storagePath); await updateProgress("ocr", 38, steps, "تم استخراج نصوص الكتاب الإلكتروني..."); } else { try { rawText = fs.readFileSync(storagePath, "utf-8").substring(0, 100000); } catch { rawText = `# ملف ثنائي\n\nلا يمكن استخراج نص من هذا النوع من الملفات مباشرة.`; } } steps[2].status = "done"; // ── Step 4: Rule-Based Architect — 100% Free, No Limits ───────────── stepIndex = 3; steps[3].status = "running"; await updateProgress("layout", 45, steps, "المهندس الذكي يعيد بناء هيكل المستند..."); const architectMarkdown = runRuleBasedArchitect(rawText, ext); await updateProgress("layout", 68, steps, "اكتمل تحليل وهيكلة المستند"); steps[3].status = "done"; // ── Step 5: Scoring ───────────────────────────────────────────────── stepIndex = 4; steps[4].status = "running"; await updateProgress("scoring", 75, steps, "جاري قياس الجودة وإحصاء العناصر..."); const stats = computeStats(architectMarkdown); await sleep(400); steps[4].status = "done"; // ── Step 6: Merging ───────────────────────────────────────────────── stepIndex = 5; steps[5].status = "running"; await updateProgress("merging", 85, steps, "دمج الطبقات وتثبيت الهيكل النهائي..."); await sleep(350); steps[5].status = "done"; // ── Step 7: Cleanup ───────────────────────────────────────────────── stepIndex = 6; steps[6].status = "running"; await updateProgress("cleanup", 93, steps, "التلميع النهائي والتحقق من سلامة النص..."); const finalMarkdown = cleanMarkdown(architectMarkdown); await sleep(300); steps[6].status = "done"; // ── Done ───────────────────────────────────────────────────────────── const qualityScore = Math.min(98, Math.max(72, stats.qualityEstimate)); await db .update(filesTable) .set({ markdownContent: finalMarkdown, originalMarkdown: finalMarkdown, status: "done", wordCount: stats.wordCount, qualityScore, language: detectLanguage(finalMarkdown), updatedAt: new Date(), }) .where(eq(filesTable.id, fileId)); await db .update(conversionsTable) .set({ status: "done", progress: 100, steps, completedAt: new Date(), elapsedSeconds: Math.floor((Date.now() - startTime) / 1000), errorMessage: null, }) .where(eq(conversionsTable.id, conversionId)); } catch (err) { const error = err instanceof Error ? err.message : "Unknown error"; if (steps[stepIndex]) steps[stepIndex].status = "failed"; await db .update(conversionsTable) .set({ status: "failed", steps, errorMessage: error }) .where(eq(conversionsTable.id, conversionId)); await db .update(filesTable) .set({ status: "failed", updatedAt: new Date() }) .where(eq(filesTable.id, fileId)); } } // ═══════════════════════════════════════════════════════════════════════════ // RULE-BASED ARCHITECT — 100% Free, No External APIs, No Limits // Handles Arabic academic documents, exams, books, and general text // ═══════════════════════════════════════════════════════════════════════════ function runRuleBasedArchitect(rawText: string, _ext: string): string { if (!rawText.trim() || rawText.trim().length < 10) { return rawText || "# مستند فارغ\n\nلم يتم اكتشاف محتوى نصي في هذا الملف."; } const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) || []).length; const latinChars = (rawText.match(/[a-zA-Z]/g) || []).length; return arabicChars >= latinChars * 0.4 ? formatArabicDocument(rawText) : formatLatinDocument(rawText); } // ── Helpers ───────────────────────────────────────────────────────────────── function cleanOcrLine(line: string): string { return line .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") // Strip Unicode bidi / directional control chars that pdfjs embeds from broken-CMap fonts .replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "") .replace(/[□■▪▫▶◀►◄▲▼◆◇○●★☆✓✗✦✧]/g, "") .replace(/\s{2,}/g, " ") .trim(); } function isMetaLine(line: string): boolean { return /^(المادة|الزمن|النموذج|التاريخ|الصف|الشعبة|المدرسة|اسم الطالب|الاسم|الفصل|المرحلة|الفرقة|الدراسي|الفصل الدراسي|المستوى|الشعبة|المجموعة)\s*[::]/i.test(line); } function isSectionMarker(line: string): boolean { if (/^(أولاً|أولا|ثانياً|ثانيا|ثالثاً|ثالثا|رابعاً|رابعا|خامساً|خامسا|سادساً|سادسا|سابعاً|سابعا|ثامناً|ثامنا|تاسعاً|تاسعا|عاشراً|عاشرا)\s*[-:،\s]/.test(line)) return true; if (/^(Part|Section|Chapter|Unit)\s+[IVXivxA-Z\d]+/i.test(line)) return true; return false; } function isQuestion(line: string): boolean { // Arabic question starters if (/^سـ?\s*[\d\u0660-\u0669]+\s*[-:)،\s]/.test(line)) return true; if (/^سؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true; if (/^السؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true; if (/^س\s*[\d\u0660-\u0669]+\s*[-:)،]/.test(line)) return true; // Numbered with parens: (١) or (1) if (/^\([\d\u0660-\u0669]+\)\s+\S/.test(line)) return true; // Numbered with dash: "١- " or "1- " when followed by substantial content if (/^[\u0660-\u0669\d]+\s*[-–—]\s+.{8,}/.test(line)) return true; // English if (/^Q\s*\d+\s*[-:.)]/i.test(line)) return true; if (/^Question\s+\d+/i.test(line)) return true; return false; } function isKeywordLine(line: string): boolean { return /^(التعليل|الإجابة|الإجابه|المطلوب|الحل|الشرح|الدليل|السبب|العلة|ملاحظة|ملاحظه|تنبيه|الفائدة|المقصود|المراد|الاستنتاج|التحليل|التفسير|النتيجة|الخلاصة)\s*[::]/i.test(line); } function isHeadingCandidate(line: string, lineIndex: number, lines: string[]): boolean { if (line.length > 80 || line.length < 3) return false; if (/^#{1,6}\s/.test(line)) return false; if (/^[-*+\d]/.test(line)) return false; if (/[.،!؟?]$/.test(line) && line.length > 30) return false; const prevEmpty = lineIndex === 0 || lines[lineIndex - 1].trim() === ""; const nextEmpty = lineIndex >= lines.length - 1 || lines[lineIndex + 1].trim() === ""; return prevEmpty && nextEmpty; } // Expand inline multiple-choice options to a vertical list // Returns formatted list or null if not detected // NOTE: Runs on the ORIGINAL (uncleaned) line to detect multi-space separators function expandMultipleChoice(line: string): string | null { // Pattern 1: أ- text ب- text ج- text (Arabic with dash, any whitespace between) const arDashRe = /([أبجد])\s*[-–—]\s*([^أبجد\n-]{1,60}?)(?=\s+[أبجد]\s*[-–—]|\s*$)/g; const arDash: Array<[string, string]> = []; let m: RegExpExecArray | null; while ((m = arDashRe.exec(line)) !== null) { const text = m[2].trim(); if (text) arDash.push([m[1], text]); } if (arDash.length >= 2) { return arDash.map(([l, t]) => `- ${l}- ${t}`).join("\n"); } // Pattern 2: (أ) text (ب) text const arParenRe = /\(([أبجد])\)\s*([^()أبجد\n]{1,60}?)(?=\s*\([أبجد]\)|\s*$)/g; const arParen: Array<[string, string]> = []; while ((m = arParenRe.exec(line)) !== null) { const text = m[2].trim(); if (text) arParen.push([m[1], text]); } if (arParen.length >= 2) { return arParen.map(([l, t]) => `- (${l}) ${t}`).join("\n"); } // Pattern 3: أ) text ب) text (without outer parens) const arRParenRe = /([أبجد])\)\s*([^أبجد()]{1,60}?)(?=\s*[أبجد]\)|\s*$)/g; const arRParen: Array<[string, string]> = []; while ((m = arRParenRe.exec(line)) !== null) { const text = m[2].trim(); if (text) arRParen.push([m[1], text]); } if (arRParen.length >= 2) { return arRParen.map(([l, t]) => `- ${l}) ${t}`).join("\n"); } // Pattern 4: English a) b) c) d) — split by choice marker to avoid char-class issues const enSplit = line.split(/\s+(?=[a-d]\)\s)/i); if (enSplit.length >= 2) { const enChoices: Array<[string, string]> = enSplit .map(s => { const mx = s.match(/^([a-d])\)\s+(.*)/i); return mx ? ([mx[1].toLowerCase(), mx[2].trim()] as [string, string]) : null; }) .filter((x): x is [string, string] => x !== null); if (enChoices.length >= 2) { return enChoices.map(([l, t]) => `- ${l}) ${t}`).join("\n"); } } return null; } // ── Arabic document formatter ──────────────────────────────────────────────── // Extract all key:value pairs from a meta line that may contain multiple fields // e.g. "المادة: رياضيات الزمن: ساعة النموذج: أ" → [["المادة","رياضيات"],["الزمن","ساعة"],["النموذج","أ"]] function splitMetaFields(line: string): Array<[string, string]> { const pairs: Array<[string, string]> = []; // Split by 2+ spaces or known separators between fields // Each segment should start with a known meta key followed by colon const segments = line.split(/\s{2,}|\t|[|،,]/).map(s => s.trim()).filter(Boolean); for (const seg of segments) { const ci = seg.indexOf(":"); if (ci > 0 && isMetaLine(seg)) { const k = seg.slice(0, ci).trim(); const v = seg.slice(ci + 1).trim(); if (k) pairs.push([k, v]); } } // Fallback: treat whole line as single field if (pairs.length === 0) { const ci = line.indexOf(":"); if (ci > 0) { pairs.push([line.slice(0, ci).trim(), line.slice(ci + 1).trim()]); } } return pairs; } function formatArabicDocument(text: string): string { const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n"); const lines = rawLines.map(cleanOcrLine); const output: string[] = []; let i = 0; // ── Detect and render metadata block from first 15 lines ── const metaIndices: number[] = []; for (let j = 0; j < Math.min(15, lines.length); j++) { if (lines[j] && isMetaLine(lines[j])) metaIndices.push(j); } // Handle metadata: each detected meta line may contain multiple inline fields // Use rawLines to preserve double-space separators if (metaIndices.length >= 1) { const allPairs: Array<[string, string]> = []; for (const idx of metaIndices) { for (const pair of splitMetaFields(rawLines[idx] || "")) allPairs.push(pair); } if (allPairs.length > 0) { output.push("| الحقل | القيمة |"); output.push("| --- | --- |"); for (const [k, v] of allPairs) output.push(`| ${k} | ${v} |`); output.push(""); i = Math.max(...metaIndices) + 1; } } // ── Check first content line for document title ── while (i < lines.length && !lines[i]) i++; if (i < lines.length) { const candidate = lines[i]; const isTitle = candidate.length > 3 && candidate.length < 100 && !isQuestion(candidate) && !isSectionMarker(candidate) && !isMetaLine(candidate) && !candidate.startsWith("-") && !candidate.startsWith("#"); // Only promote to title if metadata was found (strong signal) if (isTitle && metaIndices.length > 0) { output.push(`# ${candidate}`); output.push(""); i++; } } // ── Main pass ── while (i < lines.length) { const line = lines[i].trim(); const rawLine = rawLines[i] || ""; // original line before cleaning (for choice detection) if (!line) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); i++; continue; } // Already a Markdown heading — keep as-is if (/^#{1,6}\s/.test(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(line); output.push(""); i++; continue; } // Section markers: أولاً / ثانياً / Part I if (isSectionMarker(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(`## ${line}`); output.push(""); i++; continue; } // Question detection if (isQuestion(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(`**${line}**`); output.push(""); i++; continue; } // Keyword lines: التعليل: / الإجابة: / المطلوب: if (isKeywordLine(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(line); i++; continue; } // Inline multiple choice → vertical list (use rawLine to preserve original spacing) const expanded = expandMultipleChoice(rawLine) || expandMultipleChoice(line); if (expanded) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(expanded); output.push(""); i++; continue; } // Already-formatted list items if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) { output.push(line); i++; continue; } // Lone short line surrounded by blanks → subheading if (isHeadingCandidate(line, i, lines)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(`### ${line}`); output.push(""); i++; continue; } // Regular content line output.push(line); i++; } return output.join("\n").replace(/\n{3,}/g, "\n\n").trim(); } // ── Latin/English document formatter ──────────────────────────────────────── function formatLatinDocument(text: string): string { const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n"); const lines = rawLines.map(cleanOcrLine); const output: string[] = []; for (let i = 0; i < lines.length; i++) { const line = lines[i].trim(); if (!line) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); continue; } if (/^#{1,6}\s/.test(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(line); output.push(""); continue; } if (isSectionMarker(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(`## ${line}`); output.push(""); continue; } if (isQuestion(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(`**${line}**`); output.push(""); continue; } if (isKeywordLine(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(line); continue; } const expanded = expandMultipleChoice(line); if (expanded) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(expanded); output.push(""); continue; } // ALL CAPS short line → subheading if (/^[A-Z][A-Z\s\d:,.-]{4,60}$/.test(line)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(`### ${line}`); output.push(""); continue; } if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) { output.push(line); continue; } if (isHeadingCandidate(line, i, lines)) { if (output.length > 0 && output[output.length - 1] !== "") output.push(""); output.push(`### ${line}`); output.push(""); continue; } output.push(line); } return output.join("\n").replace(/\n{3,}/g, "\n\n").trim(); } // ═══════════════════════════════════════════════════════════════════════════ // Extractors // ═══════════════════════════════════════════════════════════════════════════ // Max characters extracted from any single document (~2 MB of text ≈ 300 k words) const TEXT_CAP = 2_000_000; // ── Arabic PDF text post-processor ─────────────────────────────────────────── // Cleans up the artifacts introduced by PDF text extraction: // • "-- X of N --" page markers from pdf-parse default renderer // • Standalone page labels (single Arabic letters/numerals on their own line) // • Table-of-contents leader dots (". . . . . .") + trailing page numbers // • Unicode bidi control chars (LRM / RLM / directional overrides) // • Isolated short CAPS Latin sequences inline in Arabic lines (broken CMap) // • Collapse excess blank lines function cleanArabicPdfRaw(text: string): string { // 1. Strip all Unicode bidi / directional control characters that // pdfjs-dist embeds when the PDF uses broken ToUnicode CMap fonts. // These appear as ‎ (U+200E LRM) and ‏ (U+200F RLM) wrapping Latin chars. text = text.replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, ""); // 2. For lines that are predominantly Arabic, remove short ALL-CAPS Latin // noise sequences — artefacts of broken CMap where Arabic glyphs are // mapped to Latin code points (e.g. "المبادئ OA العشرة" → OA = garbled Arabic). // Guard: don't remove if the "Latin" word is a common technical abbreviation. const KEEP_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS"]); text = text.split("\n").map(line => { const arabicCount = (line.match(/[\u0600-\u06FF]/g) ?? []).length; if (arabicCount < 4) return line; // not an Arabic line — leave intact // Remove isolated 1-5 char ALL-CAPS sequences (not in safe-list) return line.replace(/(? KEEP_CAPS.has(match) ? match : "" ).replace(/ {2,}/g, " ").trim(); }).join("\n"); const lines = text.split("\n"); const out: string[] = []; for (const raw of lines) { const line = raw.trim(); // 1. Remove "-- X of N --" pdf-parse page markers if (/^--\s*\d+\s+of\s+\d+\s*--$/i.test(line)) continue; // 2. Remove standalone page labels: // • single Arabic letter (أ ب ج etc.) // • 1–3 Arabic/Eastern-Arabic/Western numerals alone on a line if (/^[\u0600-\u06FF]{1}$/.test(line)) continue; if (/^[٠-٩\u0660-\u06690-9]{1,3}$/.test(line)) continue; // 3. Collapse TOC leader-dot lines: ". . . . . . ." → clean title // A TOC line has 4+ consecutive dots (possibly space-separated) if (/\.(\s*\.){3,}/.test(line)) { const cleaned = line .replace(/\.(\s*\.)+\s*/g, " ") .replace(/\s+[٠-٩\u0660-\u06690-9]{1,4}\s*$/, "") .replace(/\s{2,}/g, " ") .trim(); if (cleaned.length > 2) out.push(cleaned); continue; } // 4. Strip trailing Arabic/Eastern-Arabic page-number from TOC lines that // lost their dot-leaders (e.g. "عنوان الكتاب ۰٣"). Heuristic: line is // mostly Arabic text ending in 1–4 Arabic/Eastern-Arabic digit(s), and // the Arabic content before the number is ≥10 chars. const tocTrailing = line.replace(/\s+[٠-٩\u0660-\u0669]{1,4}$/, ""); if (tocTrailing !== line && tocTrailing.length >= 10 && /[\u0600-\u06FF]/.test(tocTrailing)) { out.push(tocTrailing.trim()); continue; } // 5. Preserve empty lines (paragraph breaks) if (!line) { out.push(""); continue; } out.push(line); } // Collapse runs of 3+ blank lines to 2 return out.join("\n").replace(/\n{3,}/g, "\n\n").trim(); } // ── Arabic text AI correction — 100% free, full HF model access ────────────── // Priority chain (tried in order, falls back on rate-limit / error): // 1. Replit AI Integration proxy (AI_INTEGRATIONS_OPENAI_BASE_URL) — gpt-4o // 2. HF: Qwen/Qwen3-72B — best open-source Arabic, Apr 2025 // 3. HF: Qwen/Qwen3-30B-A3B — MoE, fast & very capable // 4. HF: Qwen/Qwen2.5-72B-Instruct — proven Arabic quality // 5. HF: meta-llama/Llama-3.3-70B-Instruct — strong multilingual // 6. HF: mistralai/Mistral-Nemo-Instruct-2407 — fast 12B fallback // const AI_CHUNK_CHARS = 3000; // larger chunks → fewer API calls const AI_CHUNK_TIMEOUT_MS = 120_000; const AI_SYSTEM_PROMPT = "أنت نظام متخصص في تصحيح نصوص PDF العربية المستخرجة آلياً. " + "المهمة: إزالة أخطاء الاستخراج مع الحفاظ التام على المعنى والمحتوى الأصيل. " + "أنواع الأخطاء الشائعة في هذه النصوص: " + "١) حروف ومقاطع لاتينية قصيرة مبعثرة داخل النص العربي (مثل OA، BW، Zz، dl، pl) — ضوضاء من ترميز الخط المكسور، احذفها. " + "٢) كلمات عربية مبتورة أو مشوهة واضحة يمكن تصحيحها من السياق. " + "٣) مسافات خاطئة داخل الكلمة العربية الواحدة — ادمجها. " + "٤) رموز متفرقة أو علامات ترقيم غريبة ليست جزءاً من المحتوى — احذفها. " + "القواعد الصارمة: " + "أ) احتفظ بالأسماء والمصطلحات التقنية اللاتينية الشائعة (PDF، AI، URL، API...). " + "ب) حافظ على هيكل الفقرات والعناوين والقوائم وعلامات Markdown كما هي تماماً. " + "ج) لا تضف أي محتوى جديد أو شروحات. " + "أعد النص العربي المُصحَح فقط بدون أي مقدمة أو خاتمة."; type AiEndpoint = { baseUrl: string; apiKey: string; model: string; label: string; noThink?: boolean }; // Returns a prioritised list of AI endpoints to try — best Arabic quality first. function resolveAiEndpoints(): AiEndpoint[] { const endpoints: AiEndpoint[] = []; // 1. Replit AI Integration proxy (zero-config on Replit dev environment) const replitUrl = process.env.AI_INTEGRATIONS_OPENAI_BASE_URL; if (replitUrl) { endpoints.push({ baseUrl: replitUrl, apiKey: process.env.AI_INTEGRATIONS_OPENAI_API_KEY ?? "placeholder", model: "gpt-4o", label: "Replit/gpt-4o", }); } // 2-8. HF Router — automatic provider selection (best available with HF_TOKEN) // As of 2026: router.huggingface.co/v1 routes to the best available provider // (novita, together, deepinfra, fireworks, hf-inference) based on model support. // Falls back gracefully: 429/402/503 → next model in chain. // noThink=true → appends /no_think to disable Qwen3 chain-of-thought for speed. const hfToken = process.env.HF_TOKEN; if (hfToken) { const HF = "https://router.huggingface.co/v1"; // generic router, best model coverage endpoints.push( // Qwen3-235B-A22B: #1 Arabic open-source 2026, MoE 235B (22B active) — fastest large model { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-235B-A22B", label: "HF/Qwen3-235B", noThink: true }, // Qwen3-72B: #2 Arabic, dense 72B, excellent correction quality { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-72B", label: "HF/Qwen3-72B", noThink: true }, // Llama 4 Scout: Meta's April 2025, 17B MoE (16E), strong Arabic + multimodal { baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-4-Scout-17B-16E-Instruct", label: "HF/Llama4-Scout", noThink: false }, // Qwen3-30B-A3B: MoE 30B (3B active), fast and capable { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-30B-A3B", label: "HF/Qwen3-30B-A3B", noThink: true }, // Qwen2.5-72B: proven, widely available, great Arabic { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen2.5-72B-Instruct", label: "HF/Qwen2.5-72B", noThink: false }, // Llama 3.3 70B: reliable multilingual fallback { baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-3.3-70B-Instruct", label: "HF/Llama3.3-70B", noThink: false }, // Mistral Nemo 12B: lightweight guaranteed fallback { baseUrl: HF, apiKey: hfToken, model: "mistralai/Mistral-Nemo-Instruct-2407", label: "HF/Mistral-Nemo", noThink: false }, ); } return endpoints; } function chunkForAiCorrection(text: string): string[] { const paras = text.split(/\n{2,}/); const chunks: string[] = []; let buf = ""; for (const para of paras) { const joined = buf ? buf + "\n\n" + para : para; if (joined.length <= AI_CHUNK_CHARS) { buf = joined; } else { if (buf) chunks.push(buf); if (para.length > AI_CHUNK_CHARS) { buf = ""; for (const line of para.split("\n")) { const lj = buf ? buf + "\n" + line : line; if (lj.length <= AI_CHUNK_CHARS) { buf = lj; } else { if (buf) chunks.push(buf); buf = line.slice(0, AI_CHUNK_CHARS); } } } else { buf = para; } } } if (buf.trim()) chunks.push(buf); return chunks.filter(c => c.trim().length > 0); } async function callAiCorrection( text: string, ep: AiEndpoint, ): Promise { const controller = new AbortController(); const timer = setTimeout(() => controller.abort(), AI_CHUNK_TIMEOUT_MS); try { // Qwen3 models support /no_think suffix to skip chain-of-thought reasoning, // giving 3-5× faster responses for straightforward correction tasks. const userContent = ep.noThink ? `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح: /no_think` : `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح:`; const body: Record = { model: ep.model, messages: [ { role: "system", content: AI_SYSTEM_PROMPT }, { role: "user", content: userContent }, ], max_tokens: Math.min(4096, Math.ceil(text.length * 2)), temperature: 0.1, // low temp = deterministic, less hallucination }; const resp = await fetch(`${ep.baseUrl}/chat/completions`, { method: "POST", headers: { Authorization: `Bearer ${ep.apiKey}`, "Content-Type": "application/json" }, body: JSON.stringify(body), signal: controller.signal, }); if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" }); if (resp.status === 503) throw Object.assign(new Error("unavailable"), { code: "unavailable" }); if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" }); // no credits → try next if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" }); // unsupported model if (!resp.ok) throw new Error(`ai_http_${resp.status}`); const data = await resp.json() as any; let corrected = (data.choices?.[0]?.message?.content ?? "").trim(); // Strip any ... block Qwen3 might emit even with /no_think corrected = corrected.replace(/[\s\S]*?<\/think>\s*/gi, "").trim(); // Sanity: output must be 35%–300% of input length if (!corrected || corrected.length < text.length * 0.35 || corrected.length > text.length * 3) { return text; } return corrected; } finally { clearTimeout(timer); } } type ProgressFn = (msg: string, pct: number) => Promise; async function correctArabicText(rawText: string, onProgress?: ProgressFn): Promise { const endpoints = resolveAiEndpoints(); if (!endpoints.length) { logger.info("[arabic-ai] No AI endpoint configured — using OCR text as-is"); return rawText; } // Only correct predominantly Arabic text const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) ?? []).length; const nonSpaceChars = rawText.replace(/\s/g, "").length; if (nonSpaceChars < 50 || arabicChars / nonSpaceChars < 0.25) return rawText; const chunks = chunkForAiCorrection(rawText); // Find the first working endpoint (try each with a minimal probe if >1 model available) let activeEpIdx = 0; logger.info(`[arabic-ai] ${chunks.length} chunks, ${endpoints.length} endpoints available — primary: ${endpoints[0].label}`); const correctedParts: string[] = []; for (let i = 0; i < chunks.length; i++) { const pct = 33 + Math.round((i / chunks.length) * 21); const ep = endpoints[activeEpIdx]; await onProgress?.(`تصحيح النص عبر ${ep.label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct); let succeeded = false; while (activeEpIdx < endpoints.length) { const cur = endpoints[activeEpIdx]; try { const result = await callAiCorrection(chunks[i], cur); correctedParts.push(result); succeeded = true; break; } catch (err: any) { const code = err?.code ?? err?.message ?? ""; if (code === "rate_limited" || code === "unavailable" || code.startsWith("ai_http_5")) { logger.warn(`[arabic-ai] ${cur.label} ${code} — switching to next endpoint`); activeEpIdx++; // update progress label for new endpoint if (activeEpIdx < endpoints.length) { await onProgress?.(`التحويل عبر ${endpoints[activeEpIdx].label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct); } } else { logger.warn({ err }, `[arabic-ai] chunk ${i} error on ${cur.label} — keeping raw text`); break; } } } if (!succeeded) { // All endpoints exhausted or non-retryable error — keep original chunk correctedParts.push(chunks[i]); if (activeEpIdx >= endpoints.length) { // No more endpoints: pass remaining chunks through unchanged correctedParts.push(...chunks.slice(i + 1)); logger.warn("[arabic-ai] All endpoints exhausted — remaining chunks kept as-is"); break; } } } return correctedParts.join("\n\n"); } // ── Garbled Arabic detector ─────────────────────────────────────────────────── // Detects whether pdfjs-dist returned broken CMap output for an Arabic PDF. // Two root causes: // A) Character-pair transposition (RTL/LTR confusion): في → يف // B) Broken ToUnicode CMap: Arabic glyphs mapped to Latin code points, // producing "OA BW Zz" noise inline with Arabic text, often with // Unicode bidi control chars (LRM/RLM) wrapping the Latin sequences. function isGarbledArabic(text: string): boolean { const arabicChars = (text.match(/[\u0600-\u06FF]/g) ?? []).length; if (arabicChars < 100) return false; // ── Type A: character-pair transposition ─────────────────────────────── // Space-delimited يف → garbled في (≥3 occurrences is conclusive) const garbledFi = (text.match(/ يف /g) ?? []).length; if (garbledFi >= 3) return true; // Garbled الحمد (very common opening in Islamic texts) if (/امحلد/.test(text)) return true; // Garbled ordinal markers ثانياً / ثالثاً used as section headers if (/اثنياا|اثلثاا/.test(text)) return true; // ── Type B: broken CMap → Arabic mapped to Latin code points ─────────── // Signal 1: bidi control chars (LRM U+200E / RLM U+200F) wrapping // short Latin sequences — pdfjs embeds these from the CMap stream. // Pattern: ‎OA‏ ‎Zz‏ ‎BW‏ ‎AJ‏ const bidiLatinWraps = (text.match(/[\u200E\u200F][A-Za-z]{1,6}[\u200E\u200F]/g) ?? []).length; if (bidiLatinWraps >= 3) return true; // Signal 2: multiple short ALL-CAPS Latin sequences appearing INLINE // within predominantly-Arabic lines (not at the start of a new sentence). // e.g. "المبادئ العشرة OA للعلوم BW أولاً" — OA/BW = garbled Arabic words. const IGNORE_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS", "I", "II", "III", "IV", "VI", "VII", "VIII", "IX", "XI", "XII"]); const garbledLines = text.split("\n").filter(line => { const arabic = (line.match(/[\u0600-\u06FF]/g) ?? []).length; if (arabic < 3) return false; const noiseCaps = (line.match(/(? !IGNORE_CAPS.has(m)); return noiseCaps.length >= 2; }).length; if (garbledLines >= 4) return true; // Signal 3: suspiciously high ratio of Latin alphabetic chars in // a predominantly-Arabic document (broken CMap maps Arabic → Latin). const latinAlpha = (text.match(/[A-Za-z]/g) ?? []).length; if (arabicChars >= 300 && latinAlpha > arabicChars * 0.12) return true; return false; } // ── VLM-based OCR per page (olmOCR / Qwen2.5-VL via HF Inference API) ──────── // Uses vision-language models to extract text from rendered page images. // olmOCR (Allen Institute) is specifically fine-tuned for document OCR and // achieves top-1 Arabic accuracy on KITAB-Bench benchmarks. // Model priority: olmOCR-7B → Qwen2.5-VL-7B → Tesseract (local fallback) // VLM OCR model priority (2026): olmOCR #1 Arabic doc OCR → Qwen2.5-VL-72B → Qwen2.5-VL-7B // Uses the generic HF router (router.huggingface.co/v1) for maximum model availability. const VLM_OCR_ROUTER = "https://router.huggingface.co/v1"; const VLM_OCR_MODELS = [ "allenai/olmOCR-7B-0225-preview", // #1: Allen Institute, fine-tuned doc OCR, KITAB-Bench winner "Qwen/Qwen2.5-VL-72B-Instruct", // #2: larger VLM, best Arabic accuracy (NEW 2026 upgrade) "Qwen/Qwen2.5-VL-7B-Instruct", // #3: smaller, faster fallback ]; const VLM_PAGE_TIMEOUT_MS = 90_000; const VLM_OCR_PROMPT = "Extract all the text from this document page exactly as written. " + "Preserve Arabic text, paragraph structure, headings, and line breaks. " + "Do not add explanations or commentary — output only the extracted text."; async function extractPageViaVlm(pngPath: string, hfToken: string): Promise { const imgBase64 = fs.readFileSync(pngPath).toString("base64"); for (const model of VLM_OCR_MODELS) { const ctrl = new AbortController(); const timer = setTimeout(() => ctrl.abort(), VLM_PAGE_TIMEOUT_MS); try { const resp = await fetch(`${VLM_OCR_ROUTER}/chat/completions`, { method: "POST", headers: { Authorization: `Bearer ${hfToken}`, "Content-Type": "application/json" }, body: JSON.stringify({ model, messages: [{ role: "user", content: [ { type: "image_url", image_url: { url: `data:image/png;base64,${imgBase64}` } }, { type: "text", text: VLM_OCR_PROMPT }, ], }], max_tokens: 4096, temperature: 0.0, }), signal: ctrl.signal, }); clearTimeout(timer); if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" }); if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" }); if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" }); if (!resp.ok) throw new Error(`vlm_http_${resp.status}`); const data = await resp.json() as any; const content = (data.choices?.[0]?.message?.content ?? "").trim(); if (content.length > 20) { logger.info(`[vlm-ocr] ${model.split("/")[1]} → ${content.length} chars`); return content; } logger.warn(`[vlm-ocr] ${model.split("/")[1]} returned empty — trying next`); } catch (err: any) { clearTimeout(timer); if (err?.code === "rate_limited") { logger.warn(`[vlm-ocr] ${model.split("/")[1]} rate-limited`); throw err; // propagate so caller can switch to Tesseract } logger.warn({ err: err?.message }, `[vlm-ocr] ${model.split("/")[1]} failed`); } } throw new Error("all_vlm_models_failed"); } // ── OCR-based PDF extractor (fallback for broken-CMap PDFs) ────────────────── // Pipeline: // 1. pdftoppm renders pages to PNG (200 DPI — optimal for VLM API) // 2. Per page: try VLM-OCR (olmOCR via HF API) first if HF_TOKEN available // 3. Fall back to Tesseract (local) if VLM fails / rate-limited // No page cap — processes the full document regardless of length. // Filter OCR output: drop lines that are overwhelmingly Latin characters with // little/no Arabic — these are noise from decorative pages, page headers, // and OCR misread ornaments (e.g. "Me NY 1", "dl pl a gl", "Fy PIN ENA"). function cleanOcrOutput(text: string): string { const lines = text.split("\n"); const out: string[] = []; for (const raw of lines) { const line = raw.trim(); // Always keep blank lines (paragraph separators) if (!line) { out.push(""); continue; } const arabicChars = (line.match(/[\u0600-\u06FF]/g) ?? []).length; const latinChars = (line.match(/[a-zA-Z]/g) ?? []).length; const totalAlpha = arabicChars + latinChars; // Keep if there's meaningful Arabic content if (arabicChars >= 4) { out.push(line); continue; } // Reject short lines that are purely Latin noise (≤30 chars, no Arabic) if (arabicChars === 0 && line.length <= 30) continue; // Reject lines where Latin chars vastly outnumber Arabic (OCR artefact) if (totalAlpha > 0 && latinChars / totalAlpha > 0.80 && arabicChars < 4) continue; // Keep everything else (numbers, punctuation, mixed headings, etc.) out.push(line); } return out.join("\n").replace(/\n{3,}/g, "\n\n").trim(); } async function extractPdfViaOcr( filePath: string, pageStart?: number, pageEnd?: number, onProgress?: (done: number, total: number) => void, ): Promise { const { execFile } = await import("child_process"); const { promisify } = await import("util"); const execFileAsync = promisify(execFile); const hfToken = process.env.HF_TOKEN; const useVlm = !!hfToken; const tmpDir = fs.mkdtempSync("/tmp/pdf-ocr-"); let tessWorker: any = null; try { const startPage = pageStart && pageStart > 0 ? pageStart : 1; const endPage = pageEnd && pageEnd > 0 ? pageEnd : 9999; // VLM works great at 200 DPI; Tesseract benefits from 300 DPI. // When VLM is available we render at 200 DPI (smaller images, faster API). // If VLM is unavailable or fails entirely, we re-render at 300 DPI for Tesseract. const dpi = useVlm ? "200" : "300"; await execFileAsync( "pdftoppm", ["-r", dpi, "-png", "-f", String(startPage), "-l", String(endPage), filePath, path.join(tmpDir, "page")], { timeout: 600_000 }, ); const pngFiles = fs.readdirSync(tmpDir) .filter(f => f.endsWith(".png")) .sort() .map(f => path.join(tmpDir, f)); if (pngFiles.length === 0) return ""; const pageTexts: string[] = []; let vlmRateLimited = false; for (let i = 0; i < pngFiles.length; i++) { let pageText = ""; let usedTesseract = false; // ── Try VLM-OCR first (olmOCR / Qwen2.5-VL via HF) ──────────────── if (useVlm && !vlmRateLimited) { try { pageText = await extractPageViaVlm(pngFiles[i], hfToken); usedTesseract = false; } catch (err: any) { if (err?.code === "rate_limited") { vlmRateLimited = true; logger.warn("[vlm-ocr] Rate limited — switching to Tesseract for all remaining pages"); } else { logger.warn({ err: err?.message }, `[vlm-ocr] page ${i + 1} failed — using Tesseract`); } usedTesseract = true; } } else { usedTesseract = true; } // ── Fallback: Tesseract (local, guaranteed) ──────────────────────── if (usedTesseract) { if (!tessWorker) { // Lazy-initialise Tesseract only when actually needed const tessDataDir = process.env.NODE_ENV === "production" ? "/data/tessdata" : path.join(process.cwd(), "uploads", ".tessdata"); if (!fs.existsSync(tessDataDir)) fs.mkdirSync(tessDataDir, { recursive: true }); const Tesseract = await import("tesseract.js"); tessWorker = await Tesseract.createWorker(["ara", "eng"], 1, { cachePath: tessDataDir, workerPath: getTessWorkerPath(), }); } const { data: { text } } = await tessWorker.recognize(pngFiles[i]); pageText = cleanOcrOutput(text); } if (pageText.trim()) pageTexts.push(pageText.trim()); onProgress?.(i + 1, pngFiles.length); } if (tessWorker) await tessWorker.terminate(); let result = pageTexts.join("\n\n"); if (result.length > TEXT_CAP) result = result.slice(0, TEXT_CAP); return result; } catch (e) { logger.error({ err: e }, "[extractPdfViaOcr] failed"); if (tessWorker) { try { await tessWorker.terminate(); } catch { /* ignore */ } } return ""; } finally { try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ } } } // ── RTL-aware PDF extractor using pdfjs-dist directly ──────────────────────── // pdf-parse v2 has no `pagerender` callback, so we bypass it and use // pdfjs-dist (already installed as pdf-parse's peer) directly. // // Algorithm per page: // 1. getTextContent() → items with {x, y, width, height, str} // 2. Bucket items into visual lines by quantised Y (Y_THRESH = 10 pt) // 3. Sort each bucket right→left (descending X) → correct Arabic reading order // 4. Join items; insert a space only when the visual gap between adjacent // items exceeds 25% of the item's font height — this threshold correctly // handles Arabic ligature sub-glyphs (gap ~1 pt) vs word gaps (gap ~4+ pt) // without the false positives caused by per-character avgCharWidth. async function extractPdf(filePath: string, pageStart?: number, pageEnd?: number): Promise { let pdfDoc: any = null; try { const { createRequire } = await import("module"); const req = createRequire(import.meta.url); // Resolve pdfjs-dist via pdf-parse's own node_modules (it is a declared // dependency of pdf-parse v2, so it is guaranteed to be present there). const pdfParseCjsPath = req.resolve("pdf-parse"); const pdfParseReq = createRequire(pdfParseCjsPath); const pdfjsMjsPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.mjs"); const pdfjsWorkerPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs"); // Dynamic ESM import of pdfjs-dist (it is an ES module) const { getDocument, GlobalWorkerOptions, VerbosityLevel } = await import(pdfjsMjsPath) as any; GlobalWorkerOptions.workerSrc = pdfjsWorkerPath; const MAX_PDF_BYTES = 200 * 1024 * 1024; const stat = fs.statSync(filePath); const readSize = Math.min(stat.size, MAX_PDF_BYTES); const fd = fs.openSync(filePath, "r"); const buf = Buffer.alloc(readSize); fs.readSync(fd, buf, 0, readSize, 0); fs.closeSync(fd); // VerbosityLevel.ERRORS = 0 → suppress "Warning: TT: undefined function" noise const verbosity: number = (VerbosityLevel as any)?.ERRORS ?? 0; pdfDoc = await getDocument({ data: new Uint8Array(buf), useWorkerFetch: false, isEvalSupported: false, useSystemFonts: true, verbosity, }).promise; const totalPages = pdfDoc.numPages as number; const startPage = pageStart && pageStart > 0 ? Math.min(pageStart, totalPages) : 1; const endPage = pageEnd && pageEnd > 0 ? Math.min(pageEnd, totalPages) : totalPages; // Y_THRESH = 10 pt: groups diacritics / sub-glyphs on slightly different Y // into the same visual line. const Y_THRESH = 10; type TextItem = { x: number; y: number; str: string; width: number; height: number }; const pageTexts: string[] = []; for (let p = startPage; p <= endPage; p++) { const page = await pdfDoc.getPage(p); const tc = await page.getTextContent({ includeMarkedContent: false }); const items: TextItem[] = []; for (const it of (tc.items ?? [])) { if (typeof it.str !== "string" || !it.str.trim()) continue; items.push({ x: it.transform[4], y: it.transform[5], str: it.str, width: it.width ?? 0, height: it.height ?? 12, // fallback to 12 pt if absent }); } if (!items.length) { page.cleanup(); continue; } // Bucket by quantised Y const buckets = new Map(); for (const it of items) { const key = Math.round(it.y / Y_THRESH) * Y_THRESH; if (!buckets.has(key)) buckets.set(key, []); buckets.get(key)!.push(it); } // Lines top→bottom (larger Y = higher on PDF page) const sortedYs = Array.from(buckets.keys()).sort((a, b) => b - a); const lines: string[] = []; for (const y of sortedYs) { const row = buckets.get(y)!; // RTL: sort right-to-left (descending X) row.sort((a, b) => b.x - a.x); // Join items, inserting a space only when the gap between adjacent // items exceeds 25% of the item's font height. // This correctly skips ligature sub-glyph gaps (~1 pt) while catching // genuine inter-word spaces (~4+ pt for typical Arabic body text). let lineText = ""; for (let i = 0; i < row.length; i++) { lineText += row[i].str; if (i < row.length - 1) { const cur = row[i]; const next = row[i + 1]; // gap = horizontal distance between right edge of `next` and left edge of `cur` const gap = cur.x - (next.x + next.width); const spaceThreshold = (cur.height > 0 ? cur.height : 12) * 0.25; if (gap > spaceThreshold) lineText += " "; } } const trimmed = lineText.trim(); if (trimmed) lines.push(trimmed); } page.cleanup(); pageTexts.push(lines.join("\n")); } let text = pageTexts.join("\n\n").trim(); // Arabic-specific post-processing: strips page markers, TOC dots, etc. text = cleanArabicPdfRaw(text); return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; } catch (e) { logger.error({ err: e }, "[extractPdf] failed"); return ""; } finally { if (pdfDoc) { try { await pdfDoc.destroy(); } catch { /* ignore */ } } } } async function extractDocx(filePath: string): Promise { try { const mammoth = await import("mammoth"); const result = await mammoth.extractRawText({ path: filePath }); const text = result.value?.trim() || ""; return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; } catch (e) { return ""; } } // Resolves the Tesseract.js Node.js worker script path so it works even when // the server code is bundled with esbuild (which breaks the default auto-resolution). function getTessWorkerPath(): string { const pkgJson = _require.resolve("tesseract.js/package.json"); return path.join(path.dirname(pkgJson), "src/worker-script/node/index.js"); } async function extractImage(filePath: string): Promise { try { const Tesseract = await import("tesseract.js"); const cacheDir = process.env.NODE_ENV === "production" ? "/data/tessdata" : path.join(process.cwd(), "uploads", ".tessdata"); if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir, { recursive: true }); const worker = await Tesseract.createWorker(["ara", "eng"], 1, { cachePath: cacheDir, workerPath: getTessWorkerPath(), }); const { data: { text } } = await worker.recognize(filePath); await worker.terminate(); return text?.trim() || ""; } catch (e) { logger.error({ err: e }, "[extractImage] error"); return ""; } } async function extractSpreadsheet(filePath: string, ext: string): Promise { try { if (ext === ".csv") { const content = fs.readFileSync(filePath, "utf-8"); const lines = content.split("\n").filter(Boolean).slice(0, 5000); // cap rows if (lines.length === 0) return ""; const headers = lines[0].split(",").map((h) => h.trim()); let md = `| ${headers.join(" | ")} |\n`; md += `| ${headers.map(() => "---").join(" | ")} |\n`; for (const line of lines.slice(1)) { const cells = line.split(",").map((c) => c.trim()); md += `| ${cells.join(" | ")} |\n`; if (md.length > TEXT_CAP) break; } return md; } const { createRequire } = await import("module"); const req = createRequire(import.meta.url); const XLSX = req("xlsx"); const workbook = XLSX.readFile(filePath, { sheetRows: 5000 }); // cap rows per sheet let md = ""; for (const sheetName of workbook.SheetNames) { const sheet = workbook.Sheets[sheetName]; const data: string[][] = XLSX.utils.sheet_to_json(sheet, { header: 1 }); md += `## ${sheetName}\n\n`; if (data.length > 0) { const headers = data[0].map(String); md += `| ${headers.join(" | ")} |\n`; md += `| ${headers.map(() => "---").join(" | ")} |\n`; for (const row of data.slice(1)) { md += `| ${headers.map((_, idx) => String(row[idx] ?? "")).join(" | ")} |\n`; if (md.length > TEXT_CAP) break; } md += "\n"; } if (md.length > TEXT_CAP) break; } return md; } catch (e) { return ""; } } async function extractPptx(filePath: string): Promise { try { const JSZip = (await import("jszip")).default; const content = fs.readFileSync(filePath); const zip = await JSZip.loadAsync(content); let text = ""; const slideFiles = Object.keys(zip.files) .filter((f) => f.match(/ppt\/slides\/slide\d+\.xml/)) .sort(); for (const slideFile of slideFiles) { const xml = await zip.files[slideFile].async("string"); const matches = xml.match(/(.*?)<\/a:t>/g) || []; const slideText = matches .map((m) => m.replace(/<[^>]+>/g, "").trim()) .filter(Boolean) .join(" "); if (slideText) text += slideText + "\n\n"; if (text.length > TEXT_CAP) break; } return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; } catch (e) { return ""; } } async function extractEpub(filePath: string): Promise { try { const JSZip = (await import("jszip")).default; const content = fs.readFileSync(filePath); const zip = await JSZip.loadAsync(content); let text = ""; for (const filename of Object.keys(zip.files)) { if (filename.endsWith(".html") || filename.endsWith(".xhtml")) { const html = await zip.files[filename].async("string"); text += htmlToPlainText(html) + "\n\n"; if (text.length > TEXT_CAP) break; } } return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; } catch (e) { return ""; } } function htmlToPlainText(html: string): string { return html .replace(/]*>(.*?)<\/h\1>/gis, (_, l, c) => "\n" + "#".repeat(Number(l)) + " " + stripTags(c) + "\n") .replace(/]*>(.*?)<\/p>/gis, (_, c) => "\n" + stripTags(c) + "\n") .replace(/]*>(.*?)<\/li>/gis, "- $1\n") .replace(//gi, "\n") .replace(/<[^>]+>/g, "") .replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/ /g, " ") .replace(/\n{3,}/g, "\n\n") .trim(); } function stripTags(s: string): string { return s.replace(/<[^>]+>/g, "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").trim(); } // ═══════════════════════════════════════════════════════════════════════════ // Stats & Utilities // ═══════════════════════════════════════════════════════════════════════════ function computeStats(md: string) { const wordCount = md.split(/\s+/).filter(Boolean).length; const headings = (md.match(/^#{1,6}\s/gm) || []).length; const boldItems = (md.match(/\*\*[^*]+\*\*/g) || []).length; const listItems = (md.match(/^[-*+]\s/gm) || []).length; const tableRows = (md.match(/^\|/gm) || []).length; const codeBlocks = (md.match(/```/g) || []).length / 2; const qualityEstimate = Math.min( 98, 72 + Math.min(headings * 3, 12) + Math.min(boldItems, 10) + Math.min(listItems, 8) + (tableRows > 0 ? 4 : 0) + (codeBlocks > 0 ? 2 : 0) + Math.min(wordCount / 50, 10) ); return { wordCount, headings, boldItems, listItems, qualityEstimate }; } function cleanMarkdown(md: string): string { return md .replace(/\r\n/g, "\n") .replace(/[ \t]+$/gm, "") .replace(/\n{4,}/g, "\n\n\n") .trim(); } function detectLanguage(text: string): string { const arabicChars = (text.match(/[\u0600-\u06FF]/g) || []).length; const latinChars = (text.match(/[a-zA-Z]/g) || []).length; if (arabicChars > latinChars * 0.6) return "ar"; if (latinChars > arabicChars * 0.6) return "en"; return "mixed"; } function sleep(ms: number): Promise { return new Promise((r) => setTimeout(r, ms)); } // Public entry point — enforces a 15-minute hard limit per conversion job const CONVERSION_TIMEOUT_MS = 15 * 60 * 1000; async function runConversion(conversionId: string, fileId: string, storagePath: string) { try { await withTimeout( runConversionCore(conversionId, fileId, storagePath), CONVERSION_TIMEOUT_MS, "تحويل الملف" ); } catch (err) { const error = err instanceof Error ? err.message : "انتهت مهلة التحويل"; await db.update(conversionsTable) .set({ status: "failed", errorMessage: error }) .where(eq(conversionsTable.id, conversionId)); await db.update(filesTable) .set({ status: "failed", updatedAt: new Date() }) .where(eq(filesTable.id, fileId)); } } // ═══════════════════════════════════════════════════════════════════════════ // Routes // ═══════════════════════════════════════════════════════════════════════════ // POST /api/convert/upload router.post("/upload", upload.single("file"), async (req: AuthRequest, res) => { try { if (!req.file) { res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" }); return; } const { pageStart, pageEnd, folderId } = req.body; const fileName = path.parse(fixFilename(req.file.originalname)).name; const [file] = await db .insert(filesTable) .values({ name: fileName + ".md", ownerId: req.userId!, folderId: folderId || null, originalName: fixFilename(req.file.originalname), originalType: req.file.mimetype, sizeBytes: req.file.size, storagePath: req.file.path, status: "queued", }) .returning(); const [conversion] = await db .insert(conversionsTable) .values({ fileId: file.id, userId: req.userId!, status: "queued", progress: 0, steps: initSteps(), pageStart: pageStart ? Number(pageStart) : null, pageEnd: pageEnd ? Number(pageEnd) : null, }) .returning(); runConversion(conversion.id, file.id, req.file.path).catch((err) => req.log?.error({ err }, "background conversion error") ); res.status(202).json({ jobId: conversion.id, fileId: file.id, status: "queued", progress: 0, steps: initSteps(), createdAt: conversion.createdAt, }); } catch (err) { const e = err instanceof Error ? err : new Error(String(err)); const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause; const rootMsg = cause?.message ?? e.message; console.error("[RAQIM] /upload error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack); req.log?.error({ err, cause: cause?.message }, "upload error"); res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" }); } }); // POST /api/convert/upload-split — upload once, create N conversion jobs router.post("/upload-split", upload.single("file"), async (req: AuthRequest, res) => { try { if (!req.file) { res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" }); return; } const { ranges: rangesJson, folderId } = req.body; let ranges: Array<{ start: number; end: number; label: string }> = []; try { ranges = JSON.parse(rangesJson || "[]"); } catch { res.status(400).json({ error: "validation", message: "نطاقات الصفحات غير صالحة" }); return; } if (!ranges.length) { res.status(400).json({ error: "validation", message: "يجب تحديد نطاق واحد على الأقل" }); return; } const baseName = path.parse(fixFilename(req.file.originalname)).name; const jobs = []; for (const range of ranges) { const partName = `${baseName} — ${range.label}.md`; const [file] = await db .insert(filesTable) .values({ name: partName, ownerId: req.userId!, folderId: folderId || null, originalName: fixFilename(req.file!.originalname), originalType: req.file!.mimetype, sizeBytes: req.file!.size, storagePath: req.file!.path, status: "queued", }) .returning(); const [conversion] = await db .insert(conversionsTable) .values({ fileId: file.id, userId: req.userId!, status: "queued", progress: 0, steps: initSteps(), pageStart: range.start || null, pageEnd: range.end || null, }) .returning(); runConversion(conversion.id, file.id, req.file!.path).catch((err) => req.log?.error({ err }, "split conversion error") ); jobs.push({ jobId: conversion.id, fileId: file.id, name: partName }); } res.status(202).json({ jobs }); } catch (err) { const e = err instanceof Error ? err : new Error(String(err)); const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause; const rootMsg = cause?.message ?? e.message; console.error("[RAQIM] /upload-split error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack); req.log?.error({ err, cause: cause?.message }, "upload-split error"); res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" }); } }); // POST /api/convert router.post("/", async (req: AuthRequest, res) => { try { const { fileId, pageStart, pageEnd } = req.body; const file = await db.query.filesTable.findFirst({ where: and(eq(filesTable.id, fileId), eq(filesTable.ownerId, req.userId!)), }); if (!file || !file.storagePath) { res.status(404).json({ error: "not_found", message: "الملف غير موجود" }); return; } const [conversion] = await db .insert(conversionsTable) .values({ fileId: file.id, userId: req.userId!, status: "queued", progress: 0, steps: initSteps(), pageStart: pageStart || null, pageEnd: pageEnd || null, }) .returning(); runConversion(conversion.id, file.id, file.storagePath).catch((err) => req.log?.error({ err }, "background conversion error") ); res.status(202).json({ jobId: conversion.id, fileId, status: "queued", progress: 0, steps: initSteps(), createdAt: conversion.createdAt, }); } catch (err) { req.log?.error({ err }, "convert error"); res.status(500).json({ error: "server_error", message: "فشل التحويل" }); } }); // GET /api/convert/:jobId router.get("/:jobId", async (req: AuthRequest, res) => { try { const jobId = req.params.jobId as string; const conv = await db.query.conversionsTable.findFirst({ where: and(eq(conversionsTable.id, jobId), eq(conversionsTable.userId, req.userId!)), }); if (!conv) { res.status(404).json({ error: "not_found", message: "المهمة غير موجودة" }); return; } res.json({ jobId: conv.id, fileId: conv.fileId, status: conv.status, progress: conv.progress, steps: conv.steps, queuePosition: null, elapsedSeconds: conv.elapsedSeconds, estimatedSeconds: conv.estimatedSeconds, errorMessage: conv.errorMessage, createdAt: conv.createdAt, }); } catch (err) { req.log?.error({ err }, "get conversion error"); res.status(500).json({ error: "server_error", message: "فشل جلب الحالة" }); } }); export default router;