Spaces:
Sleeping
Sleeping
| import { Router } from "express"; | |
| import multer from "multer"; | |
| import path from "path"; | |
| import fs from "fs"; | |
| import { createRequire } from "module"; | |
| import { db } from "@workspace/db"; | |
| import { filesTable, conversionsTable } from "@workspace/db"; | |
| import { eq, and } from "drizzle-orm"; | |
| import { requireAuth, AuthRequest } from "../middlewares/auth.js"; | |
| import { logger } from "../lib/logger.js"; | |
| // Module-level require() for resolving peer package paths (works in ESM + esbuild bundles) | |
| const _require = createRequire(import.meta.url); | |
| const router = Router(); | |
| router.use(requireAuth); | |
| // In production, use /data/uploads (persistent HF Spaces volume). | |
| // /tmp/uploads is a tmpfs that starts empty at container boot — unreliable. | |
| const uploadDir = | |
| process.env.NODE_ENV === "production" | |
| ? "/data/uploads" | |
| : path.join(process.cwd(), "uploads"); | |
| try { | |
| fs.mkdirSync(uploadDir, { recursive: true }); | |
| } catch (e) { | |
| console.error("[RAQIM] Failed to create upload dir:", uploadDir, e); | |
| } | |
| // Multer decodes the filename header as Latin-1 by default; re-encode as UTF-8 | |
| function fixFilename(raw: string): string { | |
| try { | |
| return Buffer.from(raw, "latin1").toString("utf8"); | |
| } catch { | |
| return raw; | |
| } | |
| } | |
| const storage = multer.diskStorage({ | |
| destination: uploadDir, | |
| filename: (_, file, cb) => cb(null, `${Date.now()}-${fixFilename(file.originalname)}`), | |
| }); | |
| const upload = multer({ storage, limits: { fileSize: 500 * 1024 * 1024 } }); | |
| const CONVERSION_STEPS = [ | |
| { name: "analyzing", label: "تحليل الملف والتعرف على نوعه" }, | |
| { name: "routing", label: "توجيه ذكي لأنسب محركات المعالجة" }, | |
| { name: "ocr", label: "استخراج النص الخام (OCR / Parser)" }, | |
| { name: "layout", label: "المهندس الذكي — إعادة بناء التنسيق" }, | |
| { name: "scoring", label: "تقييم الجودة وإحصاء العناصر" }, | |
| { name: "merging", label: "دمج الطبقات ومعالجة الهيكل النهائي" }, | |
| { name: "cleanup", label: "تنظيف وتلميع المستند" }, | |
| ]; | |
| function initSteps() { | |
| return CONVERSION_STEPS.map((s) => ({ ...s, status: "pending" })); | |
| } | |
| // Wrap any async fn with a timeout; rejects with an Error if it exceeds ms | |
| function withTimeout<T>(promise: Promise<T>, ms: number, label: string): Promise<T> { | |
| return new Promise((resolve, reject) => { | |
| const timer = setTimeout(() => reject(new Error(`تجاوز الوقت المحدد: ${label}`)), ms); | |
| promise.then( | |
| (v) => { clearTimeout(timer); resolve(v); }, | |
| (e) => { clearTimeout(timer); reject(e); } | |
| ); | |
| }); | |
| } | |
| async function runConversionCore(conversionId: string, fileId: string, storagePath: string) { | |
| const steps = initSteps(); | |
| let stepIndex = 0; | |
| const startTime = Date.now(); | |
| // Read page range set at upload time | |
| const convRecord = await db.query.conversionsTable.findFirst({ | |
| where: eq(conversionsTable.id, conversionId), | |
| }); | |
| const pageStart = convRecord?.pageStart ?? undefined; | |
| const pageEnd = convRecord?.pageEnd ?? undefined; | |
| const updateProgress = async ( | |
| status: string, | |
| progress: number, | |
| stepsDone: typeof steps, | |
| aiMessage?: string | |
| ) => { | |
| await db | |
| .update(conversionsTable) | |
| .set({ | |
| status: status as any, | |
| progress, | |
| steps: stepsDone, | |
| elapsedSeconds: Math.floor((Date.now() - startTime) / 1000), | |
| ...(aiMessage ? { errorMessage: aiMessage } : {}), | |
| }) | |
| .where(eq(conversionsTable.id, conversionId)); | |
| }; | |
| try { | |
| const ext = path.extname(storagePath).toLowerCase(); | |
| let rawText = ""; | |
| // ── Step 1: Analyzing ─────────────────────────────────────────────── | |
| stepIndex = 0; | |
| steps[0].status = "running"; | |
| await updateProgress("analyzing", 5, steps, "جاري تحليل نوع الملف والبنية الداخلية..."); | |
| await sleep(600); | |
| steps[0].status = "done"; | |
| // ── Step 2: Routing ───────────────────────────────────────────────── | |
| stepIndex = 1; | |
| steps[1].status = "running"; | |
| await updateProgress("routing", 12, steps, "اختيار أنسب محرك استخراج للملف..."); | |
| await sleep(400); | |
| steps[1].status = "done"; | |
| // ── Step 3: OCR / Text Extraction ─────────────────────────────────── | |
| stepIndex = 2; | |
| steps[2].status = "running"; | |
| await updateProgress("ocr", 20, steps, "جاري استخراج النص من الملف..."); | |
| if ([".txt", ".md"].includes(ext)) { | |
| rawText = fs.readFileSync(storagePath, "utf-8"); | |
| } else if (ext === ".pdf") { | |
| rawText = await extractPdf(storagePath, pageStart, pageEnd); | |
| await updateProgress("ocr", 28, steps, "تم استخراج النص الخام من الـ PDF..."); | |
| // If text appears garbled (broken ToUnicode CMap in font), fall back to | |
| // rendering each page as an image and running Tesseract OCR on it. | |
| // This completely bypasses the CMap issue and works offline/without any API key. | |
| if (isGarbledArabic(rawText)) { | |
| await updateProgress("ocr", 30, steps, "تم رصد خلل في ترميز الخط — جاري استخدام OCR للحصول على نص دقيق..."); | |
| const ocrText = await extractPdfViaOcr(storagePath, pageStart, pageEnd, | |
| (done, total) => updateProgress("ocr", 30 + Math.round((done / total) * 20), steps, | |
| `جاري تحليل الصفحات بواسطة OCR... (${done}/${total})`) | |
| ); | |
| if (ocrText.length > 50) { | |
| rawText = ocrText; | |
| await updateProgress("ocr", 50, steps, "تم استخراج النص بواسطة OCR بدقة عالية ✓"); | |
| } | |
| } | |
| // Optional AI polish — free on Replit (AI proxy) and on HF Spaces (HF_TOKEN). | |
| rawText = await correctArabicText(rawText, (msg, pct) => | |
| updateProgress("ocr", pct, steps, msg) | |
| ); | |
| await updateProgress("ocr", 55, steps, "اكتمل استخراج النص العربي ✓"); | |
| } else if ([".docx", ".doc"].includes(ext)) { | |
| rawText = await extractDocx(storagePath); | |
| await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word..."); | |
| } else if ([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp", ".gif"].includes(ext)) { | |
| rawText = await extractImage(storagePath); | |
| await updateProgress("ocr", 38, steps, "تم استخراج النص من الصورة بتقنية OCR..."); | |
| } else if ([".xlsx", ".xls", ".csv"].includes(ext)) { | |
| rawText = await extractSpreadsheet(storagePath, ext); | |
| await updateProgress("ocr", 38, steps, "تم تحليل جداول البيانات..."); | |
| } else if ([".html", ".htm"].includes(ext)) { | |
| const html = fs.readFileSync(storagePath, "utf-8"); | |
| rawText = htmlToPlainText(html); | |
| await updateProgress("ocr", 38, steps, "تم تحليل ملف HTML..."); | |
| } else if ([".pptx", ".ppt"].includes(ext)) { | |
| rawText = await extractPptx(storagePath); | |
| await updateProgress("ocr", 38, steps, "تم استخراج نصوص الشرائح..."); | |
| } else if ([".epub"].includes(ext)) { | |
| rawText = await extractEpub(storagePath); | |
| await updateProgress("ocr", 38, steps, "تم استخراج نصوص الكتاب الإلكتروني..."); | |
| } else { | |
| try { | |
| rawText = fs.readFileSync(storagePath, "utf-8").substring(0, 100000); | |
| } catch { | |
| rawText = `# ملف ثنائي\n\nلا يمكن استخراج نص من هذا النوع من الملفات مباشرة.`; | |
| } | |
| } | |
| steps[2].status = "done"; | |
| // ── Step 4: Rule-Based Architect — 100% Free, No Limits ───────────── | |
| stepIndex = 3; | |
| steps[3].status = "running"; | |
| await updateProgress("layout", 45, steps, "المهندس الذكي يعيد بناء هيكل المستند..."); | |
| const architectMarkdown = runRuleBasedArchitect(rawText, ext); | |
| await updateProgress("layout", 68, steps, "اكتمل تحليل وهيكلة المستند"); | |
| steps[3].status = "done"; | |
| // ── Step 5: Scoring ───────────────────────────────────────────────── | |
| stepIndex = 4; | |
| steps[4].status = "running"; | |
| await updateProgress("scoring", 75, steps, "جاري قياس الجودة وإحصاء العناصر..."); | |
| const stats = computeStats(architectMarkdown); | |
| await sleep(400); | |
| steps[4].status = "done"; | |
| // ── Step 6: Merging ───────────────────────────────────────────────── | |
| stepIndex = 5; | |
| steps[5].status = "running"; | |
| await updateProgress("merging", 85, steps, "دمج الطبقات وتثبيت الهيكل النهائي..."); | |
| await sleep(350); | |
| steps[5].status = "done"; | |
| // ── Step 7: Cleanup ───────────────────────────────────────────────── | |
| stepIndex = 6; | |
| steps[6].status = "running"; | |
| await updateProgress("cleanup", 93, steps, "التلميع النهائي والتحقق من سلامة النص..."); | |
| const finalMarkdown = cleanMarkdown(architectMarkdown); | |
| await sleep(300); | |
| steps[6].status = "done"; | |
| // ── Done ───────────────────────────────────────────────────────────── | |
| const qualityScore = Math.min(98, Math.max(72, stats.qualityEstimate)); | |
| await db | |
| .update(filesTable) | |
| .set({ | |
| markdownContent: finalMarkdown, | |
| originalMarkdown: finalMarkdown, | |
| status: "done", | |
| wordCount: stats.wordCount, | |
| qualityScore, | |
| language: detectLanguage(finalMarkdown), | |
| updatedAt: new Date(), | |
| }) | |
| .where(eq(filesTable.id, fileId)); | |
| await db | |
| .update(conversionsTable) | |
| .set({ | |
| status: "done", | |
| progress: 100, | |
| steps, | |
| completedAt: new Date(), | |
| elapsedSeconds: Math.floor((Date.now() - startTime) / 1000), | |
| errorMessage: null, | |
| }) | |
| .where(eq(conversionsTable.id, conversionId)); | |
| } catch (err) { | |
| const error = err instanceof Error ? err.message : "Unknown error"; | |
| if (steps[stepIndex]) steps[stepIndex].status = "failed"; | |
| await db | |
| .update(conversionsTable) | |
| .set({ status: "failed", steps, errorMessage: error }) | |
| .where(eq(conversionsTable.id, conversionId)); | |
| await db | |
| .update(filesTable) | |
| .set({ status: "failed", updatedAt: new Date() }) | |
| .where(eq(filesTable.id, fileId)); | |
| } | |
| } | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| // RULE-BASED ARCHITECT — 100% Free, No External APIs, No Limits | |
| // Handles Arabic academic documents, exams, books, and general text | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| function runRuleBasedArchitect(rawText: string, _ext: string): string { | |
| if (!rawText.trim() || rawText.trim().length < 10) { | |
| return rawText || "# مستند فارغ\n\nلم يتم اكتشاف محتوى نصي في هذا الملف."; | |
| } | |
| const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) || []).length; | |
| const latinChars = (rawText.match(/[a-zA-Z]/g) || []).length; | |
| return arabicChars >= latinChars * 0.4 | |
| ? formatArabicDocument(rawText) | |
| : formatLatinDocument(rawText); | |
| } | |
| // ── Helpers ───────────────────────────────────────────────────────────────── | |
| function cleanOcrLine(line: string): string { | |
| return line | |
| .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "") | |
| // Strip Unicode bidi / directional control chars that pdfjs embeds from broken-CMap fonts | |
| .replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "") | |
| .replace(/[□■▪▫▶◀►◄▲▼◆◇○●★☆✓✗✦✧]/g, "") | |
| .replace(/\s{2,}/g, " ") | |
| .trim(); | |
| } | |
| function isMetaLine(line: string): boolean { | |
| return /^(المادة|الزمن|النموذج|التاريخ|الصف|الشعبة|المدرسة|اسم الطالب|الاسم|الفصل|المرحلة|الفرقة|الدراسي|الفصل الدراسي|المستوى|الشعبة|المجموعة)\s*[::]/i.test(line); | |
| } | |
| function isSectionMarker(line: string): boolean { | |
| if (/^(أولاً|أولا|ثانياً|ثانيا|ثالثاً|ثالثا|رابعاً|رابعا|خامساً|خامسا|سادساً|سادسا|سابعاً|سابعا|ثامناً|ثامنا|تاسعاً|تاسعا|عاشراً|عاشرا)\s*[-:،\s]/.test(line)) return true; | |
| if (/^(Part|Section|Chapter|Unit)\s+[IVXivxA-Z\d]+/i.test(line)) return true; | |
| return false; | |
| } | |
| function isQuestion(line: string): boolean { | |
| // Arabic question starters | |
| if (/^سـ?\s*[\d\u0660-\u0669]+\s*[-:)،\s]/.test(line)) return true; | |
| if (/^سؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true; | |
| if (/^السؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true; | |
| if (/^س\s*[\d\u0660-\u0669]+\s*[-:)،]/.test(line)) return true; | |
| // Numbered with parens: (١) or (1) | |
| if (/^\([\d\u0660-\u0669]+\)\s+\S/.test(line)) return true; | |
| // Numbered with dash: "١- " or "1- " when followed by substantial content | |
| if (/^[\u0660-\u0669\d]+\s*[-–—]\s+.{8,}/.test(line)) return true; | |
| // English | |
| if (/^Q\s*\d+\s*[-:.)]/i.test(line)) return true; | |
| if (/^Question\s+\d+/i.test(line)) return true; | |
| return false; | |
| } | |
| function isKeywordLine(line: string): boolean { | |
| return /^(التعليل|الإجابة|الإجابه|المطلوب|الحل|الشرح|الدليل|السبب|العلة|ملاحظة|ملاحظه|تنبيه|الفائدة|المقصود|المراد|الاستنتاج|التحليل|التفسير|النتيجة|الخلاصة)\s*[::]/i.test(line); | |
| } | |
| function isHeadingCandidate(line: string, lineIndex: number, lines: string[]): boolean { | |
| if (line.length > 80 || line.length < 3) return false; | |
| if (/^#{1,6}\s/.test(line)) return false; | |
| if (/^[-*+\d]/.test(line)) return false; | |
| if (/[.،!؟?]$/.test(line) && line.length > 30) return false; | |
| const prevEmpty = lineIndex === 0 || lines[lineIndex - 1].trim() === ""; | |
| const nextEmpty = lineIndex >= lines.length - 1 || lines[lineIndex + 1].trim() === ""; | |
| return prevEmpty && nextEmpty; | |
| } | |
| // Expand inline multiple-choice options to a vertical list | |
| // Returns formatted list or null if not detected | |
| // NOTE: Runs on the ORIGINAL (uncleaned) line to detect multi-space separators | |
| function expandMultipleChoice(line: string): string | null { | |
| // Pattern 1: أ- text ب- text ج- text (Arabic with dash, any whitespace between) | |
| const arDashRe = /([أبجد])\s*[-–—]\s*([^أبجد\n-]{1,60}?)(?=\s+[أبجد]\s*[-–—]|\s*$)/g; | |
| const arDash: Array<[string, string]> = []; | |
| let m: RegExpExecArray | null; | |
| while ((m = arDashRe.exec(line)) !== null) { | |
| const text = m[2].trim(); | |
| if (text) arDash.push([m[1], text]); | |
| } | |
| if (arDash.length >= 2) { | |
| return arDash.map(([l, t]) => `- ${l}- ${t}`).join("\n"); | |
| } | |
| // Pattern 2: (أ) text (ب) text | |
| const arParenRe = /\(([أبجد])\)\s*([^()أبجد\n]{1,60}?)(?=\s*\([أبجد]\)|\s*$)/g; | |
| const arParen: Array<[string, string]> = []; | |
| while ((m = arParenRe.exec(line)) !== null) { | |
| const text = m[2].trim(); | |
| if (text) arParen.push([m[1], text]); | |
| } | |
| if (arParen.length >= 2) { | |
| return arParen.map(([l, t]) => `- (${l}) ${t}`).join("\n"); | |
| } | |
| // Pattern 3: أ) text ب) text (without outer parens) | |
| const arRParenRe = /([أبجد])\)\s*([^أبجد()]{1,60}?)(?=\s*[أبجد]\)|\s*$)/g; | |
| const arRParen: Array<[string, string]> = []; | |
| while ((m = arRParenRe.exec(line)) !== null) { | |
| const text = m[2].trim(); | |
| if (text) arRParen.push([m[1], text]); | |
| } | |
| if (arRParen.length >= 2) { | |
| return arRParen.map(([l, t]) => `- ${l}) ${t}`).join("\n"); | |
| } | |
| // Pattern 4: English a) b) c) d) — split by choice marker to avoid char-class issues | |
| const enSplit = line.split(/\s+(?=[a-d]\)\s)/i); | |
| if (enSplit.length >= 2) { | |
| const enChoices: Array<[string, string]> = enSplit | |
| .map(s => { | |
| const mx = s.match(/^([a-d])\)\s+(.*)/i); | |
| return mx ? ([mx[1].toLowerCase(), mx[2].trim()] as [string, string]) : null; | |
| }) | |
| .filter((x): x is [string, string] => x !== null); | |
| if (enChoices.length >= 2) { | |
| return enChoices.map(([l, t]) => `- ${l}) ${t}`).join("\n"); | |
| } | |
| } | |
| return null; | |
| } | |
| // ── Arabic document formatter ──────────────────────────────────────────────── | |
| // Extract all key:value pairs from a meta line that may contain multiple fields | |
| // e.g. "المادة: رياضيات الزمن: ساعة النموذج: أ" → [["المادة","رياضيات"],["الزمن","ساعة"],["النموذج","أ"]] | |
| function splitMetaFields(line: string): Array<[string, string]> { | |
| const pairs: Array<[string, string]> = []; | |
| // Split by 2+ spaces or known separators between fields | |
| // Each segment should start with a known meta key followed by colon | |
| const segments = line.split(/\s{2,}|\t|[|،,]/).map(s => s.trim()).filter(Boolean); | |
| for (const seg of segments) { | |
| const ci = seg.indexOf(":"); | |
| if (ci > 0 && isMetaLine(seg)) { | |
| const k = seg.slice(0, ci).trim(); | |
| const v = seg.slice(ci + 1).trim(); | |
| if (k) pairs.push([k, v]); | |
| } | |
| } | |
| // Fallback: treat whole line as single field | |
| if (pairs.length === 0) { | |
| const ci = line.indexOf(":"); | |
| if (ci > 0) { | |
| pairs.push([line.slice(0, ci).trim(), line.slice(ci + 1).trim()]); | |
| } | |
| } | |
| return pairs; | |
| } | |
| function formatArabicDocument(text: string): string { | |
| const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n"); | |
| const lines = rawLines.map(cleanOcrLine); | |
| const output: string[] = []; | |
| let i = 0; | |
| // ── Detect and render metadata block from first 15 lines ── | |
| const metaIndices: number[] = []; | |
| for (let j = 0; j < Math.min(15, lines.length); j++) { | |
| if (lines[j] && isMetaLine(lines[j])) metaIndices.push(j); | |
| } | |
| // Handle metadata: each detected meta line may contain multiple inline fields | |
| // Use rawLines to preserve double-space separators | |
| if (metaIndices.length >= 1) { | |
| const allPairs: Array<[string, string]> = []; | |
| for (const idx of metaIndices) { | |
| for (const pair of splitMetaFields(rawLines[idx] || "")) allPairs.push(pair); | |
| } | |
| if (allPairs.length > 0) { | |
| output.push("| الحقل | القيمة |"); | |
| output.push("| --- | --- |"); | |
| for (const [k, v] of allPairs) output.push(`| ${k} | ${v} |`); | |
| output.push(""); | |
| i = Math.max(...metaIndices) + 1; | |
| } | |
| } | |
| // ── Check first content line for document title ── | |
| while (i < lines.length && !lines[i]) i++; | |
| if (i < lines.length) { | |
| const candidate = lines[i]; | |
| const isTitle = | |
| candidate.length > 3 && | |
| candidate.length < 100 && | |
| !isQuestion(candidate) && | |
| !isSectionMarker(candidate) && | |
| !isMetaLine(candidate) && | |
| !candidate.startsWith("-") && | |
| !candidate.startsWith("#"); | |
| // Only promote to title if metadata was found (strong signal) | |
| if (isTitle && metaIndices.length > 0) { | |
| output.push(`# ${candidate}`); | |
| output.push(""); | |
| i++; | |
| } | |
| } | |
| // ── Main pass ── | |
| while (i < lines.length) { | |
| const line = lines[i].trim(); | |
| const rawLine = rawLines[i] || ""; // original line before cleaning (for choice detection) | |
| if (!line) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| i++; | |
| continue; | |
| } | |
| // Already a Markdown heading — keep as-is | |
| if (/^#{1,6}\s/.test(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(line); | |
| output.push(""); | |
| i++; | |
| continue; | |
| } | |
| // Section markers: أولاً / ثانياً / Part I | |
| if (isSectionMarker(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(`## ${line}`); | |
| output.push(""); | |
| i++; | |
| continue; | |
| } | |
| // Question detection | |
| if (isQuestion(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(`**${line}**`); | |
| output.push(""); | |
| i++; | |
| continue; | |
| } | |
| // Keyword lines: التعليل: / الإجابة: / المطلوب: | |
| if (isKeywordLine(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(line); | |
| i++; | |
| continue; | |
| } | |
| // Inline multiple choice → vertical list (use rawLine to preserve original spacing) | |
| const expanded = expandMultipleChoice(rawLine) || expandMultipleChoice(line); | |
| if (expanded) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(expanded); | |
| output.push(""); | |
| i++; | |
| continue; | |
| } | |
| // Already-formatted list items | |
| if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) { | |
| output.push(line); | |
| i++; | |
| continue; | |
| } | |
| // Lone short line surrounded by blanks → subheading | |
| if (isHeadingCandidate(line, i, lines)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(`### ${line}`); | |
| output.push(""); | |
| i++; | |
| continue; | |
| } | |
| // Regular content line | |
| output.push(line); | |
| i++; | |
| } | |
| return output.join("\n").replace(/\n{3,}/g, "\n\n").trim(); | |
| } | |
| // ── Latin/English document formatter ──────────────────────────────────────── | |
| function formatLatinDocument(text: string): string { | |
| const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n"); | |
| const lines = rawLines.map(cleanOcrLine); | |
| const output: string[] = []; | |
| for (let i = 0; i < lines.length; i++) { | |
| const line = lines[i].trim(); | |
| if (!line) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| continue; | |
| } | |
| if (/^#{1,6}\s/.test(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(line); | |
| output.push(""); | |
| continue; | |
| } | |
| if (isSectionMarker(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(`## ${line}`); | |
| output.push(""); | |
| continue; | |
| } | |
| if (isQuestion(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(`**${line}**`); | |
| output.push(""); | |
| continue; | |
| } | |
| if (isKeywordLine(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(line); | |
| continue; | |
| } | |
| const expanded = expandMultipleChoice(line); | |
| if (expanded) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(expanded); | |
| output.push(""); | |
| continue; | |
| } | |
| // ALL CAPS short line → subheading | |
| if (/^[A-Z][A-Z\s\d:,.-]{4,60}$/.test(line)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(`### ${line}`); | |
| output.push(""); | |
| continue; | |
| } | |
| if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) { | |
| output.push(line); | |
| continue; | |
| } | |
| if (isHeadingCandidate(line, i, lines)) { | |
| if (output.length > 0 && output[output.length - 1] !== "") output.push(""); | |
| output.push(`### ${line}`); | |
| output.push(""); | |
| continue; | |
| } | |
| output.push(line); | |
| } | |
| return output.join("\n").replace(/\n{3,}/g, "\n\n").trim(); | |
| } | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| // Extractors | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| // Max characters extracted from any single document (~2 MB of text ≈ 300 k words) | |
| const TEXT_CAP = 2_000_000; | |
| // ── Arabic PDF text post-processor ─────────────────────────────────────────── | |
| // Cleans up the artifacts introduced by PDF text extraction: | |
| // • "-- X of N --" page markers from pdf-parse default renderer | |
| // • Standalone page labels (single Arabic letters/numerals on their own line) | |
| // • Table-of-contents leader dots (". . . . . .") + trailing page numbers | |
| // • Unicode bidi control chars (LRM / RLM / directional overrides) | |
| // • Isolated short CAPS Latin sequences inline in Arabic lines (broken CMap) | |
| // • Collapse excess blank lines | |
| function cleanArabicPdfRaw(text: string): string { | |
| // 1. Strip all Unicode bidi / directional control characters that | |
| // pdfjs-dist embeds when the PDF uses broken ToUnicode CMap fonts. | |
| // These appear as (U+200E LRM) and (U+200F RLM) wrapping Latin chars. | |
| text = text.replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, ""); | |
| // 2. For lines that are predominantly Arabic, remove short ALL-CAPS Latin | |
| // noise sequences — artefacts of broken CMap where Arabic glyphs are | |
| // mapped to Latin code points (e.g. "المبادئ OA العشرة" → OA = garbled Arabic). | |
| // Guard: don't remove if the "Latin" word is a common technical abbreviation. | |
| const KEEP_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS"]); | |
| text = text.split("\n").map(line => { | |
| const arabicCount = (line.match(/[\u0600-\u06FF]/g) ?? []).length; | |
| if (arabicCount < 4) return line; // not an Arabic line — leave intact | |
| // Remove isolated 1-5 char ALL-CAPS sequences (not in safe-list) | |
| return line.replace(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g, (match) => | |
| KEEP_CAPS.has(match) ? match : "" | |
| ).replace(/ {2,}/g, " ").trim(); | |
| }).join("\n"); | |
| const lines = text.split("\n"); | |
| const out: string[] = []; | |
| for (const raw of lines) { | |
| const line = raw.trim(); | |
| // 1. Remove "-- X of N --" pdf-parse page markers | |
| if (/^--\s*\d+\s+of\s+\d+\s*--$/i.test(line)) continue; | |
| // 2. Remove standalone page labels: | |
| // • single Arabic letter (أ ب ج etc.) | |
| // • 1–3 Arabic/Eastern-Arabic/Western numerals alone on a line | |
| if (/^[\u0600-\u06FF]{1}$/.test(line)) continue; | |
| if (/^[٠-٩\u0660-\u06690-9]{1,3}$/.test(line)) continue; | |
| // 3. Collapse TOC leader-dot lines: ". . . . . . ." → clean title | |
| // A TOC line has 4+ consecutive dots (possibly space-separated) | |
| if (/\.(\s*\.){3,}/.test(line)) { | |
| const cleaned = line | |
| .replace(/\.(\s*\.)+\s*/g, " ") | |
| .replace(/\s+[٠-٩\u0660-\u06690-9]{1,4}\s*$/, "") | |
| .replace(/\s{2,}/g, " ") | |
| .trim(); | |
| if (cleaned.length > 2) out.push(cleaned); | |
| continue; | |
| } | |
| // 4. Strip trailing Arabic/Eastern-Arabic page-number from TOC lines that | |
| // lost their dot-leaders (e.g. "عنوان الكتاب ۰٣"). Heuristic: line is | |
| // mostly Arabic text ending in 1–4 Arabic/Eastern-Arabic digit(s), and | |
| // the Arabic content before the number is ≥10 chars. | |
| const tocTrailing = line.replace(/\s+[٠-٩\u0660-\u0669]{1,4}$/, ""); | |
| if (tocTrailing !== line && tocTrailing.length >= 10 && /[\u0600-\u06FF]/.test(tocTrailing)) { | |
| out.push(tocTrailing.trim()); | |
| continue; | |
| } | |
| // 5. Preserve empty lines (paragraph breaks) | |
| if (!line) { out.push(""); continue; } | |
| out.push(line); | |
| } | |
| // Collapse runs of 3+ blank lines to 2 | |
| return out.join("\n").replace(/\n{3,}/g, "\n\n").trim(); | |
| } | |
| // ── Arabic text AI correction — 100% free, full HF model access ────────────── | |
| // Priority chain (tried in order, falls back on rate-limit / error): | |
| // 1. Replit AI Integration proxy (AI_INTEGRATIONS_OPENAI_BASE_URL) — gpt-4o | |
| // 2. HF: Qwen/Qwen3-72B — best open-source Arabic, Apr 2025 | |
| // 3. HF: Qwen/Qwen3-30B-A3B — MoE, fast & very capable | |
| // 4. HF: Qwen/Qwen2.5-72B-Instruct — proven Arabic quality | |
| // 5. HF: meta-llama/Llama-3.3-70B-Instruct — strong multilingual | |
| // 6. HF: mistralai/Mistral-Nemo-Instruct-2407 — fast 12B fallback | |
| // | |
| const AI_CHUNK_CHARS = 3000; // larger chunks → fewer API calls | |
| const AI_CHUNK_TIMEOUT_MS = 120_000; | |
| const AI_SYSTEM_PROMPT = | |
| "أنت نظام متخصص في تصحيح نصوص PDF العربية المستخرجة آلياً. " + | |
| "المهمة: إزالة أخطاء الاستخراج مع الحفاظ التام على المعنى والمحتوى الأصيل. " + | |
| "أنواع الأخطاء الشائعة في هذه النصوص: " + | |
| "١) حروف ومقاطع لاتينية قصيرة مبعثرة داخل النص العربي (مثل OA، BW، Zz، dl، pl) — ضوضاء من ترميز الخط المكسور، احذفها. " + | |
| "٢) كلمات عربية مبتورة أو مشوهة واضحة يمكن تصحيحها من السياق. " + | |
| "٣) مسافات خاطئة داخل الكلمة العربية الواحدة — ادمجها. " + | |
| "٤) رموز متفرقة أو علامات ترقيم غريبة ليست جزءاً من المحتوى — احذفها. " + | |
| "القواعد الصارمة: " + | |
| "أ) احتفظ بالأسماء والمصطلحات التقنية اللاتينية الشائعة (PDF، AI، URL، API...). " + | |
| "ب) حافظ على هيكل الفقرات والعناوين والقوائم وعلامات Markdown كما هي تماماً. " + | |
| "ج) لا تضف أي محتوى جديد أو شروحات. " + | |
| "أعد النص العربي المُصحَح فقط بدون أي مقدمة أو خاتمة."; | |
| type AiEndpoint = { baseUrl: string; apiKey: string; model: string; label: string; noThink?: boolean }; | |
| // Returns a prioritised list of AI endpoints to try — best Arabic quality first. | |
| function resolveAiEndpoints(): AiEndpoint[] { | |
| const endpoints: AiEndpoint[] = []; | |
| // 1. Replit AI Integration proxy (zero-config on Replit dev environment) | |
| const replitUrl = process.env.AI_INTEGRATIONS_OPENAI_BASE_URL; | |
| if (replitUrl) { | |
| endpoints.push({ | |
| baseUrl: replitUrl, | |
| apiKey: process.env.AI_INTEGRATIONS_OPENAI_API_KEY ?? "placeholder", | |
| model: "gpt-4o", | |
| label: "Replit/gpt-4o", | |
| }); | |
| } | |
| // 2-8. HF Router — automatic provider selection (best available with HF_TOKEN) | |
| // As of 2026: router.huggingface.co/v1 routes to the best available provider | |
| // (novita, together, deepinfra, fireworks, hf-inference) based on model support. | |
| // Falls back gracefully: 429/402/503 → next model in chain. | |
| // noThink=true → appends /no_think to disable Qwen3 chain-of-thought for speed. | |
| const hfToken = process.env.HF_TOKEN; | |
| if (hfToken) { | |
| const HF = "https://router.huggingface.co/v1"; // generic router, best model coverage | |
| endpoints.push( | |
| // Qwen3-235B-A22B: #1 Arabic open-source 2026, MoE 235B (22B active) — fastest large model | |
| { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-235B-A22B", label: "HF/Qwen3-235B", noThink: true }, | |
| // Qwen3-72B: #2 Arabic, dense 72B, excellent correction quality | |
| { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-72B", label: "HF/Qwen3-72B", noThink: true }, | |
| // Llama 4 Scout: Meta's April 2025, 17B MoE (16E), strong Arabic + multimodal | |
| { baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-4-Scout-17B-16E-Instruct", label: "HF/Llama4-Scout", noThink: false }, | |
| // Qwen3-30B-A3B: MoE 30B (3B active), fast and capable | |
| { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-30B-A3B", label: "HF/Qwen3-30B-A3B", noThink: true }, | |
| // Qwen2.5-72B: proven, widely available, great Arabic | |
| { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen2.5-72B-Instruct", label: "HF/Qwen2.5-72B", noThink: false }, | |
| // Llama 3.3 70B: reliable multilingual fallback | |
| { baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-3.3-70B-Instruct", label: "HF/Llama3.3-70B", noThink: false }, | |
| // Mistral Nemo 12B: lightweight guaranteed fallback | |
| { baseUrl: HF, apiKey: hfToken, model: "mistralai/Mistral-Nemo-Instruct-2407", label: "HF/Mistral-Nemo", noThink: false }, | |
| ); | |
| } | |
| return endpoints; | |
| } | |
| function chunkForAiCorrection(text: string): string[] { | |
| const paras = text.split(/\n{2,}/); | |
| const chunks: string[] = []; | |
| let buf = ""; | |
| for (const para of paras) { | |
| const joined = buf ? buf + "\n\n" + para : para; | |
| if (joined.length <= AI_CHUNK_CHARS) { | |
| buf = joined; | |
| } else { | |
| if (buf) chunks.push(buf); | |
| if (para.length > AI_CHUNK_CHARS) { | |
| buf = ""; | |
| for (const line of para.split("\n")) { | |
| const lj = buf ? buf + "\n" + line : line; | |
| if (lj.length <= AI_CHUNK_CHARS) { buf = lj; } | |
| else { if (buf) chunks.push(buf); buf = line.slice(0, AI_CHUNK_CHARS); } | |
| } | |
| } else { | |
| buf = para; | |
| } | |
| } | |
| } | |
| if (buf.trim()) chunks.push(buf); | |
| return chunks.filter(c => c.trim().length > 0); | |
| } | |
| async function callAiCorrection( | |
| text: string, | |
| ep: AiEndpoint, | |
| ): Promise<string> { | |
| const controller = new AbortController(); | |
| const timer = setTimeout(() => controller.abort(), AI_CHUNK_TIMEOUT_MS); | |
| try { | |
| // Qwen3 models support /no_think suffix to skip chain-of-thought reasoning, | |
| // giving 3-5× faster responses for straightforward correction tasks. | |
| const userContent = ep.noThink | |
| ? `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح: /no_think` | |
| : `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح:`; | |
| const body: Record<string, unknown> = { | |
| model: ep.model, | |
| messages: [ | |
| { role: "system", content: AI_SYSTEM_PROMPT }, | |
| { role: "user", content: userContent }, | |
| ], | |
| max_tokens: Math.min(4096, Math.ceil(text.length * 2)), | |
| temperature: 0.1, // low temp = deterministic, less hallucination | |
| }; | |
| const resp = await fetch(`${ep.baseUrl}/chat/completions`, { | |
| method: "POST", | |
| headers: { Authorization: `Bearer ${ep.apiKey}`, "Content-Type": "application/json" }, | |
| body: JSON.stringify(body), | |
| signal: controller.signal, | |
| }); | |
| if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" }); | |
| if (resp.status === 503) throw Object.assign(new Error("unavailable"), { code: "unavailable" }); | |
| if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" }); // no credits → try next | |
| if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" }); // unsupported model | |
| if (!resp.ok) throw new Error(`ai_http_${resp.status}`); | |
| const data = await resp.json() as any; | |
| let corrected = (data.choices?.[0]?.message?.content ?? "").trim(); | |
| // Strip any <think>...</think> block Qwen3 might emit even with /no_think | |
| corrected = corrected.replace(/<think>[\s\S]*?<\/think>\s*/gi, "").trim(); | |
| // Sanity: output must be 35%–300% of input length | |
| if (!corrected || corrected.length < text.length * 0.35 || corrected.length > text.length * 3) { | |
| return text; | |
| } | |
| return corrected; | |
| } finally { | |
| clearTimeout(timer); | |
| } | |
| } | |
| type ProgressFn = (msg: string, pct: number) => Promise<void>; | |
| async function correctArabicText(rawText: string, onProgress?: ProgressFn): Promise<string> { | |
| const endpoints = resolveAiEndpoints(); | |
| if (!endpoints.length) { | |
| logger.info("[arabic-ai] No AI endpoint configured — using OCR text as-is"); | |
| return rawText; | |
| } | |
| // Only correct predominantly Arabic text | |
| const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) ?? []).length; | |
| const nonSpaceChars = rawText.replace(/\s/g, "").length; | |
| if (nonSpaceChars < 50 || arabicChars / nonSpaceChars < 0.25) return rawText; | |
| const chunks = chunkForAiCorrection(rawText); | |
| // Find the first working endpoint (try each with a minimal probe if >1 model available) | |
| let activeEpIdx = 0; | |
| logger.info(`[arabic-ai] ${chunks.length} chunks, ${endpoints.length} endpoints available — primary: ${endpoints[0].label}`); | |
| const correctedParts: string[] = []; | |
| for (let i = 0; i < chunks.length; i++) { | |
| const pct = 33 + Math.round((i / chunks.length) * 21); | |
| const ep = endpoints[activeEpIdx]; | |
| await onProgress?.(`تصحيح النص عبر ${ep.label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct); | |
| let succeeded = false; | |
| while (activeEpIdx < endpoints.length) { | |
| const cur = endpoints[activeEpIdx]; | |
| try { | |
| const result = await callAiCorrection(chunks[i], cur); | |
| correctedParts.push(result); | |
| succeeded = true; | |
| break; | |
| } catch (err: any) { | |
| const code = err?.code ?? err?.message ?? ""; | |
| if (code === "rate_limited" || code === "unavailable" || code.startsWith("ai_http_5")) { | |
| logger.warn(`[arabic-ai] ${cur.label} ${code} — switching to next endpoint`); | |
| activeEpIdx++; | |
| // update progress label for new endpoint | |
| if (activeEpIdx < endpoints.length) { | |
| await onProgress?.(`التحويل عبر ${endpoints[activeEpIdx].label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct); | |
| } | |
| } else { | |
| logger.warn({ err }, `[arabic-ai] chunk ${i} error on ${cur.label} — keeping raw text`); | |
| break; | |
| } | |
| } | |
| } | |
| if (!succeeded) { | |
| // All endpoints exhausted or non-retryable error — keep original chunk | |
| correctedParts.push(chunks[i]); | |
| if (activeEpIdx >= endpoints.length) { | |
| // No more endpoints: pass remaining chunks through unchanged | |
| correctedParts.push(...chunks.slice(i + 1)); | |
| logger.warn("[arabic-ai] All endpoints exhausted — remaining chunks kept as-is"); | |
| break; | |
| } | |
| } | |
| } | |
| return correctedParts.join("\n\n"); | |
| } | |
| // ── Garbled Arabic detector ─────────────────────────────────────────────────── | |
| // Detects whether pdfjs-dist returned broken CMap output for an Arabic PDF. | |
| // Two root causes: | |
| // A) Character-pair transposition (RTL/LTR confusion): في → يف | |
| // B) Broken ToUnicode CMap: Arabic glyphs mapped to Latin code points, | |
| // producing "OA BW Zz" noise inline with Arabic text, often with | |
| // Unicode bidi control chars (LRM/RLM) wrapping the Latin sequences. | |
| function isGarbledArabic(text: string): boolean { | |
| const arabicChars = (text.match(/[\u0600-\u06FF]/g) ?? []).length; | |
| if (arabicChars < 100) return false; | |
| // ── Type A: character-pair transposition ─────────────────────────────── | |
| // Space-delimited يف → garbled في (≥3 occurrences is conclusive) | |
| const garbledFi = (text.match(/ يف /g) ?? []).length; | |
| if (garbledFi >= 3) return true; | |
| // Garbled الحمد (very common opening in Islamic texts) | |
| if (/امحلد/.test(text)) return true; | |
| // Garbled ordinal markers ثانياً / ثالثاً used as section headers | |
| if (/اثنياا|اثلثاا/.test(text)) return true; | |
| // ── Type B: broken CMap → Arabic mapped to Latin code points ─────────── | |
| // Signal 1: bidi control chars (LRM U+200E / RLM U+200F) wrapping | |
| // short Latin sequences — pdfjs embeds these from the CMap stream. | |
| // Pattern: OA Zz BW AJ | |
| const bidiLatinWraps = (text.match(/[\u200E\u200F][A-Za-z]{1,6}[\u200E\u200F]/g) ?? []).length; | |
| if (bidiLatinWraps >= 3) return true; | |
| // Signal 2: multiple short ALL-CAPS Latin sequences appearing INLINE | |
| // within predominantly-Arabic lines (not at the start of a new sentence). | |
| // e.g. "المبادئ العشرة OA للعلوم BW أولاً" — OA/BW = garbled Arabic words. | |
| const IGNORE_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS", | |
| "I", "II", "III", "IV", "VI", "VII", "VIII", "IX", "XI", "XII"]); | |
| const garbledLines = text.split("\n").filter(line => { | |
| const arabic = (line.match(/[\u0600-\u06FF]/g) ?? []).length; | |
| if (arabic < 3) return false; | |
| const noiseCaps = (line.match(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g) ?? []) | |
| .filter(m => !IGNORE_CAPS.has(m)); | |
| return noiseCaps.length >= 2; | |
| }).length; | |
| if (garbledLines >= 4) return true; | |
| // Signal 3: suspiciously high ratio of Latin alphabetic chars in | |
| // a predominantly-Arabic document (broken CMap maps Arabic → Latin). | |
| const latinAlpha = (text.match(/[A-Za-z]/g) ?? []).length; | |
| if (arabicChars >= 300 && latinAlpha > arabicChars * 0.12) return true; | |
| return false; | |
| } | |
| // ── VLM-based OCR per page (olmOCR / Qwen2.5-VL via HF Inference API) ──────── | |
| // Uses vision-language models to extract text from rendered page images. | |
| // olmOCR (Allen Institute) is specifically fine-tuned for document OCR and | |
| // achieves top-1 Arabic accuracy on KITAB-Bench benchmarks. | |
| // Model priority: olmOCR-7B → Qwen2.5-VL-7B → Tesseract (local fallback) | |
| // VLM OCR model priority (2026): olmOCR #1 Arabic doc OCR → Qwen2.5-VL-72B → Qwen2.5-VL-7B | |
| // Uses the generic HF router (router.huggingface.co/v1) for maximum model availability. | |
| const VLM_OCR_ROUTER = "https://router.huggingface.co/v1"; | |
| const VLM_OCR_MODELS = [ | |
| "allenai/olmOCR-7B-0225-preview", // #1: Allen Institute, fine-tuned doc OCR, KITAB-Bench winner | |
| "Qwen/Qwen2.5-VL-72B-Instruct", // #2: larger VLM, best Arabic accuracy (NEW 2026 upgrade) | |
| "Qwen/Qwen2.5-VL-7B-Instruct", // #3: smaller, faster fallback | |
| ]; | |
| const VLM_PAGE_TIMEOUT_MS = 90_000; | |
| const VLM_OCR_PROMPT = | |
| "Extract all the text from this document page exactly as written. " + | |
| "Preserve Arabic text, paragraph structure, headings, and line breaks. " + | |
| "Do not add explanations or commentary — output only the extracted text."; | |
| async function extractPageViaVlm(pngPath: string, hfToken: string): Promise<string> { | |
| const imgBase64 = fs.readFileSync(pngPath).toString("base64"); | |
| for (const model of VLM_OCR_MODELS) { | |
| const ctrl = new AbortController(); | |
| const timer = setTimeout(() => ctrl.abort(), VLM_PAGE_TIMEOUT_MS); | |
| try { | |
| const resp = await fetch(`${VLM_OCR_ROUTER}/chat/completions`, { | |
| method: "POST", | |
| headers: { Authorization: `Bearer ${hfToken}`, "Content-Type": "application/json" }, | |
| body: JSON.stringify({ | |
| model, | |
| messages: [{ | |
| role: "user", | |
| content: [ | |
| { type: "image_url", image_url: { url: `data:image/png;base64,${imgBase64}` } }, | |
| { type: "text", text: VLM_OCR_PROMPT }, | |
| ], | |
| }], | |
| max_tokens: 4096, | |
| temperature: 0.0, | |
| }), | |
| signal: ctrl.signal, | |
| }); | |
| clearTimeout(timer); | |
| if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" }); | |
| if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" }); | |
| if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" }); | |
| if (!resp.ok) throw new Error(`vlm_http_${resp.status}`); | |
| const data = await resp.json() as any; | |
| const content = (data.choices?.[0]?.message?.content ?? "").trim(); | |
| if (content.length > 20) { | |
| logger.info(`[vlm-ocr] ${model.split("/")[1]} → ${content.length} chars`); | |
| return content; | |
| } | |
| logger.warn(`[vlm-ocr] ${model.split("/")[1]} returned empty — trying next`); | |
| } catch (err: any) { | |
| clearTimeout(timer); | |
| if (err?.code === "rate_limited") { | |
| logger.warn(`[vlm-ocr] ${model.split("/")[1]} rate-limited`); | |
| throw err; // propagate so caller can switch to Tesseract | |
| } | |
| logger.warn({ err: err?.message }, `[vlm-ocr] ${model.split("/")[1]} failed`); | |
| } | |
| } | |
| throw new Error("all_vlm_models_failed"); | |
| } | |
| // ── OCR-based PDF extractor (fallback for broken-CMap PDFs) ────────────────── | |
| // Pipeline: | |
| // 1. pdftoppm renders pages to PNG (200 DPI — optimal for VLM API) | |
| // 2. Per page: try VLM-OCR (olmOCR via HF API) first if HF_TOKEN available | |
| // 3. Fall back to Tesseract (local) if VLM fails / rate-limited | |
| // No page cap — processes the full document regardless of length. | |
| // Filter OCR output: drop lines that are overwhelmingly Latin characters with | |
| // little/no Arabic — these are noise from decorative pages, page headers, | |
| // and OCR misread ornaments (e.g. "Me NY 1", "dl pl a gl", "Fy PIN ENA"). | |
| function cleanOcrOutput(text: string): string { | |
| const lines = text.split("\n"); | |
| const out: string[] = []; | |
| for (const raw of lines) { | |
| const line = raw.trim(); | |
| // Always keep blank lines (paragraph separators) | |
| if (!line) { out.push(""); continue; } | |
| const arabicChars = (line.match(/[\u0600-\u06FF]/g) ?? []).length; | |
| const latinChars = (line.match(/[a-zA-Z]/g) ?? []).length; | |
| const totalAlpha = arabicChars + latinChars; | |
| // Keep if there's meaningful Arabic content | |
| if (arabicChars >= 4) { out.push(line); continue; } | |
| // Reject short lines that are purely Latin noise (≤30 chars, no Arabic) | |
| if (arabicChars === 0 && line.length <= 30) continue; | |
| // Reject lines where Latin chars vastly outnumber Arabic (OCR artefact) | |
| if (totalAlpha > 0 && latinChars / totalAlpha > 0.80 && arabicChars < 4) continue; | |
| // Keep everything else (numbers, punctuation, mixed headings, etc.) | |
| out.push(line); | |
| } | |
| return out.join("\n").replace(/\n{3,}/g, "\n\n").trim(); | |
| } | |
| async function extractPdfViaOcr( | |
| filePath: string, | |
| pageStart?: number, | |
| pageEnd?: number, | |
| onProgress?: (done: number, total: number) => void, | |
| ): Promise<string> { | |
| const { execFile } = await import("child_process"); | |
| const { promisify } = await import("util"); | |
| const execFileAsync = promisify(execFile); | |
| const hfToken = process.env.HF_TOKEN; | |
| const useVlm = !!hfToken; | |
| const tmpDir = fs.mkdtempSync("/tmp/pdf-ocr-"); | |
| let tessWorker: any = null; | |
| try { | |
| const startPage = pageStart && pageStart > 0 ? pageStart : 1; | |
| const endPage = pageEnd && pageEnd > 0 ? pageEnd : 9999; | |
| // VLM works great at 200 DPI; Tesseract benefits from 300 DPI. | |
| // When VLM is available we render at 200 DPI (smaller images, faster API). | |
| // If VLM is unavailable or fails entirely, we re-render at 300 DPI for Tesseract. | |
| const dpi = useVlm ? "200" : "300"; | |
| await execFileAsync( | |
| "pdftoppm", | |
| ["-r", dpi, "-png", "-f", String(startPage), "-l", String(endPage), | |
| filePath, path.join(tmpDir, "page")], | |
| { timeout: 600_000 }, | |
| ); | |
| const pngFiles = fs.readdirSync(tmpDir) | |
| .filter(f => f.endsWith(".png")) | |
| .sort() | |
| .map(f => path.join(tmpDir, f)); | |
| if (pngFiles.length === 0) return ""; | |
| const pageTexts: string[] = []; | |
| let vlmRateLimited = false; | |
| for (let i = 0; i < pngFiles.length; i++) { | |
| let pageText = ""; | |
| let usedTesseract = false; | |
| // ── Try VLM-OCR first (olmOCR / Qwen2.5-VL via HF) ──────────────── | |
| if (useVlm && !vlmRateLimited) { | |
| try { | |
| pageText = await extractPageViaVlm(pngFiles[i], hfToken); | |
| usedTesseract = false; | |
| } catch (err: any) { | |
| if (err?.code === "rate_limited") { | |
| vlmRateLimited = true; | |
| logger.warn("[vlm-ocr] Rate limited — switching to Tesseract for all remaining pages"); | |
| } else { | |
| logger.warn({ err: err?.message }, `[vlm-ocr] page ${i + 1} failed — using Tesseract`); | |
| } | |
| usedTesseract = true; | |
| } | |
| } else { | |
| usedTesseract = true; | |
| } | |
| // ── Fallback: Tesseract (local, guaranteed) ──────────────────────── | |
| if (usedTesseract) { | |
| if (!tessWorker) { | |
| // Lazy-initialise Tesseract only when actually needed | |
| const tessDataDir = | |
| process.env.NODE_ENV === "production" | |
| ? "/data/tessdata" | |
| : path.join(process.cwd(), "uploads", ".tessdata"); | |
| if (!fs.existsSync(tessDataDir)) fs.mkdirSync(tessDataDir, { recursive: true }); | |
| const Tesseract = await import("tesseract.js"); | |
| tessWorker = await Tesseract.createWorker(["ara", "eng"], 1, { | |
| cachePath: tessDataDir, | |
| workerPath: getTessWorkerPath(), | |
| }); | |
| } | |
| const { data: { text } } = await tessWorker.recognize(pngFiles[i]); | |
| pageText = cleanOcrOutput(text); | |
| } | |
| if (pageText.trim()) pageTexts.push(pageText.trim()); | |
| onProgress?.(i + 1, pngFiles.length); | |
| } | |
| if (tessWorker) await tessWorker.terminate(); | |
| let result = pageTexts.join("\n\n"); | |
| if (result.length > TEXT_CAP) result = result.slice(0, TEXT_CAP); | |
| return result; | |
| } catch (e) { | |
| logger.error({ err: e }, "[extractPdfViaOcr] failed"); | |
| if (tessWorker) { try { await tessWorker.terminate(); } catch { /* ignore */ } } | |
| return ""; | |
| } finally { | |
| try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ } | |
| } | |
| } | |
| // ── RTL-aware PDF extractor using pdfjs-dist directly ──────────────────────── | |
| // pdf-parse v2 has no `pagerender` callback, so we bypass it and use | |
| // pdfjs-dist (already installed as pdf-parse's peer) directly. | |
| // | |
| // Algorithm per page: | |
| // 1. getTextContent() → items with {x, y, width, height, str} | |
| // 2. Bucket items into visual lines by quantised Y (Y_THRESH = 10 pt) | |
| // 3. Sort each bucket right→left (descending X) → correct Arabic reading order | |
| // 4. Join items; insert a space only when the visual gap between adjacent | |
| // items exceeds 25% of the item's font height — this threshold correctly | |
| // handles Arabic ligature sub-glyphs (gap ~1 pt) vs word gaps (gap ~4+ pt) | |
| // without the false positives caused by per-character avgCharWidth. | |
| async function extractPdf(filePath: string, pageStart?: number, pageEnd?: number): Promise<string> { | |
| let pdfDoc: any = null; | |
| try { | |
| const { createRequire } = await import("module"); | |
| const req = createRequire(import.meta.url); | |
| // Resolve pdfjs-dist via pdf-parse's own node_modules (it is a declared | |
| // dependency of pdf-parse v2, so it is guaranteed to be present there). | |
| const pdfParseCjsPath = req.resolve("pdf-parse"); | |
| const pdfParseReq = createRequire(pdfParseCjsPath); | |
| const pdfjsMjsPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.mjs"); | |
| const pdfjsWorkerPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs"); | |
| // Dynamic ESM import of pdfjs-dist (it is an ES module) | |
| const { getDocument, GlobalWorkerOptions, VerbosityLevel } = | |
| await import(pdfjsMjsPath) as any; | |
| GlobalWorkerOptions.workerSrc = pdfjsWorkerPath; | |
| const MAX_PDF_BYTES = 200 * 1024 * 1024; | |
| const stat = fs.statSync(filePath); | |
| const readSize = Math.min(stat.size, MAX_PDF_BYTES); | |
| const fd = fs.openSync(filePath, "r"); | |
| const buf = Buffer.alloc(readSize); | |
| fs.readSync(fd, buf, 0, readSize, 0); | |
| fs.closeSync(fd); | |
| // VerbosityLevel.ERRORS = 0 → suppress "Warning: TT: undefined function" noise | |
| const verbosity: number = (VerbosityLevel as any)?.ERRORS ?? 0; | |
| pdfDoc = await getDocument({ | |
| data: new Uint8Array(buf), | |
| useWorkerFetch: false, | |
| isEvalSupported: false, | |
| useSystemFonts: true, | |
| verbosity, | |
| }).promise; | |
| const totalPages = pdfDoc.numPages as number; | |
| const startPage = pageStart && pageStart > 0 ? Math.min(pageStart, totalPages) : 1; | |
| const endPage = pageEnd && pageEnd > 0 ? Math.min(pageEnd, totalPages) : totalPages; | |
| // Y_THRESH = 10 pt: groups diacritics / sub-glyphs on slightly different Y | |
| // into the same visual line. | |
| const Y_THRESH = 10; | |
| type TextItem = { x: number; y: number; str: string; width: number; height: number }; | |
| const pageTexts: string[] = []; | |
| for (let p = startPage; p <= endPage; p++) { | |
| const page = await pdfDoc.getPage(p); | |
| const tc = await page.getTextContent({ includeMarkedContent: false }); | |
| const items: TextItem[] = []; | |
| for (const it of (tc.items ?? [])) { | |
| if (typeof it.str !== "string" || !it.str.trim()) continue; | |
| items.push({ | |
| x: it.transform[4], | |
| y: it.transform[5], | |
| str: it.str, | |
| width: it.width ?? 0, | |
| height: it.height ?? 12, // fallback to 12 pt if absent | |
| }); | |
| } | |
| if (!items.length) { | |
| page.cleanup(); | |
| continue; | |
| } | |
| // Bucket by quantised Y | |
| const buckets = new Map<number, TextItem[]>(); | |
| for (const it of items) { | |
| const key = Math.round(it.y / Y_THRESH) * Y_THRESH; | |
| if (!buckets.has(key)) buckets.set(key, []); | |
| buckets.get(key)!.push(it); | |
| } | |
| // Lines top→bottom (larger Y = higher on PDF page) | |
| const sortedYs = Array.from(buckets.keys()).sort((a, b) => b - a); | |
| const lines: string[] = []; | |
| for (const y of sortedYs) { | |
| const row = buckets.get(y)!; | |
| // RTL: sort right-to-left (descending X) | |
| row.sort((a, b) => b.x - a.x); | |
| // Join items, inserting a space only when the gap between adjacent | |
| // items exceeds 25% of the item's font height. | |
| // This correctly skips ligature sub-glyph gaps (~1 pt) while catching | |
| // genuine inter-word spaces (~4+ pt for typical Arabic body text). | |
| let lineText = ""; | |
| for (let i = 0; i < row.length; i++) { | |
| lineText += row[i].str; | |
| if (i < row.length - 1) { | |
| const cur = row[i]; | |
| const next = row[i + 1]; | |
| // gap = horizontal distance between right edge of `next` and left edge of `cur` | |
| const gap = cur.x - (next.x + next.width); | |
| const spaceThreshold = (cur.height > 0 ? cur.height : 12) * 0.25; | |
| if (gap > spaceThreshold) lineText += " "; | |
| } | |
| } | |
| const trimmed = lineText.trim(); | |
| if (trimmed) lines.push(trimmed); | |
| } | |
| page.cleanup(); | |
| pageTexts.push(lines.join("\n")); | |
| } | |
| let text = pageTexts.join("\n\n").trim(); | |
| // Arabic-specific post-processing: strips page markers, TOC dots, etc. | |
| text = cleanArabicPdfRaw(text); | |
| return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; | |
| } catch (e) { | |
| logger.error({ err: e }, "[extractPdf] failed"); | |
| return ""; | |
| } finally { | |
| if (pdfDoc) { | |
| try { await pdfDoc.destroy(); } catch { /* ignore */ } | |
| } | |
| } | |
| } | |
| async function extractDocx(filePath: string): Promise<string> { | |
| try { | |
| const mammoth = await import("mammoth"); | |
| const result = await mammoth.extractRawText({ path: filePath }); | |
| const text = result.value?.trim() || ""; | |
| return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; | |
| } catch (e) { | |
| return ""; | |
| } | |
| } | |
| // Resolves the Tesseract.js Node.js worker script path so it works even when | |
| // the server code is bundled with esbuild (which breaks the default auto-resolution). | |
| function getTessWorkerPath(): string { | |
| const pkgJson = _require.resolve("tesseract.js/package.json"); | |
| return path.join(path.dirname(pkgJson), "src/worker-script/node/index.js"); | |
| } | |
| async function extractImage(filePath: string): Promise<string> { | |
| try { | |
| const Tesseract = await import("tesseract.js"); | |
| const cacheDir = | |
| process.env.NODE_ENV === "production" | |
| ? "/data/tessdata" | |
| : path.join(process.cwd(), "uploads", ".tessdata"); | |
| if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir, { recursive: true }); | |
| const worker = await Tesseract.createWorker(["ara", "eng"], 1, { | |
| cachePath: cacheDir, | |
| workerPath: getTessWorkerPath(), | |
| }); | |
| const { data: { text } } = await worker.recognize(filePath); | |
| await worker.terminate(); | |
| return text?.trim() || ""; | |
| } catch (e) { | |
| logger.error({ err: e }, "[extractImage] error"); | |
| return ""; | |
| } | |
| } | |
| async function extractSpreadsheet(filePath: string, ext: string): Promise<string> { | |
| try { | |
| if (ext === ".csv") { | |
| const content = fs.readFileSync(filePath, "utf-8"); | |
| const lines = content.split("\n").filter(Boolean).slice(0, 5000); // cap rows | |
| if (lines.length === 0) return ""; | |
| const headers = lines[0].split(",").map((h) => h.trim()); | |
| let md = `| ${headers.join(" | ")} |\n`; | |
| md += `| ${headers.map(() => "---").join(" | ")} |\n`; | |
| for (const line of lines.slice(1)) { | |
| const cells = line.split(",").map((c) => c.trim()); | |
| md += `| ${cells.join(" | ")} |\n`; | |
| if (md.length > TEXT_CAP) break; | |
| } | |
| return md; | |
| } | |
| const { createRequire } = await import("module"); | |
| const req = createRequire(import.meta.url); | |
| const XLSX = req("xlsx"); | |
| const workbook = XLSX.readFile(filePath, { sheetRows: 5000 }); // cap rows per sheet | |
| let md = ""; | |
| for (const sheetName of workbook.SheetNames) { | |
| const sheet = workbook.Sheets[sheetName]; | |
| const data: string[][] = XLSX.utils.sheet_to_json(sheet, { header: 1 }); | |
| md += `## ${sheetName}\n\n`; | |
| if (data.length > 0) { | |
| const headers = data[0].map(String); | |
| md += `| ${headers.join(" | ")} |\n`; | |
| md += `| ${headers.map(() => "---").join(" | ")} |\n`; | |
| for (const row of data.slice(1)) { | |
| md += `| ${headers.map((_, idx) => String(row[idx] ?? "")).join(" | ")} |\n`; | |
| if (md.length > TEXT_CAP) break; | |
| } | |
| md += "\n"; | |
| } | |
| if (md.length > TEXT_CAP) break; | |
| } | |
| return md; | |
| } catch (e) { | |
| return ""; | |
| } | |
| } | |
| async function extractPptx(filePath: string): Promise<string> { | |
| try { | |
| const JSZip = (await import("jszip")).default; | |
| const content = fs.readFileSync(filePath); | |
| const zip = await JSZip.loadAsync(content); | |
| let text = ""; | |
| const slideFiles = Object.keys(zip.files) | |
| .filter((f) => f.match(/ppt\/slides\/slide\d+\.xml/)) | |
| .sort(); | |
| for (const slideFile of slideFiles) { | |
| const xml = await zip.files[slideFile].async("string"); | |
| const matches = xml.match(/<a:t>(.*?)<\/a:t>/g) || []; | |
| const slideText = matches | |
| .map((m) => m.replace(/<[^>]+>/g, "").trim()) | |
| .filter(Boolean) | |
| .join(" "); | |
| if (slideText) text += slideText + "\n\n"; | |
| if (text.length > TEXT_CAP) break; | |
| } | |
| return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; | |
| } catch (e) { | |
| return ""; | |
| } | |
| } | |
| async function extractEpub(filePath: string): Promise<string> { | |
| try { | |
| const JSZip = (await import("jszip")).default; | |
| const content = fs.readFileSync(filePath); | |
| const zip = await JSZip.loadAsync(content); | |
| let text = ""; | |
| for (const filename of Object.keys(zip.files)) { | |
| if (filename.endsWith(".html") || filename.endsWith(".xhtml")) { | |
| const html = await zip.files[filename].async("string"); | |
| text += htmlToPlainText(html) + "\n\n"; | |
| if (text.length > TEXT_CAP) break; | |
| } | |
| } | |
| return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text; | |
| } catch (e) { | |
| return ""; | |
| } | |
| } | |
| function htmlToPlainText(html: string): string { | |
| return html | |
| .replace(/<h([1-6])[^>]*>(.*?)<\/h\1>/gis, (_, l, c) => "\n" + "#".repeat(Number(l)) + " " + stripTags(c) + "\n") | |
| .replace(/<p[^>]*>(.*?)<\/p>/gis, (_, c) => "\n" + stripTags(c) + "\n") | |
| .replace(/<li[^>]*>(.*?)<\/li>/gis, "- $1\n") | |
| .replace(/<br\s*\/?>/gi, "\n") | |
| .replace(/<[^>]+>/g, "") | |
| .replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/ /g, " ") | |
| .replace(/\n{3,}/g, "\n\n") | |
| .trim(); | |
| } | |
| function stripTags(s: string): string { | |
| return s.replace(/<[^>]+>/g, "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").trim(); | |
| } | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| // Stats & Utilities | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| function computeStats(md: string) { | |
| const wordCount = md.split(/\s+/).filter(Boolean).length; | |
| const headings = (md.match(/^#{1,6}\s/gm) || []).length; | |
| const boldItems = (md.match(/\*\*[^*]+\*\*/g) || []).length; | |
| const listItems = (md.match(/^[-*+]\s/gm) || []).length; | |
| const tableRows = (md.match(/^\|/gm) || []).length; | |
| const codeBlocks = (md.match(/```/g) || []).length / 2; | |
| const qualityEstimate = Math.min( | |
| 98, | |
| 72 + | |
| Math.min(headings * 3, 12) + | |
| Math.min(boldItems, 10) + | |
| Math.min(listItems, 8) + | |
| (tableRows > 0 ? 4 : 0) + | |
| (codeBlocks > 0 ? 2 : 0) + | |
| Math.min(wordCount / 50, 10) | |
| ); | |
| return { wordCount, headings, boldItems, listItems, qualityEstimate }; | |
| } | |
| function cleanMarkdown(md: string): string { | |
| return md | |
| .replace(/\r\n/g, "\n") | |
| .replace(/[ \t]+$/gm, "") | |
| .replace(/\n{4,}/g, "\n\n\n") | |
| .trim(); | |
| } | |
| function detectLanguage(text: string): string { | |
| const arabicChars = (text.match(/[\u0600-\u06FF]/g) || []).length; | |
| const latinChars = (text.match(/[a-zA-Z]/g) || []).length; | |
| if (arabicChars > latinChars * 0.6) return "ar"; | |
| if (latinChars > arabicChars * 0.6) return "en"; | |
| return "mixed"; | |
| } | |
| function sleep(ms: number): Promise<void> { | |
| return new Promise((r) => setTimeout(r, ms)); | |
| } | |
| // Public entry point — enforces a 15-minute hard limit per conversion job | |
| const CONVERSION_TIMEOUT_MS = 15 * 60 * 1000; | |
| async function runConversion(conversionId: string, fileId: string, storagePath: string) { | |
| try { | |
| await withTimeout( | |
| runConversionCore(conversionId, fileId, storagePath), | |
| CONVERSION_TIMEOUT_MS, | |
| "تحويل الملف" | |
| ); | |
| } catch (err) { | |
| const error = err instanceof Error ? err.message : "انتهت مهلة التحويل"; | |
| await db.update(conversionsTable) | |
| .set({ status: "failed", errorMessage: error }) | |
| .where(eq(conversionsTable.id, conversionId)); | |
| await db.update(filesTable) | |
| .set({ status: "failed", updatedAt: new Date() }) | |
| .where(eq(filesTable.id, fileId)); | |
| } | |
| } | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| // Routes | |
| // ═══════════════════════════════════════════════════════════════════════════ | |
| // POST /api/convert/upload | |
| router.post("/upload", upload.single("file"), async (req: AuthRequest, res) => { | |
| try { | |
| if (!req.file) { | |
| res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" }); | |
| return; | |
| } | |
| const { pageStart, pageEnd, folderId } = req.body; | |
| const fileName = path.parse(fixFilename(req.file.originalname)).name; | |
| const [file] = await db | |
| .insert(filesTable) | |
| .values({ | |
| name: fileName + ".md", | |
| ownerId: req.userId!, | |
| folderId: folderId || null, | |
| originalName: fixFilename(req.file.originalname), | |
| originalType: req.file.mimetype, | |
| sizeBytes: req.file.size, | |
| storagePath: req.file.path, | |
| status: "queued", | |
| }) | |
| .returning(); | |
| const [conversion] = await db | |
| .insert(conversionsTable) | |
| .values({ | |
| fileId: file.id, | |
| userId: req.userId!, | |
| status: "queued", | |
| progress: 0, | |
| steps: initSteps(), | |
| pageStart: pageStart ? Number(pageStart) : null, | |
| pageEnd: pageEnd ? Number(pageEnd) : null, | |
| }) | |
| .returning(); | |
| runConversion(conversion.id, file.id, req.file.path).catch((err) => | |
| req.log?.error({ err }, "background conversion error") | |
| ); | |
| res.status(202).json({ | |
| jobId: conversion.id, | |
| fileId: file.id, | |
| status: "queued", | |
| progress: 0, | |
| steps: initSteps(), | |
| createdAt: conversion.createdAt, | |
| }); | |
| } catch (err) { | |
| const e = err instanceof Error ? err : new Error(String(err)); | |
| const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause; | |
| const rootMsg = cause?.message ?? e.message; | |
| console.error("[RAQIM] /upload error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack); | |
| req.log?.error({ err, cause: cause?.message }, "upload error"); | |
| res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" }); | |
| } | |
| }); | |
| // POST /api/convert/upload-split — upload once, create N conversion jobs | |
| router.post("/upload-split", upload.single("file"), async (req: AuthRequest, res) => { | |
| try { | |
| if (!req.file) { | |
| res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" }); | |
| return; | |
| } | |
| const { ranges: rangesJson, folderId } = req.body; | |
| let ranges: Array<{ start: number; end: number; label: string }> = []; | |
| try { | |
| ranges = JSON.parse(rangesJson || "[]"); | |
| } catch { | |
| res.status(400).json({ error: "validation", message: "نطاقات الصفحات غير صالحة" }); | |
| return; | |
| } | |
| if (!ranges.length) { | |
| res.status(400).json({ error: "validation", message: "يجب تحديد نطاق واحد على الأقل" }); | |
| return; | |
| } | |
| const baseName = path.parse(fixFilename(req.file.originalname)).name; | |
| const jobs = []; | |
| for (const range of ranges) { | |
| const partName = `${baseName} — ${range.label}.md`; | |
| const [file] = await db | |
| .insert(filesTable) | |
| .values({ | |
| name: partName, | |
| ownerId: req.userId!, | |
| folderId: folderId || null, | |
| originalName: fixFilename(req.file!.originalname), | |
| originalType: req.file!.mimetype, | |
| sizeBytes: req.file!.size, | |
| storagePath: req.file!.path, | |
| status: "queued", | |
| }) | |
| .returning(); | |
| const [conversion] = await db | |
| .insert(conversionsTable) | |
| .values({ | |
| fileId: file.id, | |
| userId: req.userId!, | |
| status: "queued", | |
| progress: 0, | |
| steps: initSteps(), | |
| pageStart: range.start || null, | |
| pageEnd: range.end || null, | |
| }) | |
| .returning(); | |
| runConversion(conversion.id, file.id, req.file!.path).catch((err) => | |
| req.log?.error({ err }, "split conversion error") | |
| ); | |
| jobs.push({ jobId: conversion.id, fileId: file.id, name: partName }); | |
| } | |
| res.status(202).json({ jobs }); | |
| } catch (err) { | |
| const e = err instanceof Error ? err : new Error(String(err)); | |
| const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause; | |
| const rootMsg = cause?.message ?? e.message; | |
| console.error("[RAQIM] /upload-split error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack); | |
| req.log?.error({ err, cause: cause?.message }, "upload-split error"); | |
| res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" }); | |
| } | |
| }); | |
| // POST /api/convert | |
| router.post("/", async (req: AuthRequest, res) => { | |
| try { | |
| const { fileId, pageStart, pageEnd } = req.body; | |
| const file = await db.query.filesTable.findFirst({ | |
| where: and(eq(filesTable.id, fileId), eq(filesTable.ownerId, req.userId!)), | |
| }); | |
| if (!file || !file.storagePath) { | |
| res.status(404).json({ error: "not_found", message: "الملف غير موجود" }); | |
| return; | |
| } | |
| const [conversion] = await db | |
| .insert(conversionsTable) | |
| .values({ | |
| fileId: file.id, | |
| userId: req.userId!, | |
| status: "queued", | |
| progress: 0, | |
| steps: initSteps(), | |
| pageStart: pageStart || null, | |
| pageEnd: pageEnd || null, | |
| }) | |
| .returning(); | |
| runConversion(conversion.id, file.id, file.storagePath).catch((err) => | |
| req.log?.error({ err }, "background conversion error") | |
| ); | |
| res.status(202).json({ | |
| jobId: conversion.id, | |
| fileId, | |
| status: "queued", | |
| progress: 0, | |
| steps: initSteps(), | |
| createdAt: conversion.createdAt, | |
| }); | |
| } catch (err) { | |
| req.log?.error({ err }, "convert error"); | |
| res.status(500).json({ error: "server_error", message: "فشل التحويل" }); | |
| } | |
| }); | |
| // GET /api/convert/:jobId | |
| router.get("/:jobId", async (req: AuthRequest, res) => { | |
| try { | |
| const jobId = req.params.jobId as string; | |
| const conv = await db.query.conversionsTable.findFirst({ | |
| where: and(eq(conversionsTable.id, jobId), eq(conversionsTable.userId, req.userId!)), | |
| }); | |
| if (!conv) { | |
| res.status(404).json({ error: "not_found", message: "المهمة غير موجودة" }); | |
| return; | |
| } | |
| res.json({ | |
| jobId: conv.id, | |
| fileId: conv.fileId, | |
| status: conv.status, | |
| progress: conv.progress, | |
| steps: conv.steps, | |
| queuePosition: null, | |
| elapsedSeconds: conv.elapsedSeconds, | |
| estimatedSeconds: conv.estimatedSeconds, | |
| errorMessage: conv.errorMessage, | |
| createdAt: conv.createdAt, | |
| }); | |
| } catch (err) { | |
| req.log?.error({ err }, "get conversion error"); | |
| res.status(500).json({ error: "server_error", message: "فشل جلب الحالة" }); | |
| } | |
| }); | |
| export default router; | |