raqim / artifacts /api-server /src /routes /convert.ts
RAQIM Deploy
Deploy RAQIM 2026-05-02 23:08
3e9069b
import { Router } from "express";
import multer from "multer";
import path from "path";
import fs from "fs";
import { createRequire } from "module";
import { db } from "@workspace/db";
import { filesTable, conversionsTable } from "@workspace/db";
import { eq, and } from "drizzle-orm";
import { requireAuth, AuthRequest } from "../middlewares/auth.js";
import { logger } from "../lib/logger.js";
// Module-level require() for resolving peer package paths (works in ESM + esbuild bundles)
const _require = createRequire(import.meta.url);
const router = Router();
router.use(requireAuth);
// In production, use /data/uploads (persistent HF Spaces volume).
// /tmp/uploads is a tmpfs that starts empty at container boot — unreliable.
const uploadDir =
process.env.NODE_ENV === "production"
? "/data/uploads"
: path.join(process.cwd(), "uploads");
try {
fs.mkdirSync(uploadDir, { recursive: true });
} catch (e) {
console.error("[RAQIM] Failed to create upload dir:", uploadDir, e);
}
// Multer decodes the filename header as Latin-1 by default; re-encode as UTF-8
function fixFilename(raw: string): string {
try {
return Buffer.from(raw, "latin1").toString("utf8");
} catch {
return raw;
}
}
const storage = multer.diskStorage({
destination: uploadDir,
filename: (_, file, cb) => cb(null, `${Date.now()}-${fixFilename(file.originalname)}`),
});
const upload = multer({ storage, limits: { fileSize: 500 * 1024 * 1024 } });
const CONVERSION_STEPS = [
{ name: "analyzing", label: "تحليل الملف والتعرف على نوعه" },
{ name: "routing", label: "توجيه ذكي لأنسب محركات المعالجة" },
{ name: "ocr", label: "استخراج النص الخام (OCR / Parser)" },
{ name: "layout", label: "المهندس الذكي — إعادة بناء التنسيق" },
{ name: "scoring", label: "تقييم الجودة وإحصاء العناصر" },
{ name: "merging", label: "دمج الطبقات ومعالجة الهيكل النهائي" },
{ name: "cleanup", label: "تنظيف وتلميع المستند" },
];
function initSteps() {
return CONVERSION_STEPS.map((s) => ({ ...s, status: "pending" }));
}
// Wrap any async fn with a timeout; rejects with an Error if it exceeds ms
function withTimeout<T>(promise: Promise<T>, ms: number, label: string): Promise<T> {
return new Promise((resolve, reject) => {
const timer = setTimeout(() => reject(new Error(`تجاوز الوقت المحدد: ${label}`)), ms);
promise.then(
(v) => { clearTimeout(timer); resolve(v); },
(e) => { clearTimeout(timer); reject(e); }
);
});
}
async function runConversionCore(conversionId: string, fileId: string, storagePath: string) {
const steps = initSteps();
let stepIndex = 0;
const startTime = Date.now();
// Read page range set at upload time
const convRecord = await db.query.conversionsTable.findFirst({
where: eq(conversionsTable.id, conversionId),
});
const pageStart = convRecord?.pageStart ?? undefined;
const pageEnd = convRecord?.pageEnd ?? undefined;
const updateProgress = async (
status: string,
progress: number,
stepsDone: typeof steps,
aiMessage?: string
) => {
await db
.update(conversionsTable)
.set({
status: status as any,
progress,
steps: stepsDone,
elapsedSeconds: Math.floor((Date.now() - startTime) / 1000),
...(aiMessage ? { errorMessage: aiMessage } : {}),
})
.where(eq(conversionsTable.id, conversionId));
};
try {
const ext = path.extname(storagePath).toLowerCase();
let rawText = "";
// ── Step 1: Analyzing ───────────────────────────────────────────────
stepIndex = 0;
steps[0].status = "running";
await updateProgress("analyzing", 5, steps, "جاري تحليل نوع الملف والبنية الداخلية...");
await sleep(600);
steps[0].status = "done";
// ── Step 2: Routing ─────────────────────────────────────────────────
stepIndex = 1;
steps[1].status = "running";
await updateProgress("routing", 12, steps, "اختيار أنسب محرك استخراج للملف...");
await sleep(400);
steps[1].status = "done";
// ── Step 3: OCR / Text Extraction ───────────────────────────────────
stepIndex = 2;
steps[2].status = "running";
await updateProgress("ocr", 20, steps, "جاري استخراج النص من الملف...");
if ([".txt", ".md"].includes(ext)) {
rawText = fs.readFileSync(storagePath, "utf-8");
} else if (ext === ".pdf") {
rawText = await extractPdf(storagePath, pageStart, pageEnd);
await updateProgress("ocr", 28, steps, "تم استخراج النص الخام من الـ PDF...");
// If text appears garbled (broken ToUnicode CMap in font), fall back to
// rendering each page as an image and running Tesseract OCR on it.
// This completely bypasses the CMap issue and works offline/without any API key.
if (isGarbledArabic(rawText)) {
await updateProgress("ocr", 30, steps, "تم رصد خلل في ترميز الخط — جاري استخدام OCR للحصول على نص دقيق...");
const ocrText = await extractPdfViaOcr(storagePath, pageStart, pageEnd,
(done, total) => updateProgress("ocr", 30 + Math.round((done / total) * 20), steps,
`جاري تحليل الصفحات بواسطة OCR... (${done}/${total})`)
);
if (ocrText.length > 50) {
rawText = ocrText;
await updateProgress("ocr", 50, steps, "تم استخراج النص بواسطة OCR بدقة عالية ✓");
}
}
// Optional AI polish — free on Replit (AI proxy) and on HF Spaces (HF_TOKEN).
rawText = await correctArabicText(rawText, (msg, pct) =>
updateProgress("ocr", pct, steps, msg)
);
await updateProgress("ocr", 55, steps, "اكتمل استخراج النص العربي ✓");
} else if ([".docx", ".doc"].includes(ext)) {
rawText = await extractDocx(storagePath);
await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word...");
} else if ([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp", ".gif"].includes(ext)) {
rawText = await extractImage(storagePath);
await updateProgress("ocr", 38, steps, "تم استخراج النص من الصورة بتقنية OCR...");
} else if ([".xlsx", ".xls", ".csv"].includes(ext)) {
rawText = await extractSpreadsheet(storagePath, ext);
await updateProgress("ocr", 38, steps, "تم تحليل جداول البيانات...");
} else if ([".html", ".htm"].includes(ext)) {
const html = fs.readFileSync(storagePath, "utf-8");
rawText = htmlToPlainText(html);
await updateProgress("ocr", 38, steps, "تم تحليل ملف HTML...");
} else if ([".pptx", ".ppt"].includes(ext)) {
rawText = await extractPptx(storagePath);
await updateProgress("ocr", 38, steps, "تم استخراج نصوص الشرائح...");
} else if ([".epub"].includes(ext)) {
rawText = await extractEpub(storagePath);
await updateProgress("ocr", 38, steps, "تم استخراج نصوص الكتاب الإلكتروني...");
} else {
try {
rawText = fs.readFileSync(storagePath, "utf-8").substring(0, 100000);
} catch {
rawText = `# ملف ثنائي\n\nلا يمكن استخراج نص من هذا النوع من الملفات مباشرة.`;
}
}
steps[2].status = "done";
// ── Step 4: Rule-Based Architect — 100% Free, No Limits ─────────────
stepIndex = 3;
steps[3].status = "running";
await updateProgress("layout", 45, steps, "المهندس الذكي يعيد بناء هيكل المستند...");
const architectMarkdown = runRuleBasedArchitect(rawText, ext);
await updateProgress("layout", 68, steps, "اكتمل تحليل وهيكلة المستند");
steps[3].status = "done";
// ── Step 5: Scoring ─────────────────────────────────────────────────
stepIndex = 4;
steps[4].status = "running";
await updateProgress("scoring", 75, steps, "جاري قياس الجودة وإحصاء العناصر...");
const stats = computeStats(architectMarkdown);
await sleep(400);
steps[4].status = "done";
// ── Step 6: Merging ─────────────────────────────────────────────────
stepIndex = 5;
steps[5].status = "running";
await updateProgress("merging", 85, steps, "دمج الطبقات وتثبيت الهيكل النهائي...");
await sleep(350);
steps[5].status = "done";
// ── Step 7: Cleanup ─────────────────────────────────────────────────
stepIndex = 6;
steps[6].status = "running";
await updateProgress("cleanup", 93, steps, "التلميع النهائي والتحقق من سلامة النص...");
const finalMarkdown = cleanMarkdown(architectMarkdown);
await sleep(300);
steps[6].status = "done";
// ── Done ─────────────────────────────────────────────────────────────
const qualityScore = Math.min(98, Math.max(72, stats.qualityEstimate));
await db
.update(filesTable)
.set({
markdownContent: finalMarkdown,
originalMarkdown: finalMarkdown,
status: "done",
wordCount: stats.wordCount,
qualityScore,
language: detectLanguage(finalMarkdown),
updatedAt: new Date(),
})
.where(eq(filesTable.id, fileId));
await db
.update(conversionsTable)
.set({
status: "done",
progress: 100,
steps,
completedAt: new Date(),
elapsedSeconds: Math.floor((Date.now() - startTime) / 1000),
errorMessage: null,
})
.where(eq(conversionsTable.id, conversionId));
} catch (err) {
const error = err instanceof Error ? err.message : "Unknown error";
if (steps[stepIndex]) steps[stepIndex].status = "failed";
await db
.update(conversionsTable)
.set({ status: "failed", steps, errorMessage: error })
.where(eq(conversionsTable.id, conversionId));
await db
.update(filesTable)
.set({ status: "failed", updatedAt: new Date() })
.where(eq(filesTable.id, fileId));
}
}
// ═══════════════════════════════════════════════════════════════════════════
// RULE-BASED ARCHITECT — 100% Free, No External APIs, No Limits
// Handles Arabic academic documents, exams, books, and general text
// ═══════════════════════════════════════════════════════════════════════════
function runRuleBasedArchitect(rawText: string, _ext: string): string {
if (!rawText.trim() || rawText.trim().length < 10) {
return rawText || "# مستند فارغ\n\nلم يتم اكتشاف محتوى نصي في هذا الملف.";
}
const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) || []).length;
const latinChars = (rawText.match(/[a-zA-Z]/g) || []).length;
return arabicChars >= latinChars * 0.4
? formatArabicDocument(rawText)
: formatLatinDocument(rawText);
}
// ── Helpers ─────────────────────────────────────────────────────────────────
function cleanOcrLine(line: string): string {
return line
.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "")
// Strip Unicode bidi / directional control chars that pdfjs embeds from broken-CMap fonts
.replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "")
.replace(/[□■▪▫▶◀►◄▲▼◆◇○●★☆✓✗✦✧]/g, "")
.replace(/\s{2,}/g, " ")
.trim();
}
function isMetaLine(line: string): boolean {
return /^(المادة|الزمن|النموذج|التاريخ|الصف|الشعبة|المدرسة|اسم الطالب|الاسم|الفصل|المرحلة|الفرقة|الدراسي|الفصل الدراسي|المستوى|الشعبة|المجموعة)\s*[::]/i.test(line);
}
function isSectionMarker(line: string): boolean {
if (/^(أولاً|أولا|ثانياً|ثانيا|ثالثاً|ثالثا|رابعاً|رابعا|خامساً|خامسا|سادساً|سادسا|سابعاً|سابعا|ثامناً|ثامنا|تاسعاً|تاسعا|عاشراً|عاشرا)\s*[-:،\s]/.test(line)) return true;
if (/^(Part|Section|Chapter|Unit)\s+[IVXivxA-Z\d]+/i.test(line)) return true;
return false;
}
function isQuestion(line: string): boolean {
// Arabic question starters
if (/^سـ?\s*[\d\u0660-\u0669]+\s*[-:)،\s]/.test(line)) return true;
if (/^سؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true;
if (/^السؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true;
if (/^س\s*[\d\u0660-\u0669]+\s*[-:)،]/.test(line)) return true;
// Numbered with parens: (١) or (1)
if (/^\([\d\u0660-\u0669]+\)\s+\S/.test(line)) return true;
// Numbered with dash: "١- " or "1- " when followed by substantial content
if (/^[\u0660-\u0669\d]+\s*[-–—]\s+.{8,}/.test(line)) return true;
// English
if (/^Q\s*\d+\s*[-:.)]/i.test(line)) return true;
if (/^Question\s+\d+/i.test(line)) return true;
return false;
}
function isKeywordLine(line: string): boolean {
return /^(التعليل|الإجابة|الإجابه|المطلوب|الحل|الشرح|الدليل|السبب|العلة|ملاحظة|ملاحظه|تنبيه|الفائدة|المقصود|المراد|الاستنتاج|التحليل|التفسير|النتيجة|الخلاصة)\s*[::]/i.test(line);
}
function isHeadingCandidate(line: string, lineIndex: number, lines: string[]): boolean {
if (line.length > 80 || line.length < 3) return false;
if (/^#{1,6}\s/.test(line)) return false;
if (/^[-*+\d]/.test(line)) return false;
if (/[.،!؟?]$/.test(line) && line.length > 30) return false;
const prevEmpty = lineIndex === 0 || lines[lineIndex - 1].trim() === "";
const nextEmpty = lineIndex >= lines.length - 1 || lines[lineIndex + 1].trim() === "";
return prevEmpty && nextEmpty;
}
// Expand inline multiple-choice options to a vertical list
// Returns formatted list or null if not detected
// NOTE: Runs on the ORIGINAL (uncleaned) line to detect multi-space separators
function expandMultipleChoice(line: string): string | null {
// Pattern 1: أ- text ب- text ج- text (Arabic with dash, any whitespace between)
const arDashRe = /([أبجد])\s*[-–—]\s*([^أبجد\n-]{1,60}?)(?=\s+[أبجد]\s*[-–—]|\s*$)/g;
const arDash: Array<[string, string]> = [];
let m: RegExpExecArray | null;
while ((m = arDashRe.exec(line)) !== null) {
const text = m[2].trim();
if (text) arDash.push([m[1], text]);
}
if (arDash.length >= 2) {
return arDash.map(([l, t]) => `- ${l}- ${t}`).join("\n");
}
// Pattern 2: (أ) text (ب) text
const arParenRe = /\(([أبجد])\)\s*([^()أبجد\n]{1,60}?)(?=\s*\([أبجد]\)|\s*$)/g;
const arParen: Array<[string, string]> = [];
while ((m = arParenRe.exec(line)) !== null) {
const text = m[2].trim();
if (text) arParen.push([m[1], text]);
}
if (arParen.length >= 2) {
return arParen.map(([l, t]) => `- (${l}) ${t}`).join("\n");
}
// Pattern 3: أ) text ب) text (without outer parens)
const arRParenRe = /([أبجد])\)\s*([^أبجد()]{1,60}?)(?=\s*[أبجد]\)|\s*$)/g;
const arRParen: Array<[string, string]> = [];
while ((m = arRParenRe.exec(line)) !== null) {
const text = m[2].trim();
if (text) arRParen.push([m[1], text]);
}
if (arRParen.length >= 2) {
return arRParen.map(([l, t]) => `- ${l}) ${t}`).join("\n");
}
// Pattern 4: English a) b) c) d) — split by choice marker to avoid char-class issues
const enSplit = line.split(/\s+(?=[a-d]\)\s)/i);
if (enSplit.length >= 2) {
const enChoices: Array<[string, string]> = enSplit
.map(s => {
const mx = s.match(/^([a-d])\)\s+(.*)/i);
return mx ? ([mx[1].toLowerCase(), mx[2].trim()] as [string, string]) : null;
})
.filter((x): x is [string, string] => x !== null);
if (enChoices.length >= 2) {
return enChoices.map(([l, t]) => `- ${l}) ${t}`).join("\n");
}
}
return null;
}
// ── Arabic document formatter ────────────────────────────────────────────────
// Extract all key:value pairs from a meta line that may contain multiple fields
// e.g. "المادة: رياضيات الزمن: ساعة النموذج: أ" → [["المادة","رياضيات"],["الزمن","ساعة"],["النموذج","أ"]]
function splitMetaFields(line: string): Array<[string, string]> {
const pairs: Array<[string, string]> = [];
// Split by 2+ spaces or known separators between fields
// Each segment should start with a known meta key followed by colon
const segments = line.split(/\s{2,}|\t|[|،,]/).map(s => s.trim()).filter(Boolean);
for (const seg of segments) {
const ci = seg.indexOf(":");
if (ci > 0 && isMetaLine(seg)) {
const k = seg.slice(0, ci).trim();
const v = seg.slice(ci + 1).trim();
if (k) pairs.push([k, v]);
}
}
// Fallback: treat whole line as single field
if (pairs.length === 0) {
const ci = line.indexOf(":");
if (ci > 0) {
pairs.push([line.slice(0, ci).trim(), line.slice(ci + 1).trim()]);
}
}
return pairs;
}
function formatArabicDocument(text: string): string {
const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n");
const lines = rawLines.map(cleanOcrLine);
const output: string[] = [];
let i = 0;
// ── Detect and render metadata block from first 15 lines ──
const metaIndices: number[] = [];
for (let j = 0; j < Math.min(15, lines.length); j++) {
if (lines[j] && isMetaLine(lines[j])) metaIndices.push(j);
}
// Handle metadata: each detected meta line may contain multiple inline fields
// Use rawLines to preserve double-space separators
if (metaIndices.length >= 1) {
const allPairs: Array<[string, string]> = [];
for (const idx of metaIndices) {
for (const pair of splitMetaFields(rawLines[idx] || "")) allPairs.push(pair);
}
if (allPairs.length > 0) {
output.push("| الحقل | القيمة |");
output.push("| --- | --- |");
for (const [k, v] of allPairs) output.push(`| ${k} | ${v} |`);
output.push("");
i = Math.max(...metaIndices) + 1;
}
}
// ── Check first content line for document title ──
while (i < lines.length && !lines[i]) i++;
if (i < lines.length) {
const candidate = lines[i];
const isTitle =
candidate.length > 3 &&
candidate.length < 100 &&
!isQuestion(candidate) &&
!isSectionMarker(candidate) &&
!isMetaLine(candidate) &&
!candidate.startsWith("-") &&
!candidate.startsWith("#");
// Only promote to title if metadata was found (strong signal)
if (isTitle && metaIndices.length > 0) {
output.push(`# ${candidate}`);
output.push("");
i++;
}
}
// ── Main pass ──
while (i < lines.length) {
const line = lines[i].trim();
const rawLine = rawLines[i] || ""; // original line before cleaning (for choice detection)
if (!line) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
i++;
continue;
}
// Already a Markdown heading — keep as-is
if (/^#{1,6}\s/.test(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(line);
output.push("");
i++;
continue;
}
// Section markers: أولاً / ثانياً / Part I
if (isSectionMarker(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(`## ${line}`);
output.push("");
i++;
continue;
}
// Question detection
if (isQuestion(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(`**${line}**`);
output.push("");
i++;
continue;
}
// Keyword lines: التعليل: / الإجابة: / المطلوب:
if (isKeywordLine(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(line);
i++;
continue;
}
// Inline multiple choice → vertical list (use rawLine to preserve original spacing)
const expanded = expandMultipleChoice(rawLine) || expandMultipleChoice(line);
if (expanded) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(expanded);
output.push("");
i++;
continue;
}
// Already-formatted list items
if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) {
output.push(line);
i++;
continue;
}
// Lone short line surrounded by blanks → subheading
if (isHeadingCandidate(line, i, lines)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(`### ${line}`);
output.push("");
i++;
continue;
}
// Regular content line
output.push(line);
i++;
}
return output.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}
// ── Latin/English document formatter ────────────────────────────────────────
function formatLatinDocument(text: string): string {
const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n");
const lines = rawLines.map(cleanOcrLine);
const output: string[] = [];
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (!line) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
continue;
}
if (/^#{1,6}\s/.test(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(line);
output.push("");
continue;
}
if (isSectionMarker(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(`## ${line}`);
output.push("");
continue;
}
if (isQuestion(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(`**${line}**`);
output.push("");
continue;
}
if (isKeywordLine(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(line);
continue;
}
const expanded = expandMultipleChoice(line);
if (expanded) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(expanded);
output.push("");
continue;
}
// ALL CAPS short line → subheading
if (/^[A-Z][A-Z\s\d:,.-]{4,60}$/.test(line)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(`### ${line}`);
output.push("");
continue;
}
if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) {
output.push(line);
continue;
}
if (isHeadingCandidate(line, i, lines)) {
if (output.length > 0 && output[output.length - 1] !== "") output.push("");
output.push(`### ${line}`);
output.push("");
continue;
}
output.push(line);
}
return output.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}
// ═══════════════════════════════════════════════════════════════════════════
// Extractors
// ═══════════════════════════════════════════════════════════════════════════
// Max characters extracted from any single document (~2 MB of text ≈ 300 k words)
const TEXT_CAP = 2_000_000;
// ── Arabic PDF text post-processor ───────────────────────────────────────────
// Cleans up the artifacts introduced by PDF text extraction:
// • "-- X of N --" page markers from pdf-parse default renderer
// • Standalone page labels (single Arabic letters/numerals on their own line)
// • Table-of-contents leader dots (". . . . . .") + trailing page numbers
// • Unicode bidi control chars (LRM / RLM / directional overrides)
// • Isolated short CAPS Latin sequences inline in Arabic lines (broken CMap)
// • Collapse excess blank lines
function cleanArabicPdfRaw(text: string): string {
// 1. Strip all Unicode bidi / directional control characters that
// pdfjs-dist embeds when the PDF uses broken ToUnicode CMap fonts.
// These appear as ‎ (U+200E LRM) and ‏ (U+200F RLM) wrapping Latin chars.
text = text.replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "");
// 2. For lines that are predominantly Arabic, remove short ALL-CAPS Latin
// noise sequences — artefacts of broken CMap where Arabic glyphs are
// mapped to Latin code points (e.g. "المبادئ OA العشرة" → OA = garbled Arabic).
// Guard: don't remove if the "Latin" word is a common technical abbreviation.
const KEEP_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS"]);
text = text.split("\n").map(line => {
const arabicCount = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
if (arabicCount < 4) return line; // not an Arabic line — leave intact
// Remove isolated 1-5 char ALL-CAPS sequences (not in safe-list)
return line.replace(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g, (match) =>
KEEP_CAPS.has(match) ? match : ""
).replace(/ {2,}/g, " ").trim();
}).join("\n");
const lines = text.split("\n");
const out: string[] = [];
for (const raw of lines) {
const line = raw.trim();
// 1. Remove "-- X of N --" pdf-parse page markers
if (/^--\s*\d+\s+of\s+\d+\s*--$/i.test(line)) continue;
// 2. Remove standalone page labels:
// • single Arabic letter (أ ب ج etc.)
// • 1–3 Arabic/Eastern-Arabic/Western numerals alone on a line
if (/^[\u0600-\u06FF]{1}$/.test(line)) continue;
if (/^[٠-٩\u0660-\u06690-9]{1,3}$/.test(line)) continue;
// 3. Collapse TOC leader-dot lines: ". . . . . . ." → clean title
// A TOC line has 4+ consecutive dots (possibly space-separated)
if (/\.(\s*\.){3,}/.test(line)) {
const cleaned = line
.replace(/\.(\s*\.)+\s*/g, " ")
.replace(/\s+[٠-٩\u0660-\u06690-9]{1,4}\s*$/, "")
.replace(/\s{2,}/g, " ")
.trim();
if (cleaned.length > 2) out.push(cleaned);
continue;
}
// 4. Strip trailing Arabic/Eastern-Arabic page-number from TOC lines that
// lost their dot-leaders (e.g. "عنوان الكتاب ۰٣"). Heuristic: line is
// mostly Arabic text ending in 1–4 Arabic/Eastern-Arabic digit(s), and
// the Arabic content before the number is ≥10 chars.
const tocTrailing = line.replace(/\s+[٠-٩\u0660-\u0669]{1,4}$/, "");
if (tocTrailing !== line && tocTrailing.length >= 10 && /[\u0600-\u06FF]/.test(tocTrailing)) {
out.push(tocTrailing.trim());
continue;
}
// 5. Preserve empty lines (paragraph breaks)
if (!line) { out.push(""); continue; }
out.push(line);
}
// Collapse runs of 3+ blank lines to 2
return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}
// ── Arabic text AI correction — 100% free, full HF model access ──────────────
// Priority chain (tried in order, falls back on rate-limit / error):
// 1. Replit AI Integration proxy (AI_INTEGRATIONS_OPENAI_BASE_URL) — gpt-4o
// 2. HF: Qwen/Qwen3-72B — best open-source Arabic, Apr 2025
// 3. HF: Qwen/Qwen3-30B-A3B — MoE, fast & very capable
// 4. HF: Qwen/Qwen2.5-72B-Instruct — proven Arabic quality
// 5. HF: meta-llama/Llama-3.3-70B-Instruct — strong multilingual
// 6. HF: mistralai/Mistral-Nemo-Instruct-2407 — fast 12B fallback
//
const AI_CHUNK_CHARS = 3000; // larger chunks → fewer API calls
const AI_CHUNK_TIMEOUT_MS = 120_000;
const AI_SYSTEM_PROMPT =
"أنت نظام متخصص في تصحيح نصوص PDF العربية المستخرجة آلياً. " +
"المهمة: إزالة أخطاء الاستخراج مع الحفاظ التام على المعنى والمحتوى الأصيل. " +
"أنواع الأخطاء الشائعة في هذه النصوص: " +
"١) حروف ومقاطع لاتينية قصيرة مبعثرة داخل النص العربي (مثل OA، BW، Zz، dl، pl) — ضوضاء من ترميز الخط المكسور، احذفها. " +
"٢) كلمات عربية مبتورة أو مشوهة واضحة يمكن تصحيحها من السياق. " +
"٣) مسافات خاطئة داخل الكلمة العربية الواحدة — ادمجها. " +
"٤) رموز متفرقة أو علامات ترقيم غريبة ليست جزءاً من المحتوى — احذفها. " +
"القواعد الصارمة: " +
"أ) احتفظ بالأسماء والمصطلحات التقنية اللاتينية الشائعة (PDF، AI، URL، API...). " +
"ب) حافظ على هيكل الفقرات والعناوين والقوائم وعلامات Markdown كما هي تماماً. " +
"ج) لا تضف أي محتوى جديد أو شروحات. " +
"أعد النص العربي المُصحَح فقط بدون أي مقدمة أو خاتمة.";
type AiEndpoint = { baseUrl: string; apiKey: string; model: string; label: string; noThink?: boolean };
// Returns a prioritised list of AI endpoints to try — best Arabic quality first.
function resolveAiEndpoints(): AiEndpoint[] {
const endpoints: AiEndpoint[] = [];
// 1. Replit AI Integration proxy (zero-config on Replit dev environment)
const replitUrl = process.env.AI_INTEGRATIONS_OPENAI_BASE_URL;
if (replitUrl) {
endpoints.push({
baseUrl: replitUrl,
apiKey: process.env.AI_INTEGRATIONS_OPENAI_API_KEY ?? "placeholder",
model: "gpt-4o",
label: "Replit/gpt-4o",
});
}
// 2-8. HF Router — automatic provider selection (best available with HF_TOKEN)
// As of 2026: router.huggingface.co/v1 routes to the best available provider
// (novita, together, deepinfra, fireworks, hf-inference) based on model support.
// Falls back gracefully: 429/402/503 → next model in chain.
// noThink=true → appends /no_think to disable Qwen3 chain-of-thought for speed.
const hfToken = process.env.HF_TOKEN;
if (hfToken) {
const HF = "https://router.huggingface.co/v1"; // generic router, best model coverage
endpoints.push(
// Qwen3-235B-A22B: #1 Arabic open-source 2026, MoE 235B (22B active) — fastest large model
{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-235B-A22B", label: "HF/Qwen3-235B", noThink: true },
// Qwen3-72B: #2 Arabic, dense 72B, excellent correction quality
{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-72B", label: "HF/Qwen3-72B", noThink: true },
// Llama 4 Scout: Meta's April 2025, 17B MoE (16E), strong Arabic + multimodal
{ baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-4-Scout-17B-16E-Instruct", label: "HF/Llama4-Scout", noThink: false },
// Qwen3-30B-A3B: MoE 30B (3B active), fast and capable
{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-30B-A3B", label: "HF/Qwen3-30B-A3B", noThink: true },
// Qwen2.5-72B: proven, widely available, great Arabic
{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen2.5-72B-Instruct", label: "HF/Qwen2.5-72B", noThink: false },
// Llama 3.3 70B: reliable multilingual fallback
{ baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-3.3-70B-Instruct", label: "HF/Llama3.3-70B", noThink: false },
// Mistral Nemo 12B: lightweight guaranteed fallback
{ baseUrl: HF, apiKey: hfToken, model: "mistralai/Mistral-Nemo-Instruct-2407", label: "HF/Mistral-Nemo", noThink: false },
);
}
return endpoints;
}
function chunkForAiCorrection(text: string): string[] {
const paras = text.split(/\n{2,}/);
const chunks: string[] = [];
let buf = "";
for (const para of paras) {
const joined = buf ? buf + "\n\n" + para : para;
if (joined.length <= AI_CHUNK_CHARS) {
buf = joined;
} else {
if (buf) chunks.push(buf);
if (para.length > AI_CHUNK_CHARS) {
buf = "";
for (const line of para.split("\n")) {
const lj = buf ? buf + "\n" + line : line;
if (lj.length <= AI_CHUNK_CHARS) { buf = lj; }
else { if (buf) chunks.push(buf); buf = line.slice(0, AI_CHUNK_CHARS); }
}
} else {
buf = para;
}
}
}
if (buf.trim()) chunks.push(buf);
return chunks.filter(c => c.trim().length > 0);
}
async function callAiCorrection(
text: string,
ep: AiEndpoint,
): Promise<string> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), AI_CHUNK_TIMEOUT_MS);
try {
// Qwen3 models support /no_think suffix to skip chain-of-thought reasoning,
// giving 3-5× faster responses for straightforward correction tasks.
const userContent = ep.noThink
? `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح: /no_think`
: `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح:`;
const body: Record<string, unknown> = {
model: ep.model,
messages: [
{ role: "system", content: AI_SYSTEM_PROMPT },
{ role: "user", content: userContent },
],
max_tokens: Math.min(4096, Math.ceil(text.length * 2)),
temperature: 0.1, // low temp = deterministic, less hallucination
};
const resp = await fetch(`${ep.baseUrl}/chat/completions`, {
method: "POST",
headers: { Authorization: `Bearer ${ep.apiKey}`, "Content-Type": "application/json" },
body: JSON.stringify(body),
signal: controller.signal,
});
if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" });
if (resp.status === 503) throw Object.assign(new Error("unavailable"), { code: "unavailable" });
if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" }); // no credits → try next
if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" }); // unsupported model
if (!resp.ok) throw new Error(`ai_http_${resp.status}`);
const data = await resp.json() as any;
let corrected = (data.choices?.[0]?.message?.content ?? "").trim();
// Strip any <think>...</think> block Qwen3 might emit even with /no_think
corrected = corrected.replace(/<think>[\s\S]*?<\/think>\s*/gi, "").trim();
// Sanity: output must be 35%–300% of input length
if (!corrected || corrected.length < text.length * 0.35 || corrected.length > text.length * 3) {
return text;
}
return corrected;
} finally {
clearTimeout(timer);
}
}
type ProgressFn = (msg: string, pct: number) => Promise<void>;
async function correctArabicText(rawText: string, onProgress?: ProgressFn): Promise<string> {
const endpoints = resolveAiEndpoints();
if (!endpoints.length) {
logger.info("[arabic-ai] No AI endpoint configured — using OCR text as-is");
return rawText;
}
// Only correct predominantly Arabic text
const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) ?? []).length;
const nonSpaceChars = rawText.replace(/\s/g, "").length;
if (nonSpaceChars < 50 || arabicChars / nonSpaceChars < 0.25) return rawText;
const chunks = chunkForAiCorrection(rawText);
// Find the first working endpoint (try each with a minimal probe if >1 model available)
let activeEpIdx = 0;
logger.info(`[arabic-ai] ${chunks.length} chunks, ${endpoints.length} endpoints available — primary: ${endpoints[0].label}`);
const correctedParts: string[] = [];
for (let i = 0; i < chunks.length; i++) {
const pct = 33 + Math.round((i / chunks.length) * 21);
const ep = endpoints[activeEpIdx];
await onProgress?.(`تصحيح النص عبر ${ep.label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct);
let succeeded = false;
while (activeEpIdx < endpoints.length) {
const cur = endpoints[activeEpIdx];
try {
const result = await callAiCorrection(chunks[i], cur);
correctedParts.push(result);
succeeded = true;
break;
} catch (err: any) {
const code = err?.code ?? err?.message ?? "";
if (code === "rate_limited" || code === "unavailable" || code.startsWith("ai_http_5")) {
logger.warn(`[arabic-ai] ${cur.label} ${code} — switching to next endpoint`);
activeEpIdx++;
// update progress label for new endpoint
if (activeEpIdx < endpoints.length) {
await onProgress?.(`التحويل عبر ${endpoints[activeEpIdx].label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct);
}
} else {
logger.warn({ err }, `[arabic-ai] chunk ${i} error on ${cur.label} — keeping raw text`);
break;
}
}
}
if (!succeeded) {
// All endpoints exhausted or non-retryable error — keep original chunk
correctedParts.push(chunks[i]);
if (activeEpIdx >= endpoints.length) {
// No more endpoints: pass remaining chunks through unchanged
correctedParts.push(...chunks.slice(i + 1));
logger.warn("[arabic-ai] All endpoints exhausted — remaining chunks kept as-is");
break;
}
}
}
return correctedParts.join("\n\n");
}
// ── Garbled Arabic detector ───────────────────────────────────────────────────
// Detects whether pdfjs-dist returned broken CMap output for an Arabic PDF.
// Two root causes:
// A) Character-pair transposition (RTL/LTR confusion): في → يف
// B) Broken ToUnicode CMap: Arabic glyphs mapped to Latin code points,
// producing "OA BW Zz" noise inline with Arabic text, often with
// Unicode bidi control chars (LRM/RLM) wrapping the Latin sequences.
function isGarbledArabic(text: string): boolean {
const arabicChars = (text.match(/[\u0600-\u06FF]/g) ?? []).length;
if (arabicChars < 100) return false;
// ── Type A: character-pair transposition ───────────────────────────────
// Space-delimited يف → garbled في (≥3 occurrences is conclusive)
const garbledFi = (text.match(/ يف /g) ?? []).length;
if (garbledFi >= 3) return true;
// Garbled الحمد (very common opening in Islamic texts)
if (/امحلد/.test(text)) return true;
// Garbled ordinal markers ثانياً / ثالثاً used as section headers
if (/اثنياا|اثلثاا/.test(text)) return true;
// ── Type B: broken CMap → Arabic mapped to Latin code points ───────────
// Signal 1: bidi control chars (LRM U+200E / RLM U+200F) wrapping
// short Latin sequences — pdfjs embeds these from the CMap stream.
// Pattern: ‎OA‏ ‎Zz‏ ‎BW‏ ‎AJ‏
const bidiLatinWraps = (text.match(/[\u200E\u200F][A-Za-z]{1,6}[\u200E\u200F]/g) ?? []).length;
if (bidiLatinWraps >= 3) return true;
// Signal 2: multiple short ALL-CAPS Latin sequences appearing INLINE
// within predominantly-Arabic lines (not at the start of a new sentence).
// e.g. "المبادئ العشرة OA للعلوم BW أولاً" — OA/BW = garbled Arabic words.
const IGNORE_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS",
"I", "II", "III", "IV", "VI", "VII", "VIII", "IX", "XI", "XII"]);
const garbledLines = text.split("\n").filter(line => {
const arabic = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
if (arabic < 3) return false;
const noiseCaps = (line.match(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g) ?? [])
.filter(m => !IGNORE_CAPS.has(m));
return noiseCaps.length >= 2;
}).length;
if (garbledLines >= 4) return true;
// Signal 3: suspiciously high ratio of Latin alphabetic chars in
// a predominantly-Arabic document (broken CMap maps Arabic → Latin).
const latinAlpha = (text.match(/[A-Za-z]/g) ?? []).length;
if (arabicChars >= 300 && latinAlpha > arabicChars * 0.12) return true;
return false;
}
// ── VLM-based OCR per page (olmOCR / Qwen2.5-VL via HF Inference API) ────────
// Uses vision-language models to extract text from rendered page images.
// olmOCR (Allen Institute) is specifically fine-tuned for document OCR and
// achieves top-1 Arabic accuracy on KITAB-Bench benchmarks.
// Model priority: olmOCR-7B → Qwen2.5-VL-7B → Tesseract (local fallback)
// VLM OCR model priority (2026): olmOCR #1 Arabic doc OCR → Qwen2.5-VL-72B → Qwen2.5-VL-7B
// Uses the generic HF router (router.huggingface.co/v1) for maximum model availability.
const VLM_OCR_ROUTER = "https://router.huggingface.co/v1";
const VLM_OCR_MODELS = [
"allenai/olmOCR-7B-0225-preview", // #1: Allen Institute, fine-tuned doc OCR, KITAB-Bench winner
"Qwen/Qwen2.5-VL-72B-Instruct", // #2: larger VLM, best Arabic accuracy (NEW 2026 upgrade)
"Qwen/Qwen2.5-VL-7B-Instruct", // #3: smaller, faster fallback
];
const VLM_PAGE_TIMEOUT_MS = 90_000;
const VLM_OCR_PROMPT =
"Extract all the text from this document page exactly as written. " +
"Preserve Arabic text, paragraph structure, headings, and line breaks. " +
"Do not add explanations or commentary — output only the extracted text.";
async function extractPageViaVlm(pngPath: string, hfToken: string): Promise<string> {
const imgBase64 = fs.readFileSync(pngPath).toString("base64");
for (const model of VLM_OCR_MODELS) {
const ctrl = new AbortController();
const timer = setTimeout(() => ctrl.abort(), VLM_PAGE_TIMEOUT_MS);
try {
const resp = await fetch(`${VLM_OCR_ROUTER}/chat/completions`, {
method: "POST",
headers: { Authorization: `Bearer ${hfToken}`, "Content-Type": "application/json" },
body: JSON.stringify({
model,
messages: [{
role: "user",
content: [
{ type: "image_url", image_url: { url: `data:image/png;base64,${imgBase64}` } },
{ type: "text", text: VLM_OCR_PROMPT },
],
}],
max_tokens: 4096,
temperature: 0.0,
}),
signal: ctrl.signal,
});
clearTimeout(timer);
if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" });
if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" });
if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" });
if (!resp.ok) throw new Error(`vlm_http_${resp.status}`);
const data = await resp.json() as any;
const content = (data.choices?.[0]?.message?.content ?? "").trim();
if (content.length > 20) {
logger.info(`[vlm-ocr] ${model.split("/")[1]}${content.length} chars`);
return content;
}
logger.warn(`[vlm-ocr] ${model.split("/")[1]} returned empty — trying next`);
} catch (err: any) {
clearTimeout(timer);
if (err?.code === "rate_limited") {
logger.warn(`[vlm-ocr] ${model.split("/")[1]} rate-limited`);
throw err; // propagate so caller can switch to Tesseract
}
logger.warn({ err: err?.message }, `[vlm-ocr] ${model.split("/")[1]} failed`);
}
}
throw new Error("all_vlm_models_failed");
}
// ── OCR-based PDF extractor (fallback for broken-CMap PDFs) ──────────────────
// Pipeline:
// 1. pdftoppm renders pages to PNG (200 DPI — optimal for VLM API)
// 2. Per page: try VLM-OCR (olmOCR via HF API) first if HF_TOKEN available
// 3. Fall back to Tesseract (local) if VLM fails / rate-limited
// No page cap — processes the full document regardless of length.
// Filter OCR output: drop lines that are overwhelmingly Latin characters with
// little/no Arabic — these are noise from decorative pages, page headers,
// and OCR misread ornaments (e.g. "Me NY 1", "dl pl a gl", "Fy PIN ENA").
function cleanOcrOutput(text: string): string {
const lines = text.split("\n");
const out: string[] = [];
for (const raw of lines) {
const line = raw.trim();
// Always keep blank lines (paragraph separators)
if (!line) { out.push(""); continue; }
const arabicChars = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
const latinChars = (line.match(/[a-zA-Z]/g) ?? []).length;
const totalAlpha = arabicChars + latinChars;
// Keep if there's meaningful Arabic content
if (arabicChars >= 4) { out.push(line); continue; }
// Reject short lines that are purely Latin noise (≤30 chars, no Arabic)
if (arabicChars === 0 && line.length <= 30) continue;
// Reject lines where Latin chars vastly outnumber Arabic (OCR artefact)
if (totalAlpha > 0 && latinChars / totalAlpha > 0.80 && arabicChars < 4) continue;
// Keep everything else (numbers, punctuation, mixed headings, etc.)
out.push(line);
}
return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}
async function extractPdfViaOcr(
filePath: string,
pageStart?: number,
pageEnd?: number,
onProgress?: (done: number, total: number) => void,
): Promise<string> {
const { execFile } = await import("child_process");
const { promisify } = await import("util");
const execFileAsync = promisify(execFile);
const hfToken = process.env.HF_TOKEN;
const useVlm = !!hfToken;
const tmpDir = fs.mkdtempSync("/tmp/pdf-ocr-");
let tessWorker: any = null;
try {
const startPage = pageStart && pageStart > 0 ? pageStart : 1;
const endPage = pageEnd && pageEnd > 0 ? pageEnd : 9999;
// VLM works great at 200 DPI; Tesseract benefits from 300 DPI.
// When VLM is available we render at 200 DPI (smaller images, faster API).
// If VLM is unavailable or fails entirely, we re-render at 300 DPI for Tesseract.
const dpi = useVlm ? "200" : "300";
await execFileAsync(
"pdftoppm",
["-r", dpi, "-png", "-f", String(startPage), "-l", String(endPage),
filePath, path.join(tmpDir, "page")],
{ timeout: 600_000 },
);
const pngFiles = fs.readdirSync(tmpDir)
.filter(f => f.endsWith(".png"))
.sort()
.map(f => path.join(tmpDir, f));
if (pngFiles.length === 0) return "";
const pageTexts: string[] = [];
let vlmRateLimited = false;
for (let i = 0; i < pngFiles.length; i++) {
let pageText = "";
let usedTesseract = false;
// ── Try VLM-OCR first (olmOCR / Qwen2.5-VL via HF) ────────────────
if (useVlm && !vlmRateLimited) {
try {
pageText = await extractPageViaVlm(pngFiles[i], hfToken);
usedTesseract = false;
} catch (err: any) {
if (err?.code === "rate_limited") {
vlmRateLimited = true;
logger.warn("[vlm-ocr] Rate limited — switching to Tesseract for all remaining pages");
} else {
logger.warn({ err: err?.message }, `[vlm-ocr] page ${i + 1} failed — using Tesseract`);
}
usedTesseract = true;
}
} else {
usedTesseract = true;
}
// ── Fallback: Tesseract (local, guaranteed) ────────────────────────
if (usedTesseract) {
if (!tessWorker) {
// Lazy-initialise Tesseract only when actually needed
const tessDataDir =
process.env.NODE_ENV === "production"
? "/data/tessdata"
: path.join(process.cwd(), "uploads", ".tessdata");
if (!fs.existsSync(tessDataDir)) fs.mkdirSync(tessDataDir, { recursive: true });
const Tesseract = await import("tesseract.js");
tessWorker = await Tesseract.createWorker(["ara", "eng"], 1, {
cachePath: tessDataDir,
workerPath: getTessWorkerPath(),
});
}
const { data: { text } } = await tessWorker.recognize(pngFiles[i]);
pageText = cleanOcrOutput(text);
}
if (pageText.trim()) pageTexts.push(pageText.trim());
onProgress?.(i + 1, pngFiles.length);
}
if (tessWorker) await tessWorker.terminate();
let result = pageTexts.join("\n\n");
if (result.length > TEXT_CAP) result = result.slice(0, TEXT_CAP);
return result;
} catch (e) {
logger.error({ err: e }, "[extractPdfViaOcr] failed");
if (tessWorker) { try { await tessWorker.terminate(); } catch { /* ignore */ } }
return "";
} finally {
try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
}
}
// ── RTL-aware PDF extractor using pdfjs-dist directly ────────────────────────
// pdf-parse v2 has no `pagerender` callback, so we bypass it and use
// pdfjs-dist (already installed as pdf-parse's peer) directly.
//
// Algorithm per page:
// 1. getTextContent() → items with {x, y, width, height, str}
// 2. Bucket items into visual lines by quantised Y (Y_THRESH = 10 pt)
// 3. Sort each bucket right→left (descending X) → correct Arabic reading order
// 4. Join items; insert a space only when the visual gap between adjacent
// items exceeds 25% of the item's font height — this threshold correctly
// handles Arabic ligature sub-glyphs (gap ~1 pt) vs word gaps (gap ~4+ pt)
// without the false positives caused by per-character avgCharWidth.
async function extractPdf(filePath: string, pageStart?: number, pageEnd?: number): Promise<string> {
let pdfDoc: any = null;
try {
const { createRequire } = await import("module");
const req = createRequire(import.meta.url);
// Resolve pdfjs-dist via pdf-parse's own node_modules (it is a declared
// dependency of pdf-parse v2, so it is guaranteed to be present there).
const pdfParseCjsPath = req.resolve("pdf-parse");
const pdfParseReq = createRequire(pdfParseCjsPath);
const pdfjsMjsPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.mjs");
const pdfjsWorkerPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");
// Dynamic ESM import of pdfjs-dist (it is an ES module)
const { getDocument, GlobalWorkerOptions, VerbosityLevel } =
await import(pdfjsMjsPath) as any;
GlobalWorkerOptions.workerSrc = pdfjsWorkerPath;
const MAX_PDF_BYTES = 200 * 1024 * 1024;
const stat = fs.statSync(filePath);
const readSize = Math.min(stat.size, MAX_PDF_BYTES);
const fd = fs.openSync(filePath, "r");
const buf = Buffer.alloc(readSize);
fs.readSync(fd, buf, 0, readSize, 0);
fs.closeSync(fd);
// VerbosityLevel.ERRORS = 0 → suppress "Warning: TT: undefined function" noise
const verbosity: number = (VerbosityLevel as any)?.ERRORS ?? 0;
pdfDoc = await getDocument({
data: new Uint8Array(buf),
useWorkerFetch: false,
isEvalSupported: false,
useSystemFonts: true,
verbosity,
}).promise;
const totalPages = pdfDoc.numPages as number;
const startPage = pageStart && pageStart > 0 ? Math.min(pageStart, totalPages) : 1;
const endPage = pageEnd && pageEnd > 0 ? Math.min(pageEnd, totalPages) : totalPages;
// Y_THRESH = 10 pt: groups diacritics / sub-glyphs on slightly different Y
// into the same visual line.
const Y_THRESH = 10;
type TextItem = { x: number; y: number; str: string; width: number; height: number };
const pageTexts: string[] = [];
for (let p = startPage; p <= endPage; p++) {
const page = await pdfDoc.getPage(p);
const tc = await page.getTextContent({ includeMarkedContent: false });
const items: TextItem[] = [];
for (const it of (tc.items ?? [])) {
if (typeof it.str !== "string" || !it.str.trim()) continue;
items.push({
x: it.transform[4],
y: it.transform[5],
str: it.str,
width: it.width ?? 0,
height: it.height ?? 12, // fallback to 12 pt if absent
});
}
if (!items.length) {
page.cleanup();
continue;
}
// Bucket by quantised Y
const buckets = new Map<number, TextItem[]>();
for (const it of items) {
const key = Math.round(it.y / Y_THRESH) * Y_THRESH;
if (!buckets.has(key)) buckets.set(key, []);
buckets.get(key)!.push(it);
}
// Lines top→bottom (larger Y = higher on PDF page)
const sortedYs = Array.from(buckets.keys()).sort((a, b) => b - a);
const lines: string[] = [];
for (const y of sortedYs) {
const row = buckets.get(y)!;
// RTL: sort right-to-left (descending X)
row.sort((a, b) => b.x - a.x);
// Join items, inserting a space only when the gap between adjacent
// items exceeds 25% of the item's font height.
// This correctly skips ligature sub-glyph gaps (~1 pt) while catching
// genuine inter-word spaces (~4+ pt for typical Arabic body text).
let lineText = "";
for (let i = 0; i < row.length; i++) {
lineText += row[i].str;
if (i < row.length - 1) {
const cur = row[i];
const next = row[i + 1];
// gap = horizontal distance between right edge of `next` and left edge of `cur`
const gap = cur.x - (next.x + next.width);
const spaceThreshold = (cur.height > 0 ? cur.height : 12) * 0.25;
if (gap > spaceThreshold) lineText += " ";
}
}
const trimmed = lineText.trim();
if (trimmed) lines.push(trimmed);
}
page.cleanup();
pageTexts.push(lines.join("\n"));
}
let text = pageTexts.join("\n\n").trim();
// Arabic-specific post-processing: strips page markers, TOC dots, etc.
text = cleanArabicPdfRaw(text);
return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
} catch (e) {
logger.error({ err: e }, "[extractPdf] failed");
return "";
} finally {
if (pdfDoc) {
try { await pdfDoc.destroy(); } catch { /* ignore */ }
}
}
}
async function extractDocx(filePath: string): Promise<string> {
try {
const mammoth = await import("mammoth");
const result = await mammoth.extractRawText({ path: filePath });
const text = result.value?.trim() || "";
return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
} catch (e) {
return "";
}
}
// Resolves the Tesseract.js Node.js worker script path so it works even when
// the server code is bundled with esbuild (which breaks the default auto-resolution).
function getTessWorkerPath(): string {
const pkgJson = _require.resolve("tesseract.js/package.json");
return path.join(path.dirname(pkgJson), "src/worker-script/node/index.js");
}
async function extractImage(filePath: string): Promise<string> {
try {
const Tesseract = await import("tesseract.js");
const cacheDir =
process.env.NODE_ENV === "production"
? "/data/tessdata"
: path.join(process.cwd(), "uploads", ".tessdata");
if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir, { recursive: true });
const worker = await Tesseract.createWorker(["ara", "eng"], 1, {
cachePath: cacheDir,
workerPath: getTessWorkerPath(),
});
const { data: { text } } = await worker.recognize(filePath);
await worker.terminate();
return text?.trim() || "";
} catch (e) {
logger.error({ err: e }, "[extractImage] error");
return "";
}
}
async function extractSpreadsheet(filePath: string, ext: string): Promise<string> {
try {
if (ext === ".csv") {
const content = fs.readFileSync(filePath, "utf-8");
const lines = content.split("\n").filter(Boolean).slice(0, 5000); // cap rows
if (lines.length === 0) return "";
const headers = lines[0].split(",").map((h) => h.trim());
let md = `| ${headers.join(" | ")} |\n`;
md += `| ${headers.map(() => "---").join(" | ")} |\n`;
for (const line of lines.slice(1)) {
const cells = line.split(",").map((c) => c.trim());
md += `| ${cells.join(" | ")} |\n`;
if (md.length > TEXT_CAP) break;
}
return md;
}
const { createRequire } = await import("module");
const req = createRequire(import.meta.url);
const XLSX = req("xlsx");
const workbook = XLSX.readFile(filePath, { sheetRows: 5000 }); // cap rows per sheet
let md = "";
for (const sheetName of workbook.SheetNames) {
const sheet = workbook.Sheets[sheetName];
const data: string[][] = XLSX.utils.sheet_to_json(sheet, { header: 1 });
md += `## ${sheetName}\n\n`;
if (data.length > 0) {
const headers = data[0].map(String);
md += `| ${headers.join(" | ")} |\n`;
md += `| ${headers.map(() => "---").join(" | ")} |\n`;
for (const row of data.slice(1)) {
md += `| ${headers.map((_, idx) => String(row[idx] ?? "")).join(" | ")} |\n`;
if (md.length > TEXT_CAP) break;
}
md += "\n";
}
if (md.length > TEXT_CAP) break;
}
return md;
} catch (e) {
return "";
}
}
async function extractPptx(filePath: string): Promise<string> {
try {
const JSZip = (await import("jszip")).default;
const content = fs.readFileSync(filePath);
const zip = await JSZip.loadAsync(content);
let text = "";
const slideFiles = Object.keys(zip.files)
.filter((f) => f.match(/ppt\/slides\/slide\d+\.xml/))
.sort();
for (const slideFile of slideFiles) {
const xml = await zip.files[slideFile].async("string");
const matches = xml.match(/<a:t>(.*?)<\/a:t>/g) || [];
const slideText = matches
.map((m) => m.replace(/<[^>]+>/g, "").trim())
.filter(Boolean)
.join(" ");
if (slideText) text += slideText + "\n\n";
if (text.length > TEXT_CAP) break;
}
return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
} catch (e) {
return "";
}
}
async function extractEpub(filePath: string): Promise<string> {
try {
const JSZip = (await import("jszip")).default;
const content = fs.readFileSync(filePath);
const zip = await JSZip.loadAsync(content);
let text = "";
for (const filename of Object.keys(zip.files)) {
if (filename.endsWith(".html") || filename.endsWith(".xhtml")) {
const html = await zip.files[filename].async("string");
text += htmlToPlainText(html) + "\n\n";
if (text.length > TEXT_CAP) break;
}
}
return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
} catch (e) {
return "";
}
}
function htmlToPlainText(html: string): string {
return html
.replace(/<h([1-6])[^>]*>(.*?)<\/h\1>/gis, (_, l, c) => "\n" + "#".repeat(Number(l)) + " " + stripTags(c) + "\n")
.replace(/<p[^>]*>(.*?)<\/p>/gis, (_, c) => "\n" + stripTags(c) + "\n")
.replace(/<li[^>]*>(.*?)<\/li>/gis, "- $1\n")
.replace(/<br\s*\/?>/gi, "\n")
.replace(/<[^>]+>/g, "")
.replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&nbsp;/g, " ")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
function stripTags(s: string): string {
return s.replace(/<[^>]+>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").trim();
}
// ═══════════════════════════════════════════════════════════════════════════
// Stats & Utilities
// ═══════════════════════════════════════════════════════════════════════════
function computeStats(md: string) {
const wordCount = md.split(/\s+/).filter(Boolean).length;
const headings = (md.match(/^#{1,6}\s/gm) || []).length;
const boldItems = (md.match(/\*\*[^*]+\*\*/g) || []).length;
const listItems = (md.match(/^[-*+]\s/gm) || []).length;
const tableRows = (md.match(/^\|/gm) || []).length;
const codeBlocks = (md.match(/```/g) || []).length / 2;
const qualityEstimate = Math.min(
98,
72 +
Math.min(headings * 3, 12) +
Math.min(boldItems, 10) +
Math.min(listItems, 8) +
(tableRows > 0 ? 4 : 0) +
(codeBlocks > 0 ? 2 : 0) +
Math.min(wordCount / 50, 10)
);
return { wordCount, headings, boldItems, listItems, qualityEstimate };
}
function cleanMarkdown(md: string): string {
return md
.replace(/\r\n/g, "\n")
.replace(/[ \t]+$/gm, "")
.replace(/\n{4,}/g, "\n\n\n")
.trim();
}
function detectLanguage(text: string): string {
const arabicChars = (text.match(/[\u0600-\u06FF]/g) || []).length;
const latinChars = (text.match(/[a-zA-Z]/g) || []).length;
if (arabicChars > latinChars * 0.6) return "ar";
if (latinChars > arabicChars * 0.6) return "en";
return "mixed";
}
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
// Public entry point — enforces a 15-minute hard limit per conversion job
const CONVERSION_TIMEOUT_MS = 15 * 60 * 1000;
async function runConversion(conversionId: string, fileId: string, storagePath: string) {
try {
await withTimeout(
runConversionCore(conversionId, fileId, storagePath),
CONVERSION_TIMEOUT_MS,
"تحويل الملف"
);
} catch (err) {
const error = err instanceof Error ? err.message : "انتهت مهلة التحويل";
await db.update(conversionsTable)
.set({ status: "failed", errorMessage: error })
.where(eq(conversionsTable.id, conversionId));
await db.update(filesTable)
.set({ status: "failed", updatedAt: new Date() })
.where(eq(filesTable.id, fileId));
}
}
// ═══════════════════════════════════════════════════════════════════════════
// Routes
// ═══════════════════════════════════════════════════════════════════════════
// POST /api/convert/upload
router.post("/upload", upload.single("file"), async (req: AuthRequest, res) => {
try {
if (!req.file) {
res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" });
return;
}
const { pageStart, pageEnd, folderId } = req.body;
const fileName = path.parse(fixFilename(req.file.originalname)).name;
const [file] = await db
.insert(filesTable)
.values({
name: fileName + ".md",
ownerId: req.userId!,
folderId: folderId || null,
originalName: fixFilename(req.file.originalname),
originalType: req.file.mimetype,
sizeBytes: req.file.size,
storagePath: req.file.path,
status: "queued",
})
.returning();
const [conversion] = await db
.insert(conversionsTable)
.values({
fileId: file.id,
userId: req.userId!,
status: "queued",
progress: 0,
steps: initSteps(),
pageStart: pageStart ? Number(pageStart) : null,
pageEnd: pageEnd ? Number(pageEnd) : null,
})
.returning();
runConversion(conversion.id, file.id, req.file.path).catch((err) =>
req.log?.error({ err }, "background conversion error")
);
res.status(202).json({
jobId: conversion.id,
fileId: file.id,
status: "queued",
progress: 0,
steps: initSteps(),
createdAt: conversion.createdAt,
});
} catch (err) {
const e = err instanceof Error ? err : new Error(String(err));
const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause;
const rootMsg = cause?.message ?? e.message;
console.error("[RAQIM] /upload error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack);
req.log?.error({ err, cause: cause?.message }, "upload error");
res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" });
}
});
// POST /api/convert/upload-split — upload once, create N conversion jobs
router.post("/upload-split", upload.single("file"), async (req: AuthRequest, res) => {
try {
if (!req.file) {
res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" });
return;
}
const { ranges: rangesJson, folderId } = req.body;
let ranges: Array<{ start: number; end: number; label: string }> = [];
try {
ranges = JSON.parse(rangesJson || "[]");
} catch {
res.status(400).json({ error: "validation", message: "نطاقات الصفحات غير صالحة" });
return;
}
if (!ranges.length) {
res.status(400).json({ error: "validation", message: "يجب تحديد نطاق واحد على الأقل" });
return;
}
const baseName = path.parse(fixFilename(req.file.originalname)).name;
const jobs = [];
for (const range of ranges) {
const partName = `${baseName}${range.label}.md`;
const [file] = await db
.insert(filesTable)
.values({
name: partName,
ownerId: req.userId!,
folderId: folderId || null,
originalName: fixFilename(req.file!.originalname),
originalType: req.file!.mimetype,
sizeBytes: req.file!.size,
storagePath: req.file!.path,
status: "queued",
})
.returning();
const [conversion] = await db
.insert(conversionsTable)
.values({
fileId: file.id,
userId: req.userId!,
status: "queued",
progress: 0,
steps: initSteps(),
pageStart: range.start || null,
pageEnd: range.end || null,
})
.returning();
runConversion(conversion.id, file.id, req.file!.path).catch((err) =>
req.log?.error({ err }, "split conversion error")
);
jobs.push({ jobId: conversion.id, fileId: file.id, name: partName });
}
res.status(202).json({ jobs });
} catch (err) {
const e = err instanceof Error ? err : new Error(String(err));
const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause;
const rootMsg = cause?.message ?? e.message;
console.error("[RAQIM] /upload-split error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack);
req.log?.error({ err, cause: cause?.message }, "upload-split error");
res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" });
}
});
// POST /api/convert
router.post("/", async (req: AuthRequest, res) => {
try {
const { fileId, pageStart, pageEnd } = req.body;
const file = await db.query.filesTable.findFirst({
where: and(eq(filesTable.id, fileId), eq(filesTable.ownerId, req.userId!)),
});
if (!file || !file.storagePath) {
res.status(404).json({ error: "not_found", message: "الملف غير موجود" });
return;
}
const [conversion] = await db
.insert(conversionsTable)
.values({
fileId: file.id,
userId: req.userId!,
status: "queued",
progress: 0,
steps: initSteps(),
pageStart: pageStart || null,
pageEnd: pageEnd || null,
})
.returning();
runConversion(conversion.id, file.id, file.storagePath).catch((err) =>
req.log?.error({ err }, "background conversion error")
);
res.status(202).json({
jobId: conversion.id,
fileId,
status: "queued",
progress: 0,
steps: initSteps(),
createdAt: conversion.createdAt,
});
} catch (err) {
req.log?.error({ err }, "convert error");
res.status(500).json({ error: "server_error", message: "فشل التحويل" });
}
});
// GET /api/convert/:jobId
router.get("/:jobId", async (req: AuthRequest, res) => {
try {
const jobId = req.params.jobId as string;
const conv = await db.query.conversionsTable.findFirst({
where: and(eq(conversionsTable.id, jobId), eq(conversionsTable.userId, req.userId!)),
});
if (!conv) {
res.status(404).json({ error: "not_found", message: "المهمة غير موجودة" });
return;
}
res.json({
jobId: conv.id,
fileId: conv.fileId,
status: conv.status,
progress: conv.progress,
steps: conv.steps,
queuePosition: null,
elapsedSeconds: conv.elapsedSeconds,
estimatedSeconds: conv.estimatedSeconds,
errorMessage: conv.errorMessage,
createdAt: conv.createdAt,
});
} catch (err) {
req.log?.error({ err }, "get conversion error");
res.status(500).json({ error: "server_error", message: "فشل جلب الحالة" });
}
});
export default router;