Spaces:

abedelbahnasy55
/

raqim

Sleeping

raqim / artifacts /api-server /src /routes /convert.ts

RAQIM Deploy

Deploy RAQIM 2026-05-02 23:08

3e9069b about 1 month ago

71 kB

	import { Router } from "express";
	import multer from "multer";
	import path from "path";
	import fs from "fs";
	import { createRequire } from "module";
	import { db } from "@workspace/db";
	import { filesTable, conversionsTable } from "@workspace/db";
	import { eq, and } from "drizzle-orm";
	import { requireAuth, AuthRequest } from "../middlewares/auth.js";
	import { logger } from "../lib/logger.js";

	// Module-level require() for resolving peer package paths (works in ESM + esbuild bundles)
	const _require = createRequire(import.meta.url);

	const router = Router();
	router.use(requireAuth);

	// In production, use /data/uploads (persistent HF Spaces volume).
	// /tmp/uploads is a tmpfs that starts empty at container boot — unreliable.
	const uploadDir =
	process.env.NODE_ENV === "production"
	? "/data/uploads"
	: path.join(process.cwd(), "uploads");
	try {
	fs.mkdirSync(uploadDir, { recursive: true });
	} catch (e) {
	console.error("[RAQIM] Failed to create upload dir:", uploadDir, e);
	}

	// Multer decodes the filename header as Latin-1 by default; re-encode as UTF-8
	function fixFilename(raw: string): string {
	try {
	return Buffer.from(raw, "latin1").toString("utf8");
	} catch {
	return raw;
	}
	}

	const storage = multer.diskStorage({
	destination: uploadDir,
	filename: (_, file, cb) => cb(null, `${Date.now()}-${fixFilename(file.originalname)}`),
	});
	const upload = multer({ storage, limits: { fileSize: 500 * 1024 * 1024 } });

	const CONVERSION_STEPS = [
	{ name: "analyzing", label: "تحليل الملف والتعرف على نوعه" },
	{ name: "routing", label: "توجيه ذكي لأنسب محركات المعالجة" },
	{ name: "ocr", label: "استخراج النص الخام (OCR / Parser)" },
	{ name: "layout", label: "المهندس الذكي — إعادة بناء التنسيق" },
	{ name: "scoring", label: "تقييم الجودة وإحصاء العناصر" },
	{ name: "merging", label: "دمج الطبقات ومعالجة الهيكل النهائي" },
	{ name: "cleanup", label: "تنظيف وتلميع المستند" },
	];

	function initSteps() {
	return CONVERSION_STEPS.map((s) => ({ ...s, status: "pending" }));
	}

	// Wrap any async fn with a timeout; rejects with an Error if it exceeds ms
	function withTimeout<T>(promise: Promise<T>, ms: number, label: string): Promise<T> {
	return new Promise((resolve, reject) => {
	const timer = setTimeout(() => reject(new Error(`تجاوز الوقت المحدد: ${label}`)), ms);
	promise.then(
	(v) => { clearTimeout(timer); resolve(v); },
	(e) => { clearTimeout(timer); reject(e); }
	);
	});
	}

	async function runConversionCore(conversionId: string, fileId: string, storagePath: string) {
	const steps = initSteps();
	let stepIndex = 0;
	const startTime = Date.now();

	// Read page range set at upload time
	const convRecord = await db.query.conversionsTable.findFirst({
	where: eq(conversionsTable.id, conversionId),
	});
	const pageStart = convRecord?.pageStart ?? undefined;
	const pageEnd = convRecord?.pageEnd ?? undefined;

	const updateProgress = async (
	status: string,
	progress: number,
	stepsDone: typeof steps,
	aiMessage?: string
	) => {
	await db
	.update(conversionsTable)
	.set({
	status: status as any,
	progress,
	steps: stepsDone,
	elapsedSeconds: Math.floor((Date.now() - startTime) / 1000),
	...(aiMessage ? { errorMessage: aiMessage } : {}),
	})
	.where(eq(conversionsTable.id, conversionId));
	};

	try {
	const ext = path.extname(storagePath).toLowerCase();
	let rawText = "";

	// ── Step 1: Analyzing ───────────────────────────────────────────────
	stepIndex = 0;
	steps[0].status = "running";
	await updateProgress("analyzing", 5, steps, "جاري تحليل نوع الملف والبنية الداخلية...");
	await sleep(600);
	steps[0].status = "done";

	// ── Step 2: Routing ─────────────────────────────────────────────────
	stepIndex = 1;
	steps[1].status = "running";
	await updateProgress("routing", 12, steps, "اختيار أنسب محرك استخراج للملف...");
	await sleep(400);
	steps[1].status = "done";

	// ── Step 3: OCR / Text Extraction ───────────────────────────────────
	stepIndex = 2;
	steps[2].status = "running";
	await updateProgress("ocr", 20, steps, "جاري استخراج النص من الملف...");

	if ([".txt", ".md"].includes(ext)) {
	rawText = fs.readFileSync(storagePath, "utf-8");
	} else if (ext === ".pdf") {
	rawText = await extractPdf(storagePath, pageStart, pageEnd);
	await updateProgress("ocr", 28, steps, "تم استخراج النص الخام من الـ PDF...");

	// If text appears garbled (broken ToUnicode CMap in font), fall back to
	// rendering each page as an image and running Tesseract OCR on it.
	// This completely bypasses the CMap issue and works offline/without any API key.
	if (isGarbledArabic(rawText)) {
	await updateProgress("ocr", 30, steps, "تم رصد خلل في ترميز الخط — جاري استخدام OCR للحصول على نص دقيق...");
	const ocrText = await extractPdfViaOcr(storagePath, pageStart, pageEnd,
	(done, total) => updateProgress("ocr", 30 + Math.round((done / total) * 20), steps,
	`جاري تحليل الصفحات بواسطة OCR... (${done}/${total})`)
	);
	if (ocrText.length > 50) {
	rawText = ocrText;
	await updateProgress("ocr", 50, steps, "تم استخراج النص بواسطة OCR بدقة عالية ✓");
	}
	}

	// Optional AI polish — free on Replit (AI proxy) and on HF Spaces (HF_TOKEN).
	rawText = await correctArabicText(rawText, (msg, pct) =>
	updateProgress("ocr", pct, steps, msg)
	);
	await updateProgress("ocr", 55, steps, "اكتمل استخراج النص العربي ✓");
	} else if ([".docx", ".doc"].includes(ext)) {
	rawText = await extractDocx(storagePath);
	await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word...");
	} else if ([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp", ".gif"].includes(ext)) {
	rawText = await extractImage(storagePath);
	await updateProgress("ocr", 38, steps, "تم استخراج النص من الصورة بتقنية OCR...");
	} else if ([".xlsx", ".xls", ".csv"].includes(ext)) {
	rawText = await extractSpreadsheet(storagePath, ext);
	await updateProgress("ocr", 38, steps, "تم تحليل جداول البيانات...");
	} else if ([".html", ".htm"].includes(ext)) {
	const html = fs.readFileSync(storagePath, "utf-8");
	rawText = htmlToPlainText(html);
	await updateProgress("ocr", 38, steps, "تم تحليل ملف HTML...");
	} else if ([".pptx", ".ppt"].includes(ext)) {
	rawText = await extractPptx(storagePath);
	await updateProgress("ocr", 38, steps, "تم استخراج نصوص الشرائح...");
	} else if ([".epub"].includes(ext)) {
	rawText = await extractEpub(storagePath);
	await updateProgress("ocr", 38, steps, "تم استخراج نصوص الكتاب الإلكتروني...");
	} else {
	try {
	rawText = fs.readFileSync(storagePath, "utf-8").substring(0, 100000);
	} catch {
	rawText = `# ملف ثنائي\n\nلا يمكن استخراج نص من هذا النوع من الملفات مباشرة.`;
	}
	}

	steps[2].status = "done";

	// ── Step 4: Rule-Based Architect — 100% Free, No Limits ─────────────
	stepIndex = 3;
	steps[3].status = "running";
	await updateProgress("layout", 45, steps, "المهندس الذكي يعيد بناء هيكل المستند...");

	const architectMarkdown = runRuleBasedArchitect(rawText, ext);
	await updateProgress("layout", 68, steps, "اكتمل تحليل وهيكلة المستند");
	steps[3].status = "done";

	// ── Step 5: Scoring ─────────────────────────────────────────────────
	stepIndex = 4;
	steps[4].status = "running";
	await updateProgress("scoring", 75, steps, "جاري قياس الجودة وإحصاء العناصر...");
	const stats = computeStats(architectMarkdown);
	await sleep(400);
	steps[4].status = "done";

	// ── Step 6: Merging ─────────────────────────────────────────────────
	stepIndex = 5;
	steps[5].status = "running";
	await updateProgress("merging", 85, steps, "دمج الطبقات وتثبيت الهيكل النهائي...");
	await sleep(350);
	steps[5].status = "done";

	// ── Step 7: Cleanup ─────────────────────────────────────────────────
	stepIndex = 6;
	steps[6].status = "running";
	await updateProgress("cleanup", 93, steps, "التلميع النهائي والتحقق من سلامة النص...");
	const finalMarkdown = cleanMarkdown(architectMarkdown);
	await sleep(300);
	steps[6].status = "done";

	// ── Done ─────────────────────────────────────────────────────────────
	const qualityScore = Math.min(98, Math.max(72, stats.qualityEstimate));

	await db
	.update(filesTable)
	.set({
	markdownContent: finalMarkdown,
	originalMarkdown: finalMarkdown,
	status: "done",
	wordCount: stats.wordCount,
	qualityScore,
	language: detectLanguage(finalMarkdown),
	updatedAt: new Date(),
	})
	.where(eq(filesTable.id, fileId));

	await db
	.update(conversionsTable)
	.set({
	status: "done",
	progress: 100,
	steps,
	completedAt: new Date(),
	elapsedSeconds: Math.floor((Date.now() - startTime) / 1000),
	errorMessage: null,
	})
	.where(eq(conversionsTable.id, conversionId));
	} catch (err) {
	const error = err instanceof Error ? err.message : "Unknown error";
	if (steps[stepIndex]) steps[stepIndex].status = "failed";
	await db
	.update(conversionsTable)
	.set({ status: "failed", steps, errorMessage: error })
	.where(eq(conversionsTable.id, conversionId));
	await db
	.update(filesTable)
	.set({ status: "failed", updatedAt: new Date() })
	.where(eq(filesTable.id, fileId));
	}
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// RULE-BASED ARCHITECT — 100% Free, No External APIs, No Limits
	// Handles Arabic academic documents, exams, books, and general text
	// ═══════════════════════════════════════════════════════════════════════════

	function runRuleBasedArchitect(rawText: string, _ext: string): string {
	if (!rawText.trim() \|\| rawText.trim().length < 10) {
	return rawText \|\| "# مستند فارغ\n\nلم يتم اكتشاف محتوى نصي في هذا الملف.";
	}
	const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) \|\| []).length;
	const latinChars = (rawText.match(/[a-zA-Z]/g) \|\| []).length;
	return arabicChars >= latinChars * 0.4
	? formatArabicDocument(rawText)
	: formatLatinDocument(rawText);
	}

	// ── Helpers ─────────────────────────────────────────────────────────────────

	function cleanOcrLine(line: string): string {
	return line
	.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "")
	// Strip Unicode bidi / directional control chars that pdfjs embeds from broken-CMap fonts
	.replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "")
	.replace(/[□■▪▫▶◀►◄▲▼◆◇○●★☆✓✗✦✧]/g, "")
	.replace(/\s{2,}/g, " ")
	.trim();
	}

	function isMetaLine(line: string): boolean {
	return /^(المادة\|الزمن\|النموذج\|التاريخ\|الصف\|الشعبة\|المدرسة\|اسم الطالب\|الاسم\|الفصل\|المرحلة\|الفرقة\|الدراسي\|الفصل الدراسي\|المستوى\|الشعبة\|المجموعة)\s*[:：]/i.test(line);
	}

	function isSectionMarker(line: string): boolean {
	if (/^(أولاً\|أولا\|ثانياً\|ثانيا\|ثالثاً\|ثالثا\|رابعاً\|رابعا\|خامساً\|خامسا\|سادساً\|سادسا\|سابعاً\|سابعا\|ثامناً\|ثامنا\|تاسعاً\|تاسعا\|عاشراً\|عاشرا)\s*[-:،\s]/.test(line)) return true;
	if (/^(Part\|Section\|Chapter\|Unit)\s+[IVXivxA-Z\d]+/i.test(line)) return true;
	return false;
	}

	function isQuestion(line: string): boolean {
	// Arabic question starters
	if (/^سـ?\s[\d\u0660-\u0669]+\s[-:)،\s]/.test(line)) return true;
	if (/^سؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true;
	if (/^السؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true;
	if (/^س\s[\d\u0660-\u0669]+\s[-:)،]/.test(line)) return true;
	// Numbered with parens: (١) or (1)
	if (/^$[\d\u0660-\u0669]+$\s+\S/.test(line)) return true;
	// Numbered with dash: "١- " or "1- " when followed by substantial content
	if (/^[\u0660-\u0669\d]+\s*[-–—]\s+.{8,}/.test(line)) return true;
	// English
	if (/^Q\s\d+\s[-:.)]/i.test(line)) return true;
	if (/^Question\s+\d+/i.test(line)) return true;
	return false;
	}

	function isKeywordLine(line: string): boolean {
	return /^(التعليل\|الإجابة\|الإجابه\|المطلوب\|الحل\|الشرح\|الدليل\|السبب\|العلة\|ملاحظة\|ملاحظه\|تنبيه\|الفائدة\|المقصود\|المراد\|الاستنتاج\|التحليل\|التفسير\|النتيجة\|الخلاصة)\s*[:：]/i.test(line);
	}

	function isHeadingCandidate(line: string, lineIndex: number, lines: string[]): boolean {
	if (line.length > 80 \|\| line.length < 3) return false;
	if (/^#{1,6}\s/.test(line)) return false;
	if (/^[-*+\d]/.test(line)) return false;
	if (/[.،!؟?]$/.test(line) && line.length > 30) return false;
	const prevEmpty = lineIndex === 0 \|\| lines[lineIndex - 1].trim() === "";
	const nextEmpty = lineIndex >= lines.length - 1 \|\| lines[lineIndex + 1].trim() === "";
	return prevEmpty && nextEmpty;
	}

	// Expand inline multiple-choice options to a vertical list
	// Returns formatted list or null if not detected
	// NOTE: Runs on the ORIGINAL (uncleaned) line to detect multi-space separators
	function expandMultipleChoice(line: string): string \| null {
	// Pattern 1: أ- text ب- text ج- text (Arabic with dash, any whitespace between)
	const arDashRe = /([أبجد])\s[-–—]\s([^أبجد\n-]{1,60}?)(?=\s+[أبجد]\s[-–—]\|\s$)/g;
	const arDash: Array<[string, string]> = [];
	let m: RegExpExecArray \| null;
	while ((m = arDashRe.exec(line)) !== null) {
	const text = m[2].trim();
	if (text) arDash.push([m[1], text]);
	}
	if (arDash.length >= 2) {
	return arDash.map(([l, t]) => `- ${l}- ${t}`).join("\n");
	}

	// Pattern 2: (أ) text (ب) text
	const arParenRe = /$([أبجد])$\s([^()أبجد\n]{1,60}?)(?=\s$[أبجد]$\|\s*$)/g;
	const arParen: Array<[string, string]> = [];
	while ((m = arParenRe.exec(line)) !== null) {
	const text = m[2].trim();
	if (text) arParen.push([m[1], text]);
	}
	if (arParen.length >= 2) {
	return arParen.map(([l, t]) => `- (${l}) ${t}`).join("\n");
	}

	// Pattern 3: أ) text ب) text (without outer parens)
	const arRParenRe = /([أبجد])\)\s([^أبجد()]{1,60}?)(?=\s[أبجد]\)\|\s*$)/g;
	const arRParen: Array<[string, string]> = [];
	while ((m = arRParenRe.exec(line)) !== null) {
	const text = m[2].trim();
	if (text) arRParen.push([m[1], text]);
	}
	if (arRParen.length >= 2) {
	return arRParen.map(([l, t]) => `- ${l}) ${t}`).join("\n");
	}

	// Pattern 4: English a) b) c) d) — split by choice marker to avoid char-class issues
	const enSplit = line.split(/\s+(?=[a-d]\)\s)/i);
	if (enSplit.length >= 2) {
	const enChoices: Array<[string, string]> = enSplit
	.map(s => {
	const mx = s.match(/^([a-d])\)\s+(.*)/i);
	return mx ? ([mx[1].toLowerCase(), mx[2].trim()] as [string, string]) : null;
	})
	.filter((x): x is [string, string] => x !== null);
	if (enChoices.length >= 2) {
	return enChoices.map(([l, t]) => `- ${l}) ${t}`).join("\n");
	}
	}

	return null;
	}

	// ── Arabic document formatter ────────────────────────────────────────────────

	// Extract all key:value pairs from a meta line that may contain multiple fields
	// e.g. "المادة: رياضيات الزمن: ساعة النموذج: أ" → [["المادة","رياضيات"],["الزمن","ساعة"],["النموذج","أ"]]
	function splitMetaFields(line: string): Array<[string, string]> {
	const pairs: Array<[string, string]> = [];
	// Split by 2+ spaces or known separators between fields
	// Each segment should start with a known meta key followed by colon
	const segments = line.split(/\s{2,}\|\t\|[\|،,]/).map(s => s.trim()).filter(Boolean);
	for (const seg of segments) {
	const ci = seg.indexOf(":");
	if (ci > 0 && isMetaLine(seg)) {
	const k = seg.slice(0, ci).trim();
	const v = seg.slice(ci + 1).trim();
	if (k) pairs.push([k, v]);
	}
	}
	// Fallback: treat whole line as single field
	if (pairs.length === 0) {
	const ci = line.indexOf(":");
	if (ci > 0) {
	pairs.push([line.slice(0, ci).trim(), line.slice(ci + 1).trim()]);
	}
	}
	return pairs;
	}

	function formatArabicDocument(text: string): string {
	const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n");
	const lines = rawLines.map(cleanOcrLine);
	const output: string[] = [];
	let i = 0;

	// ── Detect and render metadata block from first 15 lines ──
	const metaIndices: number[] = [];
	for (let j = 0; j < Math.min(15, lines.length); j++) {
	if (lines[j] && isMetaLine(lines[j])) metaIndices.push(j);
	}

	// Handle metadata: each detected meta line may contain multiple inline fields
	// Use rawLines to preserve double-space separators
	if (metaIndices.length >= 1) {
	const allPairs: Array<[string, string]> = [];
	for (const idx of metaIndices) {
	for (const pair of splitMetaFields(rawLines[idx] \|\| "")) allPairs.push(pair);
	}
	if (allPairs.length > 0) {
	output.push("\| الحقل \| القيمة \|");
	output.push("\| --- \| --- \|");
	for (const [k, v] of allPairs) output.push(`\| ${k} \| ${v} \|`);
	output.push("");
	i = Math.max(...metaIndices) + 1;
	}
	}

	// ── Check first content line for document title ──
	while (i < lines.length && !lines[i]) i++;
	if (i < lines.length) {
	const candidate = lines[i];
	const isTitle =
	candidate.length > 3 &&
	candidate.length < 100 &&
	!isQuestion(candidate) &&
	!isSectionMarker(candidate) &&
	!isMetaLine(candidate) &&
	!candidate.startsWith("-") &&
	!candidate.startsWith("#");
	// Only promote to title if metadata was found (strong signal)
	if (isTitle && metaIndices.length > 0) {
	output.push(`# ${candidate}`);
	output.push("");
	i++;
	}
	}

	// ── Main pass ──
	while (i < lines.length) {
	const line = lines[i].trim();
	const rawLine = rawLines[i] \|\| ""; // original line before cleaning (for choice detection)

	if (!line) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	i++;
	continue;
	}

	// Already a Markdown heading — keep as-is
	if (/^#{1,6}\s/.test(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(line);
	output.push("");
	i++;
	continue;
	}

	// Section markers: أولاً / ثانياً / Part I
	if (isSectionMarker(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(`## ${line}`);
	output.push("");
	i++;
	continue;
	}

	// Question detection
	if (isQuestion(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(`${line}`);
	output.push("");
	i++;
	continue;
	}

	// Keyword lines: التعليل: / الإجابة: / المطلوب:
	if (isKeywordLine(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(line);
	i++;
	continue;
	}

	// Inline multiple choice → vertical list (use rawLine to preserve original spacing)
	const expanded = expandMultipleChoice(rawLine) \|\| expandMultipleChoice(line);
	if (expanded) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(expanded);
	output.push("");
	i++;
	continue;
	}

	// Already-formatted list items
	if (/^[-*+]\s/.test(line) \|\| /^\d+\.\s/.test(line)) {
	output.push(line);
	i++;
	continue;
	}

	// Lone short line surrounded by blanks → subheading
	if (isHeadingCandidate(line, i, lines)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(`### ${line}`);
	output.push("");
	i++;
	continue;
	}

	// Regular content line
	output.push(line);
	i++;
	}

	return output.join("\n").replace(/\n{3,}/g, "\n\n").trim();
	}

	// ── Latin/English document formatter ────────────────────────────────────────

	function formatLatinDocument(text: string): string {
	const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n");
	const lines = rawLines.map(cleanOcrLine);
	const output: string[] = [];

	for (let i = 0; i < lines.length; i++) {
	const line = lines[i].trim();

	if (!line) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	continue;
	}

	if (/^#{1,6}\s/.test(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(line);
	output.push("");
	continue;
	}

	if (isSectionMarker(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(`## ${line}`);
	output.push("");
	continue;
	}

	if (isQuestion(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(`${line}`);
	output.push("");
	continue;
	}

	if (isKeywordLine(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(line);
	continue;
	}

	const expanded = expandMultipleChoice(line);
	if (expanded) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(expanded);
	output.push("");
	continue;
	}

	// ALL CAPS short line → subheading
	if (/^[A-Z][A-Z\s\d:,.-]{4,60}$/.test(line)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(`### ${line}`);
	output.push("");
	continue;
	}

	if (/^[-*+]\s/.test(line) \|\| /^\d+\.\s/.test(line)) {
	output.push(line);
	continue;
	}

	if (isHeadingCandidate(line, i, lines)) {
	if (output.length > 0 && output[output.length - 1] !== "") output.push("");
	output.push(`### ${line}`);
	output.push("");
	continue;
	}

	output.push(line);
	}

	return output.join("\n").replace(/\n{3,}/g, "\n\n").trim();
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Extractors
	// ═══════════════════════════════════════════════════════════════════════════
	// Max characters extracted from any single document (~2 MB of text ≈ 300 k words)
	const TEXT_CAP = 2_000_000;

	// ── Arabic PDF text post-processor ───────────────────────────────────────────
	// Cleans up the artifacts introduced by PDF text extraction:
	// • "-- X of N --" page markers from pdf-parse default renderer
	// • Standalone page labels (single Arabic letters/numerals on their own line)
	// • Table-of-contents leader dots (". . . . . .") + trailing page numbers
	// • Unicode bidi control chars (LRM / RLM / directional overrides)
	// • Isolated short CAPS Latin sequences inline in Arabic lines (broken CMap)
	// • Collapse excess blank lines
	function cleanArabicPdfRaw(text: string): string {
	// 1. Strip all Unicode bidi / directional control characters that
	// pdfjs-dist embeds when the PDF uses broken ToUnicode CMap fonts.
	// These appear as ‎ (U+200E LRM) and ‏ (U+200F RLM) wrapping Latin chars.
	text = text.replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "");

	// 2. For lines that are predominantly Arabic, remove short ALL-CAPS Latin
	// noise sequences — artefacts of broken CMap where Arabic glyphs are
	// mapped to Latin code points (e.g. "المبادئ OA العشرة" → OA = garbled Arabic).
	// Guard: don't remove if the "Latin" word is a common technical abbreviation.
	const KEEP_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS"]);
	text = text.split("\n").map(line => {
	const arabicCount = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
	if (arabicCount < 4) return line; // not an Arabic line — leave intact
	// Remove isolated 1-5 char ALL-CAPS sequences (not in safe-list)
	return line.replace(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g, (match) =>
	KEEP_CAPS.has(match) ? match : ""
	).replace(/ {2,}/g, " ").trim();
	}).join("\n");

	const lines = text.split("\n");
	const out: string[] = [];

	for (const raw of lines) {
	const line = raw.trim();

	// 1. Remove "-- X of N --" pdf-parse page markers
	if (/^--\s\d+\s+of\s+\d+\s--$/i.test(line)) continue;

	// 2. Remove standalone page labels:
	// • single Arabic letter (أ ب ج etc.)
	// • 1–3 Arabic/Eastern-Arabic/Western numerals alone on a line
	if (/^[\u0600-\u06FF]{1}$/.test(line)) continue;
	if (/^[٠-٩\u0660-\u06690-9]{1,3}$/.test(line)) continue;

	// 3. Collapse TOC leader-dot lines: ". . . . . . ." → clean title
	// A TOC line has 4+ consecutive dots (possibly space-separated)
	if (/\.(\s*\.){3,}/.test(line)) {
	const cleaned = line
	.replace(/\.(\s\.)+\s/g, " ")
	.replace(/\s+[٠-٩\u0660-\u06690-9]{1,4}\s*$/, "")
	.replace(/\s{2,}/g, " ")
	.trim();
	if (cleaned.length > 2) out.push(cleaned);
	continue;
	}

	// 4. Strip trailing Arabic/Eastern-Arabic page-number from TOC lines that
	// lost their dot-leaders (e.g. "عنوان الكتاب ۰٣"). Heuristic: line is
	// mostly Arabic text ending in 1–4 Arabic/Eastern-Arabic digit(s), and
	// the Arabic content before the number is ≥10 chars.
	const tocTrailing = line.replace(/\s+[٠-٩\u0660-\u0669]{1,4}$/, "");
	if (tocTrailing !== line && tocTrailing.length >= 10 && /[\u0600-\u06FF]/.test(tocTrailing)) {
	out.push(tocTrailing.trim());
	continue;
	}

	// 5. Preserve empty lines (paragraph breaks)
	if (!line) { out.push(""); continue; }

	out.push(line);
	}

	// Collapse runs of 3+ blank lines to 2
	return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
	}

	// ── Arabic text AI correction — 100% free, full HF model access ──────────────
	// Priority chain (tried in order, falls back on rate-limit / error):
	// 1. Replit AI Integration proxy (AI_INTEGRATIONS_OPENAI_BASE_URL) — gpt-4o
	// 2. HF: Qwen/Qwen3-72B — best open-source Arabic, Apr 2025
	// 3. HF: Qwen/Qwen3-30B-A3B — MoE, fast & very capable
	// 4. HF: Qwen/Qwen2.5-72B-Instruct — proven Arabic quality
	// 5. HF: meta-llama/Llama-3.3-70B-Instruct — strong multilingual
	// 6. HF: mistralai/Mistral-Nemo-Instruct-2407 — fast 12B fallback
	//
	const AI_CHUNK_CHARS = 3000; // larger chunks → fewer API calls
	const AI_CHUNK_TIMEOUT_MS = 120_000;

	const AI_SYSTEM_PROMPT =
	"أنت نظام متخصص في تصحيح نصوص PDF العربية المستخرجة آلياً. " +
	"المهمة: إزالة أخطاء الاستخراج مع الحفاظ التام على المعنى والمحتوى الأصيل. " +
	"أنواع الأخطاء الشائعة في هذه النصوص: " +
	"١) حروف ومقاطع لاتينية قصيرة مبعثرة داخل النص العربي (مثل OA، BW، Zz، dl، pl) — ضوضاء من ترميز الخط المكسور، احذفها. " +
	"٢) كلمات عربية مبتورة أو مشوهة واضحة يمكن تصحيحها من السياق. " +
	"٣) مسافات خاطئة داخل الكلمة العربية الواحدة — ادمجها. " +
	"٤) رموز متفرقة أو علامات ترقيم غريبة ليست جزءاً من المحتوى — احذفها. " +
	"القواعد الصارمة: " +
	"أ) احتفظ بالأسماء والمصطلحات التقنية اللاتينية الشائعة (PDF، AI، URL، API...). " +
	"ب) حافظ على هيكل الفقرات والعناوين والقوائم وعلامات Markdown كما هي تماماً. " +
	"ج) لا تضف أي محتوى جديد أو شروحات. " +
	"أعد النص العربي المُصحَح فقط بدون أي مقدمة أو خاتمة.";

	type AiEndpoint = { baseUrl: string; apiKey: string; model: string; label: string; noThink?: boolean };

	// Returns a prioritised list of AI endpoints to try — best Arabic quality first.
	function resolveAiEndpoints(): AiEndpoint[] {
	const endpoints: AiEndpoint[] = [];

	// 1. Replit AI Integration proxy (zero-config on Replit dev environment)
	const replitUrl = process.env.AI_INTEGRATIONS_OPENAI_BASE_URL;
	if (replitUrl) {
	endpoints.push({
	baseUrl: replitUrl,
	apiKey: process.env.AI_INTEGRATIONS_OPENAI_API_KEY ?? "placeholder",
	model: "gpt-4o",
	label: "Replit/gpt-4o",
	});
	}

	// 2-8. HF Router — automatic provider selection (best available with HF_TOKEN)
	// As of 2026: router.huggingface.co/v1 routes to the best available provider
	// (novita, together, deepinfra, fireworks, hf-inference) based on model support.
	// Falls back gracefully: 429/402/503 → next model in chain.
	// noThink=true → appends /no_think to disable Qwen3 chain-of-thought for speed.
	const hfToken = process.env.HF_TOKEN;
	if (hfToken) {
	const HF = "https://router.huggingface.co/v1"; // generic router, best model coverage
	endpoints.push(
	// Qwen3-235B-A22B: #1 Arabic open-source 2026, MoE 235B (22B active) — fastest large model
	{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-235B-A22B", label: "HF/Qwen3-235B", noThink: true },
	// Qwen3-72B: #2 Arabic, dense 72B, excellent correction quality
	{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-72B", label: "HF/Qwen3-72B", noThink: true },
	// Llama 4 Scout: Meta's April 2025, 17B MoE (16E), strong Arabic + multimodal
	{ baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-4-Scout-17B-16E-Instruct", label: "HF/Llama4-Scout", noThink: false },
	// Qwen3-30B-A3B: MoE 30B (3B active), fast and capable
	{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-30B-A3B", label: "HF/Qwen3-30B-A3B", noThink: true },
	// Qwen2.5-72B: proven, widely available, great Arabic
	{ baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen2.5-72B-Instruct", label: "HF/Qwen2.5-72B", noThink: false },
	// Llama 3.3 70B: reliable multilingual fallback
	{ baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-3.3-70B-Instruct", label: "HF/Llama3.3-70B", noThink: false },
	// Mistral Nemo 12B: lightweight guaranteed fallback
	{ baseUrl: HF, apiKey: hfToken, model: "mistralai/Mistral-Nemo-Instruct-2407", label: "HF/Mistral-Nemo", noThink: false },
	);
	}

	return endpoints;
	}

	function chunkForAiCorrection(text: string): string[] {
	const paras = text.split(/\n{2,}/);
	const chunks: string[] = [];
	let buf = "";
	for (const para of paras) {
	const joined = buf ? buf + "\n\n" + para : para;
	if (joined.length <= AI_CHUNK_CHARS) {
	buf = joined;
	} else {
	if (buf) chunks.push(buf);
	if (para.length > AI_CHUNK_CHARS) {
	buf = "";
	for (const line of para.split("\n")) {
	const lj = buf ? buf + "\n" + line : line;
	if (lj.length <= AI_CHUNK_CHARS) { buf = lj; }
	else { if (buf) chunks.push(buf); buf = line.slice(0, AI_CHUNK_CHARS); }
	}
	} else {
	buf = para;
	}
	}
	}
	if (buf.trim()) chunks.push(buf);
	return chunks.filter(c => c.trim().length > 0);
	}

	async function callAiCorrection(
	text: string,
	ep: AiEndpoint,
	): Promise<string> {
	const controller = new AbortController();
	const timer = setTimeout(() => controller.abort(), AI_CHUNK_TIMEOUT_MS);
	try {
	// Qwen3 models support /no_think suffix to skip chain-of-thought reasoning,
	// giving 3-5× faster responses for straightforward correction tasks.
	const userContent = ep.noThink
	? `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح: /no_think`
	: `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح:`;

	const body: Record<string, unknown> = {
	model: ep.model,
	messages: [
	{ role: "system", content: AI_SYSTEM_PROMPT },
	{ role: "user", content: userContent },
	],
	max_tokens: Math.min(4096, Math.ceil(text.length * 2)),
	temperature: 0.1, // low temp = deterministic, less hallucination
	};

	const resp = await fetch(`${ep.baseUrl}/chat/completions`, {
	method: "POST",
	headers: { Authorization: `Bearer ${ep.apiKey}`, "Content-Type": "application/json" },
	body: JSON.stringify(body),
	signal: controller.signal,
	});

	if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" });
	if (resp.status === 503) throw Object.assign(new Error("unavailable"), { code: "unavailable" });
	if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" }); // no credits → try next
	if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" }); // unsupported model
	if (!resp.ok) throw new Error(`ai_http_${resp.status}`);

	const data = await resp.json() as any;
	let corrected = (data.choices?.[0]?.message?.content ?? "").trim();

	// Strip any <think>...</think> block Qwen3 might emit even with /no_think
	corrected = corrected.replace(/<think>[\s\S]?<\/think>\s/gi, "").trim();

	// Sanity: output must be 35%–300% of input length
	if (!corrected \|\| corrected.length < text.length * 0.35 \|\| corrected.length > text.length * 3) {
	return text;
	}
	return corrected;
	} finally {
	clearTimeout(timer);
	}
	}

	type ProgressFn = (msg: string, pct: number) => Promise<void>;

	async function correctArabicText(rawText: string, onProgress?: ProgressFn): Promise<string> {
	const endpoints = resolveAiEndpoints();
	if (!endpoints.length) {
	logger.info("[arabic-ai] No AI endpoint configured — using OCR text as-is");
	return rawText;
	}

	// Only correct predominantly Arabic text
	const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) ?? []).length;
	const nonSpaceChars = rawText.replace(/\s/g, "").length;
	if (nonSpaceChars < 50 \|\| arabicChars / nonSpaceChars < 0.25) return rawText;

	const chunks = chunkForAiCorrection(rawText);

	// Find the first working endpoint (try each with a minimal probe if >1 model available)
	let activeEpIdx = 0;
	logger.info(`[arabic-ai] ${chunks.length} chunks, ${endpoints.length} endpoints available — primary: ${endpoints[0].label}`);

	const correctedParts: string[] = [];

	for (let i = 0; i < chunks.length; i++) {
	const pct = 33 + Math.round((i / chunks.length) * 21);
	const ep = endpoints[activeEpIdx];
	await onProgress?.(`تصحيح النص عبر ${ep.label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct);

	let succeeded = false;
	while (activeEpIdx < endpoints.length) {
	const cur = endpoints[activeEpIdx];
	try {
	const result = await callAiCorrection(chunks[i], cur);
	correctedParts.push(result);
	succeeded = true;
	break;
	} catch (err: any) {
	const code = err?.code ?? err?.message ?? "";
	if (code === "rate_limited" \|\| code === "unavailable" \|\| code.startsWith("ai_http_5")) {
	logger.warn(`[arabic-ai] ${cur.label} ${code} — switching to next endpoint`);
	activeEpIdx++;
	// update progress label for new endpoint
	if (activeEpIdx < endpoints.length) {
	await onProgress?.(`التحويل عبر ${endpoints[activeEpIdx].label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct);
	}
	} else {
	logger.warn({ err }, `[arabic-ai] chunk ${i} error on ${cur.label} — keeping raw text`);
	break;
	}
	}
	}

	if (!succeeded) {
	// All endpoints exhausted or non-retryable error — keep original chunk
	correctedParts.push(chunks[i]);
	if (activeEpIdx >= endpoints.length) {
	// No more endpoints: pass remaining chunks through unchanged
	correctedParts.push(...chunks.slice(i + 1));
	logger.warn("[arabic-ai] All endpoints exhausted — remaining chunks kept as-is");
	break;
	}
	}
	}

	return correctedParts.join("\n\n");
	}

	// ── Garbled Arabic detector ───────────────────────────────────────────────────
	// Detects whether pdfjs-dist returned broken CMap output for an Arabic PDF.
	// Two root causes:
	// A) Character-pair transposition (RTL/LTR confusion): في → يف
	// B) Broken ToUnicode CMap: Arabic glyphs mapped to Latin code points,
	// producing "OA BW Zz" noise inline with Arabic text, often with
	// Unicode bidi control chars (LRM/RLM) wrapping the Latin sequences.
	function isGarbledArabic(text: string): boolean {
	const arabicChars = (text.match(/[\u0600-\u06FF]/g) ?? []).length;
	if (arabicChars < 100) return false;

	// ── Type A: character-pair transposition ───────────────────────────────
	// Space-delimited يف → garbled في (≥3 occurrences is conclusive)
	const garbledFi = (text.match(/ يف /g) ?? []).length;
	if (garbledFi >= 3) return true;

	// Garbled الحمد (very common opening in Islamic texts)
	if (/امحلد/.test(text)) return true;

	// Garbled ordinal markers ثانياً / ثالثاً used as section headers
	if (/اثنياا\|اثلثاا/.test(text)) return true;

	// ── Type B: broken CMap → Arabic mapped to Latin code points ───────────
	// Signal 1: bidi control chars (LRM U+200E / RLM U+200F) wrapping
	// short Latin sequences — pdfjs embeds these from the CMap stream.
	// Pattern: ‎OA‏ ‎Zz‏ ‎BW‏ ‎AJ‏
	const bidiLatinWraps = (text.match(/[\u200E\u200F][A-Za-z]{1,6}[\u200E\u200F]/g) ?? []).length;
	if (bidiLatinWraps >= 3) return true;

	// Signal 2: multiple short ALL-CAPS Latin sequences appearing INLINE
	// within predominantly-Arabic lines (not at the start of a new sentence).
	// e.g. "المبادئ العشرة OA للعلوم BW أولاً" — OA/BW = garbled Arabic words.
	const IGNORE_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS",
	"I", "II", "III", "IV", "VI", "VII", "VIII", "IX", "XI", "XII"]);
	const garbledLines = text.split("\n").filter(line => {
	const arabic = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
	if (arabic < 3) return false;
	const noiseCaps = (line.match(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g) ?? [])
	.filter(m => !IGNORE_CAPS.has(m));
	return noiseCaps.length >= 2;
	}).length;
	if (garbledLines >= 4) return true;

	// Signal 3: suspiciously high ratio of Latin alphabetic chars in
	// a predominantly-Arabic document (broken CMap maps Arabic → Latin).
	const latinAlpha = (text.match(/[A-Za-z]/g) ?? []).length;
	if (arabicChars >= 300 && latinAlpha > arabicChars * 0.12) return true;

	return false;
	}

	// ── VLM-based OCR per page (olmOCR / Qwen2.5-VL via HF Inference API) ────────
	// Uses vision-language models to extract text from rendered page images.
	// olmOCR (Allen Institute) is specifically fine-tuned for document OCR and
	// achieves top-1 Arabic accuracy on KITAB-Bench benchmarks.
	// Model priority: olmOCR-7B → Qwen2.5-VL-7B → Tesseract (local fallback)

	// VLM OCR model priority (2026): olmOCR #1 Arabic doc OCR → Qwen2.5-VL-72B → Qwen2.5-VL-7B
	// Uses the generic HF router (router.huggingface.co/v1) for maximum model availability.
	const VLM_OCR_ROUTER = "https://router.huggingface.co/v1";
	const VLM_OCR_MODELS = [
	"allenai/olmOCR-7B-0225-preview", // #1: Allen Institute, fine-tuned doc OCR, KITAB-Bench winner
	"Qwen/Qwen2.5-VL-72B-Instruct", // #2: larger VLM, best Arabic accuracy (NEW 2026 upgrade)
	"Qwen/Qwen2.5-VL-7B-Instruct", // #3: smaller, faster fallback
	];
	const VLM_PAGE_TIMEOUT_MS = 90_000;
	const VLM_OCR_PROMPT =
	"Extract all the text from this document page exactly as written. " +
	"Preserve Arabic text, paragraph structure, headings, and line breaks. " +
	"Do not add explanations or commentary — output only the extracted text.";

	async function extractPageViaVlm(pngPath: string, hfToken: string): Promise<string> {
	const imgBase64 = fs.readFileSync(pngPath).toString("base64");

	for (const model of VLM_OCR_MODELS) {
	const ctrl = new AbortController();
	const timer = setTimeout(() => ctrl.abort(), VLM_PAGE_TIMEOUT_MS);
	try {
	const resp = await fetch(`${VLM_OCR_ROUTER}/chat/completions`, {
	method: "POST",
	headers: { Authorization: `Bearer ${hfToken}`, "Content-Type": "application/json" },
	body: JSON.stringify({
	model,
	messages: [{
	role: "user",
	content: [
	{ type: "image_url", image_url: { url: `data:image/png;base64,${imgBase64}` } },
	{ type: "text", text: VLM_OCR_PROMPT },
	],
	}],
	max_tokens: 4096,
	temperature: 0.0,
	}),
	signal: ctrl.signal,
	});
	clearTimeout(timer);
	if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" });
	if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable" });
	if (resp.status === 404) throw Object.assign(new Error("model_not_found"), { code: "unavailable" });
	if (!resp.ok) throw new Error(`vlm_http_${resp.status}`);
	const data = await resp.json() as any;
	const content = (data.choices?.[0]?.message?.content ?? "").trim();
	if (content.length > 20) {
	logger.info(`[vlm-ocr] ${model.split("/")[1]} → ${content.length} chars`);
	return content;
	}
	logger.warn(`[vlm-ocr] ${model.split("/")[1]} returned empty — trying next`);
	} catch (err: any) {
	clearTimeout(timer);
	if (err?.code === "rate_limited") {
	logger.warn(`[vlm-ocr] ${model.split("/")[1]} rate-limited`);
	throw err; // propagate so caller can switch to Tesseract
	}
	logger.warn({ err: err?.message }, `[vlm-ocr] ${model.split("/")[1]} failed`);
	}
	}
	throw new Error("all_vlm_models_failed");
	}

	// ── OCR-based PDF extractor (fallback for broken-CMap PDFs) ──────────────────
	// Pipeline:
	// 1. pdftoppm renders pages to PNG (200 DPI — optimal for VLM API)
	// 2. Per page: try VLM-OCR (olmOCR via HF API) first if HF_TOKEN available
	// 3. Fall back to Tesseract (local) if VLM fails / rate-limited
	// No page cap — processes the full document regardless of length.

	// Filter OCR output: drop lines that are overwhelmingly Latin characters with
	// little/no Arabic — these are noise from decorative pages, page headers,
	// and OCR misread ornaments (e.g. "Me NY 1", "dl pl a gl", "Fy PIN ENA").
	function cleanOcrOutput(text: string): string {
	const lines = text.split("\n");
	const out: string[] = [];

	for (const raw of lines) {
	const line = raw.trim();

	// Always keep blank lines (paragraph separators)
	if (!line) { out.push(""); continue; }

	const arabicChars = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
	const latinChars = (line.match(/[a-zA-Z]/g) ?? []).length;
	const totalAlpha = arabicChars + latinChars;

	// Keep if there's meaningful Arabic content
	if (arabicChars >= 4) { out.push(line); continue; }

	// Reject short lines that are purely Latin noise (≤30 chars, no Arabic)
	if (arabicChars === 0 && line.length <= 30) continue;

	// Reject lines where Latin chars vastly outnumber Arabic (OCR artefact)
	if (totalAlpha > 0 && latinChars / totalAlpha > 0.80 && arabicChars < 4) continue;

	// Keep everything else (numbers, punctuation, mixed headings, etc.)
	out.push(line);
	}

	return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
	}

	async function extractPdfViaOcr(
	filePath: string,
	pageStart?: number,
	pageEnd?: number,
	onProgress?: (done: number, total: number) => void,
	): Promise<string> {
	const { execFile } = await import("child_process");
	const { promisify } = await import("util");
	const execFileAsync = promisify(execFile);

	const hfToken = process.env.HF_TOKEN;
	const useVlm = !!hfToken;

	const tmpDir = fs.mkdtempSync("/tmp/pdf-ocr-");
	let tessWorker: any = null;

	try {
	const startPage = pageStart && pageStart > 0 ? pageStart : 1;
	const endPage = pageEnd && pageEnd > 0 ? pageEnd : 9999;

	// VLM works great at 200 DPI; Tesseract benefits from 300 DPI.
	// When VLM is available we render at 200 DPI (smaller images, faster API).
	// If VLM is unavailable or fails entirely, we re-render at 300 DPI for Tesseract.
	const dpi = useVlm ? "200" : "300";
	await execFileAsync(
	"pdftoppm",
	["-r", dpi, "-png", "-f", String(startPage), "-l", String(endPage),
	filePath, path.join(tmpDir, "page")],
	{ timeout: 600_000 },
	);

	const pngFiles = fs.readdirSync(tmpDir)
	.filter(f => f.endsWith(".png"))
	.sort()
	.map(f => path.join(tmpDir, f));

	if (pngFiles.length === 0) return "";

	const pageTexts: string[] = [];
	let vlmRateLimited = false;

	for (let i = 0; i < pngFiles.length; i++) {
	let pageText = "";
	let usedTesseract = false;

	// ── Try VLM-OCR first (olmOCR / Qwen2.5-VL via HF) ────────────────
	if (useVlm && !vlmRateLimited) {
	try {
	pageText = await extractPageViaVlm(pngFiles[i], hfToken);
	usedTesseract = false;
	} catch (err: any) {
	if (err?.code === "rate_limited") {
	vlmRateLimited = true;
	logger.warn("[vlm-ocr] Rate limited — switching to Tesseract for all remaining pages");
	} else {
	logger.warn({ err: err?.message }, `[vlm-ocr] page ${i + 1} failed — using Tesseract`);
	}
	usedTesseract = true;
	}
	} else {
	usedTesseract = true;
	}

	// ── Fallback: Tesseract (local, guaranteed) ────────────────────────
	if (usedTesseract) {
	if (!tessWorker) {
	// Lazy-initialise Tesseract only when actually needed
	const tessDataDir =
	process.env.NODE_ENV === "production"
	? "/data/tessdata"
	: path.join(process.cwd(), "uploads", ".tessdata");
	if (!fs.existsSync(tessDataDir)) fs.mkdirSync(tessDataDir, { recursive: true });
	const Tesseract = await import("tesseract.js");
	tessWorker = await Tesseract.createWorker(["ara", "eng"], 1, {
	cachePath: tessDataDir,
	workerPath: getTessWorkerPath(),
	});
	}
	const { data: { text } } = await tessWorker.recognize(pngFiles[i]);
	pageText = cleanOcrOutput(text);
	}

	if (pageText.trim()) pageTexts.push(pageText.trim());
	onProgress?.(i + 1, pngFiles.length);
	}

	if (tessWorker) await tessWorker.terminate();

	let result = pageTexts.join("\n\n");
	if (result.length > TEXT_CAP) result = result.slice(0, TEXT_CAP);
	return result;
	} catch (e) {
	logger.error({ err: e }, "[extractPdfViaOcr] failed");
	if (tessWorker) { try { await tessWorker.terminate(); } catch { /* ignore */ } }
	return "";
	} finally {
	try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
	}
	}

	// ── RTL-aware PDF extractor using pdfjs-dist directly ────────────────────────
	// pdf-parse v2 has no `pagerender` callback, so we bypass it and use
	// pdfjs-dist (already installed as pdf-parse's peer) directly.
	//
	// Algorithm per page:
	// 1. getTextContent() → items with {x, y, width, height, str}
	// 2. Bucket items into visual lines by quantised Y (Y_THRESH = 10 pt)
	// 3. Sort each bucket right→left (descending X) → correct Arabic reading order
	// 4. Join items; insert a space only when the visual gap between adjacent
	// items exceeds 25% of the item's font height — this threshold correctly
	// handles Arabic ligature sub-glyphs (gap ~1 pt) vs word gaps (gap ~4+ pt)
	// without the false positives caused by per-character avgCharWidth.
	async function extractPdf(filePath: string, pageStart?: number, pageEnd?: number): Promise<string> {
	let pdfDoc: any = null;
	try {
	const { createRequire } = await import("module");
	const req = createRequire(import.meta.url);

	// Resolve pdfjs-dist via pdf-parse's own node_modules (it is a declared
	// dependency of pdf-parse v2, so it is guaranteed to be present there).
	const pdfParseCjsPath = req.resolve("pdf-parse");
	const pdfParseReq = createRequire(pdfParseCjsPath);
	const pdfjsMjsPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.mjs");
	const pdfjsWorkerPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");

	// Dynamic ESM import of pdfjs-dist (it is an ES module)
	const { getDocument, GlobalWorkerOptions, VerbosityLevel } =
	await import(pdfjsMjsPath) as any;

	GlobalWorkerOptions.workerSrc = pdfjsWorkerPath;

	const MAX_PDF_BYTES = 200 * 1024 * 1024;
	const stat = fs.statSync(filePath);
	const readSize = Math.min(stat.size, MAX_PDF_BYTES);
	const fd = fs.openSync(filePath, "r");
	const buf = Buffer.alloc(readSize);
	fs.readSync(fd, buf, 0, readSize, 0);
	fs.closeSync(fd);

	// VerbosityLevel.ERRORS = 0 → suppress "Warning: TT: undefined function" noise
	const verbosity: number = (VerbosityLevel as any)?.ERRORS ?? 0;
	pdfDoc = await getDocument({
	data: new Uint8Array(buf),
	useWorkerFetch: false,
	isEvalSupported: false,
	useSystemFonts: true,
	verbosity,
	}).promise;

	const totalPages = pdfDoc.numPages as number;
	const startPage = pageStart && pageStart > 0 ? Math.min(pageStart, totalPages) : 1;
	const endPage = pageEnd && pageEnd > 0 ? Math.min(pageEnd, totalPages) : totalPages;

	// Y_THRESH = 10 pt: groups diacritics / sub-glyphs on slightly different Y
	// into the same visual line.
	const Y_THRESH = 10;

	type TextItem = { x: number; y: number; str: string; width: number; height: number };

	const pageTexts: string[] = [];

	for (let p = startPage; p <= endPage; p++) {
	const page = await pdfDoc.getPage(p);
	const tc = await page.getTextContent({ includeMarkedContent: false });

	const items: TextItem[] = [];
	for (const it of (tc.items ?? [])) {
	if (typeof it.str !== "string" \|\| !it.str.trim()) continue;
	items.push({
	x: it.transform[4],
	y: it.transform[5],
	str: it.str,
	width: it.width ?? 0,
	height: it.height ?? 12, // fallback to 12 pt if absent
	});
	}

	if (!items.length) {
	page.cleanup();
	continue;
	}

	// Bucket by quantised Y
	const buckets = new Map<number, TextItem[]>();
	for (const it of items) {
	const key = Math.round(it.y / Y_THRESH) * Y_THRESH;
	if (!buckets.has(key)) buckets.set(key, []);
	buckets.get(key)!.push(it);
	}

	// Lines top→bottom (larger Y = higher on PDF page)
	const sortedYs = Array.from(buckets.keys()).sort((a, b) => b - a);

	const lines: string[] = [];
	for (const y of sortedYs) {
	const row = buckets.get(y)!;
	// RTL: sort right-to-left (descending X)
	row.sort((a, b) => b.x - a.x);

	// Join items, inserting a space only when the gap between adjacent
	// items exceeds 25% of the item's font height.
	// This correctly skips ligature sub-glyph gaps (~1 pt) while catching
	// genuine inter-word spaces (~4+ pt for typical Arabic body text).
	let lineText = "";
	for (let i = 0; i < row.length; i++) {
	lineText += row[i].str;
	if (i < row.length - 1) {
	const cur = row[i];
	const next = row[i + 1];
	// gap = horizontal distance between right edge of `next` and left edge of `cur`
	const gap = cur.x - (next.x + next.width);
	const spaceThreshold = (cur.height > 0 ? cur.height : 12) * 0.25;
	if (gap > spaceThreshold) lineText += " ";
	}
	}

	const trimmed = lineText.trim();
	if (trimmed) lines.push(trimmed);
	}

	page.cleanup();
	pageTexts.push(lines.join("\n"));
	}

	let text = pageTexts.join("\n\n").trim();

	// Arabic-specific post-processing: strips page markers, TOC dots, etc.
	text = cleanArabicPdfRaw(text);

	return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
	} catch (e) {
	logger.error({ err: e }, "[extractPdf] failed");
	return "";
	} finally {
	if (pdfDoc) {
	try { await pdfDoc.destroy(); } catch { /* ignore */ }
	}
	}
	}

	async function extractDocx(filePath: string): Promise<string> {
	try {
	const mammoth = await import("mammoth");
	const result = await mammoth.extractRawText({ path: filePath });
	const text = result.value?.trim() \|\| "";
	return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
	} catch (e) {
	return "";
	}
	}

	// Resolves the Tesseract.js Node.js worker script path so it works even when
	// the server code is bundled with esbuild (which breaks the default auto-resolution).
	function getTessWorkerPath(): string {
	const pkgJson = _require.resolve("tesseract.js/package.json");
	return path.join(path.dirname(pkgJson), "src/worker-script/node/index.js");
	}

	async function extractImage(filePath: string): Promise<string> {
	try {
	const Tesseract = await import("tesseract.js");
	const cacheDir =
	process.env.NODE_ENV === "production"
	? "/data/tessdata"
	: path.join(process.cwd(), "uploads", ".tessdata");
	if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir, { recursive: true });
	const worker = await Tesseract.createWorker(["ara", "eng"], 1, {
	cachePath: cacheDir,
	workerPath: getTessWorkerPath(),
	});
	const { data: { text } } = await worker.recognize(filePath);
	await worker.terminate();
	return text?.trim() \|\| "";
	} catch (e) {
	logger.error({ err: e }, "[extractImage] error");
	return "";
	}
	}

	async function extractSpreadsheet(filePath: string, ext: string): Promise<string> {
	try {
	if (ext === ".csv") {
	const content = fs.readFileSync(filePath, "utf-8");
	const lines = content.split("\n").filter(Boolean).slice(0, 5000); // cap rows
	if (lines.length === 0) return "";
	const headers = lines[0].split(",").map((h) => h.trim());
	let md = `\| ${headers.join(" \| ")} \|\n`;
	md += `\| ${headers.map(() => "---").join(" \| ")} \|\n`;
	for (const line of lines.slice(1)) {
	const cells = line.split(",").map((c) => c.trim());
	md += `\| ${cells.join(" \| ")} \|\n`;
	if (md.length > TEXT_CAP) break;
	}
	return md;
	}
	const { createRequire } = await import("module");
	const req = createRequire(import.meta.url);
	const XLSX = req("xlsx");
	const workbook = XLSX.readFile(filePath, { sheetRows: 5000 }); // cap rows per sheet
	let md = "";
	for (const sheetName of workbook.SheetNames) {
	const sheet = workbook.Sheets[sheetName];
	const data: string[][] = XLSX.utils.sheet_to_json(sheet, { header: 1 });
	md += `## ${sheetName}\n\n`;
	if (data.length > 0) {
	const headers = data[0].map(String);
	md += `\| ${headers.join(" \| ")} \|\n`;
	md += `\| ${headers.map(() => "---").join(" \| ")} \|\n`;
	for (const row of data.slice(1)) {
	md += `\| ${headers.map((_, idx) => String(row[idx] ?? "")).join(" \| ")} \|\n`;
	if (md.length > TEXT_CAP) break;
	}
	md += "\n";
	}
	if (md.length > TEXT_CAP) break;
	}
	return md;
	} catch (e) {
	return "";
	}
	}

	async function extractPptx(filePath: string): Promise<string> {
	try {
	const JSZip = (await import("jszip")).default;
	const content = fs.readFileSync(filePath);
	const zip = await JSZip.loadAsync(content);
	let text = "";
	const slideFiles = Object.keys(zip.files)
	.filter((f) => f.match(/ppt\/slides\/slide\d+\.xml/))
	.sort();
	for (const slideFile of slideFiles) {
	const xml = await zip.files[slideFile].async("string");
	const matches = xml.match(/<a:t>(.*?)<\/a:t>/g) \|\| [];
	const slideText = matches
	.map((m) => m.replace(/<[^>]+>/g, "").trim())
	.filter(Boolean)
	.join(" ");
	if (slideText) text += slideText + "\n\n";
	if (text.length > TEXT_CAP) break;
	}
	return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
	} catch (e) {
	return "";
	}
	}

	async function extractEpub(filePath: string): Promise<string> {
	try {
	const JSZip = (await import("jszip")).default;
	const content = fs.readFileSync(filePath);
	const zip = await JSZip.loadAsync(content);
	let text = "";
	for (const filename of Object.keys(zip.files)) {
	if (filename.endsWith(".html") \|\| filename.endsWith(".xhtml")) {
	const html = await zip.files[filename].async("string");
	text += htmlToPlainText(html) + "\n\n";
	if (text.length > TEXT_CAP) break;
	}
	}
	return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
	} catch (e) {
	return "";
	}
	}

	function htmlToPlainText(html: string): string {
	return html
	.replace(/<h([1-6])[^>]>(.?)<\/h\1>/gis, (_, l, c) => "\n" + "#".repeat(Number(l)) + " " + stripTags(c) + "\n")
	.replace(/<p[^>]>(.?)<\/p>/gis, (_, c) => "\n" + stripTags(c) + "\n")
	.replace(/<li[^>]>(.?)<\/li>/gis, "- $1\n")
	.replace(/<br\s*\/?>/gi, "\n")
	.replace(/<[^>]+>/g, "")
	.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/ /g, " ")
	.replace(/\n{3,}/g, "\n\n")
	.trim();
	}

	function stripTags(s: string): string {
	return s.replace(/<[^>]+>/g, "").replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").trim();
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Stats & Utilities
	// ═══════════════════════════════════════════════════════════════════════════
	function computeStats(md: string) {
	const wordCount = md.split(/\s+/).filter(Boolean).length;
	const headings = (md.match(/^#{1,6}\s/gm) \|\| []).length;
	const boldItems = (md.match(/\\[^]+\\*/g) \|\| []).length;
	const listItems = (md.match(/^[-*+]\s/gm) \|\| []).length;
	const tableRows = (md.match(/^\\|/gm) \|\| []).length;
	const codeBlocks = (md.match(/```/g) \|\| []).length / 2;

	const qualityEstimate = Math.min(
	98,
	72 +
	Math.min(headings * 3, 12) +
	Math.min(boldItems, 10) +
	Math.min(listItems, 8) +
	(tableRows > 0 ? 4 : 0) +
	(codeBlocks > 0 ? 2 : 0) +
	Math.min(wordCount / 50, 10)
	);

	return { wordCount, headings, boldItems, listItems, qualityEstimate };
	}

	function cleanMarkdown(md: string): string {
	return md
	.replace(/\r\n/g, "\n")
	.replace(/[ \t]+$/gm, "")
	.replace(/\n{4,}/g, "\n\n\n")
	.trim();
	}

	function detectLanguage(text: string): string {
	const arabicChars = (text.match(/[\u0600-\u06FF]/g) \|\| []).length;
	const latinChars = (text.match(/[a-zA-Z]/g) \|\| []).length;
	if (arabicChars > latinChars * 0.6) return "ar";
	if (latinChars > arabicChars * 0.6) return "en";
	return "mixed";
	}

	function sleep(ms: number): Promise<void> {
	return new Promise((r) => setTimeout(r, ms));
	}

	// Public entry point — enforces a 15-minute hard limit per conversion job
	const CONVERSION_TIMEOUT_MS = 15 * 60 * 1000;
	async function runConversion(conversionId: string, fileId: string, storagePath: string) {
	try {
	await withTimeout(
	runConversionCore(conversionId, fileId, storagePath),
	CONVERSION_TIMEOUT_MS,
	"تحويل الملف"
	);
	} catch (err) {
	const error = err instanceof Error ? err.message : "انتهت مهلة التحويل";
	await db.update(conversionsTable)
	.set({ status: "failed", errorMessage: error })
	.where(eq(conversionsTable.id, conversionId));
	await db.update(filesTable)
	.set({ status: "failed", updatedAt: new Date() })
	.where(eq(filesTable.id, fileId));
	}
	}

	// ═══════════════════════════════════════════════════════════════════════════
	// Routes
	// ═══════════════════════════════════════════════════════════════════════════

	// POST /api/convert/upload
	router.post("/upload", upload.single("file"), async (req: AuthRequest, res) => {
	try {
	if (!req.file) {
	res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" });
	return;
	}
	const { pageStart, pageEnd, folderId } = req.body;
	const fileName = path.parse(fixFilename(req.file.originalname)).name;

	const [file] = await db
	.insert(filesTable)
	.values({
	name: fileName + ".md",
	ownerId: req.userId!,
	folderId: folderId \|\| null,
	originalName: fixFilename(req.file.originalname),
	originalType: req.file.mimetype,
	sizeBytes: req.file.size,
	storagePath: req.file.path,
	status: "queued",
	})
	.returning();

	const [conversion] = await db
	.insert(conversionsTable)
	.values({
	fileId: file.id,
	userId: req.userId!,
	status: "queued",
	progress: 0,
	steps: initSteps(),
	pageStart: pageStart ? Number(pageStart) : null,
	pageEnd: pageEnd ? Number(pageEnd) : null,
	})
	.returning();

	runConversion(conversion.id, file.id, req.file.path).catch((err) =>
	req.log?.error({ err }, "background conversion error")
	);

	res.status(202).json({
	jobId: conversion.id,
	fileId: file.id,
	status: "queued",
	progress: 0,
	steps: initSteps(),
	createdAt: conversion.createdAt,
	});
	} catch (err) {
	const e = err instanceof Error ? err : new Error(String(err));
	const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause;
	const rootMsg = cause?.message ?? e.message;
	console.error("[RAQIM] /upload error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack);
	req.log?.error({ err, cause: cause?.message }, "upload error");
	res.status(500).json({ error: "server_error", message: rootMsg \|\| "فشل الرفع" });
	}
	});

	// POST /api/convert/upload-split — upload once, create N conversion jobs
	router.post("/upload-split", upload.single("file"), async (req: AuthRequest, res) => {
	try {
	if (!req.file) {
	res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" });
	return;
	}
	const { ranges: rangesJson, folderId } = req.body;
	let ranges: Array<{ start: number; end: number; label: string }> = [];
	try {
	ranges = JSON.parse(rangesJson \|\| "[]");
	} catch {
	res.status(400).json({ error: "validation", message: "نطاقات الصفحات غير صالحة" });
	return;
	}
	if (!ranges.length) {
	res.status(400).json({ error: "validation", message: "يجب تحديد نطاق واحد على الأقل" });
	return;
	}

	const baseName = path.parse(fixFilename(req.file.originalname)).name;
	const jobs = [];

	for (const range of ranges) {
	const partName = `${baseName} — ${range.label}.md`;
	const [file] = await db
	.insert(filesTable)
	.values({
	name: partName,
	ownerId: req.userId!,
	folderId: folderId \|\| null,
	originalName: fixFilename(req.file!.originalname),
	originalType: req.file!.mimetype,
	sizeBytes: req.file!.size,
	storagePath: req.file!.path,
	status: "queued",
	})
	.returning();

	const [conversion] = await db
	.insert(conversionsTable)
	.values({
	fileId: file.id,
	userId: req.userId!,
	status: "queued",
	progress: 0,
	steps: initSteps(),
	pageStart: range.start \|\| null,
	pageEnd: range.end \|\| null,
	})
	.returning();

	runConversion(conversion.id, file.id, req.file!.path).catch((err) =>
	req.log?.error({ err }, "split conversion error")
	);

	jobs.push({ jobId: conversion.id, fileId: file.id, name: partName });
	}

	res.status(202).json({ jobs });
	} catch (err) {
	const e = err instanceof Error ? err : new Error(String(err));
	const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause;
	const rootMsg = cause?.message ?? e.message;
	console.error("[RAQIM] /upload-split error:", rootMsg, "\n outer:", e.message, "\n stack:", e.stack);
	req.log?.error({ err, cause: cause?.message }, "upload-split error");
	res.status(500).json({ error: "server_error", message: rootMsg \|\| "فشل الرفع" });
	}
	});

	// POST /api/convert
	router.post("/", async (req: AuthRequest, res) => {
	try {
	const { fileId, pageStart, pageEnd } = req.body;
	const file = await db.query.filesTable.findFirst({
	where: and(eq(filesTable.id, fileId), eq(filesTable.ownerId, req.userId!)),
	});
	if (!file \|\| !file.storagePath) {
	res.status(404).json({ error: "not_found", message: "الملف غير موجود" });
	return;
	}

	const [conversion] = await db
	.insert(conversionsTable)
	.values({
	fileId: file.id,
	userId: req.userId!,
	status: "queued",
	progress: 0,
	steps: initSteps(),
	pageStart: pageStart \|\| null,
	pageEnd: pageEnd \|\| null,
	})
	.returning();

	runConversion(conversion.id, file.id, file.storagePath).catch((err) =>
	req.log?.error({ err }, "background conversion error")
	);

	res.status(202).json({
	jobId: conversion.id,
	fileId,
	status: "queued",
	progress: 0,
	steps: initSteps(),
	createdAt: conversion.createdAt,
	});
	} catch (err) {
	req.log?.error({ err }, "convert error");
	res.status(500).json({ error: "server_error", message: "فشل التحويل" });
	}
	});

	// GET /api/convert/:jobId
	router.get("/:jobId", async (req: AuthRequest, res) => {
	try {
	const jobId = req.params.jobId as string;
	const conv = await db.query.conversionsTable.findFirst({
	where: and(eq(conversionsTable.id, jobId), eq(conversionsTable.userId, req.userId!)),
	});
	if (!conv) {
	res.status(404).json({ error: "not_found", message: "المهمة غير موجودة" });
	return;
	}
	res.json({
	jobId: conv.id,
	fileId: conv.fileId,
	status: conv.status,
	progress: conv.progress,
	steps: conv.steps,
	queuePosition: null,
	elapsedSeconds: conv.elapsedSeconds,
	estimatedSeconds: conv.estimatedSeconds,
	errorMessage: conv.errorMessage,
	createdAt: conv.createdAt,
	});
	} catch (err) {
	req.log?.error({ err }, "get conversion error");
	res.status(500).json({ error: "server_error", message: "فشل جلب الحالة" });
	}
	});

	export default router;