Spaces:

abedelbahnasy55
/

raqim

Sleeping

File size: 70,981 Bytes

import { Router } from "express";
import multer from "multer";
import path from "path";
import fs from "fs";
import { createRequire } from "module";
import { db } from "@workspace/db";
import { filesTable, conversionsTable } from "@workspace/db";
import { eq, and } from "drizzle-orm";
import { requireAuth, AuthRequest } from "../middlewares/auth.js";
import { logger } from "../lib/logger.js";

// Module-level require() for resolving peer package paths (works in ESM + esbuild bundles)
const _require = createRequire(import.meta.url);

const router = Router();
router.use(requireAuth);

// In production, use /data/uploads (persistent HF Spaces volume).
// /tmp/uploads is a tmpfs that starts empty at container boot — unreliable.
const uploadDir =
  process.env.NODE_ENV === "production"
    ? "/data/uploads"
    : path.join(process.cwd(), "uploads");
try {
  fs.mkdirSync(uploadDir, { recursive: true });
} catch (e) {
  console.error("[RAQIM] Failed to create upload dir:", uploadDir, e);
}

// Multer decodes the filename header as Latin-1 by default; re-encode as UTF-8
function fixFilename(raw: string): string {
  try {
    return Buffer.from(raw, "latin1").toString("utf8");
  } catch {
    return raw;
  }
}

const storage = multer.diskStorage({
  destination: uploadDir,
  filename: (_, file, cb) => cb(null, `${Date.now()}-${fixFilename(file.originalname)}`),
});
const upload = multer({ storage, limits: { fileSize: 500 * 1024 * 1024 } });

const CONVERSION_STEPS = [
  { name: "analyzing",  label: "تحليل الملف والتعرف على نوعه" },
  { name: "routing",    label: "توجيه ذكي لأنسب محركات المعالجة" },
  { name: "ocr",        label: "استخراج النص الخام (OCR / Parser)" },
  { name: "layout",     label: "المهندس الذكي — إعادة بناء التنسيق" },
  { name: "scoring",    label: "تقييم الجودة وإحصاء العناصر" },
  { name: "merging",    label: "دمج الطبقات ومعالجة الهيكل النهائي" },
  { name: "cleanup",    label: "تنظيف وتلميع المستند" },
];

function initSteps() {
  return CONVERSION_STEPS.map((s) => ({ ...s, status: "pending" }));
}

// Wrap any async fn with a timeout; rejects with an Error if it exceeds ms
function withTimeout<T>(promise: Promise<T>, ms: number, label: string): Promise<T> {
  return new Promise((resolve, reject) => {
    const timer = setTimeout(() => reject(new Error(`تجاوز الوقت المحدد: ${label}`)), ms);
    promise.then(
      (v) => { clearTimeout(timer); resolve(v); },
      (e) => { clearTimeout(timer); reject(e); }
    );
  });
}

async function runConversionCore(conversionId: string, fileId: string, storagePath: string) {
  const steps = initSteps();
  let stepIndex = 0;
  const startTime = Date.now();

  // Read page range set at upload time
  const convRecord = await db.query.conversionsTable.findFirst({
    where: eq(conversionsTable.id, conversionId),
  });
  const pageStart = convRecord?.pageStart ?? undefined;
  const pageEnd = convRecord?.pageEnd ?? undefined;

  const updateProgress = async (
    status: string,
    progress: number,
    stepsDone: typeof steps,
    aiMessage?: string
  ) => {
    await db
      .update(conversionsTable)
      .set({
        status: status as any,
        progress,
        steps: stepsDone,
        elapsedSeconds: Math.floor((Date.now() - startTime) / 1000),
        ...(aiMessage ? { errorMessage: aiMessage } : {}),
      })
      .where(eq(conversionsTable.id, conversionId));
  };

  try {
    const ext = path.extname(storagePath).toLowerCase();
    let rawText = "";

    // ── Step 1: Analyzing ───────────────────────────────────────────────
    stepIndex = 0;
    steps[0].status = "running";
    await updateProgress("analyzing", 5, steps, "جاري تحليل نوع الملف والبنية الداخلية...");
    await sleep(600);
    steps[0].status = "done";

    // ── Step 2: Routing ─────────────────────────────────────────────────
    stepIndex = 1;
    steps[1].status = "running";
    await updateProgress("routing", 12, steps, "اختيار أنسب محرك استخراج للملف...");
    await sleep(400);
    steps[1].status = "done";

    // ── Step 3: OCR / Text Extraction ───────────────────────────────────
    stepIndex = 2;
    steps[2].status = "running";
    await updateProgress("ocr", 20, steps, "جاري استخراج النص من الملف...");

    if ([".txt", ".md"].includes(ext)) {
      rawText = fs.readFileSync(storagePath, "utf-8");
    } else if (ext === ".pdf") {
      rawText = await extractPdf(storagePath, pageStart, pageEnd);
      await updateProgress("ocr", 28, steps, "تم استخراج النص الخام من الـ PDF...");

      // If text appears garbled (broken ToUnicode CMap in font), fall back to
      // rendering each page as an image and running Tesseract OCR on it.
      // This completely bypasses the CMap issue and works offline/without any API key.
      if (isGarbledArabic(rawText)) {
        await updateProgress("ocr", 30, steps, "تم رصد خلل في ترميز الخط — جاري استخدام OCR للحصول على نص دقيق...");
        const ocrText = await extractPdfViaOcr(storagePath, pageStart, pageEnd,
          (done, total) => updateProgress("ocr", 30 + Math.round((done / total) * 20), steps,
            `جاري تحليل الصفحات بواسطة OCR... (${done}/${total})`)
        );
        if (ocrText.length > 50) {
          rawText = ocrText;
          await updateProgress("ocr", 50, steps, "تم استخراج النص بواسطة OCR بدقة عالية ✓");
        }
      }

      // Optional AI polish — free on Replit (AI proxy) and on HF Spaces (HF_TOKEN).
      rawText = await correctArabicText(rawText, (msg, pct) =>
        updateProgress("ocr", pct, steps, msg)
      );
      await updateProgress("ocr", 55, steps, "اكتمل استخراج النص العربي ✓");
    } else if ([".docx", ".doc"].includes(ext)) {
      rawText = await extractDocx(storagePath);
      await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word...");
    } else if ([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp", ".gif"].includes(ext)) {
      rawText = await extractImage(storagePath);
      await updateProgress("ocr", 38, steps, "تم استخراج النص من الصورة بتقنية OCR...");
    } else if ([".xlsx", ".xls", ".csv"].includes(ext)) {
      rawText = await extractSpreadsheet(storagePath, ext);
      await updateProgress("ocr", 38, steps, "تم تحليل جداول البيانات...");
    } else if ([".html", ".htm"].includes(ext)) {
      const html = fs.readFileSync(storagePath, "utf-8");
      rawText = htmlToPlainText(html);
      await updateProgress("ocr", 38, steps, "تم تحليل ملف HTML...");
    } else if ([".pptx", ".ppt"].includes(ext)) {
      rawText = await extractPptx(storagePath);
      await updateProgress("ocr", 38, steps, "تم استخراج نصوص الشرائح...");
    } else if ([".epub"].includes(ext)) {
      rawText = await extractEpub(storagePath);
      await updateProgress("ocr", 38, steps, "تم استخراج نصوص الكتاب الإلكتروني...");
    } else {
      try {
        rawText = fs.readFileSync(storagePath, "utf-8").substring(0, 100000);
      } catch {
        rawText = `# ملف ثنائي\n\nلا يمكن استخراج نص من هذا النوع من الملفات مباشرة.`;
      }
    }

    steps[2].status = "done";

    // ── Step 4: Rule-Based Architect — 100% Free, No Limits ─────────────
    stepIndex = 3;
    steps[3].status = "running";
    await updateProgress("layout", 45, steps, "المهندس الذكي يعيد بناء هيكل المستند...");

    const architectMarkdown = runRuleBasedArchitect(rawText, ext);
    await updateProgress("layout", 68, steps, "اكتمل تحليل وهيكلة المستند");
    steps[3].status = "done";

    // ── Step 5: Scoring ─────────────────────────────────────────────────
    stepIndex = 4;
    steps[4].status = "running";
    await updateProgress("scoring", 75, steps, "جاري قياس الجودة وإحصاء العناصر...");
    const stats = computeStats(architectMarkdown);
    await sleep(400);
    steps[4].status = "done";

    // ── Step 6: Merging ─────────────────────────────────────────────────
    stepIndex = 5;
    steps[5].status = "running";
    await updateProgress("merging", 85, steps, "دمج الطبقات وتثبيت الهيكل النهائي...");
    await sleep(350);
    steps[5].status = "done";

    // ── Step 7: Cleanup ─────────────────────────────────────────────────
    stepIndex = 6;
    steps[6].status = "running";
    await updateProgress("cleanup", 93, steps, "التلميع النهائي والتحقق من سلامة النص...");
    const finalMarkdown = cleanMarkdown(architectMarkdown);
    await sleep(300);
    steps[6].status = "done";

    // ── Done ─────────────────────────────────────────────────────────────
    const qualityScore = Math.min(98, Math.max(72, stats.qualityEstimate));

    await db
      .update(filesTable)
      .set({
        markdownContent: finalMarkdown,
        originalMarkdown: finalMarkdown,
        status: "done",
        wordCount: stats.wordCount,
        qualityScore,
        language: detectLanguage(finalMarkdown),
        updatedAt: new Date(),
      })
      .where(eq(filesTable.id, fileId));

    await db
      .update(conversionsTable)
      .set({
        status: "done",
        progress: 100,
        steps,
        completedAt: new Date(),
        elapsedSeconds: Math.floor((Date.now() - startTime) / 1000),
        errorMessage: null,
      })
      .where(eq(conversionsTable.id, conversionId));
  } catch (err) {
    const error = err instanceof Error ? err.message : "Unknown error";
    if (steps[stepIndex]) steps[stepIndex].status = "failed";
    await db
      .update(conversionsTable)
      .set({ status: "failed", steps, errorMessage: error })
      .where(eq(conversionsTable.id, conversionId));
    await db
      .update(filesTable)
      .set({ status: "failed", updatedAt: new Date() })
      .where(eq(filesTable.id, fileId));
  }
}

// ═══════════════════════════════════════════════════════════════════════════
// RULE-BASED ARCHITECT — 100% Free, No External APIs, No Limits
// Handles Arabic academic documents, exams, books, and general text
// ═══════════════════════════════════════════════════════════════════════════

function runRuleBasedArchitect(rawText: string, _ext: string): string {
  if (!rawText.trim() || rawText.trim().length < 10) {
    return rawText || "# مستند فارغ\n\nلم يتم اكتشاف محتوى نصي في هذا الملف.";
  }
  const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) || []).length;
  const latinChars = (rawText.match(/[a-zA-Z]/g) || []).length;
  return arabicChars >= latinChars * 0.4
    ? formatArabicDocument(rawText)
    : formatLatinDocument(rawText);
}

// ── Helpers ─────────────────────────────────────────────────────────────────

function cleanOcrLine(line: string): string {
  return line
    .replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/g, "")
    // Strip Unicode bidi / directional control chars that pdfjs embeds from broken-CMap fonts
    .replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "")
    .replace(/[□■▪▫▶◀►◄▲▼◆◇○●★☆✓✗✦✧]/g, "")
    .replace(/\s{2,}/g, " ")
    .trim();
}

function isMetaLine(line: string): boolean {
  return /^(المادة|الزمن|النموذج|التاريخ|الصف|الشعبة|المدرسة|اسم الطالب|الاسم|الفصل|المرحلة|الفرقة|الدراسي|الفصل الدراسي|المستوى|الشعبة|المجموعة)\s*[:：]/i.test(line);
}

function isSectionMarker(line: string): boolean {
  if (/^(أولاً|أولا|ثانياً|ثانيا|ثالثاً|ثالثا|رابعاً|رابعا|خامساً|خامسا|سادساً|سادسا|سابعاً|سابعا|ثامناً|ثامنا|تاسعاً|تاسعا|عاشراً|عاشرا)\s*[-:،\s]/.test(line)) return true;
  if (/^(Part|Section|Chapter|Unit)\s+[IVXivxA-Z\d]+/i.test(line)) return true;
  return false;
}

function isQuestion(line: string): boolean {
  // Arabic question starters
  if (/^سـ?\s*[\d\u0660-\u0669]+\s*[-:)،\s]/.test(line)) return true;
  if (/^سؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true;
  if (/^السؤال\s+[\d\u0660-\u0669]+/i.test(line)) return true;
  if (/^س\s*[\d\u0660-\u0669]+\s*[-:)،]/.test(line)) return true;
  // Numbered with parens: (١) or (1)
  if (/^\([\d\u0660-\u0669]+\)\s+\S/.test(line)) return true;
  // Numbered with dash: "١- " or "1- " when followed by substantial content
  if (/^[\u0660-\u0669\d]+\s*[-–—]\s+.{8,}/.test(line)) return true;
  // English
  if (/^Q\s*\d+\s*[-:.)]/i.test(line)) return true;
  if (/^Question\s+\d+/i.test(line)) return true;
  return false;
}

function isKeywordLine(line: string): boolean {
  return /^(التعليل|الإجابة|الإجابه|المطلوب|الحل|الشرح|الدليل|السبب|العلة|ملاحظة|ملاحظه|تنبيه|الفائدة|المقصود|المراد|الاستنتاج|التحليل|التفسير|النتيجة|الخلاصة)\s*[:：]/i.test(line);
}

function isHeadingCandidate(line: string, lineIndex: number, lines: string[]): boolean {
  if (line.length > 80 || line.length < 3) return false;
  if (/^#{1,6}\s/.test(line)) return false;
  if (/^[-*+\d]/.test(line)) return false;
  if (/[.،!؟?]$/.test(line) && line.length > 30) return false;
  const prevEmpty = lineIndex === 0 || lines[lineIndex - 1].trim() === "";
  const nextEmpty = lineIndex >= lines.length - 1 || lines[lineIndex + 1].trim() === "";
  return prevEmpty && nextEmpty;
}

// Expand inline multiple-choice options to a vertical list
// Returns formatted list or null if not detected
// NOTE: Runs on the ORIGINAL (uncleaned) line to detect multi-space separators
function expandMultipleChoice(line: string): string | null {
  // Pattern 1: أ- text  ب- text  ج- text  (Arabic with dash, any whitespace between)
  const arDashRe = /([أبجد])\s*[-–—]\s*([^أبجد\n-]{1,60}?)(?=\s+[أبجد]\s*[-–—]|\s*$)/g;
  const arDash: Array<[string, string]> = [];
  let m: RegExpExecArray | null;
  while ((m = arDashRe.exec(line)) !== null) {
    const text = m[2].trim();
    if (text) arDash.push([m[1], text]);
  }
  if (arDash.length >= 2) {
    return arDash.map(([l, t]) => `- ${l}- ${t}`).join("\n");
  }

  // Pattern 2: (أ) text  (ب) text
  const arParenRe = /\(([أبجد])\)\s*([^()أبجد\n]{1,60}?)(?=\s*\([أبجد]\)|\s*$)/g;
  const arParen: Array<[string, string]> = [];
  while ((m = arParenRe.exec(line)) !== null) {
    const text = m[2].trim();
    if (text) arParen.push([m[1], text]);
  }
  if (arParen.length >= 2) {
    return arParen.map(([l, t]) => `- (${l}) ${t}`).join("\n");
  }

  // Pattern 3: أ) text  ب) text (without outer parens)
  const arRParenRe = /([أبجد])\)\s*([^أبجد()]{1,60}?)(?=\s*[أبجد]\)|\s*$)/g;
  const arRParen: Array<[string, string]> = [];
  while ((m = arRParenRe.exec(line)) !== null) {
    const text = m[2].trim();
    if (text) arRParen.push([m[1], text]);
  }
  if (arRParen.length >= 2) {
    return arRParen.map(([l, t]) => `- ${l}) ${t}`).join("\n");
  }

  // Pattern 4: English a) b) c) d) — split by choice marker to avoid char-class issues
  const enSplit = line.split(/\s+(?=[a-d]\)\s)/i);
  if (enSplit.length >= 2) {
    const enChoices: Array<[string, string]> = enSplit
      .map(s => {
        const mx = s.match(/^([a-d])\)\s+(.*)/i);
        return mx ? ([mx[1].toLowerCase(), mx[2].trim()] as [string, string]) : null;
      })
      .filter((x): x is [string, string] => x !== null);
    if (enChoices.length >= 2) {
      return enChoices.map(([l, t]) => `- ${l}) ${t}`).join("\n");
    }
  }

  return null;
}

// ── Arabic document formatter ────────────────────────────────────────────────

// Extract all key:value pairs from a meta line that may contain multiple fields
// e.g. "المادة: رياضيات  الزمن: ساعة  النموذج: أ" → [["المادة","رياضيات"],["الزمن","ساعة"],["النموذج","أ"]]
function splitMetaFields(line: string): Array<[string, string]> {
  const pairs: Array<[string, string]> = [];
  // Split by 2+ spaces or known separators between fields
  // Each segment should start with a known meta key followed by colon
  const segments = line.split(/\s{2,}|\t|[|،,]/).map(s => s.trim()).filter(Boolean);
  for (const seg of segments) {
    const ci = seg.indexOf(":");
    if (ci > 0 && isMetaLine(seg)) {
      const k = seg.slice(0, ci).trim();
      const v = seg.slice(ci + 1).trim();
      if (k) pairs.push([k, v]);
    }
  }
  // Fallback: treat whole line as single field
  if (pairs.length === 0) {
    const ci = line.indexOf(":");
    if (ci > 0) {
      pairs.push([line.slice(0, ci).trim(), line.slice(ci + 1).trim()]);
    }
  }
  return pairs;
}

function formatArabicDocument(text: string): string {
  const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n");
  const lines = rawLines.map(cleanOcrLine);
  const output: string[] = [];
  let i = 0;

  // ── Detect and render metadata block from first 15 lines ──
  const metaIndices: number[] = [];
  for (let j = 0; j < Math.min(15, lines.length); j++) {
    if (lines[j] && isMetaLine(lines[j])) metaIndices.push(j);
  }

  // Handle metadata: each detected meta line may contain multiple inline fields
  // Use rawLines to preserve double-space separators
  if (metaIndices.length >= 1) {
    const allPairs: Array<[string, string]> = [];
    for (const idx of metaIndices) {
      for (const pair of splitMetaFields(rawLines[idx] || "")) allPairs.push(pair);
    }
    if (allPairs.length > 0) {
      output.push("| الحقل | القيمة |");
      output.push("| --- | --- |");
      for (const [k, v] of allPairs) output.push(`| ${k} | ${v} |`);
      output.push("");
      i = Math.max(...metaIndices) + 1;
    }
  }

  // ── Check first content line for document title ──
  while (i < lines.length && !lines[i]) i++;
  if (i < lines.length) {
    const candidate = lines[i];
    const isTitle =
      candidate.length > 3 &&
      candidate.length < 100 &&
      !isQuestion(candidate) &&
      !isSectionMarker(candidate) &&
      !isMetaLine(candidate) &&
      !candidate.startsWith("-") &&
      !candidate.startsWith("#");
    // Only promote to title if metadata was found (strong signal)
    if (isTitle && metaIndices.length > 0) {
      output.push(`# ${candidate}`);
      output.push("");
      i++;
    }
  }

  // ── Main pass ──
  while (i < lines.length) {
    const line = lines[i].trim();
    const rawLine = rawLines[i] || "";   // original line before cleaning (for choice detection)

    if (!line) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      i++;
      continue;
    }

    // Already a Markdown heading — keep as-is
    if (/^#{1,6}\s/.test(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(line);
      output.push("");
      i++;
      continue;
    }

    // Section markers: أولاً / ثانياً / Part I
    if (isSectionMarker(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(`## ${line}`);
      output.push("");
      i++;
      continue;
    }

    // Question detection
    if (isQuestion(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(`**${line}**`);
      output.push("");
      i++;
      continue;
    }

    // Keyword lines: التعليل: / الإجابة: / المطلوب:
    if (isKeywordLine(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(line);
      i++;
      continue;
    }

    // Inline multiple choice → vertical list (use rawLine to preserve original spacing)
    const expanded = expandMultipleChoice(rawLine) || expandMultipleChoice(line);
    if (expanded) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(expanded);
      output.push("");
      i++;
      continue;
    }

    // Already-formatted list items
    if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) {
      output.push(line);
      i++;
      continue;
    }

    // Lone short line surrounded by blanks → subheading
    if (isHeadingCandidate(line, i, lines)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(`### ${line}`);
      output.push("");
      i++;
      continue;
    }

    // Regular content line
    output.push(line);
    i++;
  }

  return output.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}

// ── Latin/English document formatter ────────────────────────────────────────

function formatLatinDocument(text: string): string {
  const rawLines = text.replace(/\r\n/g, "\n").replace(/\r/g, "\n").split("\n");
  const lines = rawLines.map(cleanOcrLine);
  const output: string[] = [];

  for (let i = 0; i < lines.length; i++) {
    const line = lines[i].trim();

    if (!line) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      continue;
    }

    if (/^#{1,6}\s/.test(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(line);
      output.push("");
      continue;
    }

    if (isSectionMarker(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(`## ${line}`);
      output.push("");
      continue;
    }

    if (isQuestion(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(`**${line}**`);
      output.push("");
      continue;
    }

    if (isKeywordLine(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(line);
      continue;
    }

    const expanded = expandMultipleChoice(line);
    if (expanded) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(expanded);
      output.push("");
      continue;
    }

    // ALL CAPS short line → subheading
    if (/^[A-Z][A-Z\s\d:,.-]{4,60}$/.test(line)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(`### ${line}`);
      output.push("");
      continue;
    }

    if (/^[-*+]\s/.test(line) || /^\d+\.\s/.test(line)) {
      output.push(line);
      continue;
    }

    if (isHeadingCandidate(line, i, lines)) {
      if (output.length > 0 && output[output.length - 1] !== "") output.push("");
      output.push(`### ${line}`);
      output.push("");
      continue;
    }

    output.push(line);
  }

  return output.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}

// ═══════════════════════════════════════════════════════════════════════════
// Extractors
// ═══════════════════════════════════════════════════════════════════════════
// Max characters extracted from any single document (~2 MB of text ≈ 300 k words)
const TEXT_CAP = 2_000_000;

// ── Arabic PDF text post-processor ───────────────────────────────────────────
// Cleans up the artifacts introduced by PDF text extraction:
//   • "-- X of N --" page markers from pdf-parse default renderer
//   • Standalone page labels (single Arabic letters/numerals on their own line)
//   • Table-of-contents leader dots (". . . . . .") + trailing page numbers
//   • Unicode bidi control chars (LRM / RLM / directional overrides)
//   • Isolated short CAPS Latin sequences inline in Arabic lines (broken CMap)
//   • Collapse excess blank lines
function cleanArabicPdfRaw(text: string): string {
  // 1. Strip all Unicode bidi / directional control characters that
  //    pdfjs-dist embeds when the PDF uses broken ToUnicode CMap fonts.
  //    These appear as ‎ (U+200E LRM) and ‏ (U+200F RLM) wrapping Latin chars.
  text = text.replace(/[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]/g, "");

  // 2. For lines that are predominantly Arabic, remove short ALL-CAPS Latin
  //    noise sequences — artefacts of broken CMap where Arabic glyphs are
  //    mapped to Latin code points (e.g. "المبادئ OA العشرة" → OA = garbled Arabic).
  //    Guard: don't remove if the "Latin" word is a common technical abbreviation.
  const KEEP_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS"]);
  text = text.split("\n").map(line => {
    const arabicCount = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
    if (arabicCount < 4) return line; // not an Arabic line — leave intact
    // Remove isolated 1-5 char ALL-CAPS sequences (not in safe-list)
    return line.replace(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g, (match) =>
      KEEP_CAPS.has(match) ? match : ""
    ).replace(/ {2,}/g, " ").trim();
  }).join("\n");

  const lines = text.split("\n");
  const out: string[] = [];

  for (const raw of lines) {
    const line = raw.trim();

    // 1. Remove "-- X of N --" pdf-parse page markers
    if (/^--\s*\d+\s+of\s+\d+\s*--$/i.test(line)) continue;

    // 2. Remove standalone page labels:
    //    • single Arabic letter (أ ب ج etc.)
    //    • 1–3 Arabic/Eastern-Arabic/Western numerals alone on a line
    if (/^[\u0600-\u06FF]{1}$/.test(line)) continue;
    if (/^[٠-٩\u0660-\u06690-9]{1,3}$/.test(line)) continue;

    // 3. Collapse TOC leader-dot lines: ". . . . . . ." → clean title
    //    A TOC line has 4+ consecutive dots (possibly space-separated)
    if (/\.(\s*\.){3,}/.test(line)) {
      const cleaned = line
        .replace(/\.(\s*\.)+\s*/g, " ")
        .replace(/\s+[٠-٩\u0660-\u06690-9]{1,4}\s*$/, "")
        .replace(/\s{2,}/g, " ")
        .trim();
      if (cleaned.length > 2) out.push(cleaned);
      continue;
    }

    // 4. Strip trailing Arabic/Eastern-Arabic page-number from TOC lines that
    //    lost their dot-leaders (e.g. "عنوان الكتاب ۰٣"). Heuristic: line is
    //    mostly Arabic text ending in 1–4 Arabic/Eastern-Arabic digit(s), and
    //    the Arabic content before the number is ≥10 chars.
    const tocTrailing = line.replace(/\s+[٠-٩\u0660-\u0669]{1,4}$/, "");
    if (tocTrailing !== line && tocTrailing.length >= 10 && /[\u0600-\u06FF]/.test(tocTrailing)) {
      out.push(tocTrailing.trim());
      continue;
    }

    // 5. Preserve empty lines (paragraph breaks)
    if (!line) { out.push(""); continue; }

    out.push(line);
  }

  // Collapse runs of 3+ blank lines to 2
  return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}

// ── Arabic text AI correction — 100% free, full HF model access ──────────────
// Priority chain (tried in order, falls back on rate-limit / error):
//   1. Replit AI Integration proxy (AI_INTEGRATIONS_OPENAI_BASE_URL) — gpt-4o
//   2. HF: Qwen/Qwen3-72B         — best open-source Arabic, Apr 2025
//   3. HF: Qwen/Qwen3-30B-A3B     — MoE, fast & very capable
//   4. HF: Qwen/Qwen2.5-72B-Instruct — proven Arabic quality
//   5. HF: meta-llama/Llama-3.3-70B-Instruct — strong multilingual
//   6. HF: mistralai/Mistral-Nemo-Instruct-2407 — fast 12B fallback
//
const AI_CHUNK_CHARS      = 3000;   // larger chunks → fewer API calls
const AI_CHUNK_TIMEOUT_MS = 120_000;

const AI_SYSTEM_PROMPT =
  "أنت نظام متخصص في تصحيح نصوص PDF العربية المستخرجة آلياً. " +
  "المهمة: إزالة أخطاء الاستخراج مع الحفاظ التام على المعنى والمحتوى الأصيل. " +
  "أنواع الأخطاء الشائعة في هذه النصوص: " +
  "١) حروف ومقاطع لاتينية قصيرة مبعثرة داخل النص العربي (مثل OA، BW، Zz، dl، pl) — ضوضاء من ترميز الخط المكسور، احذفها. " +
  "٢) كلمات عربية مبتورة أو مشوهة واضحة يمكن تصحيحها من السياق. " +
  "٣) مسافات خاطئة داخل الكلمة العربية الواحدة — ادمجها. " +
  "٤) رموز متفرقة أو علامات ترقيم غريبة ليست جزءاً من المحتوى — احذفها. " +
  "القواعد الصارمة: " +
  "أ) احتفظ بالأسماء والمصطلحات التقنية اللاتينية الشائعة (PDF، AI، URL، API...). " +
  "ب) حافظ على هيكل الفقرات والعناوين والقوائم وعلامات Markdown كما هي تماماً. " +
  "ج) لا تضف أي محتوى جديد أو شروحات. " +
  "أعد النص العربي المُصحَح فقط بدون أي مقدمة أو خاتمة.";

type AiEndpoint = { baseUrl: string; apiKey: string; model: string; label: string; noThink?: boolean };

// Returns a prioritised list of AI endpoints to try — best Arabic quality first.
function resolveAiEndpoints(): AiEndpoint[] {
  const endpoints: AiEndpoint[] = [];

  // 1. Replit AI Integration proxy (zero-config on Replit dev environment)
  const replitUrl = process.env.AI_INTEGRATIONS_OPENAI_BASE_URL;
  if (replitUrl) {
    endpoints.push({
      baseUrl: replitUrl,
      apiKey:  process.env.AI_INTEGRATIONS_OPENAI_API_KEY ?? "placeholder",
      model:   "gpt-4o",
      label:   "Replit/gpt-4o",
    });
  }

  // 2-8. HF Router — automatic provider selection (best available with HF_TOKEN)
  // As of 2026: router.huggingface.co/v1 routes to the best available provider
  // (novita, together, deepinfra, fireworks, hf-inference) based on model support.
  // Falls back gracefully: 429/402/503 → next model in chain.
  // noThink=true → appends /no_think to disable Qwen3 chain-of-thought for speed.
  const hfToken = process.env.HF_TOKEN;
  if (hfToken) {
    const HF = "https://router.huggingface.co/v1";  // generic router, best model coverage
    endpoints.push(
      // Qwen3-235B-A22B: #1 Arabic open-source 2026, MoE 235B (22B active) — fastest large model
      { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-235B-A22B",                      label: "HF/Qwen3-235B",    noThink: true  },
      // Qwen3-72B: #2 Arabic, dense 72B, excellent correction quality
      { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-72B",                            label: "HF/Qwen3-72B",     noThink: true  },
      // Llama 4 Scout: Meta's April 2025, 17B MoE (16E), strong Arabic + multimodal
      { baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-4-Scout-17B-16E-Instruct", label: "HF/Llama4-Scout",  noThink: false },
      // Qwen3-30B-A3B: MoE 30B (3B active), fast and capable
      { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen3-30B-A3B",                        label: "HF/Qwen3-30B-A3B", noThink: true  },
      // Qwen2.5-72B: proven, widely available, great Arabic
      { baseUrl: HF, apiKey: hfToken, model: "Qwen/Qwen2.5-72B-Instruct",                 label: "HF/Qwen2.5-72B",   noThink: false },
      // Llama 3.3 70B: reliable multilingual fallback
      { baseUrl: HF, apiKey: hfToken, model: "meta-llama/Llama-3.3-70B-Instruct",         label: "HF/Llama3.3-70B",  noThink: false },
      // Mistral Nemo 12B: lightweight guaranteed fallback
      { baseUrl: HF, apiKey: hfToken, model: "mistralai/Mistral-Nemo-Instruct-2407",       label: "HF/Mistral-Nemo",  noThink: false },
    );
  }

  return endpoints;
}

function chunkForAiCorrection(text: string): string[] {
  const paras = text.split(/\n{2,}/);
  const chunks: string[] = [];
  let buf = "";
  for (const para of paras) {
    const joined = buf ? buf + "\n\n" + para : para;
    if (joined.length <= AI_CHUNK_CHARS) {
      buf = joined;
    } else {
      if (buf) chunks.push(buf);
      if (para.length > AI_CHUNK_CHARS) {
        buf = "";
        for (const line of para.split("\n")) {
          const lj = buf ? buf + "\n" + line : line;
          if (lj.length <= AI_CHUNK_CHARS) { buf = lj; }
          else { if (buf) chunks.push(buf); buf = line.slice(0, AI_CHUNK_CHARS); }
        }
      } else {
        buf = para;
      }
    }
  }
  if (buf.trim()) chunks.push(buf);
  return chunks.filter(c => c.trim().length > 0);
}

async function callAiCorrection(
  text: string,
  ep: AiEndpoint,
): Promise<string> {
  const controller = new AbortController();
  const timer = setTimeout(() => controller.abort(), AI_CHUNK_TIMEOUT_MS);
  try {
    // Qwen3 models support /no_think suffix to skip chain-of-thought reasoning,
    // giving 3-5× faster responses for straightforward correction tasks.
    const userContent = ep.noThink
      ? `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح: /no_think`
      : `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح:`;

    const body: Record<string, unknown> = {
      model:      ep.model,
      messages:   [
        { role: "system", content: AI_SYSTEM_PROMPT },
        { role: "user",   content: userContent },
      ],
      max_tokens: Math.min(4096, Math.ceil(text.length * 2)),
      temperature: 0.1,   // low temp = deterministic, less hallucination
    };

    const resp = await fetch(`${ep.baseUrl}/chat/completions`, {
      method:  "POST",
      headers: { Authorization: `Bearer ${ep.apiKey}`, "Content-Type": "application/json" },
      body:    JSON.stringify(body),
      signal:  controller.signal,
    });

    if (resp.status === 429) throw Object.assign(new Error("rate_limited"),     { code: "rate_limited" });
    if (resp.status === 503) throw Object.assign(new Error("unavailable"),       { code: "unavailable"  });
    if (resp.status === 402) throw Object.assign(new Error("payment_required"),  { code: "unavailable"  }); // no credits → try next
    if (resp.status === 404) throw Object.assign(new Error("model_not_found"),   { code: "unavailable"  }); // unsupported model
    if (!resp.ok) throw new Error(`ai_http_${resp.status}`);

    const data = await resp.json() as any;
    let corrected = (data.choices?.[0]?.message?.content ?? "").trim();

    // Strip any <think>...</think> block Qwen3 might emit even with /no_think
    corrected = corrected.replace(/<think>[\s\S]*?<\/think>\s*/gi, "").trim();

    // Sanity: output must be 35%–300% of input length
    if (!corrected || corrected.length < text.length * 0.35 || corrected.length > text.length * 3) {
      return text;
    }
    return corrected;
  } finally {
    clearTimeout(timer);
  }
}

type ProgressFn = (msg: string, pct: number) => Promise<void>;

async function correctArabicText(rawText: string, onProgress?: ProgressFn): Promise<string> {
  const endpoints = resolveAiEndpoints();
  if (!endpoints.length) {
    logger.info("[arabic-ai] No AI endpoint configured — using OCR text as-is");
    return rawText;
  }

  // Only correct predominantly Arabic text
  const arabicChars   = (rawText.match(/[\u0600-\u06FF]/g) ?? []).length;
  const nonSpaceChars = rawText.replace(/\s/g, "").length;
  if (nonSpaceChars < 50 || arabicChars / nonSpaceChars < 0.25) return rawText;

  const chunks = chunkForAiCorrection(rawText);

  // Find the first working endpoint (try each with a minimal probe if >1 model available)
  let activeEpIdx = 0;
  logger.info(`[arabic-ai] ${chunks.length} chunks, ${endpoints.length} endpoints available — primary: ${endpoints[0].label}`);

  const correctedParts: string[] = [];

  for (let i = 0; i < chunks.length; i++) {
    const pct = 33 + Math.round((i / chunks.length) * 21);
    const ep  = endpoints[activeEpIdx];
    await onProgress?.(`تصحيح النص عبر ${ep.label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct);

    let succeeded = false;
    while (activeEpIdx < endpoints.length) {
      const cur = endpoints[activeEpIdx];
      try {
        const result = await callAiCorrection(chunks[i], cur);
        correctedParts.push(result);
        succeeded = true;
        break;
      } catch (err: any) {
        const code = err?.code ?? err?.message ?? "";
        if (code === "rate_limited" || code === "unavailable" || code.startsWith("ai_http_5")) {
          logger.warn(`[arabic-ai] ${cur.label} ${code} — switching to next endpoint`);
          activeEpIdx++;
          // update progress label for new endpoint
          if (activeEpIdx < endpoints.length) {
            await onProgress?.(`التحويل عبر ${endpoints[activeEpIdx].label.split("/")[1]}... (${i + 1}/${chunks.length})`, pct);
          }
        } else {
          logger.warn({ err }, `[arabic-ai] chunk ${i} error on ${cur.label} — keeping raw text`);
          break;
        }
      }
    }

    if (!succeeded) {
      // All endpoints exhausted or non-retryable error — keep original chunk
      correctedParts.push(chunks[i]);
      if (activeEpIdx >= endpoints.length) {
        // No more endpoints: pass remaining chunks through unchanged
        correctedParts.push(...chunks.slice(i + 1));
        logger.warn("[arabic-ai] All endpoints exhausted — remaining chunks kept as-is");
        break;
      }
    }
  }

  return correctedParts.join("\n\n");
}

// ── Garbled Arabic detector ───────────────────────────────────────────────────
// Detects whether pdfjs-dist returned broken CMap output for an Arabic PDF.
// Two root causes:
//  A) Character-pair transposition (RTL/LTR confusion): في → يف
//  B) Broken ToUnicode CMap: Arabic glyphs mapped to Latin code points,
//     producing "OA BW Zz" noise inline with Arabic text, often with
//     Unicode bidi control chars (LRM/RLM) wrapping the Latin sequences.
function isGarbledArabic(text: string): boolean {
  const arabicChars = (text.match(/[\u0600-\u06FF]/g) ?? []).length;
  if (arabicChars < 100) return false;

  // ── Type A: character-pair transposition ───────────────────────────────
  // Space-delimited يف → garbled في (≥3 occurrences is conclusive)
  const garbledFi = (text.match(/ يف /g) ?? []).length;
  if (garbledFi >= 3) return true;

  // Garbled الحمد (very common opening in Islamic texts)
  if (/امحلد/.test(text)) return true;

  // Garbled ordinal markers ثانياً / ثالثاً used as section headers
  if (/اثنياا|اثلثاا/.test(text)) return true;

  // ── Type B: broken CMap → Arabic mapped to Latin code points ───────────
  // Signal 1: bidi control chars (LRM U+200E / RLM U+200F) wrapping
  // short Latin sequences — pdfjs embeds these from the CMap stream.
  // Pattern: ‎OA‏  ‎Zz‏  ‎BW‏  ‎AJ‏
  const bidiLatinWraps = (text.match(/[\u200E\u200F][A-Za-z]{1,6}[\u200E\u200F]/g) ?? []).length;
  if (bidiLatinWraps >= 3) return true;

  // Signal 2: multiple short ALL-CAPS Latin sequences appearing INLINE
  // within predominantly-Arabic lines (not at the start of a new sentence).
  // e.g. "المبادئ العشرة OA للعلوم BW أولاً" — OA/BW = garbled Arabic words.
  const IGNORE_CAPS = new Set(["PDF", "OCR", "AI", "URL", "API", "HTML", "CSS", "JS", "TS",
    "I", "II", "III", "IV", "VI", "VII", "VIII", "IX", "XI", "XII"]);
  const garbledLines = text.split("\n").filter(line => {
    const arabic = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
    if (arabic < 3) return false;
    const noiseCaps = (line.match(/(?<![A-Za-z])([A-Z]{1,5})(?![A-Za-z])/g) ?? [])
      .filter(m => !IGNORE_CAPS.has(m));
    return noiseCaps.length >= 2;
  }).length;
  if (garbledLines >= 4) return true;

  // Signal 3: suspiciously high ratio of Latin alphabetic chars in
  // a predominantly-Arabic document (broken CMap maps Arabic → Latin).
  const latinAlpha = (text.match(/[A-Za-z]/g) ?? []).length;
  if (arabicChars >= 300 && latinAlpha > arabicChars * 0.12) return true;

  return false;
}

// ── VLM-based OCR per page (olmOCR / Qwen2.5-VL via HF Inference API) ────────
// Uses vision-language models to extract text from rendered page images.
// olmOCR (Allen Institute) is specifically fine-tuned for document OCR and
// achieves top-1 Arabic accuracy on KITAB-Bench benchmarks.
// Model priority: olmOCR-7B → Qwen2.5-VL-7B → Tesseract (local fallback)

// VLM OCR model priority (2026): olmOCR #1 Arabic doc OCR → Qwen2.5-VL-72B → Qwen2.5-VL-7B
// Uses the generic HF router (router.huggingface.co/v1) for maximum model availability.
const VLM_OCR_ROUTER = "https://router.huggingface.co/v1";
const VLM_OCR_MODELS = [
  "allenai/olmOCR-7B-0225-preview",    // #1: Allen Institute, fine-tuned doc OCR, KITAB-Bench winner
  "Qwen/Qwen2.5-VL-72B-Instruct",      // #2: larger VLM, best Arabic accuracy (NEW 2026 upgrade)
  "Qwen/Qwen2.5-VL-7B-Instruct",       // #3: smaller, faster fallback
];
const VLM_PAGE_TIMEOUT_MS = 90_000;
const VLM_OCR_PROMPT =
  "Extract all the text from this document page exactly as written. " +
  "Preserve Arabic text, paragraph structure, headings, and line breaks. " +
  "Do not add explanations or commentary — output only the extracted text.";

async function extractPageViaVlm(pngPath: string, hfToken: string): Promise<string> {
  const imgBase64 = fs.readFileSync(pngPath).toString("base64");

  for (const model of VLM_OCR_MODELS) {
    const ctrl  = new AbortController();
    const timer = setTimeout(() => ctrl.abort(), VLM_PAGE_TIMEOUT_MS);
    try {
      const resp = await fetch(`${VLM_OCR_ROUTER}/chat/completions`, {
        method:  "POST",
        headers: { Authorization: `Bearer ${hfToken}`, "Content-Type": "application/json" },
        body: JSON.stringify({
          model,
          messages: [{
            role: "user",
            content: [
              { type: "image_url", image_url: { url: `data:image/png;base64,${imgBase64}` } },
              { type: "text",      text: VLM_OCR_PROMPT },
            ],
          }],
          max_tokens:  4096,
          temperature: 0.0,
        }),
        signal: ctrl.signal,
      });
      clearTimeout(timer);
      if (resp.status === 429) throw Object.assign(new Error("rate_limited"),    { code: "rate_limited" });
      if (resp.status === 402) throw Object.assign(new Error("payment_required"), { code: "unavailable"  });
      if (resp.status === 404) throw Object.assign(new Error("model_not_found"),  { code: "unavailable"  });
      if (!resp.ok) throw new Error(`vlm_http_${resp.status}`);
      const data    = await resp.json() as any;
      const content = (data.choices?.[0]?.message?.content ?? "").trim();
      if (content.length > 20) {
        logger.info(`[vlm-ocr] ${model.split("/")[1]} → ${content.length} chars`);
        return content;
      }
      logger.warn(`[vlm-ocr] ${model.split("/")[1]} returned empty — trying next`);
    } catch (err: any) {
      clearTimeout(timer);
      if (err?.code === "rate_limited") {
        logger.warn(`[vlm-ocr] ${model.split("/")[1]} rate-limited`);
        throw err;           // propagate so caller can switch to Tesseract
      }
      logger.warn({ err: err?.message }, `[vlm-ocr] ${model.split("/")[1]} failed`);
    }
  }
  throw new Error("all_vlm_models_failed");
}

// ── OCR-based PDF extractor (fallback for broken-CMap PDFs) ──────────────────
// Pipeline:
//   1. pdftoppm renders pages to PNG (200 DPI — optimal for VLM API)
//   2. Per page: try VLM-OCR (olmOCR via HF API) first if HF_TOKEN available
//   3. Fall back to Tesseract (local) if VLM fails / rate-limited
// No page cap — processes the full document regardless of length.

// Filter OCR output: drop lines that are overwhelmingly Latin characters with
// little/no Arabic — these are noise from decorative pages, page headers,
// and OCR misread ornaments (e.g. "Me NY 1", "dl pl a gl", "Fy PIN ENA").
function cleanOcrOutput(text: string): string {
  const lines = text.split("\n");
  const out: string[] = [];

  for (const raw of lines) {
    const line = raw.trim();

    // Always keep blank lines (paragraph separators)
    if (!line) { out.push(""); continue; }

    const arabicChars  = (line.match(/[\u0600-\u06FF]/g) ?? []).length;
    const latinChars   = (line.match(/[a-zA-Z]/g)        ?? []).length;
    const totalAlpha   = arabicChars + latinChars;

    // Keep if there's meaningful Arabic content
    if (arabicChars >= 4) { out.push(line); continue; }

    // Reject short lines that are purely Latin noise (≤30 chars, no Arabic)
    if (arabicChars === 0 && line.length <= 30) continue;

    // Reject lines where Latin chars vastly outnumber Arabic (OCR artefact)
    if (totalAlpha > 0 && latinChars / totalAlpha > 0.80 && arabicChars < 4) continue;

    // Keep everything else (numbers, punctuation, mixed headings, etc.)
    out.push(line);
  }

  return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
}

async function extractPdfViaOcr(
  filePath: string,
  pageStart?: number,
  pageEnd?: number,
  onProgress?: (done: number, total: number) => void,
): Promise<string> {
  const { execFile } = await import("child_process");
  const { promisify } = await import("util");
  const execFileAsync = promisify(execFile);

  const hfToken  = process.env.HF_TOKEN;
  const useVlm   = !!hfToken;

  const tmpDir = fs.mkdtempSync("/tmp/pdf-ocr-");
  let tessWorker: any = null;

  try {
    const startPage = pageStart && pageStart > 0 ? pageStart : 1;
    const endPage   = pageEnd   && pageEnd   > 0 ? pageEnd   : 9999;

    // VLM works great at 200 DPI; Tesseract benefits from 300 DPI.
    // When VLM is available we render at 200 DPI (smaller images, faster API).
    // If VLM is unavailable or fails entirely, we re-render at 300 DPI for Tesseract.
    const dpi = useVlm ? "200" : "300";
    await execFileAsync(
      "pdftoppm",
      ["-r", dpi, "-png", "-f", String(startPage), "-l", String(endPage),
       filePath, path.join(tmpDir, "page")],
      { timeout: 600_000 },
    );

    const pngFiles = fs.readdirSync(tmpDir)
      .filter(f => f.endsWith(".png"))
      .sort()
      .map(f => path.join(tmpDir, f));

    if (pngFiles.length === 0) return "";

    const pageTexts: string[] = [];
    let vlmRateLimited = false;

    for (let i = 0; i < pngFiles.length; i++) {
      let pageText = "";
      let usedTesseract = false;

      // ── Try VLM-OCR first (olmOCR / Qwen2.5-VL via HF) ────────────────
      if (useVlm && !vlmRateLimited) {
        try {
          pageText     = await extractPageViaVlm(pngFiles[i], hfToken);
          usedTesseract = false;
        } catch (err: any) {
          if (err?.code === "rate_limited") {
            vlmRateLimited = true;
            logger.warn("[vlm-ocr] Rate limited — switching to Tesseract for all remaining pages");
          } else {
            logger.warn({ err: err?.message }, `[vlm-ocr] page ${i + 1} failed — using Tesseract`);
          }
          usedTesseract = true;
        }
      } else {
        usedTesseract = true;
      }

      // ── Fallback: Tesseract (local, guaranteed) ────────────────────────
      if (usedTesseract) {
        if (!tessWorker) {
          // Lazy-initialise Tesseract only when actually needed
          const tessDataDir =
            process.env.NODE_ENV === "production"
              ? "/data/tessdata"
              : path.join(process.cwd(), "uploads", ".tessdata");
          if (!fs.existsSync(tessDataDir)) fs.mkdirSync(tessDataDir, { recursive: true });
          const Tesseract = await import("tesseract.js");
          tessWorker = await Tesseract.createWorker(["ara", "eng"], 1, {
            cachePath: tessDataDir,
            workerPath: getTessWorkerPath(),
          });
        }
        const { data: { text } } = await tessWorker.recognize(pngFiles[i]);
        pageText = cleanOcrOutput(text);
      }

      if (pageText.trim()) pageTexts.push(pageText.trim());
      onProgress?.(i + 1, pngFiles.length);
    }

    if (tessWorker) await tessWorker.terminate();

    let result = pageTexts.join("\n\n");
    if (result.length > TEXT_CAP) result = result.slice(0, TEXT_CAP);
    return result;
  } catch (e) {
    logger.error({ err: e }, "[extractPdfViaOcr] failed");
    if (tessWorker) { try { await tessWorker.terminate(); } catch { /* ignore */ } }
    return "";
  } finally {
    try { fs.rmSync(tmpDir, { recursive: true, force: true }); } catch { /* ignore */ }
  }
}

// ── RTL-aware PDF extractor using pdfjs-dist directly ────────────────────────
// pdf-parse v2 has no `pagerender` callback, so we bypass it and use
// pdfjs-dist (already installed as pdf-parse's peer) directly.
//
// Algorithm per page:
//  1. getTextContent() → items with {x, y, width, height, str}
//  2. Bucket items into visual lines by quantised Y (Y_THRESH = 10 pt)
//  3. Sort each bucket right→left (descending X) → correct Arabic reading order
//  4. Join items; insert a space only when the visual gap between adjacent
//     items exceeds 25% of the item's font height — this threshold correctly
//     handles Arabic ligature sub-glyphs (gap ~1 pt) vs word gaps (gap ~4+ pt)
//     without the false positives caused by per-character avgCharWidth.
async function extractPdf(filePath: string, pageStart?: number, pageEnd?: number): Promise<string> {
  let pdfDoc: any = null;
  try {
    const { createRequire } = await import("module");
    const req = createRequire(import.meta.url);

    // Resolve pdfjs-dist via pdf-parse's own node_modules (it is a declared
    // dependency of pdf-parse v2, so it is guaranteed to be present there).
    const pdfParseCjsPath = req.resolve("pdf-parse");
    const pdfParseReq = createRequire(pdfParseCjsPath);
    const pdfjsMjsPath   = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.mjs");
    const pdfjsWorkerPath = pdfParseReq.resolve("pdfjs-dist/legacy/build/pdf.worker.mjs");

    // Dynamic ESM import of pdfjs-dist (it is an ES module)
    const { getDocument, GlobalWorkerOptions, VerbosityLevel } =
      await import(pdfjsMjsPath) as any;

    GlobalWorkerOptions.workerSrc = pdfjsWorkerPath;

    const MAX_PDF_BYTES = 200 * 1024 * 1024;
    const stat = fs.statSync(filePath);
    const readSize = Math.min(stat.size, MAX_PDF_BYTES);
    const fd = fs.openSync(filePath, "r");
    const buf = Buffer.alloc(readSize);
    fs.readSync(fd, buf, 0, readSize, 0);
    fs.closeSync(fd);

    // VerbosityLevel.ERRORS = 0 → suppress "Warning: TT: undefined function" noise
    const verbosity: number = (VerbosityLevel as any)?.ERRORS ?? 0;
    pdfDoc = await getDocument({
      data: new Uint8Array(buf),
      useWorkerFetch: false,
      isEvalSupported: false,
      useSystemFonts: true,
      verbosity,
    }).promise;

    const totalPages = pdfDoc.numPages as number;
    const startPage = pageStart && pageStart > 0 ? Math.min(pageStart, totalPages) : 1;
    const endPage   = pageEnd   && pageEnd   > 0 ? Math.min(pageEnd,   totalPages) : totalPages;

    // Y_THRESH = 10 pt: groups diacritics / sub-glyphs on slightly different Y
    // into the same visual line.
    const Y_THRESH = 10;

    type TextItem = { x: number; y: number; str: string; width: number; height: number };

    const pageTexts: string[] = [];

    for (let p = startPage; p <= endPage; p++) {
      const page = await pdfDoc.getPage(p);
      const tc   = await page.getTextContent({ includeMarkedContent: false });

      const items: TextItem[] = [];
      for (const it of (tc.items ?? [])) {
        if (typeof it.str !== "string" || !it.str.trim()) continue;
        items.push({
          x:      it.transform[4],
          y:      it.transform[5],
          str:    it.str,
          width:  it.width  ?? 0,
          height: it.height ?? 12, // fallback to 12 pt if absent
        });
      }

      if (!items.length) {
        page.cleanup();
        continue;
      }

      // Bucket by quantised Y
      const buckets = new Map<number, TextItem[]>();
      for (const it of items) {
        const key = Math.round(it.y / Y_THRESH) * Y_THRESH;
        if (!buckets.has(key)) buckets.set(key, []);
        buckets.get(key)!.push(it);
      }

      // Lines top→bottom (larger Y = higher on PDF page)
      const sortedYs = Array.from(buckets.keys()).sort((a, b) => b - a);

      const lines: string[] = [];
      for (const y of sortedYs) {
        const row = buckets.get(y)!;
        // RTL: sort right-to-left (descending X)
        row.sort((a, b) => b.x - a.x);

        // Join items, inserting a space only when the gap between adjacent
        // items exceeds 25% of the item's font height.
        // This correctly skips ligature sub-glyph gaps (~1 pt) while catching
        // genuine inter-word spaces (~4+ pt for typical Arabic body text).
        let lineText = "";
        for (let i = 0; i < row.length; i++) {
          lineText += row[i].str;
          if (i < row.length - 1) {
            const cur  = row[i];
            const next = row[i + 1];
            // gap = horizontal distance between right edge of `next` and left edge of `cur`
            const gap = cur.x - (next.x + next.width);
            const spaceThreshold = (cur.height > 0 ? cur.height : 12) * 0.25;
            if (gap > spaceThreshold) lineText += " ";
          }
        }

        const trimmed = lineText.trim();
        if (trimmed) lines.push(trimmed);
      }

      page.cleanup();
      pageTexts.push(lines.join("\n"));
    }

    let text = pageTexts.join("\n\n").trim();

    // Arabic-specific post-processing: strips page markers, TOC dots, etc.
    text = cleanArabicPdfRaw(text);

    return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
  } catch (e) {
    logger.error({ err: e }, "[extractPdf] failed");
    return "";
  } finally {
    if (pdfDoc) {
      try { await pdfDoc.destroy(); } catch { /* ignore */ }
    }
  }
}

async function extractDocx(filePath: string): Promise<string> {
  try {
    const mammoth = await import("mammoth");
    const result = await mammoth.extractRawText({ path: filePath });
    const text = result.value?.trim() || "";
    return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
  } catch (e) {
    return "";
  }
}

// Resolves the Tesseract.js Node.js worker script path so it works even when
// the server code is bundled with esbuild (which breaks the default auto-resolution).
function getTessWorkerPath(): string {
  const pkgJson = _require.resolve("tesseract.js/package.json");
  return path.join(path.dirname(pkgJson), "src/worker-script/node/index.js");
}

async function extractImage(filePath: string): Promise<string> {
  try {
    const Tesseract = await import("tesseract.js");
    const cacheDir =
      process.env.NODE_ENV === "production"
        ? "/data/tessdata"
        : path.join(process.cwd(), "uploads", ".tessdata");
    if (!fs.existsSync(cacheDir)) fs.mkdirSync(cacheDir, { recursive: true });
    const worker = await Tesseract.createWorker(["ara", "eng"], 1, {
      cachePath: cacheDir,
      workerPath: getTessWorkerPath(),
    });
    const { data: { text } } = await worker.recognize(filePath);
    await worker.terminate();
    return text?.trim() || "";
  } catch (e) {
    logger.error({ err: e }, "[extractImage] error");
    return "";
  }
}

async function extractSpreadsheet(filePath: string, ext: string): Promise<string> {
  try {
    if (ext === ".csv") {
      const content = fs.readFileSync(filePath, "utf-8");
      const lines = content.split("\n").filter(Boolean).slice(0, 5000); // cap rows
      if (lines.length === 0) return "";
      const headers = lines[0].split(",").map((h) => h.trim());
      let md = `| ${headers.join(" | ")} |\n`;
      md += `| ${headers.map(() => "---").join(" | ")} |\n`;
      for (const line of lines.slice(1)) {
        const cells = line.split(",").map((c) => c.trim());
        md += `| ${cells.join(" | ")} |\n`;
        if (md.length > TEXT_CAP) break;
      }
      return md;
    }
    const { createRequire } = await import("module");
    const req = createRequire(import.meta.url);
    const XLSX = req("xlsx");
    const workbook = XLSX.readFile(filePath, { sheetRows: 5000 }); // cap rows per sheet
    let md = "";
    for (const sheetName of workbook.SheetNames) {
      const sheet = workbook.Sheets[sheetName];
      const data: string[][] = XLSX.utils.sheet_to_json(sheet, { header: 1 });
      md += `## ${sheetName}\n\n`;
      if (data.length > 0) {
        const headers = data[0].map(String);
        md += `| ${headers.join(" | ")} |\n`;
        md += `| ${headers.map(() => "---").join(" | ")} |\n`;
        for (const row of data.slice(1)) {
          md += `| ${headers.map((_, idx) => String(row[idx] ?? "")).join(" | ")} |\n`;
          if (md.length > TEXT_CAP) break;
        }
        md += "\n";
      }
      if (md.length > TEXT_CAP) break;
    }
    return md;
  } catch (e) {
    return "";
  }
}

async function extractPptx(filePath: string): Promise<string> {
  try {
    const JSZip = (await import("jszip")).default;
    const content = fs.readFileSync(filePath);
    const zip = await JSZip.loadAsync(content);
    let text = "";
    const slideFiles = Object.keys(zip.files)
      .filter((f) => f.match(/ppt\/slides\/slide\d+\.xml/))
      .sort();
    for (const slideFile of slideFiles) {
      const xml = await zip.files[slideFile].async("string");
      const matches = xml.match(/<a:t>(.*?)<\/a:t>/g) || [];
      const slideText = matches
        .map((m) => m.replace(/<[^>]+>/g, "").trim())
        .filter(Boolean)
        .join(" ");
      if (slideText) text += slideText + "\n\n";
      if (text.length > TEXT_CAP) break;
    }
    return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
  } catch (e) {
    return "";
  }
}

async function extractEpub(filePath: string): Promise<string> {
  try {
    const JSZip = (await import("jszip")).default;
    const content = fs.readFileSync(filePath);
    const zip = await JSZip.loadAsync(content);
    let text = "";
    for (const filename of Object.keys(zip.files)) {
      if (filename.endsWith(".html") || filename.endsWith(".xhtml")) {
        const html = await zip.files[filename].async("string");
        text += htmlToPlainText(html) + "\n\n";
        if (text.length > TEXT_CAP) break;
      }
    }
    return text.length > TEXT_CAP ? text.slice(0, TEXT_CAP) : text;
  } catch (e) {
    return "";
  }
}

function htmlToPlainText(html: string): string {
  return html
    .replace(/<h([1-6])[^>]*>(.*?)<\/h\1>/gis, (_, l, c) => "\n" + "#".repeat(Number(l)) + " " + stripTags(c) + "\n")
    .replace(/<p[^>]*>(.*?)<\/p>/gis, (_, c) => "\n" + stripTags(c) + "\n")
    .replace(/<li[^>]*>(.*?)<\/li>/gis, "- $1\n")
    .replace(/<br\s*\/?>/gi, "\n")
    .replace(/<[^>]+>/g, "")
    .replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").replace(/&nbsp;/g, " ")
    .replace(/\n{3,}/g, "\n\n")
    .trim();
}

function stripTags(s: string): string {
  return s.replace(/<[^>]+>/g, "").replace(/&amp;/g, "&").replace(/&lt;/g, "<").replace(/&gt;/g, ">").trim();
}

// ═══════════════════════════════════════════════════════════════════════════
// Stats & Utilities
// ═══════════════════════════════════════════════════════════════════════════
function computeStats(md: string) {
  const wordCount = md.split(/\s+/).filter(Boolean).length;
  const headings = (md.match(/^#{1,6}\s/gm) || []).length;
  const boldItems = (md.match(/\*\*[^*]+\*\*/g) || []).length;
  const listItems = (md.match(/^[-*+]\s/gm) || []).length;
  const tableRows = (md.match(/^\|/gm) || []).length;
  const codeBlocks = (md.match(/```/g) || []).length / 2;

  const qualityEstimate = Math.min(
    98,
    72 +
      Math.min(headings * 3, 12) +
      Math.min(boldItems, 10) +
      Math.min(listItems, 8) +
      (tableRows > 0 ? 4 : 0) +
      (codeBlocks > 0 ? 2 : 0) +
      Math.min(wordCount / 50, 10)
  );

  return { wordCount, headings, boldItems, listItems, qualityEstimate };
}

function cleanMarkdown(md: string): string {
  return md
    .replace(/\r\n/g, "\n")
    .replace(/[ \t]+$/gm, "")
    .replace(/\n{4,}/g, "\n\n\n")
    .trim();
}

function detectLanguage(text: string): string {
  const arabicChars = (text.match(/[\u0600-\u06FF]/g) || []).length;
  const latinChars = (text.match(/[a-zA-Z]/g) || []).length;
  if (arabicChars > latinChars * 0.6) return "ar";
  if (latinChars > arabicChars * 0.6) return "en";
  return "mixed";
}

function sleep(ms: number): Promise<void> {
  return new Promise((r) => setTimeout(r, ms));
}

// Public entry point — enforces a 15-minute hard limit per conversion job
const CONVERSION_TIMEOUT_MS = 15 * 60 * 1000;
async function runConversion(conversionId: string, fileId: string, storagePath: string) {
  try {
    await withTimeout(
      runConversionCore(conversionId, fileId, storagePath),
      CONVERSION_TIMEOUT_MS,
      "تحويل الملف"
    );
  } catch (err) {
    const error = err instanceof Error ? err.message : "انتهت مهلة التحويل";
    await db.update(conversionsTable)
      .set({ status: "failed", errorMessage: error })
      .where(eq(conversionsTable.id, conversionId));
    await db.update(filesTable)
      .set({ status: "failed", updatedAt: new Date() })
      .where(eq(filesTable.id, fileId));
  }
}

// ═══════════════════════════════════════════════════════════════════════════
// Routes
// ═══════════════════════════════════════════════════════════════════════════

// POST /api/convert/upload
router.post("/upload", upload.single("file"), async (req: AuthRequest, res) => {
  try {
    if (!req.file) {
      res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" });
      return;
    }
    const { pageStart, pageEnd, folderId } = req.body;
    const fileName = path.parse(fixFilename(req.file.originalname)).name;

    const [file] = await db
      .insert(filesTable)
      .values({
        name: fileName + ".md",
        ownerId: req.userId!,
        folderId: folderId || null,
        originalName: fixFilename(req.file.originalname),
        originalType: req.file.mimetype,
        sizeBytes: req.file.size,
        storagePath: req.file.path,
        status: "queued",
      })
      .returning();

    const [conversion] = await db
      .insert(conversionsTable)
      .values({
        fileId: file.id,
        userId: req.userId!,
        status: "queued",
        progress: 0,
        steps: initSteps(),
        pageStart: pageStart ? Number(pageStart) : null,
        pageEnd: pageEnd ? Number(pageEnd) : null,
      })
      .returning();

    runConversion(conversion.id, file.id, req.file.path).catch((err) =>
      req.log?.error({ err }, "background conversion error")
    );

    res.status(202).json({
      jobId: conversion.id,
      fileId: file.id,
      status: "queued",
      progress: 0,
      steps: initSteps(),
      createdAt: conversion.createdAt,
    });
  } catch (err) {
    const e = err instanceof Error ? err : new Error(String(err));
    const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause;
    const rootMsg = cause?.message ?? e.message;
    console.error("[RAQIM] /upload error:", rootMsg, "\n  outer:", e.message, "\n  stack:", e.stack);
    req.log?.error({ err, cause: cause?.message }, "upload error");
    res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" });
  }
});

// POST /api/convert/upload-split  — upload once, create N conversion jobs
router.post("/upload-split", upload.single("file"), async (req: AuthRequest, res) => {
  try {
    if (!req.file) {
      res.status(400).json({ error: "validation", message: "لم يتم رفع أي ملف" });
      return;
    }
    const { ranges: rangesJson, folderId } = req.body;
    let ranges: Array<{ start: number; end: number; label: string }> = [];
    try {
      ranges = JSON.parse(rangesJson || "[]");
    } catch {
      res.status(400).json({ error: "validation", message: "نطاقات الصفحات غير صالحة" });
      return;
    }
    if (!ranges.length) {
      res.status(400).json({ error: "validation", message: "يجب تحديد نطاق واحد على الأقل" });
      return;
    }

    const baseName = path.parse(fixFilename(req.file.originalname)).name;
    const jobs = [];

    for (const range of ranges) {
      const partName = `${baseName} — ${range.label}.md`;
      const [file] = await db
        .insert(filesTable)
        .values({
          name: partName,
          ownerId: req.userId!,
          folderId: folderId || null,
          originalName: fixFilename(req.file!.originalname),
          originalType: req.file!.mimetype,
          sizeBytes: req.file!.size,
          storagePath: req.file!.path,
          status: "queued",
        })
        .returning();

      const [conversion] = await db
        .insert(conversionsTable)
        .values({
          fileId: file.id,
          userId: req.userId!,
          status: "queued",
          progress: 0,
          steps: initSteps(),
          pageStart: range.start || null,
          pageEnd: range.end || null,
        })
        .returning();

      runConversion(conversion.id, file.id, req.file!.path).catch((err) =>
        req.log?.error({ err }, "split conversion error")
      );

      jobs.push({ jobId: conversion.id, fileId: file.id, name: partName });
    }

    res.status(202).json({ jobs });
  } catch (err) {
    const e = err instanceof Error ? err : new Error(String(err));
    const cause = (e as NodeJS.ErrnoException & { cause?: Error }).cause;
    const rootMsg = cause?.message ?? e.message;
    console.error("[RAQIM] /upload-split error:", rootMsg, "\n  outer:", e.message, "\n  stack:", e.stack);
    req.log?.error({ err, cause: cause?.message }, "upload-split error");
    res.status(500).json({ error: "server_error", message: rootMsg || "فشل الرفع" });
  }
});

// POST /api/convert
router.post("/", async (req: AuthRequest, res) => {
  try {
    const { fileId, pageStart, pageEnd } = req.body;
    const file = await db.query.filesTable.findFirst({
      where: and(eq(filesTable.id, fileId), eq(filesTable.ownerId, req.userId!)),
    });
    if (!file || !file.storagePath) {
      res.status(404).json({ error: "not_found", message: "الملف غير موجود" });
      return;
    }

    const [conversion] = await db
      .insert(conversionsTable)
      .values({
        fileId: file.id,
        userId: req.userId!,
        status: "queued",
        progress: 0,
        steps: initSteps(),
        pageStart: pageStart || null,
        pageEnd: pageEnd || null,
      })
      .returning();

    runConversion(conversion.id, file.id, file.storagePath).catch((err) =>
      req.log?.error({ err }, "background conversion error")
    );

    res.status(202).json({
      jobId: conversion.id,
      fileId,
      status: "queued",
      progress: 0,
      steps: initSteps(),
      createdAt: conversion.createdAt,
    });
  } catch (err) {
    req.log?.error({ err }, "convert error");
    res.status(500).json({ error: "server_error", message: "فشل التحويل" });
  }
});

// GET /api/convert/:jobId
router.get("/:jobId", async (req: AuthRequest, res) => {
  try {
    const jobId = req.params.jobId as string;
    const conv = await db.query.conversionsTable.findFirst({
      where: and(eq(conversionsTable.id, jobId), eq(conversionsTable.userId, req.userId!)),
    });
    if (!conv) {
      res.status(404).json({ error: "not_found", message: "المهمة غير موجودة" });
      return;
    }
    res.json({
      jobId: conv.id,
      fileId: conv.fileId,
      status: conv.status,
      progress: conv.progress,
      steps: conv.steps,
      queuePosition: null,
      elapsedSeconds: conv.elapsedSeconds,
      estimatedSeconds: conv.estimatedSeconds,
      errorMessage: conv.errorMessage,
      createdAt: conv.createdAt,
    });
  } catch (err) {
    req.log?.error({ err }, "get conversion error");
    res.status(500).json({ error: "server_error", message: "فشل جلب الحالة" });
  }
});

export default router;