RAQIM Deploy commited on
Commit
eec1fbf
·
1 Parent(s): c2e719f

Deploy RAQIM 2026-05-02 21:13

Browse files
artifacts/api-server/src/routes/convert.ts CHANGED
@@ -6,6 +6,7 @@ import { db } from "@workspace/db";
6
  import { filesTable, conversionsTable } from "@workspace/db";
7
  import { eq, and } from "drizzle-orm";
8
  import { requireAuth, AuthRequest } from "../middlewares/auth.js";
 
9
 
10
  const router = Router();
11
  router.use(requireAuth);
@@ -119,7 +120,13 @@ async function runConversionCore(conversionId: string, fileId: string, storagePa
119
  rawText = fs.readFileSync(storagePath, "utf-8");
120
  } else if (ext === ".pdf") {
121
  rawText = await extractPdf(storagePath, pageStart, pageEnd);
122
- await updateProgress("ocr", 38, steps, "تم تحليل الـ PDF واستخراج النص...");
 
 
 
 
 
 
123
  } else if ([".docx", ".doc"].includes(ext)) {
124
  rawText = await extractDocx(storagePath);
125
  await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word...");
@@ -634,6 +641,195 @@ function cleanArabicPdfRaw(text: string): string {
634
  return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
635
  }
636
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
637
  // ── RTL-aware PDF extractor using pdfjs-dist directly ────────────────────────
638
  // pdf-parse v2 has no `pagerender` callback, so we bypass it and use
639
  // pdfjs-dist (already installed as pdf-parse's peer) directly.
 
6
  import { filesTable, conversionsTable } from "@workspace/db";
7
  import { eq, and } from "drizzle-orm";
8
  import { requireAuth, AuthRequest } from "../middlewares/auth.js";
9
+ import { logger } from "../lib/logger.js";
10
 
11
  const router = Router();
12
  router.use(requireAuth);
 
120
  rawText = fs.readFileSync(storagePath, "utf-8");
121
  } else if (ext === ".pdf") {
122
  rawText = await extractPdf(storagePath, pageStart, pageEnd);
123
+ await updateProgress("ocr", 32, steps, "تم استخراج النص الخام من الـ PDF...");
124
+ // AI-powered Arabic text correction — fixes broken font CMap character
125
+ // transpositions that pdfjs-dist cannot resolve algorithmically.
126
+ rawText = await correctArabicPdfText(rawText, (msg, pct) =>
127
+ updateProgress("ocr", pct, steps, msg)
128
+ );
129
+ await updateProgress("ocr", 55, steps, "اكتمل التصحيح الذكي للنص العربي ✓");
130
  } else if ([".docx", ".doc"].includes(ext)) {
131
  rawText = await extractDocx(storagePath);
132
  await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word...");
 
641
  return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
642
  }
643
 
644
+ // ── Arabic PDF AI correction ──────────────────────────────────────────────────
645
+ // Many Arabic PDFs have broken ToUnicode CMaps in their fonts — the glyph→Unicode
646
+ // mapping is wrong, so pdfjs-dist returns correct Unicode codepoints but in the
647
+ // wrong character identity (e.g. ل and م swapped). This cannot be fixed via X-sort
648
+ // or bidi algorithms since the characters themselves are wrong, not their order.
649
+ //
650
+ // Solution: use the Replit AI Integration proxy (OpenAI-compatible, no extra API key
651
+ // needed — auto-provisioned via AI_INTEGRATIONS_OPENAI_BASE_URL) to run GPT-4.1,
652
+ // which has excellent Arabic language support. Falls back to raw text gracefully.
653
+ //
654
+ // Models tried in order via Replit AI Integration proxy (OpenAI-compatible, free via Replit):
655
+ const AI_CORRECTION_MODELS = [
656
+ "gpt-4.1", // Best Arabic quality
657
+ "gpt-4.1-mini", // Faster fallback
658
+ ];
659
+ const MAX_AI_CHUNKS = 30; // cap to avoid excessively long waits
660
+ const AI_CHUNK_CHARS = 1600; // chars per chunk (leaves room for system prompt)
661
+ const AI_CHUNK_TIMEOUT_MS = 90_000;
662
+
663
+ // Split at paragraph boundaries so the LLM gets coherent paragraphs
664
+ function chunkForAiCorrection(text: string): string[] {
665
+ const paras = text.split(/\n{2,}/);
666
+ const chunks: string[] = [];
667
+ let buf = "";
668
+ for (const para of paras) {
669
+ const joined = buf ? buf + "\n\n" + para : para;
670
+ if (joined.length <= AI_CHUNK_CHARS) {
671
+ buf = joined;
672
+ } else {
673
+ if (buf) chunks.push(buf);
674
+ if (para.length > AI_CHUNK_CHARS) {
675
+ // Long single paragraph: split by newline
676
+ for (const line of para.split("\n")) {
677
+ const lj = buf ? buf + "\n" + line : line;
678
+ if (lj.length <= AI_CHUNK_CHARS) { buf = lj; }
679
+ else { if (buf) chunks.push(buf); buf = line.slice(0, AI_CHUNK_CHARS); }
680
+ }
681
+ } else {
682
+ buf = para;
683
+ }
684
+ }
685
+ }
686
+ if (buf.trim()) chunks.push(buf);
687
+ return chunks.filter(c => c.trim().length > 0);
688
+ }
689
+
690
+ // Resolve the best available OpenAI-compatible endpoint.
691
+ // Priority:
692
+ // 1. Replit AI Integration proxy (auto-provisioned on Replit, free)
693
+ // 2. Custom endpoint: OPENAI_BASE_URL + OPENAI_API_KEY (user-supplied in HF Space secrets)
694
+ // 3. None → skip AI correction
695
+ function resolveAiEndpoint(): { baseUrl: string; apiKey: string } | null {
696
+ const replitUrl = process.env.AI_INTEGRATIONS_OPENAI_BASE_URL;
697
+ if (replitUrl) {
698
+ return {
699
+ baseUrl: replitUrl,
700
+ apiKey: process.env.AI_INTEGRATIONS_OPENAI_API_KEY ?? "placeholder",
701
+ };
702
+ }
703
+ const customUrl = process.env.OPENAI_BASE_URL ?? "https://api.openai.com/v1";
704
+ const customKey = process.env.OPENAI_API_KEY;
705
+ if (customKey) {
706
+ return { baseUrl: customUrl, apiKey: customKey };
707
+ }
708
+ return null;
709
+ }
710
+
711
+ async function callAiCorrection(text: string, model: string): Promise<string> {
712
+ const endpoint = resolveAiEndpoint();
713
+ if (!endpoint) throw new Error("no_ai_proxy");
714
+ const { baseUrl, apiKey } = endpoint;
715
+
716
+ const systemMsg =
717
+ "أنت نظام متخصص في تصحيح نصوص PDF العربية المستخرجة آلياً. " +
718
+ "النص مصدره كتب إسلامية وأكاديمية عربية. " +
719
+ "بعض حروف النص قد تبدلت أو انقلبت بسبب خلل في جدول ترميز خط الـ PDF (ToUnicode CMap). " +
720
+ "مثال على الخلل: 'امحلد هلل' الصحيحة 'الحمد لله'، و'اثنياا' الصح��حة 'ثانياً'، و'عرشة' الصحيحة 'عشرة'، " +
721
+ "و'عىل' الصحيحة 'على'، و'اذلي' الصحيحة 'الذي'، و'اليت' الصحيحة 'التي'، " +
722
+ "و'فيرسان' الصحيحة 'فبإمكاننا' أو ما شابه حسب السياق. " +
723
+ "مهمتك: تصحيح الكلمات المعطوبة فقط مع الحفاظ التام على: " +
724
+ "١) المعنى والمحتوى ٢) التنسيق (أسطر، فقرات، علامات Markdown) ٣) الكلمات الصحيحة كما هي. " +
725
+ "لا تضف ولا تحذف محتوى. أعد النص المصحح فقط بدون أي شرح.";
726
+
727
+ const controller = new AbortController();
728
+ const timer = setTimeout(() => controller.abort(), AI_CHUNK_TIMEOUT_MS);
729
+
730
+ try {
731
+ const resp = await fetch(`${baseUrl}/chat/completions`, {
732
+ method: "POST",
733
+ headers: {
734
+ Authorization: `Bearer ${apiKey}`,
735
+ "Content-Type": "application/json",
736
+ },
737
+ body: JSON.stringify({
738
+ model,
739
+ messages: [
740
+ { role: "system", content: systemMsg },
741
+ { role: "user", content: `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح:` },
742
+ ],
743
+ max_completion_tokens: Math.min(3000, Math.ceil(text.length * 2)),
744
+ }),
745
+ signal: controller.signal,
746
+ });
747
+
748
+ if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" });
749
+ if (!resp.ok) throw new Error(`ai_http_${resp.status}`);
750
+
751
+ const data = await resp.json() as any;
752
+ const corrected = (data.choices?.[0]?.message?.content ?? "").trim();
753
+
754
+ // Sanity: corrected must be at least 40% the length of input (not truncated)
755
+ // and no more than 300% (not hallucinated).
756
+ if (!corrected || corrected.length < text.length * 0.4 || corrected.length > text.length * 3) {
757
+ return text;
758
+ }
759
+ return corrected;
760
+ } finally {
761
+ clearTimeout(timer);
762
+ }
763
+ }
764
+
765
+ type ProgressFn = (msg: string, pct: number) => Promise<void>;
766
+
767
+ async function correctArabicPdfText(rawText: string, onProgress?: ProgressFn): Promise<string> {
768
+ if (!resolveAiEndpoint()) {
769
+ logger.info("[arabic-ai] No AI endpoint configured — skipping AI correction (set OPENAI_API_KEY in HF Space secrets to enable in production)");
770
+ return rawText;
771
+ }
772
+
773
+ // Only correct if text is predominantly Arabic
774
+ const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) ?? []).length;
775
+ const nonSpaceChars = rawText.replace(/\s/g, "").length;
776
+ if (nonSpaceChars < 50 || arabicChars / nonSpaceChars < 0.25) {
777
+ return rawText;
778
+ }
779
+
780
+ const chunks = chunkForAiCorrection(rawText);
781
+ const total = Math.min(chunks.length, MAX_AI_CHUNKS);
782
+ logger.info(`[arabic-ai] Correcting ${total}/${chunks.length} chunks with GPT`);
783
+
784
+ const correctedParts: string[] = [];
785
+ let modelIdx = 0;
786
+
787
+ for (let i = 0; i < chunks.length; i++) {
788
+ // Uncapped chunks pass through raw (document too large)
789
+ if (i >= MAX_AI_CHUNKS) {
790
+ correctedParts.push(...chunks.slice(i));
791
+ break;
792
+ }
793
+
794
+ // Report progress: map chunk index to 33%–54% range
795
+ const pct = 33 + Math.round((i / total) * 21);
796
+ await onProgress?.(`تصحيح النص بالذكاء الاصطناعي... (${i + 1}/${total})`, pct);
797
+
798
+ let done = false;
799
+ let attempts = 0;
800
+ while (!done && modelIdx < AI_CORRECTION_MODELS.length) {
801
+ try {
802
+ const model = AI_CORRECTION_MODELS[modelIdx];
803
+ const fixed = await callAiCorrection(chunks[i], model);
804
+ correctedParts.push(fixed);
805
+ done = true;
806
+ } catch (err: any) {
807
+ attempts++;
808
+ const code = err?.code ?? err?.message ?? "";
809
+ if (code === "rate_limited") {
810
+ logger.warn(`[arabic-ai] Rate-limited on ${AI_CORRECTION_MODELS[modelIdx]}, switching model`);
811
+ modelIdx++;
812
+ } else if (code === "no_ai_proxy") {
813
+ logger.warn("[arabic-ai] No AI proxy — using raw text for all remaining chunks");
814
+ correctedParts.push(...chunks.slice(i));
815
+ return correctedParts.join("\n\n");
816
+ } else {
817
+ logger.warn({ err }, `[arabic-ai] Error on chunk ${i}, trying next model`);
818
+ modelIdx++;
819
+ }
820
+ if (attempts > 3) break;
821
+ }
822
+ }
823
+
824
+ if (!done) {
825
+ correctedParts.push(chunks[i]);
826
+ modelIdx = 0; // reset; models might free up for next chunk
827
+ }
828
+ }
829
+
830
+ return correctedParts.join("\n\n");
831
+ }
832
+
833
  // ── RTL-aware PDF extractor using pdfjs-dist directly ────────────────────────
834
  // pdf-parse v2 has no `pagerender` callback, so we bypass it and use
835
  // pdfjs-dist (already installed as pdf-parse's peer) directly.
replit.md CHANGED
@@ -123,6 +123,21 @@ All timestamps stored as `INTEGER` (milliseconds since epoch). Enums stored as `
123
  - `GET /api/admin/trash` — list all trashed items
124
  - `DELETE /api/admin/trash/empty` — empty trash
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  ## Architect Engine — 100% Free, No External APIs, No Limits
127
 
128
  The "Super Architect" is a fully deterministic rule-based engine (`runRuleBasedArchitect` in `convert.ts`).
 
123
  - `GET /api/admin/trash` — list all trashed items
124
  - `DELETE /api/admin/trash/empty` — empty trash
125
 
126
+ ## Arabic PDF AI Correction
127
+
128
+ For Arabic PDFs with broken ToUnicode CMap fonts (pdfjs-dist returns wrong characters, e.g. `امحلد هلل` instead of `الحمد لله`), a post-extraction AI correction step is applied.
129
+
130
+ **How it works**: After pdfjs-dist text extraction, if the text is ≥25% Arabic, it's split into ~1600-char chunks and each chunk is corrected by GPT-4.1 (max 30 chunks = ~48,000 chars). The system prompt explains the CMap corruption with examples, enabling the model to fix character transpositions without changing content.
131
+
132
+ **Endpoint priority** (checked in order):
133
+ 1. `AI_INTEGRATIONS_OPENAI_BASE_URL` — Replit AI Integration proxy (auto-provisioned on Replit, no API key needed)
134
+ 2. `OPENAI_BASE_URL` + `OPENAI_API_KEY` — custom OpenAI-compatible endpoint (set in HF Space secrets for production)
135
+ 3. Falls back gracefully to uncorrected text if neither is available
136
+
137
+ **Models tried in order**: `gpt-4.1` → `gpt-4.1-mini` (with model fallback on 429 rate limit)
138
+
139
+ **To enable on HF Spaces**: Add `OPENAI_API_KEY` (and optionally `OPENAI_BASE_URL` for non-OpenAI endpoints like Together AI or Groq) to the Space secrets.
140
+
141
  ## Architect Engine — 100% Free, No External APIs, No Limits
142
 
143
  The "Super Architect" is a fully deterministic rule-based engine (`runRuleBasedArchitect` in `convert.ts`).