Spaces:
Sleeping
Sleeping
RAQIM Deploy commited on
Commit ·
eec1fbf
1
Parent(s): c2e719f
Deploy RAQIM 2026-05-02 21:13
Browse files- artifacts/api-server/src/routes/convert.ts +197 -1
- replit.md +15 -0
artifacts/api-server/src/routes/convert.ts
CHANGED
|
@@ -6,6 +6,7 @@ import { db } from "@workspace/db";
|
|
| 6 |
import { filesTable, conversionsTable } from "@workspace/db";
|
| 7 |
import { eq, and } from "drizzle-orm";
|
| 8 |
import { requireAuth, AuthRequest } from "../middlewares/auth.js";
|
|
|
|
| 9 |
|
| 10 |
const router = Router();
|
| 11 |
router.use(requireAuth);
|
|
@@ -119,7 +120,13 @@ async function runConversionCore(conversionId: string, fileId: string, storagePa
|
|
| 119 |
rawText = fs.readFileSync(storagePath, "utf-8");
|
| 120 |
} else if (ext === ".pdf") {
|
| 121 |
rawText = await extractPdf(storagePath, pageStart, pageEnd);
|
| 122 |
-
await updateProgress("ocr",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
} else if ([".docx", ".doc"].includes(ext)) {
|
| 124 |
rawText = await extractDocx(storagePath);
|
| 125 |
await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word...");
|
|
@@ -634,6 +641,195 @@ function cleanArabicPdfRaw(text: string): string {
|
|
| 634 |
return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
| 635 |
}
|
| 636 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
// ── RTL-aware PDF extractor using pdfjs-dist directly ────────────────────────
|
| 638 |
// pdf-parse v2 has no `pagerender` callback, so we bypass it and use
|
| 639 |
// pdfjs-dist (already installed as pdf-parse's peer) directly.
|
|
|
|
| 6 |
import { filesTable, conversionsTable } from "@workspace/db";
|
| 7 |
import { eq, and } from "drizzle-orm";
|
| 8 |
import { requireAuth, AuthRequest } from "../middlewares/auth.js";
|
| 9 |
+
import { logger } from "../lib/logger.js";
|
| 10 |
|
| 11 |
const router = Router();
|
| 12 |
router.use(requireAuth);
|
|
|
|
| 120 |
rawText = fs.readFileSync(storagePath, "utf-8");
|
| 121 |
} else if (ext === ".pdf") {
|
| 122 |
rawText = await extractPdf(storagePath, pageStart, pageEnd);
|
| 123 |
+
await updateProgress("ocr", 32, steps, "تم استخراج النص الخام من الـ PDF...");
|
| 124 |
+
// AI-powered Arabic text correction — fixes broken font CMap character
|
| 125 |
+
// transpositions that pdfjs-dist cannot resolve algorithmically.
|
| 126 |
+
rawText = await correctArabicPdfText(rawText, (msg, pct) =>
|
| 127 |
+
updateProgress("ocr", pct, steps, msg)
|
| 128 |
+
);
|
| 129 |
+
await updateProgress("ocr", 55, steps, "اكتمل التصحيح الذكي للنص العربي ✓");
|
| 130 |
} else if ([".docx", ".doc"].includes(ext)) {
|
| 131 |
rawText = await extractDocx(storagePath);
|
| 132 |
await updateProgress("ocr", 38, steps, "تم استخراج نص ملف Word...");
|
|
|
|
| 641 |
return out.join("\n").replace(/\n{3,}/g, "\n\n").trim();
|
| 642 |
}
|
| 643 |
|
| 644 |
+
// ── Arabic PDF AI correction ──────────────────────────────────────────────────
|
| 645 |
+
// Many Arabic PDFs have broken ToUnicode CMaps in their fonts — the glyph→Unicode
|
| 646 |
+
// mapping is wrong, so pdfjs-dist returns correct Unicode codepoints but in the
|
| 647 |
+
// wrong character identity (e.g. ل and م swapped). This cannot be fixed via X-sort
|
| 648 |
+
// or bidi algorithms since the characters themselves are wrong, not their order.
|
| 649 |
+
//
|
| 650 |
+
// Solution: use the Replit AI Integration proxy (OpenAI-compatible, no extra API key
|
| 651 |
+
// needed — auto-provisioned via AI_INTEGRATIONS_OPENAI_BASE_URL) to run GPT-4.1,
|
| 652 |
+
// which has excellent Arabic language support. Falls back to raw text gracefully.
|
| 653 |
+
//
|
| 654 |
+
// Models tried in order via Replit AI Integration proxy (OpenAI-compatible, free via Replit):
|
| 655 |
+
const AI_CORRECTION_MODELS = [
|
| 656 |
+
"gpt-4.1", // Best Arabic quality
|
| 657 |
+
"gpt-4.1-mini", // Faster fallback
|
| 658 |
+
];
|
| 659 |
+
const MAX_AI_CHUNKS = 30; // cap to avoid excessively long waits
|
| 660 |
+
const AI_CHUNK_CHARS = 1600; // chars per chunk (leaves room for system prompt)
|
| 661 |
+
const AI_CHUNK_TIMEOUT_MS = 90_000;
|
| 662 |
+
|
| 663 |
+
// Split at paragraph boundaries so the LLM gets coherent paragraphs
|
| 664 |
+
function chunkForAiCorrection(text: string): string[] {
|
| 665 |
+
const paras = text.split(/\n{2,}/);
|
| 666 |
+
const chunks: string[] = [];
|
| 667 |
+
let buf = "";
|
| 668 |
+
for (const para of paras) {
|
| 669 |
+
const joined = buf ? buf + "\n\n" + para : para;
|
| 670 |
+
if (joined.length <= AI_CHUNK_CHARS) {
|
| 671 |
+
buf = joined;
|
| 672 |
+
} else {
|
| 673 |
+
if (buf) chunks.push(buf);
|
| 674 |
+
if (para.length > AI_CHUNK_CHARS) {
|
| 675 |
+
// Long single paragraph: split by newline
|
| 676 |
+
for (const line of para.split("\n")) {
|
| 677 |
+
const lj = buf ? buf + "\n" + line : line;
|
| 678 |
+
if (lj.length <= AI_CHUNK_CHARS) { buf = lj; }
|
| 679 |
+
else { if (buf) chunks.push(buf); buf = line.slice(0, AI_CHUNK_CHARS); }
|
| 680 |
+
}
|
| 681 |
+
} else {
|
| 682 |
+
buf = para;
|
| 683 |
+
}
|
| 684 |
+
}
|
| 685 |
+
}
|
| 686 |
+
if (buf.trim()) chunks.push(buf);
|
| 687 |
+
return chunks.filter(c => c.trim().length > 0);
|
| 688 |
+
}
|
| 689 |
+
|
| 690 |
+
// Resolve the best available OpenAI-compatible endpoint.
|
| 691 |
+
// Priority:
|
| 692 |
+
// 1. Replit AI Integration proxy (auto-provisioned on Replit, free)
|
| 693 |
+
// 2. Custom endpoint: OPENAI_BASE_URL + OPENAI_API_KEY (user-supplied in HF Space secrets)
|
| 694 |
+
// 3. None → skip AI correction
|
| 695 |
+
function resolveAiEndpoint(): { baseUrl: string; apiKey: string } | null {
|
| 696 |
+
const replitUrl = process.env.AI_INTEGRATIONS_OPENAI_BASE_URL;
|
| 697 |
+
if (replitUrl) {
|
| 698 |
+
return {
|
| 699 |
+
baseUrl: replitUrl,
|
| 700 |
+
apiKey: process.env.AI_INTEGRATIONS_OPENAI_API_KEY ?? "placeholder",
|
| 701 |
+
};
|
| 702 |
+
}
|
| 703 |
+
const customUrl = process.env.OPENAI_BASE_URL ?? "https://api.openai.com/v1";
|
| 704 |
+
const customKey = process.env.OPENAI_API_KEY;
|
| 705 |
+
if (customKey) {
|
| 706 |
+
return { baseUrl: customUrl, apiKey: customKey };
|
| 707 |
+
}
|
| 708 |
+
return null;
|
| 709 |
+
}
|
| 710 |
+
|
| 711 |
+
async function callAiCorrection(text: string, model: string): Promise<string> {
|
| 712 |
+
const endpoint = resolveAiEndpoint();
|
| 713 |
+
if (!endpoint) throw new Error("no_ai_proxy");
|
| 714 |
+
const { baseUrl, apiKey } = endpoint;
|
| 715 |
+
|
| 716 |
+
const systemMsg =
|
| 717 |
+
"أنت نظام متخصص في تصحيح نصوص PDF العربية المستخرجة آلياً. " +
|
| 718 |
+
"النص مصدره كتب إسلامية وأكاديمية عربية. " +
|
| 719 |
+
"بعض حروف النص قد تبدلت أو انقلبت بسبب خلل في جدول ترميز خط الـ PDF (ToUnicode CMap). " +
|
| 720 |
+
"مثال على الخلل: 'امحلد هلل' الصحيحة 'الحمد لله'، و'اثنياا' الصح��حة 'ثانياً'، و'عرشة' الصحيحة 'عشرة'، " +
|
| 721 |
+
"و'عىل' الصحيحة 'على'، و'اذلي' الصحيحة 'الذي'، و'اليت' الصحيحة 'التي'، " +
|
| 722 |
+
"و'فيرسان' الصحيحة 'فبإمكاننا' أو ما شابه حسب السياق. " +
|
| 723 |
+
"مهمتك: تصحيح الكلمات المعطوبة فقط مع الحفاظ التام على: " +
|
| 724 |
+
"١) المعنى والمحتوى ٢) التنسيق (أسطر، فقرات، علامات Markdown) ٣) الكلمات الصحيحة كما هي. " +
|
| 725 |
+
"لا تضف ولا تحذف محتوى. أعد النص المصحح فقط بدون أي شرح.";
|
| 726 |
+
|
| 727 |
+
const controller = new AbortController();
|
| 728 |
+
const timer = setTimeout(() => controller.abort(), AI_CHUNK_TIMEOUT_MS);
|
| 729 |
+
|
| 730 |
+
try {
|
| 731 |
+
const resp = await fetch(`${baseUrl}/chat/completions`, {
|
| 732 |
+
method: "POST",
|
| 733 |
+
headers: {
|
| 734 |
+
Authorization: `Bearer ${apiKey}`,
|
| 735 |
+
"Content-Type": "application/json",
|
| 736 |
+
},
|
| 737 |
+
body: JSON.stringify({
|
| 738 |
+
model,
|
| 739 |
+
messages: [
|
| 740 |
+
{ role: "system", content: systemMsg },
|
| 741 |
+
{ role: "user", content: `النص المستخرج من PDF:\n\n${text}\n\nالنص المصحح:` },
|
| 742 |
+
],
|
| 743 |
+
max_completion_tokens: Math.min(3000, Math.ceil(text.length * 2)),
|
| 744 |
+
}),
|
| 745 |
+
signal: controller.signal,
|
| 746 |
+
});
|
| 747 |
+
|
| 748 |
+
if (resp.status === 429) throw Object.assign(new Error("rate_limited"), { code: "rate_limited" });
|
| 749 |
+
if (!resp.ok) throw new Error(`ai_http_${resp.status}`);
|
| 750 |
+
|
| 751 |
+
const data = await resp.json() as any;
|
| 752 |
+
const corrected = (data.choices?.[0]?.message?.content ?? "").trim();
|
| 753 |
+
|
| 754 |
+
// Sanity: corrected must be at least 40% the length of input (not truncated)
|
| 755 |
+
// and no more than 300% (not hallucinated).
|
| 756 |
+
if (!corrected || corrected.length < text.length * 0.4 || corrected.length > text.length * 3) {
|
| 757 |
+
return text;
|
| 758 |
+
}
|
| 759 |
+
return corrected;
|
| 760 |
+
} finally {
|
| 761 |
+
clearTimeout(timer);
|
| 762 |
+
}
|
| 763 |
+
}
|
| 764 |
+
|
| 765 |
+
type ProgressFn = (msg: string, pct: number) => Promise<void>;
|
| 766 |
+
|
| 767 |
+
async function correctArabicPdfText(rawText: string, onProgress?: ProgressFn): Promise<string> {
|
| 768 |
+
if (!resolveAiEndpoint()) {
|
| 769 |
+
logger.info("[arabic-ai] No AI endpoint configured — skipping AI correction (set OPENAI_API_KEY in HF Space secrets to enable in production)");
|
| 770 |
+
return rawText;
|
| 771 |
+
}
|
| 772 |
+
|
| 773 |
+
// Only correct if text is predominantly Arabic
|
| 774 |
+
const arabicChars = (rawText.match(/[\u0600-\u06FF]/g) ?? []).length;
|
| 775 |
+
const nonSpaceChars = rawText.replace(/\s/g, "").length;
|
| 776 |
+
if (nonSpaceChars < 50 || arabicChars / nonSpaceChars < 0.25) {
|
| 777 |
+
return rawText;
|
| 778 |
+
}
|
| 779 |
+
|
| 780 |
+
const chunks = chunkForAiCorrection(rawText);
|
| 781 |
+
const total = Math.min(chunks.length, MAX_AI_CHUNKS);
|
| 782 |
+
logger.info(`[arabic-ai] Correcting ${total}/${chunks.length} chunks with GPT`);
|
| 783 |
+
|
| 784 |
+
const correctedParts: string[] = [];
|
| 785 |
+
let modelIdx = 0;
|
| 786 |
+
|
| 787 |
+
for (let i = 0; i < chunks.length; i++) {
|
| 788 |
+
// Uncapped chunks pass through raw (document too large)
|
| 789 |
+
if (i >= MAX_AI_CHUNKS) {
|
| 790 |
+
correctedParts.push(...chunks.slice(i));
|
| 791 |
+
break;
|
| 792 |
+
}
|
| 793 |
+
|
| 794 |
+
// Report progress: map chunk index to 33%–54% range
|
| 795 |
+
const pct = 33 + Math.round((i / total) * 21);
|
| 796 |
+
await onProgress?.(`تصحيح النص بالذكاء الاصطناعي... (${i + 1}/${total})`, pct);
|
| 797 |
+
|
| 798 |
+
let done = false;
|
| 799 |
+
let attempts = 0;
|
| 800 |
+
while (!done && modelIdx < AI_CORRECTION_MODELS.length) {
|
| 801 |
+
try {
|
| 802 |
+
const model = AI_CORRECTION_MODELS[modelIdx];
|
| 803 |
+
const fixed = await callAiCorrection(chunks[i], model);
|
| 804 |
+
correctedParts.push(fixed);
|
| 805 |
+
done = true;
|
| 806 |
+
} catch (err: any) {
|
| 807 |
+
attempts++;
|
| 808 |
+
const code = err?.code ?? err?.message ?? "";
|
| 809 |
+
if (code === "rate_limited") {
|
| 810 |
+
logger.warn(`[arabic-ai] Rate-limited on ${AI_CORRECTION_MODELS[modelIdx]}, switching model`);
|
| 811 |
+
modelIdx++;
|
| 812 |
+
} else if (code === "no_ai_proxy") {
|
| 813 |
+
logger.warn("[arabic-ai] No AI proxy — using raw text for all remaining chunks");
|
| 814 |
+
correctedParts.push(...chunks.slice(i));
|
| 815 |
+
return correctedParts.join("\n\n");
|
| 816 |
+
} else {
|
| 817 |
+
logger.warn({ err }, `[arabic-ai] Error on chunk ${i}, trying next model`);
|
| 818 |
+
modelIdx++;
|
| 819 |
+
}
|
| 820 |
+
if (attempts > 3) break;
|
| 821 |
+
}
|
| 822 |
+
}
|
| 823 |
+
|
| 824 |
+
if (!done) {
|
| 825 |
+
correctedParts.push(chunks[i]);
|
| 826 |
+
modelIdx = 0; // reset; models might free up for next chunk
|
| 827 |
+
}
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
return correctedParts.join("\n\n");
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
// ── RTL-aware PDF extractor using pdfjs-dist directly ────────────────────────
|
| 834 |
// pdf-parse v2 has no `pagerender` callback, so we bypass it and use
|
| 835 |
// pdfjs-dist (already installed as pdf-parse's peer) directly.
|
replit.md
CHANGED
|
@@ -123,6 +123,21 @@ All timestamps stored as `INTEGER` (milliseconds since epoch). Enums stored as `
|
|
| 123 |
- `GET /api/admin/trash` — list all trashed items
|
| 124 |
- `DELETE /api/admin/trash/empty` — empty trash
|
| 125 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
## Architect Engine — 100% Free, No External APIs, No Limits
|
| 127 |
|
| 128 |
The "Super Architect" is a fully deterministic rule-based engine (`runRuleBasedArchitect` in `convert.ts`).
|
|
|
|
| 123 |
- `GET /api/admin/trash` — list all trashed items
|
| 124 |
- `DELETE /api/admin/trash/empty` — empty trash
|
| 125 |
|
| 126 |
+
## Arabic PDF AI Correction
|
| 127 |
+
|
| 128 |
+
For Arabic PDFs with broken ToUnicode CMap fonts (pdfjs-dist returns wrong characters, e.g. `امحلد هلل` instead of `الحمد لله`), a post-extraction AI correction step is applied.
|
| 129 |
+
|
| 130 |
+
**How it works**: After pdfjs-dist text extraction, if the text is ≥25% Arabic, it's split into ~1600-char chunks and each chunk is corrected by GPT-4.1 (max 30 chunks = ~48,000 chars). The system prompt explains the CMap corruption with examples, enabling the model to fix character transpositions without changing content.
|
| 131 |
+
|
| 132 |
+
**Endpoint priority** (checked in order):
|
| 133 |
+
1. `AI_INTEGRATIONS_OPENAI_BASE_URL` — Replit AI Integration proxy (auto-provisioned on Replit, no API key needed)
|
| 134 |
+
2. `OPENAI_BASE_URL` + `OPENAI_API_KEY` — custom OpenAI-compatible endpoint (set in HF Space secrets for production)
|
| 135 |
+
3. Falls back gracefully to uncorrected text if neither is available
|
| 136 |
+
|
| 137 |
+
**Models tried in order**: `gpt-4.1` → `gpt-4.1-mini` (with model fallback on 429 rate limit)
|
| 138 |
+
|
| 139 |
+
**To enable on HF Spaces**: Add `OPENAI_API_KEY` (and optionally `OPENAI_BASE_URL` for non-OpenAI endpoints like Together AI or Groq) to the Space secrets.
|
| 140 |
+
|
| 141 |
## Architect Engine — 100% Free, No External APIs, No Limits
|
| 142 |
|
| 143 |
The "Super Architect" is a fully deterministic rule-based engine (`runRuleBasedArchitect` in `convert.ts`).
|