// Client-side text extraction for PDF, DOCX and Image files. import { extractText, getDocumentProxy } from "unpdf"; import mammoth from "mammoth"; import { createWorker } from "tesseract.js"; async function createOptimizedWorker() { const worker = await createWorker("eng"); return worker; } /** * Parallel OCR Processor: Manages a pool of Tesseract workers to speed up multi-page extraction. */ class OcrPool { private workers: any[] = []; private concurrency = Math.min(navigator.hardwareConcurrency || 4, 4); async init() { if (this.workers.length > 0) return; const pool = []; for (let i = 0; i < this.concurrency; i++) { pool.push(createOptimizedWorker()); } this.workers = await Promise.all(pool); } async process(items: any[], task: (worker: any, item: any, index: number) => Promise, onProgress?: (p: number) => void) { await this.init(); const results = new Array(items.length); let completed = 0; const execute = async (worker: any, index: number) => { results[index] = await task(worker, items[index], index); completed++; if (onProgress) onProgress(Math.round((completed / items.length) * 100)); }; // Simple round-robin or queue approach const queue = [...items.keys()]; const runners = this.workers.map(async (worker) => { while (queue.length > 0) { const index = queue.shift(); if (index !== undefined) { await execute(worker, index); } } }); await Promise.all(runners); return results; } async terminate() { await Promise.all(this.workers.map(w => w.terminate())); this.workers = []; } } const pool = new OcrPool(); export async function extractImageText(file: File | Blob, onProgress?: (p: number) => void): Promise { const worker = await createOptimizedWorker(); if (onProgress) onProgress(10); const { data: { text } } = await worker.recognize(file); if (onProgress) onProgress(100); await worker.terminate(); return text.trim(); } export async function extractPdfText(file: File, onProgress?: (p: number) => void): Promise { const ab = await file.arrayBuffer(); const pdf = await getDocumentProxy(new Uint8Array(ab)); const numPages = pdf.numPages; if (onProgress) onProgress(0); const pageIndices = Array.from({ length: numPages }, (_, i) => i + 1); const results = await pool.process(pageIndices, async (worker, pageNum, idx) => { const page = await pdf.getPage(pageNum); // 1. Get selectable text const textContent = await page.getTextContent(); const pageText = textContent.items .map((item: any) => item.str) .join(" ") .replace(/\s+/g, " ") .trim(); // 2. Render and OCR for image text (User wants both) const viewport = page.getViewport({ scale: 2.0 }); const canvas = document.createElement("canvas"); const context = canvas.getContext("2d"); if (context) { canvas.height = viewport.height; canvas.width = viewport.width; // @ts-ignore await page.render({ canvasContext: context, viewport }).promise; const { data: { text: ocrText } } = await worker.recognize(canvas); const cleanedOcr = ocrText.trim(); let out = `--- Page ${pageNum} ---\n${pageText}\n`; if (cleanedOcr.length > 20) { out += `\n[Detected in images/formatting]:\n${cleanedOcr}\n`; } return out; } return `--- Page ${pageNum} ---\n${pageText}\n`; }, onProgress); // We keep workers alive in the pool for the next document, or terminate manually? // Let's terminate for now to avoid memory pressure in a long session. await pool.terminate(); return results.join("\n\n").trim(); } export async function extractDocxText(file: File, onProgress?: (p: number) => void): Promise { const ab = await file.arrayBuffer(); if (onProgress) onProgress(10); const textResult = await mammoth.extractRawText({ arrayBuffer: ab }); let fullText = (textResult.value ?? "").trim(); const images: Blob[] = []; await mammoth.convertToHtml({ arrayBuffer: ab }, { convertImage: mammoth.images.imgElement(async (image) => { const buffer = await image.read(); images.push(new Blob([new Uint8Array(buffer)], { type: image.contentType })); return { src: "" }; }) }); if (images.length > 0) { if (onProgress) onProgress(30); const results = await pool.process(images, async (worker, imgBlob) => { const { data: { text } } = await worker.recognize(imgBlob); return text.trim(); }, (p) => onProgress?.(30 + (p * 0.7))); // Scale progress to 30-100% results.forEach(t => { if (t) fullText += "\n\n[Extracted from image]:\n" + t; }); await pool.terminate(); } else { if (onProgress) onProgress(100); } return fullText.trim(); } export async function extractFileText(file: File, onProgress?: (p: number) => void): Promise<{ text: string; sourceType: "pdf" | "docx" | "text" }> { const ext = file.name.split(".").pop()?.toLowerCase() ?? ""; if (ext === "pdf") { const text = await extractPdfText(file, onProgress); return { text, sourceType: "pdf" }; } if (ext === "docx" || ext === "doc") { const text = await extractDocxText(file, onProgress); return { text, sourceType: "docx" }; } if (["png", "jpg", "jpeg", "webp", "bmp"].includes(ext)) { const text = await extractImageText(file, onProgress); return { text, sourceType: "text" }; } throw new Error("Unsupported file type for client-side extraction"); }