| |
| import { extractText, getDocumentProxy } from "unpdf"; |
| import mammoth from "mammoth"; |
| import { createWorker } from "tesseract.js"; |
|
|
| async function createOptimizedWorker() { |
| const worker = await createWorker("eng"); |
| return worker; |
| } |
|
|
| |
| |
| |
| class OcrPool { |
| private workers: any[] = []; |
| private concurrency = Math.min(navigator.hardwareConcurrency || 4, 4); |
|
|
| async init() { |
| if (this.workers.length > 0) return; |
| const pool = []; |
| for (let i = 0; i < this.concurrency; i++) { |
| pool.push(createOptimizedWorker()); |
| } |
| this.workers = await Promise.all(pool); |
| } |
|
|
| async process(items: any[], task: (worker: any, item: any, index: number) => Promise<string>, onProgress?: (p: number) => void) { |
| await this.init(); |
| const results = new Array(items.length); |
| let completed = 0; |
|
|
| const execute = async (worker: any, index: number) => { |
| results[index] = await task(worker, items[index], index); |
| completed++; |
| if (onProgress) onProgress(Math.round((completed / items.length) * 100)); |
| }; |
|
|
| |
| const queue = [...items.keys()]; |
| const runners = this.workers.map(async (worker) => { |
| while (queue.length > 0) { |
| const index = queue.shift(); |
| if (index !== undefined) { |
| await execute(worker, index); |
| } |
| } |
| }); |
|
|
| await Promise.all(runners); |
| return results; |
| } |
|
|
| async terminate() { |
| await Promise.all(this.workers.map(w => w.terminate())); |
| this.workers = []; |
| } |
| } |
|
|
| const pool = new OcrPool(); |
|
|
| export async function extractImageText(file: File | Blob, onProgress?: (p: number) => void): Promise<string> { |
| const worker = await createOptimizedWorker(); |
| if (onProgress) onProgress(10); |
| const { data: { text } } = await worker.recognize(file); |
| if (onProgress) onProgress(100); |
| await worker.terminate(); |
| return text.trim(); |
| } |
|
|
| export async function extractPdfText(file: File, onProgress?: (p: number) => void): Promise<string> { |
| const ab = await file.arrayBuffer(); |
| const pdf = await getDocumentProxy(new Uint8Array(ab)); |
| const numPages = pdf.numPages; |
| |
| if (onProgress) onProgress(0); |
|
|
| const pageIndices = Array.from({ length: numPages }, (_, i) => i + 1); |
| |
| const results = await pool.process(pageIndices, async (worker, pageNum, idx) => { |
| const page = await pdf.getPage(pageNum); |
| |
| |
| const textContent = await page.getTextContent(); |
| const pageText = textContent.items |
| .map((item: any) => item.str) |
| .join(" ") |
| .replace(/\s+/g, " ") |
| .trim(); |
|
|
| |
| const viewport = page.getViewport({ scale: 2.0 }); |
| const canvas = document.createElement("canvas"); |
| const context = canvas.getContext("2d"); |
| if (context) { |
| canvas.height = viewport.height; |
| canvas.width = viewport.width; |
| |
| await page.render({ canvasContext: context, viewport }).promise; |
| |
| const { data: { text: ocrText } } = await worker.recognize(canvas); |
| const cleanedOcr = ocrText.trim(); |
| |
| let out = `--- Page ${pageNum} ---\n${pageText}\n`; |
| if (cleanedOcr.length > 20) { |
| out += `\n[Detected in images/formatting]:\n${cleanedOcr}\n`; |
| } |
| return out; |
| } |
| return `--- Page ${pageNum} ---\n${pageText}\n`; |
| }, onProgress); |
|
|
| |
| |
| await pool.terminate(); |
| |
| return results.join("\n\n").trim(); |
| } |
|
|
| export async function extractDocxText(file: File, onProgress?: (p: number) => void): Promise<string> { |
| const ab = await file.arrayBuffer(); |
| if (onProgress) onProgress(10); |
| |
| const textResult = await mammoth.extractRawText({ arrayBuffer: ab }); |
| let fullText = (textResult.value ?? "").trim(); |
|
|
| const images: Blob[] = []; |
| await mammoth.convertToHtml({ arrayBuffer: ab }, { |
| convertImage: mammoth.images.imgElement(async (image) => { |
| const buffer = await image.read(); |
| images.push(new Blob([new Uint8Array(buffer)], { type: image.contentType })); |
| return { src: "" }; |
| }) |
| }); |
|
|
| if (images.length > 0) { |
| if (onProgress) onProgress(30); |
| const results = await pool.process(images, async (worker, imgBlob) => { |
| const { data: { text } } = await worker.recognize(imgBlob); |
| return text.trim(); |
| }, (p) => onProgress?.(30 + (p * 0.7))); |
| |
| results.forEach(t => { |
| if (t) fullText += "\n\n[Extracted from image]:\n" + t; |
| }); |
| await pool.terminate(); |
| } else { |
| if (onProgress) onProgress(100); |
| } |
|
|
| return fullText.trim(); |
| } |
|
|
| export async function extractFileText(file: File, onProgress?: (p: number) => void): Promise<{ text: string; sourceType: "pdf" | "docx" | "text" }> { |
| const ext = file.name.split(".").pop()?.toLowerCase() ?? ""; |
| |
| if (ext === "pdf") { |
| const text = await extractPdfText(file, onProgress); |
| return { text, sourceType: "pdf" }; |
| } |
| |
| if (ext === "docx" || ext === "doc") { |
| const text = await extractDocxText(file, onProgress); |
| return { text, sourceType: "docx" }; |
| } |
|
|
| if (["png", "jpg", "jpeg", "webp", "bmp"].includes(ext)) { |
| const text = await extractImageText(file, onProgress); |
| return { text, sourceType: "text" }; |
| } |
|
|
| throw new Error("Unsupported file type for client-side extraction"); |
| } |
|
|