File size: 5,631 Bytes
ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 40d16d9 ae14296 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | // Client-side text extraction for PDF, DOCX and Image files.
import { extractText, getDocumentProxy } from "unpdf";
import mammoth from "mammoth";
import { createWorker } from "tesseract.js";
async function createOptimizedWorker() {
const worker = await createWorker("eng");
return worker;
}
/**
* Parallel OCR Processor: Manages a pool of Tesseract workers to speed up multi-page extraction.
*/
class OcrPool {
private workers: any[] = [];
private concurrency = Math.min(navigator.hardwareConcurrency || 4, 4);
async init() {
if (this.workers.length > 0) return;
const pool = [];
for (let i = 0; i < this.concurrency; i++) {
pool.push(createOptimizedWorker());
}
this.workers = await Promise.all(pool);
}
async process(items: any[], task: (worker: any, item: any, index: number) => Promise<string>, onProgress?: (p: number) => void) {
await this.init();
const results = new Array(items.length);
let completed = 0;
const execute = async (worker: any, index: number) => {
results[index] = await task(worker, items[index], index);
completed++;
if (onProgress) onProgress(Math.round((completed / items.length) * 100));
};
// Simple round-robin or queue approach
const queue = [...items.keys()];
const runners = this.workers.map(async (worker) => {
while (queue.length > 0) {
const index = queue.shift();
if (index !== undefined) {
await execute(worker, index);
}
}
});
await Promise.all(runners);
return results;
}
async terminate() {
await Promise.all(this.workers.map(w => w.terminate()));
this.workers = [];
}
}
const pool = new OcrPool();
export async function extractImageText(file: File | Blob, onProgress?: (p: number) => void): Promise<string> {
const worker = await createOptimizedWorker();
if (onProgress) onProgress(10);
const { data: { text } } = await worker.recognize(file);
if (onProgress) onProgress(100);
await worker.terminate();
return text.trim();
}
export async function extractPdfText(file: File, onProgress?: (p: number) => void): Promise<string> {
const ab = await file.arrayBuffer();
const pdf = await getDocumentProxy(new Uint8Array(ab));
const numPages = pdf.numPages;
if (onProgress) onProgress(0);
const pageIndices = Array.from({ length: numPages }, (_, i) => i + 1);
const results = await pool.process(pageIndices, async (worker, pageNum, idx) => {
const page = await pdf.getPage(pageNum);
// 1. Get selectable text
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item: any) => item.str)
.join(" ")
.replace(/\s+/g, " ")
.trim();
// 2. Render and OCR for image text (User wants both)
const viewport = page.getViewport({ scale: 2.0 });
const canvas = document.createElement("canvas");
const context = canvas.getContext("2d");
if (context) {
canvas.height = viewport.height;
canvas.width = viewport.width;
// @ts-ignore
await page.render({ canvasContext: context, viewport }).promise;
const { data: { text: ocrText } } = await worker.recognize(canvas);
const cleanedOcr = ocrText.trim();
let out = `--- Page ${pageNum} ---\n${pageText}\n`;
if (cleanedOcr.length > 20) {
out += `\n[Detected in images/formatting]:\n${cleanedOcr}\n`;
}
return out;
}
return `--- Page ${pageNum} ---\n${pageText}\n`;
}, onProgress);
// We keep workers alive in the pool for the next document, or terminate manually?
// Let's terminate for now to avoid memory pressure in a long session.
await pool.terminate();
return results.join("\n\n").trim();
}
export async function extractDocxText(file: File, onProgress?: (p: number) => void): Promise<string> {
const ab = await file.arrayBuffer();
if (onProgress) onProgress(10);
const textResult = await mammoth.extractRawText({ arrayBuffer: ab });
let fullText = (textResult.value ?? "").trim();
const images: Blob[] = [];
await mammoth.convertToHtml({ arrayBuffer: ab }, {
convertImage: mammoth.images.imgElement(async (image) => {
const buffer = await image.read();
images.push(new Blob([new Uint8Array(buffer)], { type: image.contentType }));
return { src: "" };
})
});
if (images.length > 0) {
if (onProgress) onProgress(30);
const results = await pool.process(images, async (worker, imgBlob) => {
const { data: { text } } = await worker.recognize(imgBlob);
return text.trim();
}, (p) => onProgress?.(30 + (p * 0.7))); // Scale progress to 30-100%
results.forEach(t => {
if (t) fullText += "\n\n[Extracted from image]:\n" + t;
});
await pool.terminate();
} else {
if (onProgress) onProgress(100);
}
return fullText.trim();
}
export async function extractFileText(file: File, onProgress?: (p: number) => void): Promise<{ text: string; sourceType: "pdf" | "docx" | "text" }> {
const ext = file.name.split(".").pop()?.toLowerCase() ?? "";
if (ext === "pdf") {
const text = await extractPdfText(file, onProgress);
return { text, sourceType: "pdf" };
}
if (ext === "docx" || ext === "doc") {
const text = await extractDocxText(file, onProgress);
return { text, sourceType: "docx" };
}
if (["png", "jpg", "jpeg", "webp", "bmp"].includes(ext)) {
const text = await extractImageText(file, onProgress);
return { text, sourceType: "text" };
}
throw new Error("Unsupported file type for client-side extraction");
}
|