SOURCE.IO / src /lib /extract.ts
Adeen
feat: Expert Academic Data Extraction Agent upgrade with parallel OCR and premium UI
40d16d9
// Client-side text extraction for PDF, DOCX and Image files.
import { extractText, getDocumentProxy } from "unpdf";
import mammoth from "mammoth";
import { createWorker } from "tesseract.js";
async function createOptimizedWorker() {
const worker = await createWorker("eng");
return worker;
}
/**
* Parallel OCR Processor: Manages a pool of Tesseract workers to speed up multi-page extraction.
*/
class OcrPool {
private workers: any[] = [];
private concurrency = Math.min(navigator.hardwareConcurrency || 4, 4);
async init() {
if (this.workers.length > 0) return;
const pool = [];
for (let i = 0; i < this.concurrency; i++) {
pool.push(createOptimizedWorker());
}
this.workers = await Promise.all(pool);
}
async process(items: any[], task: (worker: any, item: any, index: number) => Promise<string>, onProgress?: (p: number) => void) {
await this.init();
const results = new Array(items.length);
let completed = 0;
const execute = async (worker: any, index: number) => {
results[index] = await task(worker, items[index], index);
completed++;
if (onProgress) onProgress(Math.round((completed / items.length) * 100));
};
// Simple round-robin or queue approach
const queue = [...items.keys()];
const runners = this.workers.map(async (worker) => {
while (queue.length > 0) {
const index = queue.shift();
if (index !== undefined) {
await execute(worker, index);
}
}
});
await Promise.all(runners);
return results;
}
async terminate() {
await Promise.all(this.workers.map(w => w.terminate()));
this.workers = [];
}
}
const pool = new OcrPool();
export async function extractImageText(file: File | Blob, onProgress?: (p: number) => void): Promise<string> {
const worker = await createOptimizedWorker();
if (onProgress) onProgress(10);
const { data: { text } } = await worker.recognize(file);
if (onProgress) onProgress(100);
await worker.terminate();
return text.trim();
}
export async function extractPdfText(file: File, onProgress?: (p: number) => void): Promise<string> {
const ab = await file.arrayBuffer();
const pdf = await getDocumentProxy(new Uint8Array(ab));
const numPages = pdf.numPages;
if (onProgress) onProgress(0);
const pageIndices = Array.from({ length: numPages }, (_, i) => i + 1);
const results = await pool.process(pageIndices, async (worker, pageNum, idx) => {
const page = await pdf.getPage(pageNum);
// 1. Get selectable text
const textContent = await page.getTextContent();
const pageText = textContent.items
.map((item: any) => item.str)
.join(" ")
.replace(/\s+/g, " ")
.trim();
// 2. Render and OCR for image text (User wants both)
const viewport = page.getViewport({ scale: 2.0 });
const canvas = document.createElement("canvas");
const context = canvas.getContext("2d");
if (context) {
canvas.height = viewport.height;
canvas.width = viewport.width;
// @ts-ignore
await page.render({ canvasContext: context, viewport }).promise;
const { data: { text: ocrText } } = await worker.recognize(canvas);
const cleanedOcr = ocrText.trim();
let out = `--- Page ${pageNum} ---\n${pageText}\n`;
if (cleanedOcr.length > 20) {
out += `\n[Detected in images/formatting]:\n${cleanedOcr}\n`;
}
return out;
}
return `--- Page ${pageNum} ---\n${pageText}\n`;
}, onProgress);
// We keep workers alive in the pool for the next document, or terminate manually?
// Let's terminate for now to avoid memory pressure in a long session.
await pool.terminate();
return results.join("\n\n").trim();
}
export async function extractDocxText(file: File, onProgress?: (p: number) => void): Promise<string> {
const ab = await file.arrayBuffer();
if (onProgress) onProgress(10);
const textResult = await mammoth.extractRawText({ arrayBuffer: ab });
let fullText = (textResult.value ?? "").trim();
const images: Blob[] = [];
await mammoth.convertToHtml({ arrayBuffer: ab }, {
convertImage: mammoth.images.imgElement(async (image) => {
const buffer = await image.read();
images.push(new Blob([new Uint8Array(buffer)], { type: image.contentType }));
return { src: "" };
})
});
if (images.length > 0) {
if (onProgress) onProgress(30);
const results = await pool.process(images, async (worker, imgBlob) => {
const { data: { text } } = await worker.recognize(imgBlob);
return text.trim();
}, (p) => onProgress?.(30 + (p * 0.7))); // Scale progress to 30-100%
results.forEach(t => {
if (t) fullText += "\n\n[Extracted from image]:\n" + t;
});
await pool.terminate();
} else {
if (onProgress) onProgress(100);
}
return fullText.trim();
}
export async function extractFileText(file: File, onProgress?: (p: number) => void): Promise<{ text: string; sourceType: "pdf" | "docx" | "text" }> {
const ext = file.name.split(".").pop()?.toLowerCase() ?? "";
if (ext === "pdf") {
const text = await extractPdfText(file, onProgress);
return { text, sourceType: "pdf" };
}
if (ext === "docx" || ext === "doc") {
const text = await extractDocxText(file, onProgress);
return { text, sourceType: "docx" };
}
if (["png", "jpg", "jpeg", "webp", "bmp"].includes(ext)) {
const text = await extractImageText(file, onProgress);
return { text, sourceType: "text" };
}
throw new Error("Unsupported file type for client-side extraction");
}