Spaces:

Aynkhn
/

SOURCE.IO

Sleeping

Adeen

feat: Expert Academic Data Extraction Agent upgrade with parallel OCR and premium UI

40d16d9 12 days ago

5.63 kB

	// Client-side text extraction for PDF, DOCX and Image files.
	import { extractText, getDocumentProxy } from "unpdf";
	import mammoth from "mammoth";
	import { createWorker } from "tesseract.js";

	async function createOptimizedWorker() {
	const worker = await createWorker("eng");
	return worker;
	}

	/**
	* Parallel OCR Processor: Manages a pool of Tesseract workers to speed up multi-page extraction.
	*/
	class OcrPool {
	private workers: any[] = [];
	private concurrency = Math.min(navigator.hardwareConcurrency \|\| 4, 4);

	async init() {
	if (this.workers.length > 0) return;
	const pool = [];
	for (let i = 0; i < this.concurrency; i++) {
	pool.push(createOptimizedWorker());
	}
	this.workers = await Promise.all(pool);
	}

	async process(items: any[], task: (worker: any, item: any, index: number) => Promise<string>, onProgress?: (p: number) => void) {
	await this.init();
	const results = new Array(items.length);
	let completed = 0;

	const execute = async (worker: any, index: number) => {
	results[index] = await task(worker, items[index], index);
	completed++;
	if (onProgress) onProgress(Math.round((completed / items.length) * 100));
	};

	// Simple round-robin or queue approach
	const queue = [...items.keys()];
	const runners = this.workers.map(async (worker) => {
	while (queue.length > 0) {
	const index = queue.shift();
	if (index !== undefined) {
	await execute(worker, index);
	}
	}
	});

	await Promise.all(runners);
	return results;
	}

	async terminate() {
	await Promise.all(this.workers.map(w => w.terminate()));
	this.workers = [];
	}
	}

	const pool = new OcrPool();

	export async function extractImageText(file: File \| Blob, onProgress?: (p: number) => void): Promise<string> {
	const worker = await createOptimizedWorker();
	if (onProgress) onProgress(10);
	const { data: { text } } = await worker.recognize(file);
	if (onProgress) onProgress(100);
	await worker.terminate();
	return text.trim();
	}

	export async function extractPdfText(file: File, onProgress?: (p: number) => void): Promise<string> {
	const ab = await file.arrayBuffer();
	const pdf = await getDocumentProxy(new Uint8Array(ab));
	const numPages = pdf.numPages;

	if (onProgress) onProgress(0);

	const pageIndices = Array.from({ length: numPages }, (_, i) => i + 1);

	const results = await pool.process(pageIndices, async (worker, pageNum, idx) => {
	const page = await pdf.getPage(pageNum);

	// 1. Get selectable text
	const textContent = await page.getTextContent();
	const pageText = textContent.items
	.map((item: any) => item.str)
	.join(" ")
	.replace(/\s+/g, " ")
	.trim();

	// 2. Render and OCR for image text (User wants both)
	const viewport = page.getViewport({ scale: 2.0 });
	const canvas = document.createElement("canvas");
	const context = canvas.getContext("2d");
	if (context) {
	canvas.height = viewport.height;
	canvas.width = viewport.width;
	// @ts-ignore
	await page.render({ canvasContext: context, viewport }).promise;

	const { data: { text: ocrText } } = await worker.recognize(canvas);
	const cleanedOcr = ocrText.trim();

	let out = `--- Page ${pageNum} ---\n${pageText}\n`;
	if (cleanedOcr.length > 20) {
	out += `\n[Detected in images/formatting]:\n${cleanedOcr}\n`;
	}
	return out;
	}
	return `--- Page ${pageNum} ---\n${pageText}\n`;
	}, onProgress);

	// We keep workers alive in the pool for the next document, or terminate manually?
	// Let's terminate for now to avoid memory pressure in a long session.
	await pool.terminate();

	return results.join("\n\n").trim();
	}

	export async function extractDocxText(file: File, onProgress?: (p: number) => void): Promise<string> {
	const ab = await file.arrayBuffer();
	if (onProgress) onProgress(10);

	const textResult = await mammoth.extractRawText({ arrayBuffer: ab });
	let fullText = (textResult.value ?? "").trim();

	const images: Blob[] = [];
	await mammoth.convertToHtml({ arrayBuffer: ab }, {
	convertImage: mammoth.images.imgElement(async (image) => {
	const buffer = await image.read();
	images.push(new Blob([new Uint8Array(buffer)], { type: image.contentType }));
	return { src: "" };
	})
	});

	if (images.length > 0) {
	if (onProgress) onProgress(30);
	const results = await pool.process(images, async (worker, imgBlob) => {
	const { data: { text } } = await worker.recognize(imgBlob);
	return text.trim();
	}, (p) => onProgress?.(30 + (p * 0.7))); // Scale progress to 30-100%

	results.forEach(t => {
	if (t) fullText += "\n\n[Extracted from image]:\n" + t;
	});
	await pool.terminate();
	} else {
	if (onProgress) onProgress(100);
	}

	return fullText.trim();
	}

	export async function extractFileText(file: File, onProgress?: (p: number) => void): Promise<{ text: string; sourceType: "pdf" \| "docx" \| "text" }> {
	const ext = file.name.split(".").pop()?.toLowerCase() ?? "";

	if (ext === "pdf") {
	const text = await extractPdfText(file, onProgress);
	return { text, sourceType: "pdf" };
	}

	if (ext === "docx" \|\| ext === "doc") {
	const text = await extractDocxText(file, onProgress);
	return { text, sourceType: "docx" };
	}

	if (["png", "jpg", "jpeg", "webp", "bmp"].includes(ext)) {
	const text = await extractImageText(file, onProgress);
	return { text, sourceType: "text" };
	}

	throw new Error("Unsupported file type for client-side extraction");
	}