Spaces:

Aynkhn
/

SOURCE.IO

Sleeping

File size: 5,631 Bytes

// Client-side text extraction for PDF, DOCX and Image files.
import { extractText, getDocumentProxy } from "unpdf";
import mammoth from "mammoth";
import { createWorker } from "tesseract.js";

async function createOptimizedWorker() {
  const worker = await createWorker("eng");
  return worker;
}

/**
 * Parallel OCR Processor: Manages a pool of Tesseract workers to speed up multi-page extraction.
 */
class OcrPool {
  private workers: any[] = [];
  private concurrency = Math.min(navigator.hardwareConcurrency || 4, 4);

  async init() {
    if (this.workers.length > 0) return;
    const pool = [];
    for (let i = 0; i < this.concurrency; i++) {
      pool.push(createOptimizedWorker());
    }
    this.workers = await Promise.all(pool);
  }

  async process(items: any[], task: (worker: any, item: any, index: number) => Promise<string>, onProgress?: (p: number) => void) {
    await this.init();
    const results = new Array(items.length);
    let completed = 0;

    const execute = async (worker: any, index: number) => {
      results[index] = await task(worker, items[index], index);
      completed++;
      if (onProgress) onProgress(Math.round((completed / items.length) * 100));
    };

    // Simple round-robin or queue approach
    const queue = [...items.keys()];
    const runners = this.workers.map(async (worker) => {
      while (queue.length > 0) {
        const index = queue.shift();
        if (index !== undefined) {
          await execute(worker, index);
        }
      }
    });

    await Promise.all(runners);
    return results;
  }

  async terminate() {
    await Promise.all(this.workers.map(w => w.terminate()));
    this.workers = [];
  }
}

const pool = new OcrPool();

export async function extractImageText(file: File | Blob, onProgress?: (p: number) => void): Promise<string> {
  const worker = await createOptimizedWorker();
  if (onProgress) onProgress(10);
  const { data: { text } } = await worker.recognize(file);
  if (onProgress) onProgress(100);
  await worker.terminate();
  return text.trim();
}

export async function extractPdfText(file: File, onProgress?: (p: number) => void): Promise<string> {
  const ab = await file.arrayBuffer();
  const pdf = await getDocumentProxy(new Uint8Array(ab));
  const numPages = pdf.numPages;
  
  if (onProgress) onProgress(0);

  const pageIndices = Array.from({ length: numPages }, (_, i) => i + 1);
  
  const results = await pool.process(pageIndices, async (worker, pageNum, idx) => {
    const page = await pdf.getPage(pageNum);
    
    // 1. Get selectable text
    const textContent = await page.getTextContent();
    const pageText = textContent.items
      .map((item: any) => item.str)
      .join(" ")
      .replace(/\s+/g, " ")
      .trim();

    // 2. Render and OCR for image text (User wants both)
    const viewport = page.getViewport({ scale: 2.0 });
    const canvas = document.createElement("canvas");
    const context = canvas.getContext("2d");
    if (context) {
      canvas.height = viewport.height;
      canvas.width = viewport.width;
      // @ts-ignore
      await page.render({ canvasContext: context, viewport }).promise;
      
      const { data: { text: ocrText } } = await worker.recognize(canvas);
      const cleanedOcr = ocrText.trim();
      
      let out = `--- Page ${pageNum} ---\n${pageText}\n`;
      if (cleanedOcr.length > 20) {
        out += `\n[Detected in images/formatting]:\n${cleanedOcr}\n`;
      }
      return out;
    }
    return `--- Page ${pageNum} ---\n${pageText}\n`;
  }, onProgress);

  // We keep workers alive in the pool for the next document, or terminate manually?
  // Let's terminate for now to avoid memory pressure in a long session.
  await pool.terminate();
  
  return results.join("\n\n").trim();
}

export async function extractDocxText(file: File, onProgress?: (p: number) => void): Promise<string> {
  const ab = await file.arrayBuffer();
  if (onProgress) onProgress(10);
  
  const textResult = await mammoth.extractRawText({ arrayBuffer: ab });
  let fullText = (textResult.value ?? "").trim();

  const images: Blob[] = [];
  await mammoth.convertToHtml({ arrayBuffer: ab }, {
    convertImage: mammoth.images.imgElement(async (image) => {
      const buffer = await image.read();
      images.push(new Blob([new Uint8Array(buffer)], { type: image.contentType }));
      return { src: "" };
    })
  });

  if (images.length > 0) {
    if (onProgress) onProgress(30);
    const results = await pool.process(images, async (worker, imgBlob) => {
      const { data: { text } } = await worker.recognize(imgBlob);
      return text.trim();
    }, (p) => onProgress?.(30 + (p * 0.7))); // Scale progress to 30-100%
    
    results.forEach(t => {
      if (t) fullText += "\n\n[Extracted from image]:\n" + t;
    });
    await pool.terminate();
  } else {
    if (onProgress) onProgress(100);
  }

  return fullText.trim();
}

export async function extractFileText(file: File, onProgress?: (p: number) => void): Promise<{ text: string; sourceType: "pdf" | "docx" | "text" }> {
  const ext = file.name.split(".").pop()?.toLowerCase() ?? "";
  
  if (ext === "pdf") {
    const text = await extractPdfText(file, onProgress);
    return { text, sourceType: "pdf" };
  }
  
  if (ext === "docx" || ext === "doc") {
    const text = await extractDocxText(file, onProgress);
    return { text, sourceType: "docx" };
  }

  if (["png", "jpg", "jpeg", "webp", "bmp"].includes(ext)) {
    const text = await extractImageText(file, onProgress);
    return { text, sourceType: "text" };
  }

  throw new Error("Unsupported file type for client-side extraction");
}