File size: 5,631 Bytes
ae14296
 
 
 
 
40d16d9
ae14296
 
 
 
40d16d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ae14296
40d16d9
ae14296
 
 
 
40d16d9
ae14296
 
40d16d9
 
 
ae14296
40d16d9
 
 
 
ae14296
 
 
 
 
 
 
 
 
40d16d9
ae14296
 
 
 
 
 
40d16d9
ae14296
 
 
 
 
40d16d9
 
 
ae14296
40d16d9
ae14296
40d16d9
 
ae14296
40d16d9
 
 
 
 
ae14296
 
40d16d9
ae14296
40d16d9
ae14296
 
 
 
 
 
 
 
 
40d16d9
ae14296
 
 
 
40d16d9
 
ae14296
40d16d9
 
 
 
 
 
 
 
 
ae14296
 
 
 
 
40d16d9
ae14296
 
 
40d16d9
 
ae14296
 
 
40d16d9
 
ae14296
 
 
40d16d9
 
ae14296
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
// Client-side text extraction for PDF, DOCX and Image files.
import { extractText, getDocumentProxy } from "unpdf";
import mammoth from "mammoth";
import { createWorker } from "tesseract.js";

async function createOptimizedWorker() {
  const worker = await createWorker("eng");
  return worker;
}

/**
 * Parallel OCR Processor: Manages a pool of Tesseract workers to speed up multi-page extraction.
 */
class OcrPool {
  private workers: any[] = [];
  private concurrency = Math.min(navigator.hardwareConcurrency || 4, 4);

  async init() {
    if (this.workers.length > 0) return;
    const pool = [];
    for (let i = 0; i < this.concurrency; i++) {
      pool.push(createOptimizedWorker());
    }
    this.workers = await Promise.all(pool);
  }

  async process(items: any[], task: (worker: any, item: any, index: number) => Promise<string>, onProgress?: (p: number) => void) {
    await this.init();
    const results = new Array(items.length);
    let completed = 0;

    const execute = async (worker: any, index: number) => {
      results[index] = await task(worker, items[index], index);
      completed++;
      if (onProgress) onProgress(Math.round((completed / items.length) * 100));
    };

    // Simple round-robin or queue approach
    const queue = [...items.keys()];
    const runners = this.workers.map(async (worker) => {
      while (queue.length > 0) {
        const index = queue.shift();
        if (index !== undefined) {
          await execute(worker, index);
        }
      }
    });

    await Promise.all(runners);
    return results;
  }

  async terminate() {
    await Promise.all(this.workers.map(w => w.terminate()));
    this.workers = [];
  }
}

const pool = new OcrPool();

export async function extractImageText(file: File | Blob, onProgress?: (p: number) => void): Promise<string> {
  const worker = await createOptimizedWorker();
  if (onProgress) onProgress(10);
  const { data: { text } } = await worker.recognize(file);
  if (onProgress) onProgress(100);
  await worker.terminate();
  return text.trim();
}

export async function extractPdfText(file: File, onProgress?: (p: number) => void): Promise<string> {
  const ab = await file.arrayBuffer();
  const pdf = await getDocumentProxy(new Uint8Array(ab));
  const numPages = pdf.numPages;
  
  if (onProgress) onProgress(0);

  const pageIndices = Array.from({ length: numPages }, (_, i) => i + 1);
  
  const results = await pool.process(pageIndices, async (worker, pageNum, idx) => {
    const page = await pdf.getPage(pageNum);
    
    // 1. Get selectable text
    const textContent = await page.getTextContent();
    const pageText = textContent.items
      .map((item: any) => item.str)
      .join(" ")
      .replace(/\s+/g, " ")
      .trim();

    // 2. Render and OCR for image text (User wants both)
    const viewport = page.getViewport({ scale: 2.0 });
    const canvas = document.createElement("canvas");
    const context = canvas.getContext("2d");
    if (context) {
      canvas.height = viewport.height;
      canvas.width = viewport.width;
      // @ts-ignore
      await page.render({ canvasContext: context, viewport }).promise;
      
      const { data: { text: ocrText } } = await worker.recognize(canvas);
      const cleanedOcr = ocrText.trim();
      
      let out = `--- Page ${pageNum} ---\n${pageText}\n`;
      if (cleanedOcr.length > 20) {
        out += `\n[Detected in images/formatting]:\n${cleanedOcr}\n`;
      }
      return out;
    }
    return `--- Page ${pageNum} ---\n${pageText}\n`;
  }, onProgress);

  // We keep workers alive in the pool for the next document, or terminate manually?
  // Let's terminate for now to avoid memory pressure in a long session.
  await pool.terminate();
  
  return results.join("\n\n").trim();
}

export async function extractDocxText(file: File, onProgress?: (p: number) => void): Promise<string> {
  const ab = await file.arrayBuffer();
  if (onProgress) onProgress(10);
  
  const textResult = await mammoth.extractRawText({ arrayBuffer: ab });
  let fullText = (textResult.value ?? "").trim();

  const images: Blob[] = [];
  await mammoth.convertToHtml({ arrayBuffer: ab }, {
    convertImage: mammoth.images.imgElement(async (image) => {
      const buffer = await image.read();
      images.push(new Blob([new Uint8Array(buffer)], { type: image.contentType }));
      return { src: "" };
    })
  });

  if (images.length > 0) {
    if (onProgress) onProgress(30);
    const results = await pool.process(images, async (worker, imgBlob) => {
      const { data: { text } } = await worker.recognize(imgBlob);
      return text.trim();
    }, (p) => onProgress?.(30 + (p * 0.7))); // Scale progress to 30-100%
    
    results.forEach(t => {
      if (t) fullText += "\n\n[Extracted from image]:\n" + t;
    });
    await pool.terminate();
  } else {
    if (onProgress) onProgress(100);
  }

  return fullText.trim();
}

export async function extractFileText(file: File, onProgress?: (p: number) => void): Promise<{ text: string; sourceType: "pdf" | "docx" | "text" }> {
  const ext = file.name.split(".").pop()?.toLowerCase() ?? "";
  
  if (ext === "pdf") {
    const text = await extractPdfText(file, onProgress);
    return { text, sourceType: "pdf" };
  }
  
  if (ext === "docx" || ext === "doc") {
    const text = await extractDocxText(file, onProgress);
    return { text, sourceType: "docx" };
  }

  if (["png", "jpg", "jpeg", "webp", "bmp"].includes(ext)) {
    const text = await extractImageText(file, onProgress);
    return { text, sourceType: "text" };
  }

  throw new Error("Unsupported file type for client-side extraction");
}