File size: 1,679 Bytes
676fc08
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import * as pdfjsLib from "pdfjs-dist";

async function getTextContent(file: string | ArrayBuffer) {
  try {
    pdfjsLib.GlobalWorkerOptions.workerSrc = `./scripts/pdf.worker.min.mjs`;

    // 加载 PDF 文件
    const loadingTask = pdfjsLib.getDocument(file);
    const pdfDocument = await loadingTask.promise;

    let fullText = "";

    // 循环处理每一页
    for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) {
      const page = await pdfDocument.getPage(pageNum);
      const textContent = await page.getTextContent();

      // 将文本内容组合起来
      const pageText = textContent.items
        .filter((item) => "str" in item)
        .map((item) => item.str)
        .join(" ");
      fullText += pageText + "\n"; // 可以添加换行符来区分页面
    }

    return fullText;
  } catch (error) {
    console.error("Error extracting text:", error);
    throw new Error("Error extracting text");
  }
}

export async function readTextFromPDF(file: File): Promise<string> {
  if (!file) {
    throw new Error("No file provided");
  }

  return new Promise((resolve, reject) => {
    const reader = new FileReader();

    reader.onload = async () => {
      if (reader.result) {
        try {
          const text = await getTextContent(reader.result);
          resolve(text);
        } catch (error) {
          console.error("Error processing PDF:", error);
          reject(new Error("Error processing PDF"));
        }
      } else {
        reject(new Error("File reading failed"));
      }
    };

    reader.onerror = () => {
      reject(new Error("Error reading file"));
    };

    reader.readAsArrayBuffer(file);
  });
}