|
|
import * as pdfjsLib from "pdfjs-dist"; |
|
|
|
|
|
async function getTextContent(file: string | ArrayBuffer) { |
|
|
try { |
|
|
pdfjsLib.GlobalWorkerOptions.workerSrc = `./scripts/pdf.worker.min.mjs`; |
|
|
|
|
|
|
|
|
const loadingTask = pdfjsLib.getDocument(file); |
|
|
const pdfDocument = await loadingTask.promise; |
|
|
|
|
|
let fullText = ""; |
|
|
|
|
|
|
|
|
for (let pageNum = 1; pageNum <= pdfDocument.numPages; pageNum++) { |
|
|
const page = await pdfDocument.getPage(pageNum); |
|
|
const textContent = await page.getTextContent(); |
|
|
|
|
|
|
|
|
const pageText = textContent.items |
|
|
.filter((item) => "str" in item) |
|
|
.map((item) => item.str) |
|
|
.join(" "); |
|
|
fullText += pageText + "\n"; |
|
|
} |
|
|
|
|
|
return fullText; |
|
|
} catch (error) { |
|
|
console.error("Error extracting text:", error); |
|
|
throw new Error("Error extracting text"); |
|
|
} |
|
|
} |
|
|
|
|
|
export async function readTextFromPDF(file: File): Promise<string> { |
|
|
if (!file) { |
|
|
throw new Error("No file provided"); |
|
|
} |
|
|
|
|
|
return new Promise((resolve, reject) => { |
|
|
const reader = new FileReader(); |
|
|
|
|
|
reader.onload = async () => { |
|
|
if (reader.result) { |
|
|
try { |
|
|
const text = await getTextContent(reader.result); |
|
|
resolve(text); |
|
|
} catch (error) { |
|
|
console.error("Error processing PDF:", error); |
|
|
reject(new Error("Error processing PDF")); |
|
|
} |
|
|
} else { |
|
|
reject(new Error("File reading failed")); |
|
|
} |
|
|
}; |
|
|
|
|
|
reader.onerror = () => { |
|
|
reject(new Error("Error reading file")); |
|
|
}; |
|
|
|
|
|
reader.readAsArrayBuffer(file); |
|
|
}); |
|
|
} |
|
|
|