Spaces:
Build error
Build error
File size: 2,609 Bytes
dca8ede |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
import * as pdfjs from 'pdfjs-dist';
import { join } from 'path';
import { readFile } from 'fs/promises';
import { existsSync } from 'fs';
// Configure the PDF.js worker
const WORKER_PATH = join(process.cwd(), 'node_modules', 'pdfjs-dist', 'build', 'pdf.worker.js');
if (typeof window === 'undefined') {
pdfjs.GlobalWorkerOptions.workerSrc = WORKER_PATH;
}
/**
* A simplified PDF text extractor using PDF.js
*/
export async function extractTextFromPdfFile(pdfPath: string): Promise<string> {
console.log("=== Starting simplified PDF text extraction ===");
try {
console.log("PDF path:", pdfPath);
if (!existsSync(pdfPath)) {
console.error("PDF file does not exist:", pdfPath);
throw new Error(`PDF file does not exist: ${pdfPath}`);
}
console.log("Reading file...");
const data = await readFile(pdfPath);
console.log(`Read ${data.length} bytes`);
return await extractTextFromPdfBuffer(data);
} catch (error) {
console.error("Error extracting text from PDF file:", error);
if (error instanceof Error) {
throw new Error(`Failed to extract text from PDF file: ${error.message}`);
}
throw new Error("Failed to extract text from PDF file");
}
}
/**
* Extract text from a PDF buffer using PDF.js
*/
export async function extractTextFromPdfBuffer(data: Buffer): Promise<string> {
try {
console.log(`Processing PDF buffer of size ${data.length} bytes`);
// Load the PDF document
const loadingTask = pdfjs.getDocument({ data });
const pdfDocument = await loadingTask.promise;
console.log(`PDF loaded successfully with ${pdfDocument.numPages} pages`);
let fullText = '';
// Process each page
for (let i = 1; i <= pdfDocument.numPages; i++) {
console.log(`Processing page ${i}/${pdfDocument.numPages}`);
const page = await pdfDocument.getPage(i);
const textContent = await page.getTextContent();
// Extract text from page
const pageText = textContent.items
.map(item => 'str' in item ? item.str : '')
.join(' ');
fullText += pageText + '\n\n';
}
console.log(`Extracted ${fullText.length} characters of text`);
return fullText;
} catch (error) {
console.error("Error extracting text from PDF buffer:", error);
if (error instanceof Error) {
throw new Error(`Failed to extract text from PDF buffer: ${error.message}`);
}
throw new Error("Failed to extract text from PDF buffer");
}
} |