resume-parser / lib /custom-pdf-parser.ts
PPSA's picture
Upload 235 files
dca8ede verified
import * as pdfjsLib from 'pdfjs-dist';
import { join } from 'path';
import { existsSync } from 'fs';
import { mkdir } from 'fs/promises';
// Configure PDF.js worker
const WORKER_SRC = join(process.cwd(), 'node_modules', 'pdfjs-dist', 'build', 'pdf.worker.js');
if (typeof window === 'undefined') {
pdfjsLib.GlobalWorkerOptions.workerSrc = WORKER_SRC;
}
/**
* Custom PDF parser that uses PDF.js
* @param dataBuffer PDF file buffer
* @returns Parsed data with text content
*/
export default async function customPdfParse(dataBuffer: Buffer) {
try {
console.log("Loading PDF document with PDF.js");
// Create a temp directory for PDF.js files if needed
const tempDir = join(process.cwd(), 'temp');
if (!existsSync(tempDir)) {
await mkdir(tempDir, { recursive: true });
}
// Load the PDF document
const loadingTask = pdfjsLib.getDocument({ data: dataBuffer });
const pdfDocument = await loadingTask.promise;
console.log(`PDF loaded with ${pdfDocument.numPages} pages`);
// Extract text from all pages
let fullText = '';
for (let i = 1; i <= pdfDocument.numPages; i++) {
console.log(`Processing page ${i}/${pdfDocument.numPages}`);
const page = await pdfDocument.getPage(i);
const textContent = await page.getTextContent();
// Concatenate all items' text
const pageText = textContent.items
.map(item => 'str' in item ? item.str : '')
.join(' ');
fullText += pageText + '\n';
}
console.log(`Extracted ${fullText.length} characters of text`);
// Return data in a format compatible with the original pdf-parse
return {
text: fullText,
metadata: {
info: await pdfDocument.getMetadata(),
pageInfo: {
pageCount: pdfDocument.numPages
}
},
numpages: pdfDocument.numPages,
numrender: pdfDocument.numPages,
version: '1.0.0'
};
} catch (error) {
console.error('Error parsing PDF:', error);
throw error;
}
}