Spaces:

I2E
/

resume-parser

Build error

App Files Files Community

resume-parser / lib /document-converter.ts

PPSA

Upload 235 files

dca8ede verified 10 months ago

raw

history blame contribute delete

17.3 kB

	import { exec } from "child_process"
	import { promisify } from "util"
	import { join, extname, basename } from "path"
	import { v4 as uuidv4 } from "uuid"
	import { mkdir, writeFile, readFile, copyFile } from "fs/promises"
	import { existsSync, mkdirSync } from "fs"
	import mammoth from "mammoth"
	import * as puppeteer from 'puppeteer'

	const execAsync = promisify(exec)

	/**
	* Converts a DOC or DOCX file to PDF
	* @param filePath Path to the DOC or DOCX file
	* @returns Path to the generated PDF file
	*/
	export async function convertDocToPdf(filePath: string): Promise<string> {
	console.log(`Converting document: ${filePath}`);

	// Verify file exists
	if (!existsSync(filePath)) {
	console.error(`File does not exist: ${filePath}`);
	throw new Error(`File does not exist: ${filePath}`);
	}

	// Create uploads directory if it doesn't exist
	const uploadsDir = join(process.cwd(), 'uploads');
	if (!existsSync(uploadsDir)) {
	mkdirSync(uploadsDir, { recursive: true });
	}

	// Generate PDF path
	const pdfPath = filePath.replace(/\.(doc\|docx)$/i, '.pdf');

	try {
	// Check if it's a DOCX file
	if (filePath.toLowerCase().endsWith('.docx')) {
	console.log(`Converting DOCX to PDF: ${filePath}`);
	return await convertDocxToPdf(filePath, pdfPath);
	}
	// Check if it's a DOC file
	else if (filePath.toLowerCase().endsWith('.doc')) {
	console.log(`Converting DOC to PDF: ${filePath}`);
	return await convertDocToPdfFallback(filePath, pdfPath);
	} else {
	console.error(`Unsupported file format: ${filePath}`);
	throw new Error(`Unsupported file format: ${filePath}`);
	}
	} catch (error: unknown) {
	console.error('Error in document conversion:', error);
	// Try fallback method if the main conversion fails
	try {
	console.log("Attempting fallback conversion method");
	return await createPlaceholderPdfWithContent(filePath, pdfPath);
	} catch (fallbackError) {
	console.error("Fallback conversion also failed:", fallbackError);
	if (error instanceof Error) {
	throw new Error(`Failed to convert document: ${error.message}`);
	} else {
	throw new Error(`Failed to convert document: ${String(error)}`);
	}
	}
	}
	}

	/**
	* Convert DOCX file to PDF using mammoth and puppeteer
	*/
	async function convertDocxToPdf(docxPath: string, pdfPath: string): Promise<string> {
	try {
	// Convert DOCX to HTML
	const buffer = await readFile(docxPath);
	const result = await mammoth.convertToHtml({ buffer });
	const html = result.value;

	// Create a temporary HTML file
	const htmlPath = docxPath.replace(/\.docx$/i, '.html');
	const fullHtml = `
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Document</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	line-height: 1.5;
	margin: 1cm;
	}
	h1, h2, h3, h4, h5, h6 {
	margin-top: 1em;
	margin-bottom: 0.5em;
	}
	p {
	margin-bottom: 0.5em;
	}
	table {
	border-collapse: collapse;
	width: 100%;
	}
	td, th {
	border: 1px solid #ddd;
	padding: 8px;
	}
	</style>
	</head>
	<body>
	${html}
	</body>
	</html>
	`;

	await writeFile(htmlPath, fullHtml);
	console.log(`Created HTML file: ${htmlPath}`);

	// Convert HTML to PDF using Puppeteer
	const browser = await puppeteer.launch({
	headless: true,
	args: ['--no-sandbox', '--disable-setuid-sandbox'],
	});
	const page = await browser.newPage();
	await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' });
	await page.pdf({ path: pdfPath, format: 'A4' });
	await browser.close();

	console.log(`Generated PDF at: ${pdfPath}`);
	return pdfPath;
	} catch (error) {
	console.error("Error converting DOCX to PDF:", error);
	throw error;
	}
	}

	/**
	* Convert DOC file to PDF using a fallback approach for older Word formats
	*/
	async function convertDocToPdfFallback(docPath: string, pdfPath: string): Promise<string> {
	try {
	console.log("Using fallback method for DOC conversion");

	// Read the file content
	const buffer = await readFile(docPath);

	// First try using mammoth (might work for some DOC files)
	try {
	const result = await mammoth.convertToHtml({ buffer });
	if (result.value && result.value.length > 100) {
	// If we got substantial text, use the HTML conversion route
	console.log("Mammoth extracted text from DOC, using HTML conversion");

	const htmlPath = docPath.replace(/\.doc$/i, '.html');
	const fullHtml = `
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="UTF-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>Document</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	line-height: 1.5;
	margin: 1cm;
	}
	h1, h2, h3, h4, h5, h6 {
	margin-top: 1em;
	margin-bottom: 0.5em;
	}
	p {
	margin-bottom: 0.5em;
	}
	</style>
	</head>
	<body>
	${result.value}
	<div style="margin-top: 20px; padding: 10px; background-color: #f8f9fa; border: 1px solid #ddd; border-radius: 4px;">
	<p style="font-style: italic; color: #666;">Note: This document was converted from a DOC file. Some formatting may have been lost.</p>
	</div>
	</body>
	</html>
	`;

	await writeFile(htmlPath, fullHtml);

	// Convert HTML to PDF using Puppeteer
	const browser = await puppeteer.launch({
	headless: true,
	args: ['--no-sandbox', '--disable-setuid-sandbox'],
	});
	const page = await browser.newPage();
	await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' });
	await page.pdf({ path: pdfPath, format: 'A4' });
	await browser.close();

	console.log(`Generated PDF from DOC at: ${pdfPath}`);
	return pdfPath;
	}
	} catch (mammothError) {
	console.log("Mammoth could not convert DOC file:", mammothError);
	// Continue to fallback method
	}

	// If mammoth fails, create a PDF with the content of the DOC as binary data
	// and a message saying it's a DOC file
	console.log("Creating placeholder PDF for DOC file");
	return await createPlaceholderPdfWithContent(docPath, pdfPath);
	} catch (error) {
	console.error("Error in DOC conversion fallback:", error);
	throw error;
	}
	}

	/**
	* Creates a PDF with placeholder content that includes file info and any extractable text
	*/
	async function createPlaceholderPdfWithContent(originalPath: string, pdfPath: string): Promise<string> {
	try {
	console.log("Creating placeholder PDF...");

	// Extract filename from path
	const filename = basename(originalPath);

	// Try to extract some text from the document using a simple binary read
	let extractedText = "";
	try {
	const buffer = await readFile(originalPath);
	// Convert buffer to string and look for readable text
	const content = buffer.toString('utf8');
	// Extract what looks like readable text (basic approach)
	const textMatches = content.match(/[A-Za-z0-9\s.,;:'"!?()-]{5,100}/g);
	if (textMatches && textMatches.length > 0) {
	extractedText = textMatches.slice(0, 50).join(' ');
	}
	} catch (error) {
	console.log("Could not extract text from binary file:", error);
	}

	// Launch browser
	const browser = await puppeteer.launch({
	headless: true,
	args: ['--no-sandbox', '--disable-setuid-sandbox']
	});

	// Create page
	const page = await browser.newPage();

	// Set content with information about the original file
	await page.setContent(`
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<title>Document Conversion Notice</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	margin: 40px;
	line-height: 1.5;
	}
	.container {
	max-width: 600px;
	margin: 0 auto;
	border: 1px solid #ddd;
	padding: 30px;
	border-radius: 5px;
	background-color: #f9f9f9;
	}
	h1 {
	color: #333;
	}
	.filename {
	font-family: monospace;
	background-color: #eee;
	padding: 5px 10px;
	border-radius: 3px;
	margin: 10px 0;
	display: inline-block;
	}
	.extracted-text {
	margin-top: 20px;
	border-top: 1px solid #ddd;
	padding-top: 20px;
	}
	.extracted-text pre {
	background-color: #f5f5f5;
	padding: 15px;
	border-radius: 5px;
	overflow-x: auto;
	font-size: 0.9em;
	}
	</style>
	</head>
	<body>
	<div class="container">
	<h1>Document Preview</h1>
	<p>This is a preview of the document. For full document functionality, please open the original file in Microsoft Word or another compatible document editor.</p>
	<p>Original file: <span class="filename">${filename}</span></p>

	${extractedText ? `
	<div class="extracted-text">
	<h2>Document Preview</h2>
	<p>Below is some extracted text from the document:</p>
	<pre>${extractedText}</pre>
	</div>
	` : ''}
	</div>
	</body>
	</html>
	`);

	// Generate PDF
	await page.pdf({
	path: pdfPath,
	format: 'A4',
	printBackground: true,
	margin: {
	top: '20mm',
	right: '20mm',
	bottom: '20mm',
	left: '20mm'
	}
	});

	// Close browser
	await browser.close();
	console.log("Placeholder PDF created:", pdfPath);

	// Also copy the original file to ensure it's preserved
	const preservedOriginal = pdfPath.replace('.pdf', '.original.doc');
	await copyFile(originalPath, preservedOriginal);
	console.log("Original DOC preserved at:", preservedOriginal);

	return pdfPath;
	} catch (error) {
	console.error("Error creating placeholder PDF:", error);

	// Last resort - just copy the file with PDF extension
	console.log("Last resort - copying original file with PDF extension");
	await copyFile(originalPath, pdfPath);
	return pdfPath;
	}
	}

	/**
	* Converts a DOCX file to HTML using mammoth
	*/
	async function convertDocxToHtml(docPath: string, outputDir: string): Promise<string> {
	try {
	// Generate unique filename for the HTML
	const htmlFilename = `${uuidv4()}.html`
	const htmlPath = join(outputDir, htmlFilename)

	// Read and convert the document
	const buffer = await readFile(docPath)
	const result = await mammoth.convertToHtml({ buffer })

	// Create a nicely formatted HTML document
	const html = `
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<title>Converted Document</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	margin: 40px;
	line-height: 1.5;
	font-size: 12pt;
	}
	h1, h2, h3, h4, h5, h6 {
	margin-top: 20px;
	margin-bottom: 10px;
	}
	p {
	margin-bottom: 10px;
	}
	table {
	border-collapse: collapse;
	width: 100%;
	}
	table, th, td {
	border: 1px solid #ddd;
	padding: 8px;
	}
	img {
	max-width: 100%;
	height: auto;
	}
	</style>
	</head>
	<body>
	${result.value}
	</body>
	</html>
	`

	await writeFile(htmlPath, html)
	console.log("HTML conversion complete:", htmlPath)
	return htmlPath
	} catch (error) {
	console.error("Error converting DOCX to HTML:", error)
	throw error
	}
	}

	/**
	* Converts an HTML file to PDF using puppeteer
	*/
	async function convertHtmlToPdf(htmlPath: string, pdfPath: string): Promise<string> {
	try {
	console.log("Converting HTML to PDF...")

	// Launch browser with appropriate settings
	const browser = await puppeteer.launch({
	headless: true,
	args: ['--no-sandbox', '--disable-setuid-sandbox']
	})

	// Create new page and set viewport
	const page = await browser.newPage()
	await page.setViewport({ width: 1024, height: 768 })

	// Load the HTML file and wait for content to load
	await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' })

	// Generate PDF with appropriate margins and settings
	await page.pdf({
	path: pdfPath,
	format: 'A4',
	printBackground: true,
	margin: {
	top: '20mm',
	right: '20mm',
	bottom: '20mm',
	left: '20mm'
	}
	})

	// Close browser
	await browser.close()
	console.log("PDF conversion complete:", pdfPath)
	return pdfPath
	} catch (error) {
	console.error("Error converting HTML to PDF:", error)
	throw error
	}
	}

	/**
	* Creates a PDF with placeholder content
	*/
	async function createPlaceholderPdf(originalPath: string, pdfPath: string): Promise<string> {
	try {
	console.log("Creating placeholder PDF...")

	// Extract filename from path
	const filename = originalPath.split(/[\/\\]/).pop() \|\| "unknown.doc"

	// Launch browser
	const browser = await puppeteer.launch({
	headless: true,
	args: ['--no-sandbox', '--disable-setuid-sandbox']
	})

	// Create page
	const page = await browser.newPage()

	// Set content with information about the original file
	await page.setContent(`
	<!DOCTYPE html>
	<html>
	<head>
	<meta charset="utf-8">
	<title>Document Conversion Notice</title>
	<style>
	body {
	font-family: Arial, sans-serif;
	margin: 40px;
	line-height: 1.5;
	text-align: center;
	padding-top: 100px;
	}
	.container {
	max-width: 600px;
	margin: 0 auto;
	border: 1px solid #ddd;
	padding: 30px;
	border-radius: 5px;
	background-color: #f9f9f9;
	}
	h1 {
	color: #333;
	}
	.filename {
	font-family: monospace;
	background-color: #eee;
	padding: 5px 10px;
	border-radius: 3px;
	margin: 10px 0;
	display: inline-block;
	}
	</style>
	</head>
	<body>
	<div class="container">
	<h1>Document Conversion Notice</h1>
	<p>The DOC file could not be fully converted to PDF. To view the original document accurately, please open it in Microsoft Word or another compatible document editor.</p>
	<p>Original file: <span class="filename">${filename}</span></p>
	</div>
	</body>
	</html>
	`)

	// Generate PDF
	await page.pdf({
	path: pdfPath,
	format: 'A4',
	printBackground: true,
	margin: {
	top: '20mm',
	right: '20mm',
	bottom: '20mm',
	left: '20mm'
	}
	})

	// Close browser
	await browser.close()
	console.log("Placeholder PDF created:", pdfPath)
	return pdfPath
	} catch (error) {
	console.error("Error creating placeholder PDF:", error)

	// Last resort - just copy the file with PDF extension
	await copyFile(originalPath, pdfPath)
	return pdfPath
	}
	}

	/**
	* Alternative implementation for DOC files
	*/
	export async function convertDocToPdfWithLibrary(docPath: string): Promise<string> {
	try {
	// Verify file exists
	if (!existsSync(docPath)) {
	console.error("DOC file does not exist:", docPath)
	throw new Error(`DOC file does not exist: ${docPath}`)
	}

	// For DOC files, we'll use a different approach
	const outputDir = join(process.cwd(), "uploads")
	await mkdir(outputDir, { recursive: true })

	const outputFileName = `${uuidv4()}.pdf`
	const outputPath = join(outputDir, outputFileName)

	// For now, we'll just copy the file since we don't have LibreOffice installed
	// In a production environment, you would want to use LibreOffice or a similar tool
	const fileContent = await readFile(docPath)
	await writeFile(outputPath, fileContent)

	console.log("File copied successfully")
	return outputPath
	} catch (error) {
	console.error("Error converting DOC to PDF with library:", error)
	throw new Error("Failed to convert DOC to PDF with library")
	}
	}