import { exec } from "child_process" import { promisify } from "util" import { join, extname, basename } from "path" import { v4 as uuidv4 } from "uuid" import { mkdir, writeFile, readFile, copyFile } from "fs/promises" import { existsSync, mkdirSync } from "fs" import mammoth from "mammoth" import * as puppeteer from 'puppeteer' const execAsync = promisify(exec) /** * Converts a DOC or DOCX file to PDF * @param filePath Path to the DOC or DOCX file * @returns Path to the generated PDF file */ export async function convertDocToPdf(filePath: string): Promise { console.log(`Converting document: ${filePath}`); // Verify file exists if (!existsSync(filePath)) { console.error(`File does not exist: ${filePath}`); throw new Error(`File does not exist: ${filePath}`); } // Create uploads directory if it doesn't exist const uploadsDir = join(process.cwd(), 'uploads'); if (!existsSync(uploadsDir)) { mkdirSync(uploadsDir, { recursive: true }); } // Generate PDF path const pdfPath = filePath.replace(/\.(doc|docx)$/i, '.pdf'); try { // Check if it's a DOCX file if (filePath.toLowerCase().endsWith('.docx')) { console.log(`Converting DOCX to PDF: ${filePath}`); return await convertDocxToPdf(filePath, pdfPath); } // Check if it's a DOC file else if (filePath.toLowerCase().endsWith('.doc')) { console.log(`Converting DOC to PDF: ${filePath}`); return await convertDocToPdfFallback(filePath, pdfPath); } else { console.error(`Unsupported file format: ${filePath}`); throw new Error(`Unsupported file format: ${filePath}`); } } catch (error: unknown) { console.error('Error in document conversion:', error); // Try fallback method if the main conversion fails try { console.log("Attempting fallback conversion method"); return await createPlaceholderPdfWithContent(filePath, pdfPath); } catch (fallbackError) { console.error("Fallback conversion also failed:", fallbackError); if (error instanceof Error) { throw new Error(`Failed to convert document: ${error.message}`); } else { throw new Error(`Failed to convert document: ${String(error)}`); } } } } /** * Convert DOCX file to PDF using mammoth and puppeteer */ async function convertDocxToPdf(docxPath: string, pdfPath: string): Promise { try { // Convert DOCX to HTML const buffer = await readFile(docxPath); const result = await mammoth.convertToHtml({ buffer }); const html = result.value; // Create a temporary HTML file const htmlPath = docxPath.replace(/\.docx$/i, '.html'); const fullHtml = ` Document ${html} `; await writeFile(htmlPath, fullHtml); console.log(`Created HTML file: ${htmlPath}`); // Convert HTML to PDF using Puppeteer const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' }); await page.pdf({ path: pdfPath, format: 'A4' }); await browser.close(); console.log(`Generated PDF at: ${pdfPath}`); return pdfPath; } catch (error) { console.error("Error converting DOCX to PDF:", error); throw error; } } /** * Convert DOC file to PDF using a fallback approach for older Word formats */ async function convertDocToPdfFallback(docPath: string, pdfPath: string): Promise { try { console.log("Using fallback method for DOC conversion"); // Read the file content const buffer = await readFile(docPath); // First try using mammoth (might work for some DOC files) try { const result = await mammoth.convertToHtml({ buffer }); if (result.value && result.value.length > 100) { // If we got substantial text, use the HTML conversion route console.log("Mammoth extracted text from DOC, using HTML conversion"); const htmlPath = docPath.replace(/\.doc$/i, '.html'); const fullHtml = ` Document ${result.value}

Note: This document was converted from a DOC file. Some formatting may have been lost.

`; await writeFile(htmlPath, fullHtml); // Convert HTML to PDF using Puppeteer const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'], }); const page = await browser.newPage(); await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' }); await page.pdf({ path: pdfPath, format: 'A4' }); await browser.close(); console.log(`Generated PDF from DOC at: ${pdfPath}`); return pdfPath; } } catch (mammothError) { console.log("Mammoth could not convert DOC file:", mammothError); // Continue to fallback method } // If mammoth fails, create a PDF with the content of the DOC as binary data // and a message saying it's a DOC file console.log("Creating placeholder PDF for DOC file"); return await createPlaceholderPdfWithContent(docPath, pdfPath); } catch (error) { console.error("Error in DOC conversion fallback:", error); throw error; } } /** * Creates a PDF with placeholder content that includes file info and any extractable text */ async function createPlaceholderPdfWithContent(originalPath: string, pdfPath: string): Promise { try { console.log("Creating placeholder PDF..."); // Extract filename from path const filename = basename(originalPath); // Try to extract some text from the document using a simple binary read let extractedText = ""; try { const buffer = await readFile(originalPath); // Convert buffer to string and look for readable text const content = buffer.toString('utf8'); // Extract what looks like readable text (basic approach) const textMatches = content.match(/[A-Za-z0-9\s.,;:'"!?()-]{5,100}/g); if (textMatches && textMatches.length > 0) { extractedText = textMatches.slice(0, 50).join(' '); } } catch (error) { console.log("Could not extract text from binary file:", error); } // Launch browser const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }); // Create page const page = await browser.newPage(); // Set content with information about the original file await page.setContent(` Document Conversion Notice

Document Preview

This is a preview of the document. For full document functionality, please open the original file in Microsoft Word or another compatible document editor.

Original file: ${filename}

${extractedText ? `

Document Preview

Below is some extracted text from the document:

${extractedText}

` : ''}

`); // Generate PDF await page.pdf({ path: pdfPath, format: 'A4', printBackground: true, margin: { top: '20mm', right: '20mm', bottom: '20mm', left: '20mm' } }); // Close browser await browser.close(); console.log("Placeholder PDF created:", pdfPath); // Also copy the original file to ensure it's preserved const preservedOriginal = pdfPath.replace('.pdf', '.original.doc'); await copyFile(originalPath, preservedOriginal); console.log("Original DOC preserved at:", preservedOriginal); return pdfPath; } catch (error) { console.error("Error creating placeholder PDF:", error); // Last resort - just copy the file with PDF extension console.log("Last resort - copying original file with PDF extension"); await copyFile(originalPath, pdfPath); return pdfPath; } } /** * Converts a DOCX file to HTML using mammoth */ async function convertDocxToHtml(docPath: string, outputDir: string): Promise { try { // Generate unique filename for the HTML const htmlFilename = `${uuidv4()}.html` const htmlPath = join(outputDir, htmlFilename) // Read and convert the document const buffer = await readFile(docPath) const result = await mammoth.convertToHtml({ buffer }) // Create a nicely formatted HTML document const html = ` Converted Document ${result.value} ` await writeFile(htmlPath, html) console.log("HTML conversion complete:", htmlPath) return htmlPath } catch (error) { console.error("Error converting DOCX to HTML:", error) throw error } } /** * Converts an HTML file to PDF using puppeteer */ async function convertHtmlToPdf(htmlPath: string, pdfPath: string): Promise { try { console.log("Converting HTML to PDF...") // Launch browser with appropriate settings const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }) // Create new page and set viewport const page = await browser.newPage() await page.setViewport({ width: 1024, height: 768 }) // Load the HTML file and wait for content to load await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' }) // Generate PDF with appropriate margins and settings await page.pdf({ path: pdfPath, format: 'A4', printBackground: true, margin: { top: '20mm', right: '20mm', bottom: '20mm', left: '20mm' } }) // Close browser await browser.close() console.log("PDF conversion complete:", pdfPath) return pdfPath } catch (error) { console.error("Error converting HTML to PDF:", error) throw error } } /** * Creates a PDF with placeholder content */ async function createPlaceholderPdf(originalPath: string, pdfPath: string): Promise { try { console.log("Creating placeholder PDF...") // Extract filename from path const filename = originalPath.split(/[\/\\]/).pop() || "unknown.doc" // Launch browser const browser = await puppeteer.launch({ headless: true, args: ['--no-sandbox', '--disable-setuid-sandbox'] }) // Create page const page = await browser.newPage() // Set content with information about the original file await page.setContent(` Document Conversion Notice

Document Conversion Notice

The DOC file could not be fully converted to PDF. To view the original document accurately, please open it in Microsoft Word or another compatible document editor.

Original file: ${filename}

`) // Generate PDF await page.pdf({ path: pdfPath, format: 'A4', printBackground: true, margin: { top: '20mm', right: '20mm', bottom: '20mm', left: '20mm' } }) // Close browser await browser.close() console.log("Placeholder PDF created:", pdfPath) return pdfPath } catch (error) { console.error("Error creating placeholder PDF:", error) // Last resort - just copy the file with PDF extension await copyFile(originalPath, pdfPath) return pdfPath } } /** * Alternative implementation for DOC files */ export async function convertDocToPdfWithLibrary(docPath: string): Promise { try { // Verify file exists if (!existsSync(docPath)) { console.error("DOC file does not exist:", docPath) throw new Error(`DOC file does not exist: ${docPath}`) } // For DOC files, we'll use a different approach const outputDir = join(process.cwd(), "uploads") await mkdir(outputDir, { recursive: true }) const outputFileName = `${uuidv4()}.pdf` const outputPath = join(outputDir, outputFileName) // For now, we'll just copy the file since we don't have LibreOffice installed // In a production environment, you would want to use LibreOffice or a similar tool const fileContent = await readFile(docPath) await writeFile(outputPath, fileContent) console.log("File copied successfully") return outputPath } catch (error) { console.error("Error converting DOC to PDF with library:", error) throw new Error("Failed to convert DOC to PDF with library") } }