File size: 17,282 Bytes
dca8ede
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
import { exec } from "child_process"
import { promisify } from "util"
import { join, extname, basename } from "path"
import { v4 as uuidv4 } from "uuid"
import { mkdir, writeFile, readFile, copyFile } from "fs/promises"
import { existsSync, mkdirSync } from "fs"
import mammoth from "mammoth"
import * as puppeteer from 'puppeteer'

const execAsync = promisify(exec)

/**
 * Converts a DOC or DOCX file to PDF
 * @param filePath Path to the DOC or DOCX file
 * @returns Path to the generated PDF file
 */
export async function convertDocToPdf(filePath: string): Promise<string> {
  console.log(`Converting document: ${filePath}`);
  
  // Verify file exists
  if (!existsSync(filePath)) {
    console.error(`File does not exist: ${filePath}`);
    throw new Error(`File does not exist: ${filePath}`);
  }

  // Create uploads directory if it doesn't exist
  const uploadsDir = join(process.cwd(), 'uploads');
  if (!existsSync(uploadsDir)) {
    mkdirSync(uploadsDir, { recursive: true });
  }

  // Generate PDF path
  const pdfPath = filePath.replace(/\.(doc|docx)$/i, '.pdf');

  try {
    // Check if it's a DOCX file
    if (filePath.toLowerCase().endsWith('.docx')) {
      console.log(`Converting DOCX to PDF: ${filePath}`);
      return await convertDocxToPdf(filePath, pdfPath);
    } 
    // Check if it's a DOC file
    else if (filePath.toLowerCase().endsWith('.doc')) {
      console.log(`Converting DOC to PDF: ${filePath}`);
      return await convertDocToPdfFallback(filePath, pdfPath);
    } else {
      console.error(`Unsupported file format: ${filePath}`);
      throw new Error(`Unsupported file format: ${filePath}`);
    }
  } catch (error: unknown) {
    console.error('Error in document conversion:', error);
    // Try fallback method if the main conversion fails
    try {
      console.log("Attempting fallback conversion method");
      return await createPlaceholderPdfWithContent(filePath, pdfPath);
    } catch (fallbackError) {
      console.error("Fallback conversion also failed:", fallbackError);
      if (error instanceof Error) {
        throw new Error(`Failed to convert document: ${error.message}`);
      } else {
        throw new Error(`Failed to convert document: ${String(error)}`);
      }
    }
  }
}

/**
 * Convert DOCX file to PDF using mammoth and puppeteer
 */
async function convertDocxToPdf(docxPath: string, pdfPath: string): Promise<string> {
  try {
    // Convert DOCX to HTML
    const buffer = await readFile(docxPath);
    const result = await mammoth.convertToHtml({ buffer });
    const html = result.value;
    
    // Create a temporary HTML file
    const htmlPath = docxPath.replace(/\.docx$/i, '.html');
    const fullHtml = `
      <!DOCTYPE html>
      <html>
      <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>Document</title>
        <style>
          body {
            font-family: Arial, sans-serif;
            line-height: 1.5;
            margin: 1cm;
          }
          h1, h2, h3, h4, h5, h6 {
            margin-top: 1em;
            margin-bottom: 0.5em;
          }
          p {
            margin-bottom: 0.5em;
          }
          table {
            border-collapse: collapse;
            width: 100%;
          }
          td, th {
            border: 1px solid #ddd;
            padding: 8px;
          }
        </style>
      </head>
      <body>
        ${html}
      </body>
      </html>
    `;
    
    await writeFile(htmlPath, fullHtml);
    console.log(`Created HTML file: ${htmlPath}`);
    
    // Convert HTML to PDF using Puppeteer
    const browser = await puppeteer.launch({
      headless: true,
      args: ['--no-sandbox', '--disable-setuid-sandbox'],
    });
    const page = await browser.newPage();
    await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' });
    await page.pdf({ path: pdfPath, format: 'A4' });
    await browser.close();
    
    console.log(`Generated PDF at: ${pdfPath}`);
    return pdfPath;
  } catch (error) {
    console.error("Error converting DOCX to PDF:", error);
    throw error;
  }
}

/**
 * Convert DOC file to PDF using a fallback approach for older Word formats
 */
async function convertDocToPdfFallback(docPath: string, pdfPath: string): Promise<string> {
  try {
    console.log("Using fallback method for DOC conversion");
    
    // Read the file content
    const buffer = await readFile(docPath);
    
    // First try using mammoth (might work for some DOC files)
    try {
      const result = await mammoth.convertToHtml({ buffer });
      if (result.value && result.value.length > 100) {
        // If we got substantial text, use the HTML conversion route
        console.log("Mammoth extracted text from DOC, using HTML conversion");
        
        const htmlPath = docPath.replace(/\.doc$/i, '.html');
        const fullHtml = `
          <!DOCTYPE html>
          <html>
          <head>
            <meta charset="UTF-8">
            <meta name="viewport" content="width=device-width, initial-scale=1.0">
            <title>Document</title>
            <style>
              body {
                font-family: Arial, sans-serif;
                line-height: 1.5;
                margin: 1cm;
              }
              h1, h2, h3, h4, h5, h6 {
                margin-top: 1em;
                margin-bottom: 0.5em;
              }
              p {
                margin-bottom: 0.5em;
              }
            </style>
          </head>
          <body>
            ${result.value}
            <div style="margin-top: 20px; padding: 10px; background-color: #f8f9fa; border: 1px solid #ddd; border-radius: 4px;">
              <p style="font-style: italic; color: #666;">Note: This document was converted from a DOC file. Some formatting may have been lost.</p>
            </div>
          </body>
          </html>
        `;
        
        await writeFile(htmlPath, fullHtml);
        
        // Convert HTML to PDF using Puppeteer
        const browser = await puppeteer.launch({
          headless: true,
          args: ['--no-sandbox', '--disable-setuid-sandbox'],
        });
        const page = await browser.newPage();
        await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' });
        await page.pdf({ path: pdfPath, format: 'A4' });
        await browser.close();
        
        console.log(`Generated PDF from DOC at: ${pdfPath}`);
        return pdfPath;
      }
    } catch (mammothError) {
      console.log("Mammoth could not convert DOC file:", mammothError);
      // Continue to fallback method
    }
    
    // If mammoth fails, create a PDF with the content of the DOC as binary data
    // and a message saying it's a DOC file
    console.log("Creating placeholder PDF for DOC file");
    return await createPlaceholderPdfWithContent(docPath, pdfPath);
  } catch (error) {
    console.error("Error in DOC conversion fallback:", error);
    throw error;
  }
}

/**
 * Creates a PDF with placeholder content that includes file info and any extractable text
 */
async function createPlaceholderPdfWithContent(originalPath: string, pdfPath: string): Promise<string> {
  try {
    console.log("Creating placeholder PDF...");
    
    // Extract filename from path
    const filename = basename(originalPath);
    
    // Try to extract some text from the document using a simple binary read
    let extractedText = "";
    try {
      const buffer = await readFile(originalPath);
      // Convert buffer to string and look for readable text
      const content = buffer.toString('utf8');
      // Extract what looks like readable text (basic approach)
      const textMatches = content.match(/[A-Za-z0-9\s.,;:'"!?()-]{5,100}/g);
      if (textMatches && textMatches.length > 0) {
        extractedText = textMatches.slice(0, 50).join(' ');
      }
    } catch (error) {
      console.log("Could not extract text from binary file:", error);
    }
    
    // Launch browser
    const browser = await puppeteer.launch({
      headless: true,
      args: ['--no-sandbox', '--disable-setuid-sandbox']
    });
    
    // Create page
    const page = await browser.newPage();
    
    // Set content with information about the original file
    await page.setContent(`
      <!DOCTYPE html>
      <html>
      <head>
        <meta charset="utf-8">
        <title>Document Conversion Notice</title>
        <style>
          body {
            font-family: Arial, sans-serif;
            margin: 40px;
            line-height: 1.5;
          }
          .container {
            max-width: 600px;
            margin: 0 auto;
            border: 1px solid #ddd;
            padding: 30px;
            border-radius: 5px;
            background-color: #f9f9f9;
          }
          h1 {
            color: #333;
          }
          .filename {
            font-family: monospace;
            background-color: #eee;
            padding: 5px 10px;
            border-radius: 3px;
            margin: 10px 0;
            display: inline-block;
          }
          .extracted-text {
            margin-top: 20px;
            border-top: 1px solid #ddd;
            padding-top: 20px;
          }
          .extracted-text pre {
            background-color: #f5f5f5;
            padding: 15px;
            border-radius: 5px;
            overflow-x: auto;
            font-size: 0.9em;
          }
        </style>
      </head>
      <body>
        <div class="container">
          <h1>Document Preview</h1>
          <p>This is a preview of the document. For full document functionality, please open the original file in Microsoft Word or another compatible document editor.</p>
          <p>Original file: <span class="filename">${filename}</span></p>
          
          ${extractedText ? `
          <div class="extracted-text">
            <h2>Document Preview</h2>
            <p>Below is some extracted text from the document:</p>
            <pre>${extractedText}</pre>
          </div>
          ` : ''}
        </div>
      </body>
      </html>
    `);
    
    // Generate PDF
    await page.pdf({
      path: pdfPath,
      format: 'A4',
      printBackground: true,
      margin: {
        top: '20mm',
        right: '20mm',
        bottom: '20mm',
        left: '20mm'
      }
    });
    
    // Close browser
    await browser.close();
    console.log("Placeholder PDF created:", pdfPath);
    
    // Also copy the original file to ensure it's preserved
    const preservedOriginal = pdfPath.replace('.pdf', '.original.doc');
    await copyFile(originalPath, preservedOriginal);
    console.log("Original DOC preserved at:", preservedOriginal);
    
    return pdfPath;
  } catch (error) {
    console.error("Error creating placeholder PDF:", error);
    
    // Last resort - just copy the file with PDF extension
    console.log("Last resort - copying original file with PDF extension");
    await copyFile(originalPath, pdfPath);
    return pdfPath;
  }
}

/**
 * Converts a DOCX file to HTML using mammoth
 */
async function convertDocxToHtml(docPath: string, outputDir: string): Promise<string> {
  try {
    // Generate unique filename for the HTML
    const htmlFilename = `${uuidv4()}.html`
    const htmlPath = join(outputDir, htmlFilename)
    
    // Read and convert the document
    const buffer = await readFile(docPath)
    const result = await mammoth.convertToHtml({ buffer })
    
    // Create a nicely formatted HTML document
    const html = `
      <!DOCTYPE html>
      <html>
      <head>
        <meta charset="utf-8">
        <title>Converted Document</title>
        <style>
          body {
            font-family: Arial, sans-serif;
            margin: 40px;
            line-height: 1.5;
            font-size: 12pt;
          }
          h1, h2, h3, h4, h5, h6 {
            margin-top: 20px;
            margin-bottom: 10px;
          }
          p {
            margin-bottom: 10px;
          }
          table {
            border-collapse: collapse;
            width: 100%;
          }
          table, th, td {
            border: 1px solid #ddd;
            padding: 8px;
          }
          img {
            max-width: 100%;
            height: auto;
          }
        </style>
      </head>
      <body>
        ${result.value}
      </body>
      </html>
    `
    
    await writeFile(htmlPath, html)
    console.log("HTML conversion complete:", htmlPath)
    return htmlPath
  } catch (error) {
    console.error("Error converting DOCX to HTML:", error)
    throw error
  }
}

/**
 * Converts an HTML file to PDF using puppeteer
 */
async function convertHtmlToPdf(htmlPath: string, pdfPath: string): Promise<string> {
  try {
    console.log("Converting HTML to PDF...")
    
    // Launch browser with appropriate settings
    const browser = await puppeteer.launch({
      headless: true,
      args: ['--no-sandbox', '--disable-setuid-sandbox']
    })
    
    // Create new page and set viewport
    const page = await browser.newPage()
    await page.setViewport({ width: 1024, height: 768 })
    
    // Load the HTML file and wait for content to load
    await page.goto(`file://${htmlPath}`, { waitUntil: 'networkidle0' })
    
    // Generate PDF with appropriate margins and settings
    await page.pdf({
      path: pdfPath,
      format: 'A4',
      printBackground: true,
      margin: {
        top: '20mm',
        right: '20mm',
        bottom: '20mm',
        left: '20mm'
      }
    })
    
    // Close browser
    await browser.close()
    console.log("PDF conversion complete:", pdfPath)
    return pdfPath
  } catch (error) {
    console.error("Error converting HTML to PDF:", error)
    throw error
  }
}

/**
 * Creates a PDF with placeholder content
 */
async function createPlaceholderPdf(originalPath: string, pdfPath: string): Promise<string> {
  try {
    console.log("Creating placeholder PDF...")
    
    // Extract filename from path
    const filename = originalPath.split(/[\/\\]/).pop() || "unknown.doc"
    
    // Launch browser
    const browser = await puppeteer.launch({
      headless: true,
      args: ['--no-sandbox', '--disable-setuid-sandbox']
    })
    
    // Create page
    const page = await browser.newPage()
    
    // Set content with information about the original file
    await page.setContent(`
      <!DOCTYPE html>
      <html>
      <head>
        <meta charset="utf-8">
        <title>Document Conversion Notice</title>
        <style>
          body {
            font-family: Arial, sans-serif;
            margin: 40px;
            line-height: 1.5;
            text-align: center;
            padding-top: 100px;
          }
          .container {
            max-width: 600px;
            margin: 0 auto;
            border: 1px solid #ddd;
            padding: 30px;
            border-radius: 5px;
            background-color: #f9f9f9;
          }
          h1 {
            color: #333;
          }
          .filename {
            font-family: monospace;
            background-color: #eee;
            padding: 5px 10px;
            border-radius: 3px;
            margin: 10px 0;
            display: inline-block;
          }
        </style>
      </head>
      <body>
        <div class="container">
          <h1>Document Conversion Notice</h1>
          <p>The DOC file could not be fully converted to PDF. To view the original document accurately, please open it in Microsoft Word or another compatible document editor.</p>
          <p>Original file: <span class="filename">${filename}</span></p>
        </div>
      </body>
      </html>
    `)
    
    // Generate PDF
    await page.pdf({
      path: pdfPath,
      format: 'A4',
      printBackground: true,
      margin: {
        top: '20mm',
        right: '20mm',
        bottom: '20mm',
        left: '20mm'
      }
    })
    
    // Close browser
    await browser.close()
    console.log("Placeholder PDF created:", pdfPath)
    return pdfPath
  } catch (error) {
    console.error("Error creating placeholder PDF:", error)
    
    // Last resort - just copy the file with PDF extension
    await copyFile(originalPath, pdfPath)
    return pdfPath
  }
}

/**
 * Alternative implementation for DOC files
 */
export async function convertDocToPdfWithLibrary(docPath: string): Promise<string> {
  try {
    // Verify file exists
    if (!existsSync(docPath)) {
      console.error("DOC file does not exist:", docPath)
      throw new Error(`DOC file does not exist: ${docPath}`)
    }
    
    // For DOC files, we'll use a different approach
    const outputDir = join(process.cwd(), "uploads")
    await mkdir(outputDir, { recursive: true })
    
    const outputFileName = `${uuidv4()}.pdf`
    const outputPath = join(outputDir, outputFileName)

    // For now, we'll just copy the file since we don't have LibreOffice installed
    // In a production environment, you would want to use LibreOffice or a similar tool
    const fileContent = await readFile(docPath)
    await writeFile(outputPath, fileContent)
    
    console.log("File copied successfully")
    return outputPath
  } catch (error) {
    console.error("Error converting DOC to PDF with library:", error)
    throw new Error("Failed to convert DOC to PDF with library")
  }
}