File size: 4,376 Bytes
4d35814 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
/**
* PDF processing utilities using PDF.js
* Handles PDF text extraction and image conversion in the browser
*/
import { browser } from '$app/environment';
import { MimeTypeApplication, MimeTypeImage } from '$lib/enums/files';
import * as pdfjs from 'pdfjs-dist';
type TextContent = {
items: Array<{ str: string }>;
};
if (browser) {
// Import worker as text and create blob URL for inline bundling
import('pdfjs-dist/build/pdf.worker.min.mjs?raw')
.then((workerModule) => {
const workerBlob = new Blob([workerModule.default], { type: 'application/javascript' });
pdfjs.GlobalWorkerOptions.workerSrc = URL.createObjectURL(workerBlob);
})
.catch(() => {
console.warn('Failed to load PDF.js worker, PDF processing may not work');
});
}
/**
* Convert a File object to ArrayBuffer for PDF.js processing
* @param file - The PDF file to convert
* @returns Promise resolving to the file's ArrayBuffer
*/
async function getFileAsBuffer(file: File): Promise<ArrayBuffer> {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (event) => {
if (event.target?.result) {
resolve(event.target.result as ArrayBuffer);
} else {
reject(new Error('Failed to read file.'));
}
};
reader.onerror = () => {
reject(new Error('Failed to read file.'));
};
reader.readAsArrayBuffer(file);
});
}
/**
* Extract text content from a PDF file
* @param file - The PDF file to process
* @returns Promise resolving to the extracted text content
*/
export async function convertPDFToText(file: File): Promise<string> {
if (!browser) {
throw new Error('PDF processing is only available in the browser');
}
try {
const buffer = await getFileAsBuffer(file);
const pdf = await pdfjs.getDocument(buffer).promise;
const numPages = pdf.numPages;
const textContentPromises: Promise<TextContent>[] = [];
for (let i = 1; i <= numPages; i++) {
// eslint-disable-next-line @typescript-eslint/no-explicit-any
textContentPromises.push(pdf.getPage(i).then((page: any) => page.getTextContent()));
}
const textContents = await Promise.all(textContentPromises);
const textItems = textContents.flatMap((textContent: TextContent) =>
textContent.items.map((item) => item.str ?? '')
);
return textItems.join('\n');
} catch (error) {
console.error('Error converting PDF to text:', error);
throw new Error(
`Failed to convert PDF to text: ${error instanceof Error ? error.message : 'Unknown error'}`
);
}
}
/**
* Convert PDF pages to PNG images as data URLs
* @param file - The PDF file to convert
* @param scale - Rendering scale factor (default: 1.5)
* @returns Promise resolving to array of PNG data URLs
*/
export async function convertPDFToImage(file: File, scale: number = 1.5): Promise<string[]> {
if (!browser) {
throw new Error('PDF processing is only available in the browser');
}
try {
const buffer = await getFileAsBuffer(file);
const doc = await pdfjs.getDocument(buffer).promise;
const pages: Promise<string>[] = [];
for (let i = 1; i <= doc.numPages; i++) {
const page = await doc.getPage(i);
const viewport = page.getViewport({ scale });
const canvas = document.createElement('canvas');
const ctx = canvas.getContext('2d');
canvas.width = viewport.width;
canvas.height = viewport.height;
if (!ctx) {
throw new Error('Failed to get 2D context from canvas');
}
const task = page.render({
canvasContext: ctx,
viewport: viewport,
canvas: canvas
});
pages.push(
task.promise.then(() => {
return canvas.toDataURL(MimeTypeImage.PNG);
})
);
}
return await Promise.all(pages);
} catch (error) {
console.error('Error converting PDF to images:', error);
throw new Error(
`Failed to convert PDF to images: ${error instanceof Error ? error.message : 'Unknown error'}`
);
}
}
/**
* Check if a file is a PDF based on its MIME type
* @param file - The file to check
* @returns True if the file is a PDF
*/
export function isPdfFile(file: File): boolean {
return file.type === MimeTypeApplication.PDF;
}
/**
* Check if a MIME type represents a PDF
* @param mimeType - The MIME type to check
* @returns True if the MIME type is application/pdf
*/
export function isApplicationMimeType(mimeType: string): boolean {
return mimeType === MimeTypeApplication.PDF;
}
|