| import { PDFDocument, PDFName } from 'pdf-lib'; |
|
|
| export interface SanitizeOptions { |
| flattenForms: boolean; |
| removeMetadata: boolean; |
| removeAnnotations: boolean; |
| removeJavascript: boolean; |
| removeEmbeddedFiles: boolean; |
| removeLayers: boolean; |
| removeLinks: boolean; |
| removeStructureTree: boolean; |
| removeMarkInfo: boolean; |
| removeFonts: boolean; |
| } |
|
|
| export const defaultSanitizeOptions: SanitizeOptions = { |
| flattenForms: true, |
| removeMetadata: true, |
| removeAnnotations: true, |
| removeJavascript: true, |
| removeEmbeddedFiles: true, |
| removeLayers: true, |
| removeLinks: true, |
| removeStructureTree: true, |
| removeMarkInfo: true, |
| removeFonts: false, |
| }; |
|
|
| function removeMetadataFromDoc(pdfDoc: PDFDocument) { |
| const infoDict = (pdfDoc as any).getInfoDict(); |
| const allKeys = infoDict.keys(); |
| allKeys.forEach((key: any) => { |
| infoDict.delete(key); |
| }); |
|
|
| pdfDoc.setTitle(''); |
| pdfDoc.setAuthor(''); |
| pdfDoc.setSubject(''); |
| pdfDoc.setKeywords([]); |
| pdfDoc.setCreator(''); |
| pdfDoc.setProducer(''); |
|
|
| try { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
| if (catalogDict.has(PDFName.of('Metadata'))) { |
| catalogDict.delete(PDFName.of('Metadata')); |
| } |
| } catch (e: any) { |
| console.warn('Could not remove XMP metadata:', e.message); |
| } |
|
|
| try { |
| const context = pdfDoc.context; |
| if ((context as any).trailerInfo) { |
| delete (context as any).trailerInfo.ID; |
| } |
| } catch (e: any) { |
| console.warn('Could not remove document IDs:', e.message); |
| } |
|
|
| try { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
| if (catalogDict.has(PDFName.of('PieceInfo'))) { |
| catalogDict.delete(PDFName.of('PieceInfo')); |
| } |
| } catch (e: any) { |
| console.warn('Could not remove PieceInfo:', e.message); |
| } |
| } |
|
|
| function removeAnnotationsFromDoc(pdfDoc: PDFDocument) { |
| const pages = pdfDoc.getPages(); |
| for (const page of pages) { |
| try { |
| page.node.delete(PDFName.of('Annots')); |
| } catch (e: any) { |
| console.warn('Could not remove annotations from page:', e.message); |
| } |
| } |
| } |
|
|
| function flattenFormsInDoc(pdfDoc: PDFDocument) { |
| const form = pdfDoc.getForm(); |
| form.flatten(); |
| } |
|
|
| function removeJavascriptFromDoc(pdfDoc: PDFDocument) { |
| if ((pdfDoc as any).javaScripts && (pdfDoc as any).javaScripts.length > 0) { |
| (pdfDoc as any).javaScripts = []; |
| } |
|
|
| const catalogDict = (pdfDoc.catalog as any).dict; |
|
|
| const namesRef = catalogDict.get(PDFName.of('Names')); |
| if (namesRef) { |
| try { |
| const namesDict = pdfDoc.context.lookup(namesRef) as any; |
| if (namesDict.has(PDFName.of('JavaScript'))) { |
| namesDict.delete(PDFName.of('JavaScript')); |
| } |
| } catch (e: any) { |
| console.warn('Could not access Names/JavaScript:', e.message); |
| } |
| } |
|
|
| if (catalogDict.has(PDFName.of('OpenAction'))) { |
| catalogDict.delete(PDFName.of('OpenAction')); |
| } |
|
|
| if (catalogDict.has(PDFName.of('AA'))) { |
| catalogDict.delete(PDFName.of('AA')); |
| } |
|
|
| const pages = pdfDoc.getPages(); |
| for (const page of pages) { |
| try { |
| const pageDict = page.node; |
|
|
| if (pageDict.has(PDFName.of('AA'))) { |
| pageDict.delete(PDFName.of('AA')); |
| } |
|
|
| const annotRefs = pageDict.Annots()?.asArray() || []; |
| for (const annotRef of annotRefs) { |
| try { |
| const annot = pdfDoc.context.lookup(annotRef) as any; |
|
|
| if (annot.has(PDFName.of('A'))) { |
| const actionRef = annot.get(PDFName.of('A')); |
| try { |
| const actionDict = pdfDoc.context.lookup(actionRef) as any; |
| const actionType = actionDict |
| .get(PDFName.of('S')) |
| ?.toString() |
| .substring(1); |
|
|
| if (actionType === 'JavaScript') { |
| annot.delete(PDFName.of('A')); |
| } |
| } catch (e: any) { |
| console.warn('Could not read action:', e.message); |
| } |
| } |
|
|
| if (annot.has(PDFName.of('AA'))) { |
| annot.delete(PDFName.of('AA')); |
| } |
| } catch (e: any) { |
| console.warn('Could not process annotation for JS:', e.message); |
| } |
| } |
| } catch (e: any) { |
| console.warn('Could not remove page actions:', e.message); |
| } |
| } |
|
|
| try { |
| const acroFormRef = catalogDict.get(PDFName.of('AcroForm')); |
| if (acroFormRef) { |
| const acroFormDict = pdfDoc.context.lookup(acroFormRef) as any; |
| const fieldsRef = acroFormDict.get(PDFName.of('Fields')); |
|
|
| if (fieldsRef) { |
| const fieldsArray = pdfDoc.context.lookup(fieldsRef) as any; |
| const fields = fieldsArray.asArray(); |
|
|
| for (const fieldRef of fields) { |
| try { |
| const field = pdfDoc.context.lookup(fieldRef) as any; |
|
|
| if (field.has(PDFName.of('A'))) { |
| field.delete(PDFName.of('A')); |
| } |
|
|
| if (field.has(PDFName.of('AA'))) { |
| field.delete(PDFName.of('AA')); |
| } |
| } catch (e: any) { |
| console.warn('Could not process field for JS:', e.message); |
| } |
| } |
| } |
| } |
| } catch (e: any) { |
| console.warn('Could not process form fields for JS:', e.message); |
| } |
| } |
|
|
| function removeEmbeddedFilesFromDoc(pdfDoc: PDFDocument) { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
|
|
| const namesRef = catalogDict.get(PDFName.of('Names')); |
| if (namesRef) { |
| try { |
| const namesDict = pdfDoc.context.lookup(namesRef) as any; |
| if (namesDict.has(PDFName.of('EmbeddedFiles'))) { |
| namesDict.delete(PDFName.of('EmbeddedFiles')); |
| } |
| } catch (e: any) { |
| console.warn('Could not access Names/EmbeddedFiles:', e.message); |
| } |
| } |
|
|
| if (catalogDict.has(PDFName.of('EmbeddedFiles'))) { |
| catalogDict.delete(PDFName.of('EmbeddedFiles')); |
| } |
|
|
| const pages = pdfDoc.getPages(); |
| for (const page of pages) { |
| try { |
| const annotRefs = page.node.Annots()?.asArray() || []; |
| const annotsToKeep = []; |
|
|
| for (const ref of annotRefs) { |
| try { |
| const annot = pdfDoc.context.lookup(ref) as any; |
| const subtype = annot |
| .get(PDFName.of('Subtype')) |
| ?.toString() |
| .substring(1); |
|
|
| if (subtype !== 'FileAttachment') { |
| annotsToKeep.push(ref); |
| } |
| } catch (e) { |
| annotsToKeep.push(ref); |
| } |
| } |
|
|
| if (annotsToKeep.length !== annotRefs.length) { |
| if (annotsToKeep.length > 0) { |
| const newAnnotsArray = pdfDoc.context.obj(annotsToKeep); |
| page.node.set(PDFName.of('Annots'), newAnnotsArray); |
| } else { |
| page.node.delete(PDFName.of('Annots')); |
| } |
| } |
| } catch (pageError: any) { |
| console.warn( |
| `Could not process page for attachments: ${pageError.message}` |
| ); |
| } |
| } |
|
|
| if ( |
| (pdfDoc as any).embeddedFiles && |
| (pdfDoc as any).embeddedFiles.length > 0 |
| ) { |
| (pdfDoc as any).embeddedFiles = []; |
| } |
|
|
| if (catalogDict.has(PDFName.of('Collection'))) { |
| catalogDict.delete(PDFName.of('Collection')); |
| } |
| } |
|
|
| function removeLayersFromDoc(pdfDoc: PDFDocument) { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
|
|
| if (catalogDict.has(PDFName.of('OCProperties'))) { |
| catalogDict.delete(PDFName.of('OCProperties')); |
| } |
|
|
| const pages = pdfDoc.getPages(); |
| for (const page of pages) { |
| try { |
| const pageDict = page.node; |
|
|
| if (pageDict.has(PDFName.of('OCProperties'))) { |
| pageDict.delete(PDFName.of('OCProperties')); |
| } |
|
|
| const resourcesRef = pageDict.get(PDFName.of('Resources')); |
| if (resourcesRef) { |
| try { |
| const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any; |
| if (resourcesDict.has(PDFName.of('Properties'))) { |
| resourcesDict.delete(PDFName.of('Properties')); |
| } |
| } catch (e: any) { |
| console.warn('Could not access Resources:', e.message); |
| } |
| } |
| } catch (e: any) { |
| console.warn('Could not remove page layers:', e.message); |
| } |
| } |
| } |
|
|
| function removeLinksFromDoc(pdfDoc: PDFDocument) { |
| const pages = pdfDoc.getPages(); |
|
|
| for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) { |
| try { |
| const page = pages[pageIndex]; |
| const pageDict = page.node; |
|
|
| const annotsRef = pageDict.get(PDFName.of('Annots')); |
| if (!annotsRef) continue; |
|
|
| const annotsArray = pdfDoc.context.lookup(annotsRef) as any; |
| const annotRefs = annotsArray.asArray(); |
|
|
| if (annotRefs.length === 0) continue; |
|
|
| const annotsToKeep = []; |
| let linksRemoved = 0; |
|
|
| for (const ref of annotRefs) { |
| try { |
| const annot = pdfDoc.context.lookup(ref) as any; |
| const subtype = annot |
| .get(PDFName.of('Subtype')) |
| ?.toString() |
| .substring(1); |
|
|
| let isLink = false; |
|
|
| if (subtype === 'Link') { |
| isLink = true; |
| linksRemoved++; |
| } else { |
| const actionRef = annot.get(PDFName.of('A')); |
| if (actionRef) { |
| try { |
| const actionDict = pdfDoc.context.lookup(actionRef) as any; |
| const actionType = actionDict |
| .get(PDFName.of('S')) |
| ?.toString() |
| .substring(1); |
|
|
| if ( |
| actionType === 'URI' || |
| actionType === 'Launch' || |
| actionType === 'GoTo' || |
| actionType === 'GoToR' |
| ) { |
| isLink = true; |
| linksRemoved++; |
| } |
| } catch (e: any) { |
| console.warn('Could not read action:', e.message); |
| } |
| } |
|
|
| const dest = annot.get(PDFName.of('Dest')); |
| if (dest && !isLink) { |
| isLink = true; |
| linksRemoved++; |
| } |
| } |
|
|
| if (!isLink) { |
| annotsToKeep.push(ref); |
| } |
| } catch (e: any) { |
| console.warn('Could not process annotation:', e.message); |
| annotsToKeep.push(ref); |
| } |
| } |
|
|
| if (linksRemoved > 0) { |
| if (annotsToKeep.length > 0) { |
| const newAnnotsArray = pdfDoc.context.obj(annotsToKeep); |
| pageDict.set(PDFName.of('Annots'), newAnnotsArray); |
| } else { |
| pageDict.delete(PDFName.of('Annots')); |
| } |
| } |
| } catch (pageError: any) { |
| console.warn( |
| `Could not process page ${pageIndex + 1} for links: ${pageError.message}` |
| ); |
| } |
| } |
|
|
| try { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
| const namesRef = catalogDict.get(PDFName.of('Names')); |
| if (namesRef) { |
| try { |
| const namesDict = pdfDoc.context.lookup(namesRef) as any; |
| if (namesDict.has(PDFName.of('Dests'))) { |
| namesDict.delete(PDFName.of('Dests')); |
| } |
| } catch (e: any) { |
| console.warn('Could not access Names/Dests:', e.message); |
| } |
| } |
|
|
| if (catalogDict.has(PDFName.of('Dests'))) { |
| catalogDict.delete(PDFName.of('Dests')); |
| } |
| } catch (e: any) { |
| console.warn('Could not remove named destinations:', e.message); |
| } |
| } |
|
|
| function removeStructureTreeFromDoc(pdfDoc: PDFDocument) { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
|
|
| if (catalogDict.has(PDFName.of('StructTreeRoot'))) { |
| catalogDict.delete(PDFName.of('StructTreeRoot')); |
| } |
|
|
| const pages = pdfDoc.getPages(); |
| for (const page of pages) { |
| try { |
| const pageDict = page.node; |
| if (pageDict.has(PDFName.of('StructParents'))) { |
| pageDict.delete(PDFName.of('StructParents')); |
| } |
| } catch (e: any) { |
| console.warn('Could not remove page StructParents:', e.message); |
| } |
| } |
|
|
| if (catalogDict.has(PDFName.of('ParentTree'))) { |
| catalogDict.delete(PDFName.of('ParentTree')); |
| } |
| } |
|
|
| function removeMarkInfoFromDoc(pdfDoc: PDFDocument) { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
|
|
| if (catalogDict.has(PDFName.of('MarkInfo'))) { |
| catalogDict.delete(PDFName.of('MarkInfo')); |
| } |
|
|
| if (catalogDict.has(PDFName.of('Marked'))) { |
| catalogDict.delete(PDFName.of('Marked')); |
| } |
| } |
|
|
| function removeFontsFromDoc(pdfDoc: PDFDocument) { |
| const pages = pdfDoc.getPages(); |
|
|
| for (let pageIndex = 0; pageIndex < pages.length; pageIndex++) { |
| try { |
| const page = pages[pageIndex]; |
| const pageDict = page.node; |
| const resourcesRef = pageDict.get(PDFName.of('Resources')); |
|
|
| if (resourcesRef) { |
| try { |
| const resourcesDict = pdfDoc.context.lookup(resourcesRef) as any; |
|
|
| if (resourcesDict.has(PDFName.of('Font'))) { |
| const fontRef = resourcesDict.get(PDFName.of('Font')); |
|
|
| try { |
| const fontDict = pdfDoc.context.lookup(fontRef) as any; |
| const fontKeys = fontDict.keys(); |
|
|
| for (const fontKey of fontKeys) { |
| try { |
| const specificFontRef = fontDict.get(fontKey); |
| const specificFont = pdfDoc.context.lookup( |
| specificFontRef |
| ) as any; |
|
|
| if (specificFont.has(PDFName.of('FontDescriptor'))) { |
| const descriptorRef = specificFont.get( |
| PDFName.of('FontDescriptor') |
| ); |
| const descriptor = pdfDoc.context.lookup( |
| descriptorRef |
| ) as any; |
|
|
| const fontFileKeys = ['FontFile', 'FontFile2', 'FontFile3']; |
| for (const key of fontFileKeys) { |
| if (descriptor.has(PDFName.of(key))) { |
| descriptor.delete(PDFName.of(key)); |
| } |
| } |
| } |
| } catch (e: any) { |
| console.warn(`Could not process font ${fontKey}:`, e.message); |
| } |
| } |
| } catch (e: any) { |
| console.warn('Could not access font dictionary:', e.message); |
| } |
| } |
| } catch (e: any) { |
| console.warn('Could not access Resources for fonts:', e.message); |
| } |
| } |
| } catch (e: any) { |
| console.warn( |
| `Could not remove fonts from page ${pageIndex + 1}:`, |
| e.message |
| ); |
| } |
| } |
|
|
| if ((pdfDoc as any).fonts && (pdfDoc as any).fonts.length > 0) { |
| (pdfDoc as any).fonts = []; |
| } |
| } |
|
|
| export async function sanitizePdf( |
| pdfBytes: Uint8Array, |
| options: SanitizeOptions |
| ): Promise<{ pdfDoc: PDFDocument; bytes: Uint8Array }> { |
| const pdfDoc = await PDFDocument.load(pdfBytes); |
|
|
| if (options.flattenForms) { |
| try { |
| flattenFormsInDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not flatten forms: ${e.message}`); |
| try { |
| const catalogDict = (pdfDoc.catalog as any).dict; |
| if (catalogDict.has(PDFName.of('AcroForm'))) { |
| catalogDict.delete(PDFName.of('AcroForm')); |
| } |
| } catch (removeError: any) { |
| console.warn('Could not remove AcroForm:', removeError.message); |
| } |
| } |
| } |
|
|
| if (options.removeMetadata) { |
| removeMetadataFromDoc(pdfDoc); |
| } |
|
|
| if (options.removeAnnotations) { |
| removeAnnotationsFromDoc(pdfDoc); |
| } |
|
|
| if (options.removeJavascript) { |
| try { |
| removeJavascriptFromDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not remove JavaScript: ${e.message}`); |
| } |
| } |
|
|
| if (options.removeEmbeddedFiles) { |
| try { |
| removeEmbeddedFilesFromDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not remove embedded files: ${e.message}`); |
| } |
| } |
|
|
| if (options.removeLayers) { |
| try { |
| removeLayersFromDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not remove layers: ${e.message}`); |
| } |
| } |
|
|
| if (options.removeLinks) { |
| try { |
| removeLinksFromDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not remove links: ${e.message}`); |
| } |
| } |
|
|
| if (options.removeStructureTree) { |
| try { |
| removeStructureTreeFromDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not remove structure tree: ${e.message}`); |
| } |
| } |
|
|
| if (options.removeMarkInfo) { |
| try { |
| removeMarkInfoFromDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not remove MarkInfo: ${e.message}`); |
| } |
| } |
|
|
| if (options.removeFonts) { |
| try { |
| removeFontsFromDoc(pdfDoc); |
| } catch (e: any) { |
| console.warn(`Could not remove fonts: ${e.message}`); |
| } |
| } |
|
|
| const savedBytes = await pdfDoc.save(); |
| return { pdfDoc, bytes: new Uint8Array(savedBytes) }; |
| } |
|
|