|
|
|
|
|
|
|
|
|
|
|
import { ZipReader, BlobReader, BlobWriter, type Entry } from "@zip.js/zip.js"; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
interface ExtractedFiles { |
|
|
filename: string; |
|
|
data: Blob; |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
interface OfficeParserConfig { |
|
|
|
|
|
|
|
|
|
|
|
type: "text" | "file"; |
|
|
|
|
|
|
|
|
|
|
|
outputErrorToConsole: boolean; |
|
|
|
|
|
|
|
|
|
|
|
newlineDelimiter: string; |
|
|
|
|
|
|
|
|
|
|
|
ignoreNotes: boolean; |
|
|
|
|
|
|
|
|
|
|
|
putNotesAtLast: boolean; |
|
|
} |
|
|
|
|
|
|
|
|
const ERRORHEADER = "[OfficeParser]: "; |
|
|
|
|
|
const ERRORMSG = { |
|
|
extensionUnsupported: (ext: string) => |
|
|
`Sorry, OfficeParser currently support docx, pptx, xlsx, odt, odp, ods files only. Create a ticket in Issues on github to add support for ${ext} files. Stay tuned for further updates.`, |
|
|
fileCorrupted: (filepath: string) => |
|
|
`Your file ${filepath} seems to be corrupted. If you are sure it is fine, please create a ticket in Issues on github with the file to reproduce error.`, |
|
|
}; |
|
|
|
|
|
function handleError(error: string, outputErrorToConsole = false) { |
|
|
if (error && outputErrorToConsole) console.error(ERRORHEADER + error); |
|
|
throw new Error(ERRORHEADER + error); |
|
|
} |
|
|
|
|
|
const officeFileTypes = { |
|
|
docx: "application/vnd.openxmlformats-officedocument.wordprocessingml.document", |
|
|
pptx: "application/vnd.openxmlformats-officedocument.presentationml.presentation", |
|
|
xlsx: "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", |
|
|
odt: "application/vnd.oasis.opendocument.text", |
|
|
odp: "application/vnd.oasis.opendocument.presentation", |
|
|
ods: "application/vnd.oasis.opendocument.spreadsheet", |
|
|
}; |
|
|
|
|
|
function parseXMLString(xml: string) { |
|
|
const parser = new DOMParser(); |
|
|
return parser.parseFromString(xml, "text/xml"); |
|
|
} |
|
|
|
|
|
function createMergedXmlDocument(): Document { |
|
|
return document.implementation.createDocument(null, "root"); |
|
|
} |
|
|
|
|
|
function mergeXmlDocuments(mergedDoc: Document, xmlDocs: Document[]): void { |
|
|
const mergedRoot = mergedDoc.documentElement; |
|
|
xmlDocs.forEach((xmlDoc) => { |
|
|
const root = xmlDoc.documentElement; |
|
|
if (root) { |
|
|
Array.from(root.childNodes).forEach((node) => { |
|
|
mergedRoot.appendChild(mergedDoc.importNode(node, true)); |
|
|
}); |
|
|
} |
|
|
}); |
|
|
} |
|
|
|
|
|
function serializeXmlDocument(xmlDoc: Document): string { |
|
|
const serializer = new XMLSerializer(); |
|
|
return serializer.serializeToString(xmlDoc); |
|
|
} |
|
|
|
|
|
function stringToBlob(xmlString: string): Blob { |
|
|
return new Blob([xmlString], { type: "text/xml" }); |
|
|
} |
|
|
|
|
|
function blobToFile(blob: Blob, fileName: string): File { |
|
|
return new File([blob], fileName, { type: blob.type }); |
|
|
} |
|
|
|
|
|
async function mergeXmlBlobs(blobs: Blob[]): Promise<Blob> { |
|
|
const xmlStrings = await Promise.all(blobs.map((blob) => blob.text())); |
|
|
const xmlDocs = xmlStrings.map((xmlString) => parseXMLString(xmlString)); |
|
|
|
|
|
const mergedDoc = createMergedXmlDocument(); |
|
|
mergeXmlDocuments(mergedDoc, xmlDocs); |
|
|
|
|
|
const mergedXmlString = serializeXmlDocument(mergedDoc); |
|
|
return stringToBlob(mergedXmlString); |
|
|
} |
|
|
|
|
|
function extractFiles( |
|
|
zipInput: File, |
|
|
filterFn: (filename: string) => boolean |
|
|
): Promise<ExtractedFiles[]> { |
|
|
return new Promise(async (resolve, reject) => { |
|
|
const extractedFiles: ExtractedFiles[] = []; |
|
|
const processZipfile = async (entry: Entry) => { |
|
|
if (filterFn(entry.filename)) { |
|
|
if (entry.getData) { |
|
|
const data = await entry.getData(new BlobWriter()); |
|
|
extractedFiles.push({ |
|
|
filename: entry.filename, |
|
|
data, |
|
|
}); |
|
|
} |
|
|
} |
|
|
}; |
|
|
|
|
|
try { |
|
|
|
|
|
const entrys = await new ZipReader(new BlobReader(zipInput)).getEntries(); |
|
|
for (const entry of entrys) { |
|
|
await processZipfile(entry); |
|
|
} |
|
|
resolve(extractedFiles); |
|
|
} catch (err) { |
|
|
reject(err); |
|
|
} |
|
|
}); |
|
|
} |
|
|
|
|
|
export function parseWord( |
|
|
file: File, |
|
|
config: Partial<OfficeParserConfig> |
|
|
): Promise<string | File> { |
|
|
|
|
|
const mainContentFileRegex = /word\/document[\d+]?.xml/g; |
|
|
const footnotesFileRegex = /word\/footnotes[\d+]?.xml/g; |
|
|
const endnotesFileRegex = /word\/endnotes[\d+]?.xml/g; |
|
|
|
|
|
return new Promise((resolve, reject) => { |
|
|
extractFiles(file, (x) => |
|
|
[mainContentFileRegex, footnotesFileRegex, endnotesFileRegex].some( |
|
|
(fileRegex) => x.match(fileRegex) |
|
|
) |
|
|
) |
|
|
.then((files: ExtractedFiles[]) => { |
|
|
|
|
|
if (!files.some((file) => file.filename.match(mainContentFileRegex))) |
|
|
handleError( |
|
|
ERRORMSG.fileCorrupted(file.name), |
|
|
config.outputErrorToConsole |
|
|
); |
|
|
|
|
|
return files.filter( |
|
|
(file) => |
|
|
file.filename.match(mainContentFileRegex) || |
|
|
file.filename.match(footnotesFileRegex) || |
|
|
file.filename.match(endnotesFileRegex) |
|
|
); |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.then(async (files: ExtractedFiles[]) => { |
|
|
if (config.type === "file") { |
|
|
const mergedBlob = await mergeXmlBlobs( |
|
|
files.map((item) => item.data) |
|
|
); |
|
|
return resolve(blobToFile(mergedBlob, file.name)); |
|
|
} |
|
|
|
|
|
|
|
|
const responseText: string[] = []; |
|
|
|
|
|
const xmlContentArray: string[] = []; |
|
|
for await (const file of files) { |
|
|
xmlContentArray.push(await file.data.text()); |
|
|
} |
|
|
|
|
|
xmlContentArray.forEach((xmlContent) => { |
|
|
|
|
|
const xmlParagraphNodesList = |
|
|
parseXMLString(xmlContent).getElementsByTagName("w:p"); |
|
|
|
|
|
responseText.push( |
|
|
Array.from(xmlParagraphNodesList) |
|
|
|
|
|
.filter( |
|
|
(paragraphNode) => |
|
|
paragraphNode.getElementsByTagName("w:t").length != 0 |
|
|
) |
|
|
.map((paragraphNode) => { |
|
|
|
|
|
const xmlTextNodeList = |
|
|
paragraphNode.getElementsByTagName("w:t"); |
|
|
|
|
|
return Array.from(xmlTextNodeList) |
|
|
.filter( |
|
|
(textNode) => |
|
|
textNode.childNodes[0] && textNode.childNodes[0].nodeValue |
|
|
) |
|
|
.map((textNode) => textNode.childNodes[0].nodeValue) |
|
|
.join(""); |
|
|
}) |
|
|
|
|
|
.join(config.newlineDelimiter ?? "\n") |
|
|
); |
|
|
}); |
|
|
|
|
|
resolve(responseText.join(config.newlineDelimiter ?? "\n")); |
|
|
}) |
|
|
.catch(reject); |
|
|
}); |
|
|
} |
|
|
|
|
|
export function parsePowerPoint( |
|
|
file: File, |
|
|
config: Partial<OfficeParserConfig> |
|
|
): Promise<string | File> { |
|
|
|
|
|
const allFilesRegex = /ppt\/(notesSlides|slides)\/(notesSlide|slide)\d+.xml/g; |
|
|
const slidesRegex = /ppt\/slides\/slide\d+.xml/g; |
|
|
const slideNumberRegex = /lide(\d+)\.xml/; |
|
|
|
|
|
return new Promise((resolve, reject) => { |
|
|
extractFiles( |
|
|
file, |
|
|
(x) => !!x.match(config.ignoreNotes ? slidesRegex : allFilesRegex) |
|
|
) |
|
|
.then((files: ExtractedFiles[]) => { |
|
|
|
|
|
files.sort((a, b) => { |
|
|
const matchedANumber = parseInt( |
|
|
a.filename.match(slideNumberRegex)?.at(1) || "0", |
|
|
10 |
|
|
); |
|
|
const matchedBNumber = parseInt( |
|
|
b.filename.match(slideNumberRegex)?.at(1) || "0", |
|
|
10 |
|
|
); |
|
|
|
|
|
const aNumber = isNaN(matchedANumber) ? Infinity : matchedANumber; |
|
|
const bNumber = isNaN(matchedBNumber) ? Infinity : matchedBNumber; |
|
|
|
|
|
return ( |
|
|
aNumber - bNumber || |
|
|
Number(a.filename.includes("notes")) - |
|
|
Number(b.filename.includes("notes")) |
|
|
); |
|
|
}); |
|
|
|
|
|
|
|
|
if ( |
|
|
files.length == 0 || |
|
|
!files |
|
|
.map((file) => file.filename) |
|
|
.some((filename) => filename.match(slidesRegex)) |
|
|
) |
|
|
handleError( |
|
|
ERRORMSG.fileCorrupted(file.name), |
|
|
config.outputErrorToConsole |
|
|
); |
|
|
|
|
|
|
|
|
if (!config.ignoreNotes && config.putNotesAtLast) |
|
|
|
|
|
|
|
|
files.sort( |
|
|
(a, b) => a.filename.indexOf("notes") - b.filename.indexOf("notes") |
|
|
); |
|
|
|
|
|
|
|
|
return files; |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.then(async (files: ExtractedFiles[]) => { |
|
|
if (config.type === "file") { |
|
|
const mergedBlob = await mergeXmlBlobs( |
|
|
files.map((item) => item.data) |
|
|
); |
|
|
return resolve(blobToFile(mergedBlob, file.name)); |
|
|
} |
|
|
|
|
|
|
|
|
const responseText: string[] = []; |
|
|
|
|
|
const xmlContentArray: string[] = []; |
|
|
for await (const file of files) { |
|
|
xmlContentArray.push(await file.data.text()); |
|
|
} |
|
|
xmlContentArray.forEach((xmlContent) => { |
|
|
|
|
|
const xmlParagraphNodesList = |
|
|
parseXMLString(xmlContent).getElementsByTagName("a:p"); |
|
|
|
|
|
responseText.push( |
|
|
Array.from(xmlParagraphNodesList) |
|
|
|
|
|
.filter( |
|
|
(paragraphNode) => |
|
|
paragraphNode.getElementsByTagName("a:t").length != 0 |
|
|
) |
|
|
.map((paragraphNode) => { |
|
|
|
|
|
const xmlTextNodeList = |
|
|
paragraphNode.getElementsByTagName("a:t"); |
|
|
return Array.from(xmlTextNodeList) |
|
|
.filter( |
|
|
(textNode) => |
|
|
textNode.childNodes[0] && textNode.childNodes[0].nodeValue |
|
|
) |
|
|
.map((textNode) => textNode.childNodes[0].nodeValue) |
|
|
.join(""); |
|
|
}) |
|
|
.join(config.newlineDelimiter ?? "\n") |
|
|
); |
|
|
}); |
|
|
|
|
|
|
|
|
resolve(responseText.join(config.newlineDelimiter ?? "\n")); |
|
|
}) |
|
|
.catch(reject); |
|
|
}); |
|
|
} |
|
|
|
|
|
export function parseExcel( |
|
|
file: File, |
|
|
config: Partial<OfficeParserConfig> |
|
|
): Promise<string | File> { |
|
|
|
|
|
const sheetsRegex = /xl\/worksheets\/sheet\d+.xml/g; |
|
|
const drawingsRegex = /xl\/drawings\/drawing\d+.xml/g; |
|
|
const chartsRegex = /xl\/charts\/chart\d+.xml/g; |
|
|
const stringsFilePath = "xl/sharedStrings.xml"; |
|
|
|
|
|
return new Promise((resolve, reject) => { |
|
|
extractFiles( |
|
|
file, |
|
|
(x) => |
|
|
[sheetsRegex, drawingsRegex, chartsRegex].some((fileRegex) => |
|
|
x.match(fileRegex) |
|
|
) || x == stringsFilePath |
|
|
) |
|
|
.then((files: ExtractedFiles[]) => { |
|
|
|
|
|
if ( |
|
|
files.length == 0 || |
|
|
!files |
|
|
.map((file) => file.filename) |
|
|
.some((filename) => filename.match(sheetsRegex)) |
|
|
) |
|
|
handleError( |
|
|
ERRORMSG.fileCorrupted(file.name), |
|
|
config.outputErrorToConsole |
|
|
); |
|
|
|
|
|
return { |
|
|
sheetFiles: files.filter((file) => file.filename.match(sheetsRegex)), |
|
|
drawingFiles: files.filter((file) => |
|
|
file.filename.match(drawingsRegex) |
|
|
), |
|
|
chartFiles: files.filter((file) => file.filename.match(chartsRegex)), |
|
|
sharedStringsFile: files.filter( |
|
|
(file) => file.filename == stringsFilePath |
|
|
)[0], |
|
|
}; |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.then(async (xmlContentFilesObject) => { |
|
|
if (config.type === "file") { |
|
|
const files: ExtractedFiles[] = []; |
|
|
for (const fileOrFiles of Object.values(xmlContentFilesObject)) { |
|
|
if (Array.isArray(fileOrFiles)) { |
|
|
fileOrFiles.forEach((file) => files.push(file)); |
|
|
} else { |
|
|
files.push(fileOrFiles); |
|
|
} |
|
|
} |
|
|
const mergedBlob = await mergeXmlBlobs( |
|
|
files.map((item) => item.data) |
|
|
); |
|
|
return resolve(blobToFile(mergedBlob, file.name)); |
|
|
} |
|
|
|
|
|
|
|
|
const responseText: string[] = []; |
|
|
|
|
|
|
|
|
function isValidInlineStringCNode(cNode: Element) { |
|
|
|
|
|
if (cNode.tagName.toLowerCase() != "c") return false; |
|
|
if (cNode.getAttribute("t") != "inlineStr") return false; |
|
|
const childNodesNamedIs = cNode.getElementsByTagName("is"); |
|
|
if (childNodesNamedIs.length != 1) return false; |
|
|
const childNodesNamedT = |
|
|
childNodesNamedIs[0].getElementsByTagName("t"); |
|
|
if (childNodesNamedT.length != 1) return false; |
|
|
return ( |
|
|
childNodesNamedT[0].childNodes[0] && |
|
|
childNodesNamedT[0].childNodes[0].nodeValue != "" |
|
|
); |
|
|
} |
|
|
|
|
|
|
|
|
function hasValidVNodeInCNode(cNode: Element) { |
|
|
return ( |
|
|
cNode.getElementsByTagName("v")[0] && |
|
|
cNode.getElementsByTagName("v")[0].childNodes[0] && |
|
|
cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue != "" |
|
|
); |
|
|
} |
|
|
|
|
|
|
|
|
const sharedStringsXmlTNodesList = |
|
|
xmlContentFilesObject.sharedStringsFile != undefined |
|
|
? parseXMLString( |
|
|
await xmlContentFilesObject.sharedStringsFile.data.text() |
|
|
).getElementsByTagName("t") |
|
|
: []; |
|
|
|
|
|
const sharedStrings = Array.from(sharedStringsXmlTNodesList).map( |
|
|
(tNode) => tNode.childNodes[0]?.nodeValue ?? "" |
|
|
); |
|
|
|
|
|
|
|
|
for await (const sheetFile of xmlContentFilesObject.sheetFiles) { |
|
|
const sheetXmlContent = await sheetFile.data.text(); |
|
|
|
|
|
const sheetsXmlCNodesList = |
|
|
parseXMLString(sheetXmlContent).getElementsByTagName("c"); |
|
|
|
|
|
responseText.push( |
|
|
Array.from(sheetsXmlCNodesList) |
|
|
|
|
|
.filter( |
|
|
(cNode) => |
|
|
isValidInlineStringCNode(cNode) || hasValidVNodeInCNode(cNode) |
|
|
) |
|
|
.map((cNode) => { |
|
|
|
|
|
if (isValidInlineStringCNode(cNode)) |
|
|
return cNode |
|
|
.getElementsByTagName("is")[0] |
|
|
.getElementsByTagName("t")[0].childNodes[0].nodeValue; |
|
|
|
|
|
|
|
|
if (hasValidVNodeInCNode(cNode)) { |
|
|
|
|
|
const isIndexInSharedStrings = cNode.getAttribute("t") == "s"; |
|
|
|
|
|
|
|
|
const cNodeValue = |
|
|
cNode.getElementsByTagName("v")[0].childNodes[0].nodeValue; |
|
|
if (cNodeValue) { |
|
|
const value = parseInt(cNodeValue, 10); |
|
|
|
|
|
if (isIndexInSharedStrings && value >= sharedStrings.length) |
|
|
handleError( |
|
|
ERRORMSG.fileCorrupted(file.name), |
|
|
config.outputErrorToConsole |
|
|
); |
|
|
|
|
|
return isIndexInSharedStrings |
|
|
? sharedStrings[value] |
|
|
: value; |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
return ""; |
|
|
}) |
|
|
|
|
|
.join(config.newlineDelimiter ?? "\n") |
|
|
); |
|
|
} |
|
|
|
|
|
|
|
|
for await (const drawingFile of xmlContentFilesObject.drawingFiles) { |
|
|
const drawingXmlContent = await drawingFile.data.text(); |
|
|
|
|
|
const drawingsXmlParagraphNodesList = |
|
|
parseXMLString(drawingXmlContent).getElementsByTagName("a:p"); |
|
|
|
|
|
responseText.push( |
|
|
Array.from(drawingsXmlParagraphNodesList) |
|
|
|
|
|
.filter( |
|
|
(paragraphNode) => |
|
|
paragraphNode.getElementsByTagName("a:t").length != 0 |
|
|
) |
|
|
.map((paragraphNode) => { |
|
|
|
|
|
const xmlTextNodeList = |
|
|
paragraphNode.getElementsByTagName("a:t"); |
|
|
return Array.from(xmlTextNodeList) |
|
|
.filter( |
|
|
(textNode) => |
|
|
textNode.childNodes[0] && textNode.childNodes[0].nodeValue |
|
|
) |
|
|
.map((textNode) => textNode.childNodes[0].nodeValue) |
|
|
.join(""); |
|
|
}) |
|
|
.join(config.newlineDelimiter ?? "\n") |
|
|
); |
|
|
} |
|
|
|
|
|
|
|
|
for await (const chartFile of xmlContentFilesObject.chartFiles) { |
|
|
const chartXmlContent = await chartFile.data.text(); |
|
|
|
|
|
const chartsXmlCVNodesList = |
|
|
parseXMLString(chartXmlContent).getElementsByTagName("c:v"); |
|
|
|
|
|
responseText.push( |
|
|
Array.from(chartsXmlCVNodesList) |
|
|
.filter( |
|
|
(cVNode) => |
|
|
cVNode.childNodes[0] && cVNode.childNodes[0].nodeValue |
|
|
) |
|
|
.map((cVNode) => cVNode.childNodes[0].nodeValue) |
|
|
.join(config.newlineDelimiter ?? "\n") |
|
|
); |
|
|
} |
|
|
|
|
|
|
|
|
resolve(responseText.join(config.newlineDelimiter ?? "\n")); |
|
|
}) |
|
|
.catch(reject); |
|
|
}); |
|
|
} |
|
|
|
|
|
export function parseOpenOffice( |
|
|
file: File, |
|
|
config: Partial<OfficeParserConfig> |
|
|
): Promise<string | File> { |
|
|
|
|
|
const mainContentFilePath = "content.xml"; |
|
|
const objectContentFilesRegex = /Object \d+\/content.xml/g; |
|
|
|
|
|
return new Promise((resolve, reject) => { |
|
|
extractFiles( |
|
|
file, |
|
|
(x) => x == mainContentFilePath || !!x.match(objectContentFilesRegex) |
|
|
) |
|
|
.then((files) => { |
|
|
|
|
|
if (!files.map((file) => file.filename).includes(mainContentFilePath)) |
|
|
handleError( |
|
|
ERRORMSG.fileCorrupted(file.name), |
|
|
config.outputErrorToConsole |
|
|
); |
|
|
|
|
|
return { |
|
|
mainContentFile: files.filter( |
|
|
(file) => file.filename == mainContentFilePath |
|
|
)[0], |
|
|
objectContentFiles: files.filter((file) => |
|
|
file.filename.match(objectContentFilesRegex) |
|
|
), |
|
|
}; |
|
|
}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.then(async (xmlContentFilesObject) => { |
|
|
if (config.type === "file") { |
|
|
const files: ExtractedFiles[] = []; |
|
|
for (const fileOrFiles of Object.values(xmlContentFilesObject)) { |
|
|
if (Array.isArray(fileOrFiles)) { |
|
|
fileOrFiles.forEach((file) => files.push(file)); |
|
|
} else { |
|
|
files.push(fileOrFiles); |
|
|
} |
|
|
} |
|
|
const mergedBlob = await mergeXmlBlobs( |
|
|
files.map((item) => item.data) |
|
|
); |
|
|
return resolve(blobToFile(mergedBlob, file.name)); |
|
|
} |
|
|
|
|
|
|
|
|
const notesText: string[] = []; |
|
|
|
|
|
let responseText: string[] = []; |
|
|
|
|
|
|
|
|
const allowedTextTags = ["text:p", "text:h"]; |
|
|
|
|
|
const notesTag = "presentation:notes"; |
|
|
|
|
|
|
|
|
function extractAllTextsFromNode(root: Element) { |
|
|
const xmlTextArray: string[] = []; |
|
|
for (let i = 0; i < root.childNodes.length; i++) |
|
|
traversal(root.childNodes[i], xmlTextArray, true); |
|
|
return xmlTextArray.join(""); |
|
|
} |
|
|
|
|
|
function traversal( |
|
|
node: ChildNode, |
|
|
xmlTextArray: string[], |
|
|
isFirstRecursion: boolean |
|
|
) { |
|
|
if (!node.childNodes || node.childNodes.length == 0) { |
|
|
if ( |
|
|
node.parentNode?.nodeName.indexOf("text") == 0 && |
|
|
node.nodeValue |
|
|
) { |
|
|
if ( |
|
|
isNotesNode(node.parentNode) && |
|
|
(config.putNotesAtLast || config.ignoreNotes) |
|
|
) { |
|
|
notesText.push(node.nodeValue); |
|
|
if ( |
|
|
allowedTextTags.includes(node.parentNode.nodeName) && |
|
|
!isFirstRecursion |
|
|
) |
|
|
notesText.push(config.newlineDelimiter ?? "\n"); |
|
|
} else { |
|
|
xmlTextArray.push(node.nodeValue); |
|
|
if ( |
|
|
allowedTextTags.includes(node.parentNode.nodeName) && |
|
|
!isFirstRecursion |
|
|
) |
|
|
xmlTextArray.push(config.newlineDelimiter ?? "\n"); |
|
|
} |
|
|
} |
|
|
return; |
|
|
} |
|
|
|
|
|
for (let i = 0; i < node.childNodes.length; i++) |
|
|
traversal(node.childNodes[i], xmlTextArray, false); |
|
|
} |
|
|
|
|
|
|
|
|
function isNotesNode(node: Node) { |
|
|
if (node.nodeName == notesTag) return true; |
|
|
if (node.parentNode) return isNotesNode(node.parentNode); |
|
|
return false; |
|
|
} |
|
|
|
|
|
|
|
|
function isInvalidTextNode(node: Node) { |
|
|
if (allowedTextTags.includes(node.nodeName)) return true; |
|
|
if (node.parentNode) return isInvalidTextNode(node.parentNode); |
|
|
return false; |
|
|
} |
|
|
|
|
|
|
|
|
const xmlContentArray: string[] = []; |
|
|
xmlContentArray.push( |
|
|
await xmlContentFilesObject.mainContentFile.data.text() |
|
|
); |
|
|
for await (const file of xmlContentFilesObject.objectContentFiles) { |
|
|
xmlContentArray.push(await file.data.text()); |
|
|
} |
|
|
|
|
|
|
|
|
xmlContentArray.forEach((content) => { |
|
|
const xmlContent = parseXMLString(content); |
|
|
|
|
|
const xmlTextNodesList = [ |
|
|
...Array.from(xmlContent.getElementsByTagName("*")).filter( |
|
|
(node) => |
|
|
allowedTextTags.includes(node.tagName) && |
|
|
!isInvalidTextNode(node.parentNode!) |
|
|
), |
|
|
]; |
|
|
|
|
|
responseText.push( |
|
|
xmlTextNodesList |
|
|
|
|
|
.map((textNode) => extractAllTextsFromNode(textNode)) |
|
|
.filter((text) => text != "") |
|
|
.join(config.newlineDelimiter ?? "\n") |
|
|
); |
|
|
}); |
|
|
|
|
|
|
|
|
|
|
|
if (!config.ignoreNotes && config.putNotesAtLast) |
|
|
responseText = [...responseText, ...notesText]; |
|
|
|
|
|
|
|
|
resolve(responseText.join(config.newlineDelimiter ?? "\n")); |
|
|
}) |
|
|
.catch(reject); |
|
|
}); |
|
|
} |
|
|
|
|
|
export function readTextFromOffice( |
|
|
file: File, |
|
|
config?: Partial<OfficeParserConfig> |
|
|
) { |
|
|
|
|
|
|
|
|
let internalConfig: OfficeParserConfig = { |
|
|
type: "text", |
|
|
ignoreNotes: false, |
|
|
newlineDelimiter: "\n", |
|
|
putNotesAtLast: false, |
|
|
outputErrorToConsole: false, |
|
|
}; |
|
|
|
|
|
if (config) internalConfig = { ...internalConfig, ...config }; |
|
|
|
|
|
|
|
|
switch (file.type) { |
|
|
case officeFileTypes.docx: |
|
|
return parseWord(file, internalConfig); |
|
|
case officeFileTypes.pptx: |
|
|
return parsePowerPoint(file, internalConfig); |
|
|
case officeFileTypes.xlsx: |
|
|
return parseExcel(file, internalConfig); |
|
|
case officeFileTypes.odt: |
|
|
case officeFileTypes.odp: |
|
|
case officeFileTypes.ods: |
|
|
return parseOpenOffice(file, internalConfig); |
|
|
default: |
|
|
return handleError( |
|
|
ERRORMSG.extensionUnsupported(file.type), |
|
|
internalConfig.outputErrorToConsole |
|
|
); |
|
|
} |
|
|
} |
|
|
|