export interface ParsedDocument { fileName: string; content: string; fileType: string; } export const parseDocument = async (file: File): Promise => { const fileType = file.name.split('.').pop()?.toLowerCase(); if (fileType === 'txt') { return await parseTxt(file); } else if (fileType === 'docx') { return await parseDocx(file); } else if (fileType === 'pdf') { return await parsePdf(file); } else { throw new Error('不支持的文件格式。仅支持 .txt, .docx, .pdf'); } }; const parseTxt = (file: File): Promise => { return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = (e) => resolve(e.target?.result as string); reader.onerror = (e) => reject(e); reader.readAsText(file); }); }; const parseDocx = async (file: File): Promise => { // Dynamic import to prevent page crash if module is missing let mammoth; try { // @ts-ignore mammoth = await import('mammoth'); } catch (e) { throw new Error('无法加载文档解析组件(mammoth),请检查网络连接。'); } return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = (e) => { const arrayBuffer = e.target?.result as ArrayBuffer; mammoth.extractRawText({ arrayBuffer: arrayBuffer }) .then((result: any) => resolve(result.value)) .catch((err: any) => reject(err)); }; reader.onerror = (e) => reject(e); reader.readAsArrayBuffer(file); }); }; const parsePdf = async (file: File): Promise => { // Dynamic import let pdfjsLib: any; try { // @ts-ignore const pdfjsModule = await import('pdfjs-dist'); pdfjsLib = pdfjsModule.default || pdfjsModule; if (!pdfjsLib.GlobalWorkerOptions.workerSrc) { pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://esm.sh/pdfjs-dist@3.11.174/build/pdf.worker.min.js'; } } catch (e) { throw new Error('无法加载 PDF 解析组件,请检查网络连接。'); } return new Promise((resolve, reject) => { const reader = new FileReader(); reader.onload = async (e) => { try { const typedarray = new Uint8Array(e.target?.result as ArrayBuffer); const pdf = await pdfjsLib.getDocument(typedarray).promise; let fullText = ''; for (let i = 1; i <= pdf.numPages; i++) { const page = await pdf.getPage(i); const textContent = await page.getTextContent(); const pageText = textContent.items.map((item: any) => item.str).join(' '); fullText += pageText + '\n'; } resolve(fullText); } catch (err) { reject(err); } }; reader.onerror = (e) => reject(e); reader.readAsArrayBuffer(file); }); };