Spaces:
Running
Running
| export interface ParsedDocument { | |
| fileName: string; | |
| content: string; | |
| fileType: string; | |
| } | |
| export const parseDocument = async (file: File): Promise<string> => { | |
| const fileType = file.name.split('.').pop()?.toLowerCase(); | |
| if (fileType === 'txt') { | |
| return await parseTxt(file); | |
| } else if (fileType === 'docx') { | |
| return await parseDocx(file); | |
| } else if (fileType === 'pdf') { | |
| return await parsePdf(file); | |
| } else { | |
| throw new Error('不支持的文件格式。仅支持 .txt, .docx, .pdf'); | |
| } | |
| }; | |
| const parseTxt = (file: File): Promise<string> => { | |
| return new Promise((resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = (e) => resolve(e.target?.result as string); | |
| reader.onerror = (e) => reject(e); | |
| reader.readAsText(file); | |
| }); | |
| }; | |
| const parseDocx = async (file: File): Promise<string> => { | |
| // Dynamic import to prevent page crash if module is missing | |
| let mammoth; | |
| try { | |
| // @ts-ignore | |
| mammoth = await import('mammoth'); | |
| } catch (e) { | |
| throw new Error('无法加载文档解析组件(mammoth),请检查网络连接。'); | |
| } | |
| return new Promise((resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = (e) => { | |
| const arrayBuffer = e.target?.result as ArrayBuffer; | |
| mammoth.extractRawText({ arrayBuffer: arrayBuffer }) | |
| .then((result: any) => resolve(result.value)) | |
| .catch((err: any) => reject(err)); | |
| }; | |
| reader.onerror = (e) => reject(e); | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| }; | |
| const parsePdf = async (file: File): Promise<string> => { | |
| // Dynamic import | |
| let pdfjsLib: any; | |
| try { | |
| // @ts-ignore | |
| const pdfjsModule = await import('pdfjs-dist'); | |
| pdfjsLib = pdfjsModule.default || pdfjsModule; | |
| if (!pdfjsLib.GlobalWorkerOptions.workerSrc) { | |
| pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://esm.sh/pdfjs-dist@3.11.174/build/pdf.worker.min.js'; | |
| } | |
| } catch (e) { | |
| throw new Error('无法加载 PDF 解析组件,请检查网络连接。'); | |
| } | |
| return new Promise((resolve, reject) => { | |
| const reader = new FileReader(); | |
| reader.onload = async (e) => { | |
| try { | |
| const typedarray = new Uint8Array(e.target?.result as ArrayBuffer); | |
| const pdf = await pdfjsLib.getDocument(typedarray).promise; | |
| let fullText = ''; | |
| for (let i = 1; i <= pdf.numPages; i++) { | |
| const page = await pdf.getPage(i); | |
| const textContent = await page.getTextContent(); | |
| const pageText = textContent.items.map((item: any) => item.str).join(' '); | |
| fullText += pageText + '\n'; | |
| } | |
| resolve(fullText); | |
| } catch (err) { | |
| reject(err); | |
| } | |
| }; | |
| reader.onerror = (e) => reject(e); | |
| reader.readAsArrayBuffer(file); | |
| }); | |
| }; | |