stud-manager / utils /documentParser.ts
dvc890's picture
Upload 67 files
b0770b8 verified
export interface ParsedDocument {
fileName: string;
content: string;
fileType: string;
}
export const parseDocument = async (file: File): Promise<string> => {
const fileType = file.name.split('.').pop()?.toLowerCase();
if (fileType === 'txt') {
return await parseTxt(file);
} else if (fileType === 'docx') {
return await parseDocx(file);
} else if (fileType === 'pdf') {
return await parsePdf(file);
} else {
throw new Error('不支持的文件格式。仅支持 .txt, .docx, .pdf');
}
};
const parseTxt = (file: File): Promise<string> => {
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (e) => resolve(e.target?.result as string);
reader.onerror = (e) => reject(e);
reader.readAsText(file);
});
};
const parseDocx = async (file: File): Promise<string> => {
// Dynamic import to prevent page crash if module is missing
let mammoth;
try {
// @ts-ignore
mammoth = await import('mammoth');
} catch (e) {
throw new Error('无法加载文档解析组件(mammoth),请检查网络连接。');
}
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = (e) => {
const arrayBuffer = e.target?.result as ArrayBuffer;
mammoth.extractRawText({ arrayBuffer: arrayBuffer })
.then((result: any) => resolve(result.value))
.catch((err: any) => reject(err));
};
reader.onerror = (e) => reject(e);
reader.readAsArrayBuffer(file);
});
};
const parsePdf = async (file: File): Promise<string> => {
// Dynamic import
let pdfjsLib: any;
try {
// @ts-ignore
const pdfjsModule = await import('pdfjs-dist');
pdfjsLib = pdfjsModule.default || pdfjsModule;
if (!pdfjsLib.GlobalWorkerOptions.workerSrc) {
pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://esm.sh/pdfjs-dist@3.11.174/build/pdf.worker.min.js';
}
} catch (e) {
throw new Error('无法加载 PDF 解析组件,请检查网络连接。');
}
return new Promise((resolve, reject) => {
const reader = new FileReader();
reader.onload = async (e) => {
try {
const typedarray = new Uint8Array(e.target?.result as ArrayBuffer);
const pdf = await pdfjsLib.getDocument(typedarray).promise;
let fullText = '';
for (let i = 1; i <= pdf.numPages; i++) {
const page = await pdf.getPage(i);
const textContent = await page.getTextContent();
const pageText = textContent.items.map((item: any) => item.str).join(' ');
fullText += pageText + '\n';
}
resolve(fullText);
} catch (err) {
reject(err);
}
};
reader.onerror = (e) => reject(e);
reader.readAsArrayBuffer(file);
});
};