File size: 3,108 Bytes
c071bb8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b0770b8
 
 
 
 
 
 
 
 
 
c071bb8
 
 
 
 
 
 
 
 
 
 
 
 
 
b0770b8
 
151e61b
b0770b8
 
 
 
151e61b
 
 
b0770b8
151e61b
b0770b8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c071bb8
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91

export interface ParsedDocument {
    fileName: string;
    content: string;
    fileType: string;
}

export const parseDocument = async (file: File): Promise<string> => {
    const fileType = file.name.split('.').pop()?.toLowerCase();

    if (fileType === 'txt') {
        return await parseTxt(file);
    } else if (fileType === 'docx') {
        return await parseDocx(file);
    } else if (fileType === 'pdf') {
        return await parsePdf(file);
    } else {
        throw new Error('不支持的文件格式。仅支持 .txt, .docx, .pdf');
    }
};

const parseTxt = (file: File): Promise<string> => {
    return new Promise((resolve, reject) => {
        const reader = new FileReader();
        reader.onload = (e) => resolve(e.target?.result as string);
        reader.onerror = (e) => reject(e);
        reader.readAsText(file);
    });
};

const parseDocx = async (file: File): Promise<string> => {
    // Dynamic import to prevent page crash if module is missing
    let mammoth;
    try {
        // @ts-ignore
        mammoth = await import('mammoth');
    } catch (e) {
        throw new Error('无法加载文档解析组件(mammoth),请检查网络连接。');
    }

    return new Promise((resolve, reject) => {
        const reader = new FileReader();
        reader.onload = (e) => {
            const arrayBuffer = e.target?.result as ArrayBuffer;
            mammoth.extractRawText({ arrayBuffer: arrayBuffer })
                .then((result: any) => resolve(result.value))
                .catch((err: any) => reject(err));
        };
        reader.onerror = (e) => reject(e);
        reader.readAsArrayBuffer(file);
    });
};

const parsePdf = async (file: File): Promise<string> => {
    // Dynamic import
    let pdfjsLib: any;
    try {
        // @ts-ignore
        const pdfjsModule = await import('pdfjs-dist');
        pdfjsLib = pdfjsModule.default || pdfjsModule;
        if (!pdfjsLib.GlobalWorkerOptions.workerSrc) {
            pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://esm.sh/pdfjs-dist@3.11.174/build/pdf.worker.min.js';
        }
    } catch (e) {
        throw new Error('无法加载 PDF 解析组件,请检查网络连接。');
    }

    return new Promise((resolve, reject) => {
        const reader = new FileReader();
        reader.onload = async (e) => {
            try {
                const typedarray = new Uint8Array(e.target?.result as ArrayBuffer);
                const pdf = await pdfjsLib.getDocument(typedarray).promise;
                let fullText = '';
                
                for (let i = 1; i <= pdf.numPages; i++) {
                    const page = await pdf.getPage(i);
                    const textContent = await page.getTextContent();
                    const pageText = textContent.items.map((item: any) => item.str).join(' ');
                    fullText += pageText + '\n';
                }
                resolve(fullText);
            } catch (err) {
                reject(err);
            }
        };
        reader.onerror = (e) => reject(e);
        reader.readAsArrayBuffer(file);
    });
};