File size: 4,862 Bytes
ce37a9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
const fs = require('fs-extra');
const path = require('path');
const pdf = require('pdf-parse');
const mammoth = require('mammoth');
const officeParser = require('officeparser');
const XLSX = require('xlsx');
const Tesseract = require('tesseract.js');
const mime = require('mime-types');

class RagHandler {
    constructor() {
        this.storePath = path.join(__dirname, '../session/doc_store');
        fs.ensureDirSync(this.storePath);
    }

    /**
     * Membersihkan teks hasil ekstraksi dari whitespace berlebih dan karakter aneh.
     */
    cleanText(text) {
        if (!text) return "";
        return text
            .replace(/\r\n/g, '\n') // Normalize newlines
            .replace(/\t/g, ' ')    // Tabs to spaces
            .replace(/ +/g, ' ')    // Multiple spaces to single space
            .replace(/\n\s*\n/g, '\n\n') // Max 2 newlines
            .trim();
    }

    /**
     * Smart Extraction Router
     */
    async extractText(filePath, mimeType) {
        try {
            let extractedText = "";
            const ext = path.extname(filePath).toLowerCase();

            // 1. PDF Handling
            if (mimeType === 'application/pdf' || ext === '.pdf') {
                const dataBuffer = fs.readFileSync(filePath);
                const data = await pdf(dataBuffer);
                extractedText = data.text;
                
                // Jika PDF kosong (mungkin scanned), fallback ke OCR (Optional logic could go here, 
                // but pdf-parse is usually fast. OCR on PDF pages requires splitting which is heavy).
                if (extractedText.trim().length < 50) {
                    extractedText += "\n[Catatan Sistem: Teks sangat sedikit. Dokumen ini mungkin berisi gambar scan yang sulit dibaca secara langsung.]";
                }
            } 
            // 2. Word (DOCX)
            else if (mimeType === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' || ext === '.docx') {
                const result = await mammoth.extractRawText({ path: filePath });
                extractedText = result.value;
            } 
            // 3. Excel / CSV (XLSX, XLS, CSV)
            else if (
                mimeType.includes('spreadsheet') || 
                mimeType.includes('excel') || 
                ext === '.xlsx' || ext === '.xls' || ext === '.csv'
            ) {
                const workbook = XLSX.readFile(filePath);
                const sheetNames = workbook.SheetNames;
                let excelData = [];
                
                sheetNames.forEach(name => {
                    const sheet = workbook.Sheets[name];
                    const csv = XLSX.utils.sheet_to_csv(sheet);
                    if (csv && csv.trim().length > 0) {
                        excelData.push(`--- Sheet: ${name} ---
${csv}`);
                    }
                });
                extractedText = excelData.join('\n\n');
            }
            // 4. PowerPoint (PPTX, PPT)
            else if (mimeType.includes('presentation') || mimeType.includes('powerpoint') || ext === '.pptx' || ext === '.ppt') {
                extractedText = await new Promise((resolve, reject) => {
                    officeParser.parseOffice(filePath, (data, err) => {
                        if (err) resolve(""); // Fail gracefully
                        else resolve(data);
                    });
                });
            }
            // 5. Images (OCR)
            else if (mimeType.startsWith('image/') || ['.jpg', '.jpeg', '.png', '.bmp'].includes(ext)) {
                console.log(`Starting OCR for ${filePath}...`);
                const { data: { text } } = await Tesseract.recognize(filePath, 'ind', { // 'ind' for Indonesian
                    logger: m => {} // Silent logger
                });
                extractedText = text;
            }
            // 6. Plain Text / Code
            else {
                // Fallback for .txt, .js, .py, .json, etc.
                extractedText = fs.readFileSync(filePath, 'utf8');
            }

            return this.cleanText(extractedText);

        } catch (error) {
            console.error('Smart Extraction Error:', error);
            return `[Error: Gagal mengekstrak teks dari file ini. Format mungkin rusak atau tidak didukung.]`;
        }
    }

    async saveDocumentContext(jid, text) {
        const filePath = path.join(this.storePath, `${jid.replace(/\D/g, '')}.txt`);
        await fs.writeFile(filePath, text, 'utf8');
        return filePath;
    }

    async getDocumentContext(jid) {
        const filePath = path.join(this.storePath, `${jid.replace(/\D/g, '')}.txt`);
        if (fs.existsSync(filePath)) {
            return await fs.readFile(filePath, 'utf8');
        }
        return null;
    }
}

const ragHandler = new RagHandler();
module.exports = ragHandler;