File size: 4,614 Bytes
34367da
 
 
 
 
f59c492
 
 
 
 
34367da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import { spawn } from 'child_process';
import path from 'path';
import { EventEmitter } from 'events';
import { fileURLToPath } from 'url';

// Handle both ESM and CJS contexts (import.meta.url is undefined in bundled CJS)
const __dirname = typeof import.meta?.url !== 'undefined'
    ? path.dirname(fileURLToPath(import.meta.url))
    : process.cwd();
const PYTHON_DIR = path.resolve(__dirname, 'python');

export interface HarvestResult {
    success: boolean;
    output: string;
    error?: string;
    data?: any;
}

export class PythonHarvesterAdapter extends EventEmitter {
    private static instance: PythonHarvesterAdapter;
    private pythonPath: string = 'python'; // Or 'python3', configurable via env

    private constructor() {
        super();
        if (process.env.PYTHON_PATH) {
            this.pythonPath = process.env.PYTHON_PATH;
        }
    }

    public static getInstance(): PythonHarvesterAdapter {
        if (!PythonHarvesterAdapter.instance) {
            PythonHarvesterAdapter.instance = new PythonHarvesterAdapter();
        }
        return PythonHarvesterAdapter.instance;
    }

    /**
     * Run the M365 Harvester (SharePoint/OneDrive)
     */
    public async runM365Harvest(): Promise<HarvestResult> {
        return this.runScript('m365_harvester.py', []);
    }

    /**
     * Run the Scribd Harvester
     * @param downloadDocs Download PDFs (default: true)
     * @param extractImages Extract images for PPT (default: true)
     */
    public async runScribdHarvest(downloadDocs: boolean = true, extractImages: boolean = true): Promise<HarvestResult> {
        const args: string[] = [];
        if (!downloadDocs) args.push('--no-download');
        if (!extractImages) args.push('--no-images');
        
        return this.runScript('scribd_harvester_v2.py', args);
    }

    /**
     * Generic script runner with environment injection
     */
    private runScript(scriptName: string, args: string[]): Promise<HarvestResult> {
        return new Promise((resolve) => {
            const scriptPath = path.join(PYTHON_DIR, scriptName);
            console.log(`🐍 [PYTHON] Executing: ${scriptName}`, args);

            const startTime = Date.now();
            const child = spawn(this.pythonPath, [scriptPath, ...args], {
                cwd: PYTHON_DIR,
                env: {
                    ...process.env,
                    // Ensure UTF-8 for reliable parsing
                    PYTHONIOENCODING: 'utf-8'
                }
            });

            let stdout = '';
            let stderr = '';

            child.stdout.on('data', (data) => {
                const chunk = data.toString();
                stdout += chunk;
                // Emit real-time log events if needed
                // console.log(`[${scriptName}] ${chunk.trim()}`);
            });

            child.stderr.on('data', (data) => {
                const chunk = data.toString();
                stderr += chunk;
                console.error(`[${scriptName} ERR] ${chunk.trim()}`);
            });

            child.on('close', (code) => {
                const duration = Date.now() - startTime;
                console.log(`🐍 [PYTHON] ${scriptName} finished with code ${code} (${duration}ms)`);

                if (code === 0) {
                    // Try to parse JSON output if possible, otherwise return text
                    let data = null;
                    try {
                        // Look for last JSON object in stdout
                        const jsonMatch = stdout.match(/\{[\s\S]*\}$/);
                        if (jsonMatch) {
                            data = JSON.parse(jsonMatch[0]);
                        }
                    } catch (e) {
                        // Not JSON, ignore
                    }

                    resolve({
                        success: true,
                        output: stdout,
                        data
                    });
                } else {
                    resolve({
                        success: false,
                        output: stdout,
                        error: stderr || `Process exited with code ${code}`
                    });
                }
            });

            child.on('error', (err) => {
                console.error(`🐍 [PYTHON] Failed to spawn: ${err.message}`);
                resolve({
                    success: false,
                    output: stdout,
                    error: err.message
                });
            });
        });
    }
}

export const pythonHarvester = PythonHarvesterAdapter.getInstance();