import { spawn } from 'child_process'; import path from 'path'; import { EventEmitter } from 'events'; import { fileURLToPath } from 'url'; // Handle both ESM and CJS contexts (import.meta.url is undefined in bundled CJS) const __dirname = typeof import.meta?.url !== 'undefined' ? path.dirname(fileURLToPath(import.meta.url)) : process.cwd(); const PYTHON_DIR = path.resolve(__dirname, 'python'); export interface HarvestResult { success: boolean; output: string; error?: string; data?: any; } export class PythonHarvesterAdapter extends EventEmitter { private static instance: PythonHarvesterAdapter; private pythonPath: string = 'python'; // Or 'python3', configurable via env private constructor() { super(); if (process.env.PYTHON_PATH) { this.pythonPath = process.env.PYTHON_PATH; } } public static getInstance(): PythonHarvesterAdapter { if (!PythonHarvesterAdapter.instance) { PythonHarvesterAdapter.instance = new PythonHarvesterAdapter(); } return PythonHarvesterAdapter.instance; } /** * Run the M365 Harvester (SharePoint/OneDrive) */ public async runM365Harvest(): Promise { return this.runScript('m365_harvester.py', []); } /** * Run the Scribd Harvester * @param downloadDocs Download PDFs (default: true) * @param extractImages Extract images for PPT (default: true) */ public async runScribdHarvest(downloadDocs: boolean = true, extractImages: boolean = true): Promise { const args: string[] = []; if (!downloadDocs) args.push('--no-download'); if (!extractImages) args.push('--no-images'); return this.runScript('scribd_harvester_v2.py', args); } /** * Generic script runner with environment injection */ private runScript(scriptName: string, args: string[]): Promise { return new Promise((resolve) => { const scriptPath = path.join(PYTHON_DIR, scriptName); console.log(`🐍 [PYTHON] Executing: ${scriptName}`, args); const startTime = Date.now(); const child = spawn(this.pythonPath, [scriptPath, ...args], { cwd: PYTHON_DIR, env: { ...process.env, // Ensure UTF-8 for reliable parsing PYTHONIOENCODING: 'utf-8' } }); let stdout = ''; let stderr = ''; child.stdout.on('data', (data) => { const chunk = data.toString(); stdout += chunk; // Emit real-time log events if needed // console.log(`[${scriptName}] ${chunk.trim()}`); }); child.stderr.on('data', (data) => { const chunk = data.toString(); stderr += chunk; console.error(`[${scriptName} ERR] ${chunk.trim()}`); }); child.on('close', (code) => { const duration = Date.now() - startTime; console.log(`🐍 [PYTHON] ${scriptName} finished with code ${code} (${duration}ms)`); if (code === 0) { // Try to parse JSON output if possible, otherwise return text let data = null; try { // Look for last JSON object in stdout const jsonMatch = stdout.match(/\{[\s\S]*\}$/); if (jsonMatch) { data = JSON.parse(jsonMatch[0]); } } catch (e) { // Not JSON, ignore } resolve({ success: true, output: stdout, data }); } else { resolve({ success: false, output: stdout, error: stderr || `Process exited with code ${code}` }); } }); child.on('error', (err) => { console.error(`🐍 [PYTHON] Failed to spawn: ${err.message}`); resolve({ success: false, output: stdout, error: err.message }); }); }); } } export const pythonHarvester = PythonHarvesterAdapter.getInstance();