Spaces:
Paused
Paused
| import { spawn } from 'child_process'; | |
| import path from 'path'; | |
| import { EventEmitter } from 'events'; | |
| import { fileURLToPath } from 'url'; | |
| // Handle both ESM and CJS contexts (import.meta.url is undefined in bundled CJS) | |
| const __dirname = typeof import.meta?.url !== 'undefined' | |
| ? path.dirname(fileURLToPath(import.meta.url)) | |
| : process.cwd(); | |
| const PYTHON_DIR = path.resolve(__dirname, 'python'); | |
| export interface HarvestResult { | |
| success: boolean; | |
| output: string; | |
| error?: string; | |
| data?: any; | |
| } | |
| export class PythonHarvesterAdapter extends EventEmitter { | |
| private static instance: PythonHarvesterAdapter; | |
| private pythonPath: string = 'python'; // Or 'python3', configurable via env | |
| private constructor() { | |
| super(); | |
| if (process.env.PYTHON_PATH) { | |
| this.pythonPath = process.env.PYTHON_PATH; | |
| } | |
| } | |
| public static getInstance(): PythonHarvesterAdapter { | |
| if (!PythonHarvesterAdapter.instance) { | |
| PythonHarvesterAdapter.instance = new PythonHarvesterAdapter(); | |
| } | |
| return PythonHarvesterAdapter.instance; | |
| } | |
| /** | |
| * Run the M365 Harvester (SharePoint/OneDrive) | |
| */ | |
| public async runM365Harvest(): Promise<HarvestResult> { | |
| return this.runScript('m365_harvester.py', []); | |
| } | |
| /** | |
| * Run the Scribd Harvester | |
| * @param downloadDocs Download PDFs (default: true) | |
| * @param extractImages Extract images for PPT (default: true) | |
| */ | |
| public async runScribdHarvest(downloadDocs: boolean = true, extractImages: boolean = true): Promise<HarvestResult> { | |
| const args: string[] = []; | |
| if (!downloadDocs) args.push('--no-download'); | |
| if (!extractImages) args.push('--no-images'); | |
| return this.runScript('scribd_harvester_v2.py', args); | |
| } | |
| /** | |
| * Generic script runner with environment injection | |
| */ | |
| private runScript(scriptName: string, args: string[]): Promise<HarvestResult> { | |
| return new Promise((resolve) => { | |
| const scriptPath = path.join(PYTHON_DIR, scriptName); | |
| console.log(`🐍 [PYTHON] Executing: ${scriptName}`, args); | |
| const startTime = Date.now(); | |
| const child = spawn(this.pythonPath, [scriptPath, ...args], { | |
| cwd: PYTHON_DIR, | |
| env: { | |
| ...process.env, | |
| // Ensure UTF-8 for reliable parsing | |
| PYTHONIOENCODING: 'utf-8' | |
| } | |
| }); | |
| let stdout = ''; | |
| let stderr = ''; | |
| child.stdout.on('data', (data) => { | |
| const chunk = data.toString(); | |
| stdout += chunk; | |
| // Emit real-time log events if needed | |
| // console.log(`[${scriptName}] ${chunk.trim()}`); | |
| }); | |
| child.stderr.on('data', (data) => { | |
| const chunk = data.toString(); | |
| stderr += chunk; | |
| console.error(`[${scriptName} ERR] ${chunk.trim()}`); | |
| }); | |
| child.on('close', (code) => { | |
| const duration = Date.now() - startTime; | |
| console.log(`🐍 [PYTHON] ${scriptName} finished with code ${code} (${duration}ms)`); | |
| if (code === 0) { | |
| // Try to parse JSON output if possible, otherwise return text | |
| let data = null; | |
| try { | |
| // Look for last JSON object in stdout | |
| const jsonMatch = stdout.match(/\{[\s\S]*\}$/); | |
| if (jsonMatch) { | |
| data = JSON.parse(jsonMatch[0]); | |
| } | |
| } catch (e) { | |
| // Not JSON, ignore | |
| } | |
| resolve({ | |
| success: true, | |
| output: stdout, | |
| data | |
| }); | |
| } else { | |
| resolve({ | |
| success: false, | |
| output: stdout, | |
| error: stderr || `Process exited with code ${code}` | |
| }); | |
| } | |
| }); | |
| child.on('error', (err) => { | |
| console.error(`🐍 [PYTHON] Failed to spawn: ${err.message}`); | |
| resolve({ | |
| success: false, | |
| output: stdout, | |
| error: err.message | |
| }); | |
| }); | |
| }); | |
| } | |
| } | |
| export const pythonHarvester = PythonHarvesterAdapter.getInstance(); | |