Spaces:
Paused
Paused
File size: 4,614 Bytes
34367da f59c492 34367da | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | import { spawn } from 'child_process';
import path from 'path';
import { EventEmitter } from 'events';
import { fileURLToPath } from 'url';
// Handle both ESM and CJS contexts (import.meta.url is undefined in bundled CJS)
const __dirname = typeof import.meta?.url !== 'undefined'
? path.dirname(fileURLToPath(import.meta.url))
: process.cwd();
const PYTHON_DIR = path.resolve(__dirname, 'python');
export interface HarvestResult {
success: boolean;
output: string;
error?: string;
data?: any;
}
export class PythonHarvesterAdapter extends EventEmitter {
private static instance: PythonHarvesterAdapter;
private pythonPath: string = 'python'; // Or 'python3', configurable via env
private constructor() {
super();
if (process.env.PYTHON_PATH) {
this.pythonPath = process.env.PYTHON_PATH;
}
}
public static getInstance(): PythonHarvesterAdapter {
if (!PythonHarvesterAdapter.instance) {
PythonHarvesterAdapter.instance = new PythonHarvesterAdapter();
}
return PythonHarvesterAdapter.instance;
}
/**
* Run the M365 Harvester (SharePoint/OneDrive)
*/
public async runM365Harvest(): Promise<HarvestResult> {
return this.runScript('m365_harvester.py', []);
}
/**
* Run the Scribd Harvester
* @param downloadDocs Download PDFs (default: true)
* @param extractImages Extract images for PPT (default: true)
*/
public async runScribdHarvest(downloadDocs: boolean = true, extractImages: boolean = true): Promise<HarvestResult> {
const args: string[] = [];
if (!downloadDocs) args.push('--no-download');
if (!extractImages) args.push('--no-images');
return this.runScript('scribd_harvester_v2.py', args);
}
/**
* Generic script runner with environment injection
*/
private runScript(scriptName: string, args: string[]): Promise<HarvestResult> {
return new Promise((resolve) => {
const scriptPath = path.join(PYTHON_DIR, scriptName);
console.log(`🐍 [PYTHON] Executing: ${scriptName}`, args);
const startTime = Date.now();
const child = spawn(this.pythonPath, [scriptPath, ...args], {
cwd: PYTHON_DIR,
env: {
...process.env,
// Ensure UTF-8 for reliable parsing
PYTHONIOENCODING: 'utf-8'
}
});
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
const chunk = data.toString();
stdout += chunk;
// Emit real-time log events if needed
// console.log(`[${scriptName}] ${chunk.trim()}`);
});
child.stderr.on('data', (data) => {
const chunk = data.toString();
stderr += chunk;
console.error(`[${scriptName} ERR] ${chunk.trim()}`);
});
child.on('close', (code) => {
const duration = Date.now() - startTime;
console.log(`🐍 [PYTHON] ${scriptName} finished with code ${code} (${duration}ms)`);
if (code === 0) {
// Try to parse JSON output if possible, otherwise return text
let data = null;
try {
// Look for last JSON object in stdout
const jsonMatch = stdout.match(/\{[\s\S]*\}$/);
if (jsonMatch) {
data = JSON.parse(jsonMatch[0]);
}
} catch (e) {
// Not JSON, ignore
}
resolve({
success: true,
output: stdout,
data
});
} else {
resolve({
success: false,
output: stdout,
error: stderr || `Process exited with code ${code}`
});
}
});
child.on('error', (err) => {
console.error(`🐍 [PYTHON] Failed to spawn: ${err.message}`);
resolve({
success: false,
output: stdout,
error: err.message
});
});
});
}
}
export const pythonHarvester = PythonHarvesterAdapter.getInstance();
|