widgettdc-api / apps /backend /src /services /harvester /PythonHarvesterAdapter.ts
Kraft102's picture
Fix import.meta.url CJS compatibility
f59c492 verified
import { spawn } from 'child_process';
import path from 'path';
import { EventEmitter } from 'events';
import { fileURLToPath } from 'url';
// Handle both ESM and CJS contexts (import.meta.url is undefined in bundled CJS)
const __dirname = typeof import.meta?.url !== 'undefined'
? path.dirname(fileURLToPath(import.meta.url))
: process.cwd();
const PYTHON_DIR = path.resolve(__dirname, 'python');
export interface HarvestResult {
success: boolean;
output: string;
error?: string;
data?: any;
}
export class PythonHarvesterAdapter extends EventEmitter {
private static instance: PythonHarvesterAdapter;
private pythonPath: string = 'python'; // Or 'python3', configurable via env
private constructor() {
super();
if (process.env.PYTHON_PATH) {
this.pythonPath = process.env.PYTHON_PATH;
}
}
public static getInstance(): PythonHarvesterAdapter {
if (!PythonHarvesterAdapter.instance) {
PythonHarvesterAdapter.instance = new PythonHarvesterAdapter();
}
return PythonHarvesterAdapter.instance;
}
/**
* Run the M365 Harvester (SharePoint/OneDrive)
*/
public async runM365Harvest(): Promise<HarvestResult> {
return this.runScript('m365_harvester.py', []);
}
/**
* Run the Scribd Harvester
* @param downloadDocs Download PDFs (default: true)
* @param extractImages Extract images for PPT (default: true)
*/
public async runScribdHarvest(downloadDocs: boolean = true, extractImages: boolean = true): Promise<HarvestResult> {
const args: string[] = [];
if (!downloadDocs) args.push('--no-download');
if (!extractImages) args.push('--no-images');
return this.runScript('scribd_harvester_v2.py', args);
}
/**
* Generic script runner with environment injection
*/
private runScript(scriptName: string, args: string[]): Promise<HarvestResult> {
return new Promise((resolve) => {
const scriptPath = path.join(PYTHON_DIR, scriptName);
console.log(`🐍 [PYTHON] Executing: ${scriptName}`, args);
const startTime = Date.now();
const child = spawn(this.pythonPath, [scriptPath, ...args], {
cwd: PYTHON_DIR,
env: {
...process.env,
// Ensure UTF-8 for reliable parsing
PYTHONIOENCODING: 'utf-8'
}
});
let stdout = '';
let stderr = '';
child.stdout.on('data', (data) => {
const chunk = data.toString();
stdout += chunk;
// Emit real-time log events if needed
// console.log(`[${scriptName}] ${chunk.trim()}`);
});
child.stderr.on('data', (data) => {
const chunk = data.toString();
stderr += chunk;
console.error(`[${scriptName} ERR] ${chunk.trim()}`);
});
child.on('close', (code) => {
const duration = Date.now() - startTime;
console.log(`🐍 [PYTHON] ${scriptName} finished with code ${code} (${duration}ms)`);
if (code === 0) {
// Try to parse JSON output if possible, otherwise return text
let data = null;
try {
// Look for last JSON object in stdout
const jsonMatch = stdout.match(/\{[\s\S]*\}$/);
if (jsonMatch) {
data = JSON.parse(jsonMatch[0]);
}
} catch (e) {
// Not JSON, ignore
}
resolve({
success: true,
output: stdout,
data
});
} else {
resolve({
success: false,
output: stdout,
error: stderr || `Process exited with code ${code}`
});
}
});
child.on('error', (err) => {
console.error(`🐍 [PYTHON] Failed to spawn: ${err.message}`);
resolve({
success: false,
output: stdout,
error: err.message
});
});
});
}
}
export const pythonHarvester = PythonHarvesterAdapter.getInstance();