Spaces:

Kraft102
/

widgettdc-api

Paused

App Files Files Community

widgettdc-api / apps /backend /src /services /ingestion /AutonomousHarvester.ts

Kraft102

Update backend source

34367da verified 4 months ago

raw

history blame contribute delete

4.27 kB

	import { dataIngestionEngine } from './DataIngestionEngine.js';
	import { WebCrawlerAdapter } from './WebCrawlerAdapter.js';
	import { neo4jAdapter } from '../../adapters/Neo4jAdapter.js';

	// Targets for the "200% Data Growth" KPI
	const TARGETS = [
	{ url: 'https://tdc.dk/erhverv', category: 'TDC_Product', priority: 'high' },
	{ url: 'https://tdc.dk/om-tdc', category: 'TDC_Corporate', priority: 'medium' },
	{ url: 'https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights', category: 'Strategy', priority: 'high' },
	{ url: 'https://www.cisa.gov/news-events/cybersecurity-advisories', category: 'Cybersecurity', priority: 'critical' },
	{ url: 'https://thehackernews.com/', category: 'Cybersecurity', priority: 'high' }
	];

	export class AutonomousHarvester {
	private isRunning = false;
	private stats = {
	ingested: 0,
	errors: 0,
	bytesProcessed: 0
	};

	async startHarvest() {
	if (this.isRunning) {
	console.log('🌾 Harvester already running.');
	return;
	}

	this.isRunning = true;
	console.log('🌾 STARTING AUTONOMOUS HARVEST SEQUENCE...');
	console.log(`🎯 Targets: ${TARGETS.length} high-value sources`);

	// Ensure crawler is registered
	const crawler = new WebCrawlerAdapter();
	await dataIngestionEngine.registerAdapter(crawler, 'SystemCrawler', true);

	// Execute harvest loop
	for (const target of TARGETS) {
	try {
	console.log(`🚜 Harvesting: ${target.url} [${target.category}]...`);

	// Use new adapter interface
	crawler.addUrls([target.url]);
	const rawData = await crawler.fetch();
	const entities = await crawler.transform(rawData);

	// Enhance metadata
	const enhancedEntities = entities.map(e => ({
	...e,
	metadata: {
	...e.metadata,
	category: target.category,
	priority: target.priority,
	harvestedBy: 'AutonomousHarvester v1'
	}
	}));

	// Ingest via engine
	// Note: We bypass the engine's internal fetch to inject our enhanced entities directly
	// by using the engine's persistence logic if exposed, or just relying on the engine to re-process.
	// Since dataIngestionEngine.ingestFrom(url) calls the adapter, we can just call that.

	// However, to ensure the CATEGORY is saved, we might need to update the node AFTER ingestion
	// using Neo4j directly since the engine is generic.

	// 1. Ingest
	const count = await dataIngestionEngine.ingestFrom(target.url);

	// 2. Tag in Graph
	if (count > 0) {
	await neo4jAdapter.executeQuery(`
	MATCH (n {id: $url})
	SET n.category = $category, n.priority = $priority, n.harvest_timestamp = datetime()
	MERGE (c:Category {name: $category})
	MERGE (n)-[:BELONGS_TO]->(c)
	`, {
	url: target.url,
	category: target.category,
	priority: target.priority
	});
	}

	this.stats.ingested += count;
	this.stats.bytesProcessed += 1024 * count; // Estimate
	console.log(` ✅ Harvested ${count} nodes from ${target.url}`);

	} catch (error: any) {
	console.error(` ❌ Failed to harvest ${target.url}:`, error.message);
	this.stats.errors++;
	}
	}

	this.isRunning = false;
	console.log('🌾 HARVEST COMPLETE.');
	console.log(JSON.stringify(this.stats, null, 2));
	}

	getStats() {
	return this.stats;
	}
	}

	export const autonomousHarvester = new AutonomousHarvester();