widgettdc-api / apps /backend /src /services /ingestion /AutonomousHarvester.ts
Kraft102's picture
Update backend source
34367da verified
import { dataIngestionEngine } from './DataIngestionEngine.js';
import { WebCrawlerAdapter } from './WebCrawlerAdapter.js';
import { neo4jAdapter } from '../../adapters/Neo4jAdapter.js';
// Targets for the "200% Data Growth" KPI
const TARGETS = [
{ url: 'https://tdc.dk/erhverv', category: 'TDC_Product', priority: 'high' },
{ url: 'https://tdc.dk/om-tdc', category: 'TDC_Corporate', priority: 'medium' },
{ url: 'https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights', category: 'Strategy', priority: 'high' },
{ url: 'https://www.cisa.gov/news-events/cybersecurity-advisories', category: 'Cybersecurity', priority: 'critical' },
{ url: 'https://thehackernews.com/', category: 'Cybersecurity', priority: 'high' }
];
export class AutonomousHarvester {
private isRunning = false;
private stats = {
ingested: 0,
errors: 0,
bytesProcessed: 0
};
async startHarvest() {
if (this.isRunning) {
console.log('🌾 Harvester already running.');
return;
}
this.isRunning = true;
console.log('🌾 STARTING AUTONOMOUS HARVEST SEQUENCE...');
console.log(`🎯 Targets: ${TARGETS.length} high-value sources`);
// Ensure crawler is registered
const crawler = new WebCrawlerAdapter();
await dataIngestionEngine.registerAdapter(crawler, 'SystemCrawler', true);
// Execute harvest loop
for (const target of TARGETS) {
try {
console.log(`🚜 Harvesting: ${target.url} [${target.category}]...`);
// Use new adapter interface
crawler.addUrls([target.url]);
const rawData = await crawler.fetch();
const entities = await crawler.transform(rawData);
// Enhance metadata
const enhancedEntities = entities.map(e => ({
...e,
metadata: {
...e.metadata,
category: target.category,
priority: target.priority,
harvestedBy: 'AutonomousHarvester v1'
}
}));
// Ingest via engine
// Note: We bypass the engine's internal fetch to inject our enhanced entities directly
// by using the engine's persistence logic if exposed, or just relying on the engine to re-process.
// Since dataIngestionEngine.ingestFrom(url) calls the adapter, we can just call that.
// However, to ensure the CATEGORY is saved, we might need to update the node AFTER ingestion
// using Neo4j directly since the engine is generic.
// 1. Ingest
const count = await dataIngestionEngine.ingestFrom(target.url);
// 2. Tag in Graph
if (count > 0) {
await neo4jAdapter.executeQuery(`
MATCH (n {id: $url})
SET n.category = $category, n.priority = $priority, n.harvest_timestamp = datetime()
MERGE (c:Category {name: $category})
MERGE (n)-[:BELONGS_TO]->(c)
`, {
url: target.url,
category: target.category,
priority: target.priority
});
}
this.stats.ingested += count;
this.stats.bytesProcessed += 1024 * count; // Estimate
console.log(` βœ… Harvested ${count} nodes from ${target.url}`);
} catch (error: any) {
console.error(` ❌ Failed to harvest ${target.url}:`, error.message);
this.stats.errors++;
}
}
this.isRunning = false;
console.log('🌾 HARVEST COMPLETE.');
console.log(JSON.stringify(this.stats, null, 2));
}
getStats() {
return this.stats;
}
}
export const autonomousHarvester = new AutonomousHarvester();