Spaces:
Paused
Paused
| import { dataIngestionEngine } from './DataIngestionEngine.js'; | |
| import { WebCrawlerAdapter } from './WebCrawlerAdapter.js'; | |
| import { neo4jAdapter } from '../../adapters/Neo4jAdapter.js'; | |
| // Targets for the "200% Data Growth" KPI | |
| const TARGETS = [ | |
| { url: 'https://tdc.dk/erhverv', category: 'TDC_Product', priority: 'high' }, | |
| { url: 'https://tdc.dk/om-tdc', category: 'TDC_Corporate', priority: 'medium' }, | |
| { url: 'https://www.mckinsey.com/capabilities/mckinsey-digital/our-insights', category: 'Strategy', priority: 'high' }, | |
| { url: 'https://www.cisa.gov/news-events/cybersecurity-advisories', category: 'Cybersecurity', priority: 'critical' }, | |
| { url: 'https://thehackernews.com/', category: 'Cybersecurity', priority: 'high' } | |
| ]; | |
| export class AutonomousHarvester { | |
| private isRunning = false; | |
| private stats = { | |
| ingested: 0, | |
| errors: 0, | |
| bytesProcessed: 0 | |
| }; | |
| async startHarvest() { | |
| if (this.isRunning) { | |
| console.log('πΎ Harvester already running.'); | |
| return; | |
| } | |
| this.isRunning = true; | |
| console.log('πΎ STARTING AUTONOMOUS HARVEST SEQUENCE...'); | |
| console.log(`π― Targets: ${TARGETS.length} high-value sources`); | |
| // Ensure crawler is registered | |
| const crawler = new WebCrawlerAdapter(); | |
| await dataIngestionEngine.registerAdapter(crawler, 'SystemCrawler', true); | |
| // Execute harvest loop | |
| for (const target of TARGETS) { | |
| try { | |
| console.log(`π Harvesting: ${target.url} [${target.category}]...`); | |
| // Use new adapter interface | |
| crawler.addUrls([target.url]); | |
| const rawData = await crawler.fetch(); | |
| const entities = await crawler.transform(rawData); | |
| // Enhance metadata | |
| const enhancedEntities = entities.map(e => ({ | |
| ...e, | |
| metadata: { | |
| ...e.metadata, | |
| category: target.category, | |
| priority: target.priority, | |
| harvestedBy: 'AutonomousHarvester v1' | |
| } | |
| })); | |
| // Ingest via engine | |
| // Note: We bypass the engine's internal fetch to inject our enhanced entities directly | |
| // by using the engine's persistence logic if exposed, or just relying on the engine to re-process. | |
| // Since dataIngestionEngine.ingestFrom(url) calls the adapter, we can just call that. | |
| // However, to ensure the CATEGORY is saved, we might need to update the node AFTER ingestion | |
| // using Neo4j directly since the engine is generic. | |
| // 1. Ingest | |
| const count = await dataIngestionEngine.ingestFrom(target.url); | |
| // 2. Tag in Graph | |
| if (count > 0) { | |
| await neo4jAdapter.executeQuery(` | |
| MATCH (n {id: $url}) | |
| SET n.category = $category, n.priority = $priority, n.harvest_timestamp = datetime() | |
| MERGE (c:Category {name: $category}) | |
| MERGE (n)-[:BELONGS_TO]->(c) | |
| `, { | |
| url: target.url, | |
| category: target.category, | |
| priority: target.priority | |
| }); | |
| } | |
| this.stats.ingested += count; | |
| this.stats.bytesProcessed += 1024 * count; // Estimate | |
| console.log(` β Harvested ${count} nodes from ${target.url}`); | |
| } catch (error: any) { | |
| console.error(` β Failed to harvest ${target.url}:`, error.message); | |
| this.stats.errors++; | |
| } | |
| } | |
| this.isRunning = false; | |
| console.log('πΎ HARVEST COMPLETE.'); | |
| console.log(JSON.stringify(this.stats, null, 2)); | |
| } | |
| getStats() { | |
| return this.stats; | |
| } | |
| } | |
| export const autonomousHarvester = new AutonomousHarvester(); | |