import axios from 'axios'; import * as cheerio from 'cheerio'; import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js'; import { URL } from 'url'; interface CrawlerConfig { mode: 'single' | 'directory' | 'recursive'; maxDepth: number; keywords: string[]; } export class WebCrawlerAdapter implements DataSourceAdapter { name = 'Web Crawler'; type: 'other' = 'other'; private visitedUrls: Set = new Set(); private urlsToFetch: string[] = []; private config: CrawlerConfig = { mode: 'single', maxDepth: 1, keywords: [] }; constructor() { } configure(config: Partial) { if (config.mode) this.config.mode = config.mode; if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth; if (config.keywords) this.config.keywords = config.keywords; } /** Add URLs to fetch queue */ addUrls(urls: string[]): void { const validUrls = urls.filter(u => u.startsWith('http')); this.urlsToFetch.push(...validUrls); } async fetch(): Promise { const results: any[] = []; const queue = this.urlsToFetch.map(url => ({ url, depth: 0 })); // Clear initial queue as we are processing it now this.urlsToFetch = []; while (queue.length > 0) { const item = queue.shift(); if (!item) break; const { url, depth } = item; if (this.visitedUrls.has(url)) continue; this.visitedUrls.add(url); console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`); try { const response = await axios.get(url, { timeout: 10000, headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' } }); const html = response.data; const $ = cheerio.load(html); const textContent = $('body').text().toLowerCase(); // Check keywords if configured if (this.config.keywords.length > 0) { const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase())); if (!hasKeyword) { console.log(`⏭️ Skipping ${url} - no matching keywords`); continue; } } results.push({ url, html }); // Find matching links if we haven't reached max depth if (depth < this.config.maxDepth && this.config.mode !== 'single') { const baseUrl = new URL(url); const links = $('a[href]').map((_, el) => $(el).attr('href')).get(); for (const link of links) { try { const absoluteUrl = new URL(link, url).toString(); // Domain constraint if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue; // Mode constraint: Directory if (this.config.mode === 'directory') { const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1); const linkPath = new URL(absoluteUrl).pathname; if (!linkPath.startsWith(basePath)) continue; } if (!this.visitedUrls.has(absoluteUrl)) { queue.push({ url: absoluteUrl, depth: depth + 1 }); } } catch (e) { // Invalid URL, ignore } } } } catch (error: any) { console.error(`Crawl failed for ${url}:`, error.message); } } return results; } async transform(raw: any[]): Promise { return raw.map(item => { const $ = cheerio.load(item.html); // Remove noise $('script, style, nav, footer, header, .cookie-banner, .ads').remove(); const title = $('title').text().trim() || item.url; let content = $('article').text() || $('main').text() || $('body').text(); content = content.replace(/\s+/g, ' ').trim(); const description = $('meta[name="description"]').attr('content') || ''; return { id: item.url, type: 'web_page', source: 'web_crawler', title: title, content: `Title: ${title}\nDescription: ${description}\n\n${content}`, metadata: { url: item.url, description: description, crawledAt: new Date().toISOString(), keywordsMatched: this.config.keywords }, timestamp: new Date() }; }); } async isAvailable(): Promise { return true; } }