Spaces:
Paused
Paused
| import axios from 'axios'; | |
| import * as cheerio from 'cheerio'; | |
| import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js'; | |
| import { URL } from 'url'; | |
| interface CrawlerConfig { | |
| mode: 'single' | 'directory' | 'recursive'; | |
| maxDepth: number; | |
| keywords: string[]; | |
| } | |
| export class WebCrawlerAdapter implements DataSourceAdapter { | |
| name = 'Web Crawler'; | |
| type: 'other' = 'other'; | |
| private visitedUrls: Set<string> = new Set(); | |
| private urlsToFetch: string[] = []; | |
| private config: CrawlerConfig = { | |
| mode: 'single', | |
| maxDepth: 1, | |
| keywords: [] | |
| }; | |
| constructor() { } | |
| configure(config: Partial<CrawlerConfig>) { | |
| if (config.mode) this.config.mode = config.mode; | |
| if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth; | |
| if (config.keywords) this.config.keywords = config.keywords; | |
| } | |
| /** Add URLs to fetch queue */ | |
| addUrls(urls: string[]): void { | |
| const validUrls = urls.filter(u => u.startsWith('http')); | |
| this.urlsToFetch.push(...validUrls); | |
| } | |
| async fetch(): Promise<any[]> { | |
| const results: any[] = []; | |
| const queue = this.urlsToFetch.map(url => ({ url, depth: 0 })); | |
| // Clear initial queue as we are processing it now | |
| this.urlsToFetch = []; | |
| while (queue.length > 0) { | |
| const item = queue.shift(); | |
| if (!item) break; | |
| const { url, depth } = item; | |
| if (this.visitedUrls.has(url)) continue; | |
| this.visitedUrls.add(url); | |
| console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`); | |
| try { | |
| const response = await axios.get(url, { | |
| timeout: 10000, | |
| headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' } | |
| }); | |
| const html = response.data; | |
| const $ = cheerio.load(html); | |
| const textContent = $('body').text().toLowerCase(); | |
| // Check keywords if configured | |
| if (this.config.keywords.length > 0) { | |
| const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase())); | |
| if (!hasKeyword) { | |
| console.log(`⏭️ Skipping ${url} - no matching keywords`); | |
| continue; | |
| } | |
| } | |
| results.push({ url, html }); | |
| // Find matching links if we haven't reached max depth | |
| if (depth < this.config.maxDepth && this.config.mode !== 'single') { | |
| const baseUrl = new URL(url); | |
| const links = $('a[href]').map((_, el) => $(el).attr('href')).get(); | |
| for (const link of links) { | |
| try { | |
| const absoluteUrl = new URL(link, url).toString(); | |
| // Domain constraint | |
| if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue; | |
| // Mode constraint: Directory | |
| if (this.config.mode === 'directory') { | |
| const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1); | |
| const linkPath = new URL(absoluteUrl).pathname; | |
| if (!linkPath.startsWith(basePath)) continue; | |
| } | |
| if (!this.visitedUrls.has(absoluteUrl)) { | |
| queue.push({ url: absoluteUrl, depth: depth + 1 }); | |
| } | |
| } catch (e) { | |
| // Invalid URL, ignore | |
| } | |
| } | |
| } | |
| } catch (error: any) { | |
| console.error(`Crawl failed for ${url}:`, error.message); | |
| } | |
| } | |
| return results; | |
| } | |
| async transform(raw: any[]): Promise<IngestedEntity[]> { | |
| return raw.map(item => { | |
| const $ = cheerio.load(item.html); | |
| // Remove noise | |
| $('script, style, nav, footer, header, .cookie-banner, .ads').remove(); | |
| const title = $('title').text().trim() || item.url; | |
| let content = $('article').text() || $('main').text() || $('body').text(); | |
| content = content.replace(/\s+/g, ' ').trim(); | |
| const description = $('meta[name="description"]').attr('content') || ''; | |
| return { | |
| id: item.url, | |
| type: 'web_page', | |
| source: 'web_crawler', | |
| title: title, | |
| content: `Title: ${title}\nDescription: ${description}\n\n${content}`, | |
| metadata: { | |
| url: item.url, | |
| description: description, | |
| crawledAt: new Date().toISOString(), | |
| keywordsMatched: this.config.keywords | |
| }, | |
| timestamp: new Date() | |
| }; | |
| }); | |
| } | |
| async isAvailable(): Promise<boolean> { | |
| return true; | |
| } | |
| } | |