Kraft102's picture
Deploy from GitHub Actions 2025-12-15_13-51-53
a84b07b verified
import axios from 'axios';
import * as cheerio from 'cheerio';
import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
import { URL } from 'url';
interface CrawlerConfig {
mode: 'single' | 'directory' | 'recursive';
maxDepth: number;
keywords: string[];
}
export class WebCrawlerAdapter implements DataSourceAdapter {
name = 'Web Crawler';
type: 'other' = 'other';
private visitedUrls: Set<string> = new Set();
private urlsToFetch: string[] = [];
private config: CrawlerConfig = {
mode: 'single',
maxDepth: 1,
keywords: []
};
constructor() { }
configure(config: Partial<CrawlerConfig>) {
if (config.mode) this.config.mode = config.mode;
if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
if (config.keywords) this.config.keywords = config.keywords;
}
/** Add URLs to fetch queue */
addUrls(urls: string[]): void {
const validUrls = urls.filter(u => u.startsWith('http'));
this.urlsToFetch.push(...validUrls);
}
async fetch(): Promise<any[]> {
const results: any[] = [];
const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));
// Clear initial queue as we are processing it now
this.urlsToFetch = [];
while (queue.length > 0) {
const item = queue.shift();
if (!item) break;
const { url, depth } = item;
if (this.visitedUrls.has(url)) continue;
this.visitedUrls.add(url);
console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);
try {
const response = await axios.get(url, {
timeout: 10000,
headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
});
const html = response.data;
const $ = cheerio.load(html);
const textContent = $('body').text().toLowerCase();
// Check keywords if configured
if (this.config.keywords.length > 0) {
const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
if (!hasKeyword) {
console.log(`⏭️ Skipping ${url} - no matching keywords`);
continue;
}
}
results.push({ url, html });
// Find matching links if we haven't reached max depth
if (depth < this.config.maxDepth && this.config.mode !== 'single') {
const baseUrl = new URL(url);
const links = $('a[href]').map((_, el) => $(el).attr('href')).get();
for (const link of links) {
try {
const absoluteUrl = new URL(link, url).toString();
// Domain constraint
if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;
// Mode constraint: Directory
if (this.config.mode === 'directory') {
const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
const linkPath = new URL(absoluteUrl).pathname;
if (!linkPath.startsWith(basePath)) continue;
}
if (!this.visitedUrls.has(absoluteUrl)) {
queue.push({ url: absoluteUrl, depth: depth + 1 });
}
} catch (e) {
// Invalid URL, ignore
}
}
}
} catch (error: any) {
console.error(`Crawl failed for ${url}:`, error.message);
}
}
return results;
}
async transform(raw: any[]): Promise<IngestedEntity[]> {
return raw.map(item => {
const $ = cheerio.load(item.html);
// Remove noise
$('script, style, nav, footer, header, .cookie-banner, .ads').remove();
const title = $('title').text().trim() || item.url;
let content = $('article').text() || $('main').text() || $('body').text();
content = content.replace(/\s+/g, ' ').trim();
const description = $('meta[name="description"]').attr('content') || '';
return {
id: item.url,
type: 'web_page',
source: 'web_crawler',
title: title,
content: `Title: ${title}\nDescription: ${description}\n\n${content}`,
metadata: {
url: item.url,
description: description,
crawledAt: new Date().toISOString(),
keywordsMatched: this.config.keywords
},
timestamp: new Date()
};
});
}
async isAvailable(): Promise<boolean> {
return true;
}
}