Spaces:

Kraft102
/

widgetdc-cortex

Paused

App Files Files Community

widgetdc-cortex / apps /backend /src /services /ingestion /WebCrawlerAdapter.ts

Kraft102

Deploy from GitHub Actions 2025-12-15_13-51-53

a84b07b verified 2 months ago

raw

history blame contribute delete

5.2 kB

	import axios from 'axios';
	import * as cheerio from 'cheerio';
	import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
	import { URL } from 'url';

	interface CrawlerConfig {
	mode: 'single' \| 'directory' \| 'recursive';
	maxDepth: number;
	keywords: string[];
	}

	export class WebCrawlerAdapter implements DataSourceAdapter {
	name = 'Web Crawler';
	type: 'other' = 'other';

	private visitedUrls: Set<string> = new Set();
	private urlsToFetch: string[] = [];
	private config: CrawlerConfig = {
	mode: 'single',
	maxDepth: 1,
	keywords: []
	};

	constructor() { }

	configure(config: Partial<CrawlerConfig>) {
	if (config.mode) this.config.mode = config.mode;
	if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
	if (config.keywords) this.config.keywords = config.keywords;
	}

	/** Add URLs to fetch queue */
	addUrls(urls: string[]): void {
	const validUrls = urls.filter(u => u.startsWith('http'));
	this.urlsToFetch.push(...validUrls);
	}

	async fetch(): Promise<any[]> {
	const results: any[] = [];
	const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));

	// Clear initial queue as we are processing it now
	this.urlsToFetch = [];

	while (queue.length > 0) {
	const item = queue.shift();
	if (!item) break;
	const { url, depth } = item;

	if (this.visitedUrls.has(url)) continue;
	this.visitedUrls.add(url);

	console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);

	try {
	const response = await axios.get(url, {
	timeout: 10000,
	headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
	});

	const html = response.data;
	const $ = cheerio.load(html);
	const textContent = $('body').text().toLowerCase();

	// Check keywords if configured
	if (this.config.keywords.length > 0) {
	const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
	if (!hasKeyword) {
	console.log(`⏭️ Skipping ${url} - no matching keywords`);
	continue;
	}
	}

	results.push({ url, html });

	// Find matching links if we haven't reached max depth
	if (depth < this.config.maxDepth && this.config.mode !== 'single') {
	const baseUrl = new URL(url);
	const links = $('a[href]').map((_, el) => $(el).attr('href')).get();

	for (const link of links) {
	try {
	const absoluteUrl = new URL(link, url).toString();

	// Domain constraint
	if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;

	// Mode constraint: Directory
	if (this.config.mode === 'directory') {
	const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
	const linkPath = new URL(absoluteUrl).pathname;
	if (!linkPath.startsWith(basePath)) continue;
	}

	if (!this.visitedUrls.has(absoluteUrl)) {
	queue.push({ url: absoluteUrl, depth: depth + 1 });
	}
	} catch (e) {
	// Invalid URL, ignore
	}
	}
	}

	} catch (error: any) {
	console.error(`Crawl failed for ${url}:`, error.message);
	}
	}
	return results;
	}

	async transform(raw: any[]): Promise<IngestedEntity[]> {
	return raw.map(item => {
	const $ = cheerio.load(item.html);

	// Remove noise
	$('script, style, nav, footer, header, .cookie-banner, .ads').remove();

	const title = $('title').text().trim() \|\| item.url;
	let content = $('article').text() \|\| $('main').text() \|\| $('body').text();
	content = content.replace(/\s+/g, ' ').trim();
	const description = $('meta[name="description"]').attr('content') \|\| '';

	return {
	id: item.url,
	type: 'web_page',
	source: 'web_crawler',
	title: title,
	content: `Title: ${title}\nDescription: ${description}\n\n${content}`,
	metadata: {
	url: item.url,
	description: description,
	crawledAt: new Date().toISOString(),
	keywordsMatched: this.config.keywords
	},
	timestamp: new Date()
	};
	});
	}

	async isAvailable(): Promise<boolean> {
	return true;
	}
	}