Spaces:
Paused
Paused
File size: 5,199 Bytes
529090e a84b07b 529090e a84b07b 529090e a84b07b 529090e a84b07b 529090e a84b07b 529090e a84b07b 529090e a84b07b 529090e a84b07b 529090e a84b07b 529090e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | import axios from 'axios';
import * as cheerio from 'cheerio';
import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
import { URL } from 'url';
interface CrawlerConfig {
mode: 'single' | 'directory' | 'recursive';
maxDepth: number;
keywords: string[];
}
export class WebCrawlerAdapter implements DataSourceAdapter {
name = 'Web Crawler';
type: 'other' = 'other';
private visitedUrls: Set<string> = new Set();
private urlsToFetch: string[] = [];
private config: CrawlerConfig = {
mode: 'single',
maxDepth: 1,
keywords: []
};
constructor() { }
configure(config: Partial<CrawlerConfig>) {
if (config.mode) this.config.mode = config.mode;
if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
if (config.keywords) this.config.keywords = config.keywords;
}
/** Add URLs to fetch queue */
addUrls(urls: string[]): void {
const validUrls = urls.filter(u => u.startsWith('http'));
this.urlsToFetch.push(...validUrls);
}
async fetch(): Promise<any[]> {
const results: any[] = [];
const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));
// Clear initial queue as we are processing it now
this.urlsToFetch = [];
while (queue.length > 0) {
const item = queue.shift();
if (!item) break;
const { url, depth } = item;
if (this.visitedUrls.has(url)) continue;
this.visitedUrls.add(url);
console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);
try {
const response = await axios.get(url, {
timeout: 10000,
headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
});
const html = response.data;
const $ = cheerio.load(html);
const textContent = $('body').text().toLowerCase();
// Check keywords if configured
if (this.config.keywords.length > 0) {
const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
if (!hasKeyword) {
console.log(`⏭️ Skipping ${url} - no matching keywords`);
continue;
}
}
results.push({ url, html });
// Find matching links if we haven't reached max depth
if (depth < this.config.maxDepth && this.config.mode !== 'single') {
const baseUrl = new URL(url);
const links = $('a[href]').map((_, el) => $(el).attr('href')).get();
for (const link of links) {
try {
const absoluteUrl = new URL(link, url).toString();
// Domain constraint
if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;
// Mode constraint: Directory
if (this.config.mode === 'directory') {
const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
const linkPath = new URL(absoluteUrl).pathname;
if (!linkPath.startsWith(basePath)) continue;
}
if (!this.visitedUrls.has(absoluteUrl)) {
queue.push({ url: absoluteUrl, depth: depth + 1 });
}
} catch (e) {
// Invalid URL, ignore
}
}
}
} catch (error: any) {
console.error(`Crawl failed for ${url}:`, error.message);
}
}
return results;
}
async transform(raw: any[]): Promise<IngestedEntity[]> {
return raw.map(item => {
const $ = cheerio.load(item.html);
// Remove noise
$('script, style, nav, footer, header, .cookie-banner, .ads').remove();
const title = $('title').text().trim() || item.url;
let content = $('article').text() || $('main').text() || $('body').text();
content = content.replace(/\s+/g, ' ').trim();
const description = $('meta[name="description"]').attr('content') || '';
return {
id: item.url,
type: 'web_page',
source: 'web_crawler',
title: title,
content: `Title: ${title}\nDescription: ${description}\n\n${content}`,
metadata: {
url: item.url,
description: description,
crawledAt: new Date().toISOString(),
keywordsMatched: this.config.keywords
},
timestamp: new Date()
};
});
}
async isAvailable(): Promise<boolean> {
return true;
}
}
|