File size: 5,199 Bytes
529090e
 
 
a84b07b
 
 
 
 
 
 
529090e
 
 
 
 
 
 
a84b07b
 
 
 
 
529090e
a84b07b
 
 
 
 
 
 
529090e
 
 
a84b07b
 
529090e
 
 
 
a84b07b
 
 
 
 
 
 
 
 
 
529090e
 
 
a84b07b
 
529090e
 
 
a84b07b
529090e
a84b07b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
529090e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a84b07b
 
529090e
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import axios from 'axios';
import * as cheerio from 'cheerio';
import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
import { URL } from 'url';

interface CrawlerConfig {
    mode: 'single' | 'directory' | 'recursive';
    maxDepth: number;
    keywords: string[];
}

export class WebCrawlerAdapter implements DataSourceAdapter {
    name = 'Web Crawler';
    type: 'other' = 'other';

    private visitedUrls: Set<string> = new Set();
    private urlsToFetch: string[] = [];
    private config: CrawlerConfig = {
        mode: 'single',
        maxDepth: 1,
        keywords: []
    };

    constructor() { }

    configure(config: Partial<CrawlerConfig>) {
        if (config.mode) this.config.mode = config.mode;
        if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
        if (config.keywords) this.config.keywords = config.keywords;
    }

    /** Add URLs to fetch queue */
    addUrls(urls: string[]): void {
        const validUrls = urls.filter(u => u.startsWith('http'));
        this.urlsToFetch.push(...validUrls);
    }

    async fetch(): Promise<any[]> {
        const results: any[] = [];
        const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));

        // Clear initial queue as we are processing it now
        this.urlsToFetch = [];

        while (queue.length > 0) {
            const item = queue.shift();
            if (!item) break;
            const { url, depth } = item;

            if (this.visitedUrls.has(url)) continue;
            this.visitedUrls.add(url);

            console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);

            try {
                const response = await axios.get(url, {
                    timeout: 10000,
                    headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
                });

                const html = response.data;
                const $ = cheerio.load(html);
                const textContent = $('body').text().toLowerCase();

                // Check keywords if configured
                if (this.config.keywords.length > 0) {
                    const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
                    if (!hasKeyword) {
                        console.log(`⏭️ Skipping ${url} - no matching keywords`);
                        continue;
                    }
                }

                results.push({ url, html });

                // Find matching links if we haven't reached max depth
                if (depth < this.config.maxDepth && this.config.mode !== 'single') {
                    const baseUrl = new URL(url);
                    const links = $('a[href]').map((_, el) => $(el).attr('href')).get();

                    for (const link of links) {
                        try {
                            const absoluteUrl = new URL(link, url).toString();

                            // Domain constraint
                            if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;

                            // Mode constraint: Directory
                            if (this.config.mode === 'directory') {
                                const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
                                const linkPath = new URL(absoluteUrl).pathname;
                                if (!linkPath.startsWith(basePath)) continue;
                            }

                            if (!this.visitedUrls.has(absoluteUrl)) {
                                queue.push({ url: absoluteUrl, depth: depth + 1 });
                            }
                        } catch (e) {
                            // Invalid URL, ignore
                        }
                    }
                }

            } catch (error: any) {
                console.error(`Crawl failed for ${url}:`, error.message);
            }
        }
        return results;
    }

    async transform(raw: any[]): Promise<IngestedEntity[]> {
        return raw.map(item => {
            const $ = cheerio.load(item.html);

            // Remove noise
            $('script, style, nav, footer, header, .cookie-banner, .ads').remove();

            const title = $('title').text().trim() || item.url;
            let content = $('article').text() || $('main').text() || $('body').text();
            content = content.replace(/\s+/g, ' ').trim();
            const description = $('meta[name="description"]').attr('content') || '';

            return {
                id: item.url,
                type: 'web_page',
                source: 'web_crawler',
                title: title,
                content: `Title: ${title}\nDescription: ${description}\n\n${content}`,
                metadata: {
                    url: item.url,
                    description: description,
                    crawledAt: new Date().toISOString(),
                    keywordsMatched: this.config.keywords
                },
                timestamp: new Date()
            };
        });
    }

    async isAvailable(): Promise<boolean> {
        return true;
    }
}