import axios from 'axios';
import * as cheerio from 'cheerio';
import { DataSourceAdapter, IngestedEntity } from './DataIngestionEngine.js';
import { URL } from 'url';

interface CrawlerConfig {
    mode: 'single' | 'directory' | 'recursive';
    maxDepth: number;
    keywords: string[];
}

export class WebCrawlerAdapter implements DataSourceAdapter {
    name = 'Web Crawler';
    type: 'other' = 'other';

    private visitedUrls: Set<string> = new Set();
    private urlsToFetch: string[] = [];
    private config: CrawlerConfig = {
        mode: 'single',
        maxDepth: 1,
        keywords: []
    };

    constructor() { }

    configure(config: Partial<CrawlerConfig>) {
        if (config.mode) this.config.mode = config.mode;
        if (config.maxDepth !== undefined) this.config.maxDepth = config.maxDepth;
        if (config.keywords) this.config.keywords = config.keywords;
    }

    /** Add URLs to fetch queue */
    addUrls(urls: string[]): void {
        const validUrls = urls.filter(u => u.startsWith('http'));
        this.urlsToFetch.push(...validUrls);
    }

    async fetch(): Promise<any[]> {
        const results: any[] = [];
        const queue = this.urlsToFetch.map(url => ({ url, depth: 0 }));

        // Clear initial queue as we are processing it now
        this.urlsToFetch = [];

        while (queue.length > 0) {
            const item = queue.shift();
            if (!item) break;
            const { url, depth } = item;

            if (this.visitedUrls.has(url)) continue;
            this.visitedUrls.add(url);

            console.log(`🕷️ Crawling: ${url} (Depth: ${depth}/${this.config.maxDepth})`);

            try {
                const response = await axios.get(url, {
                    timeout: 10000,
                    headers: { 'User-Agent': 'WidgeTDC-Search-Bot/1.0' }
                });

                const html = response.data;
                const $ = cheerio.load(html);
                const textContent = $('body').text().toLowerCase();

                // Check keywords if configured
                if (this.config.keywords.length > 0) {
                    const hasKeyword = this.config.keywords.some(kw => textContent.includes(kw.toLowerCase()));
                    if (!hasKeyword) {
                        console.log(`⏭️ Skipping ${url} - no matching keywords`);
                        continue;
                    }
                }

                results.push({ url, html });

                // Find matching links if we haven't reached max depth
                if (depth < this.config.maxDepth && this.config.mode !== 'single') {
                    const baseUrl = new URL(url);
                    const links = $('a[href]').map((_, el) => $(el).attr('href')).get();

                    for (const link of links) {
                        try {
                            const absoluteUrl = new URL(link, url).toString();

                            // Domain constraint
                            if (new URL(absoluteUrl).hostname !== baseUrl.hostname) continue;

                            // Mode constraint: Directory
                            if (this.config.mode === 'directory') {
                                const basePath = baseUrl.pathname.substring(0, baseUrl.pathname.lastIndexOf('/') + 1);
                                const linkPath = new URL(absoluteUrl).pathname;
                                if (!linkPath.startsWith(basePath)) continue;
                            }

                            if (!this.visitedUrls.has(absoluteUrl)) {
                                queue.push({ url: absoluteUrl, depth: depth + 1 });
                            }
                        } catch (e) {
                            // Invalid URL, ignore
                        }
                    }
                }

            } catch (error: any) {
                console.error(`Crawl failed for ${url}:`, error.message);
            }
        }
        return results;
    }

    async transform(raw: any[]): Promise<IngestedEntity[]> {
        return raw.map(item => {
            const $ = cheerio.load(item.html);

            // Remove noise
            $('script, style, nav, footer, header, .cookie-banner, .ads').remove();

            const title = $('title').text().trim() || item.url;
            let content = $('article').text() || $('main').text() || $('body').text();
            content = content.replace(/\s+/g, ' ').trim();
            const description = $('meta[name="description"]').attr('content') || '';

            return {
                id: item.url,
                type: 'web_page',
                source: 'web_crawler',
                title: title,
                content: `Title: ${title}\nDescription: ${description}\n\n${content}`,
                metadata: {
                    url: item.url,
                    description: description,
                    crawledAt: new Date().toISOString(),
                    keywordsMatched: this.config.keywords
                },
                timestamp: new Date()
            };
        });
    }

    async isAvailable(): Promise<boolean> {
        return true;
    }
}