#!/usr/bin/env python3
"""
Module de découverte unifié et optimisé pour Scrap-Dji
Conçu pour être rapide et économe en ressources (Async/Httpx)
"""

import asyncio
import httpx
import json
import re
import os
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Set
from bs4 import BeautifulSoup
from datetime import datetime

from utils.logger import setup_logger
from utils.config import SCRAPER_DELAY, SCRAPER_USER_AGENT

logger = setup_logger(__name__)

class UnifiedDiscovery:
    def __init__(self):
        self.headers = {'User-Agent': SCRAPER_USER_AGENT}
        self.african_domains = {'.sn', '.ml', '.ci', '.ng', '.gh', '.ke', '.ma', '.tn', '.dz', '.cm', '.cd', '.ga', '.bj', '.tg'}
        self.seed_sources = [
            "https://www.allafrica.com",
            "https://www.africanews.com",
            "https://www.bbc.com/africa",
            "https://www.rfi.fr/fr/afrique"
        ]
        self.african_keywords = ['afrique', 'africa', 'actualités', 'news', 'journal', 'presse']

    async def is_african_site(self, client: httpx.AsyncClient, url: str) -> bool:
        """Vérification rapide (Domaine) puis analyse de contenu si nécessaire"""
        domain = urlparse(url).netloc.lower()
        if any(domain.endswith(ext) for ext in self.african_domains):
            return True
        
        try:
            resp = await client.get(url, timeout=5.0)
            if resp.status_code == 200:
                text = resp.text.lower()
                return sum(text.count(kw) for kw in self.african_keywords) > 3
        except Exception:
            pass
        return False

    async def explore_source(self, client: httpx.AsyncClient, seed_url: str) -> List[Dict]:
        """Explore une source de départ asynchronement"""
        discovered = []
        try:
            logger.info(f"🔍 Exploration de {seed_url}...")
            resp = await client.get(seed_url, timeout=10.0)
            soup = BeautifulSoup(resp.content, 'lxml')
            
            links = soup.find_all('a', href=True)
            for link in links:
                href = link.get('href')
                if not href or not href.startswith('http'): continue
                
                full_url = urljoin(seed_url, href)
                domain = urlparse(full_url).netloc
                
                if any(ext in domain for ext in self.african_domains):
                    discovered.append({
                        'name': domain.replace('.', '_'),
                        'url': f"{urlparse(full_url).scheme}://{domain}",
                        'type': 'news',
                        'active': True,
                        'discovered_at': datetime.now().isoformat()
                    })
        except Exception as e:
            logger.error(f"Erreur seed {seed_url}: {e}")
        return discovered

    async def run_discovery(self):
        async with httpx.AsyncClient(headers=self.headers, follow_redirects=True) as client:
            tasks = [self.explore_source(client, seed) for seed in self.seed_sources]
            results = await asyncio.gather(*tasks)
            
            all_sources = []
            seen = set()
            for batch in results:
                for s in batch:
                    if s['url'] not in seen:
                        all_sources.append(s)
                        seen.add(s['url'])
            
            logger.info(f"✅ {len(all_sources)} sources uniques découvertes.")
            self.save_sources(all_sources)
            return all_sources

    def save_sources(self, sources: List[Dict], filename: str = "sources.json"):
        existing = {"sources": []}
        if os.path.exists(filename):
            try:
                with open(filename, 'r', encoding='utf-8') as f:
                    existing = json.load(f)
            except: pass
        
        existing_urls = {s['url'] for s in existing.get('sources', [])}
        new_sources = [s for s in sources if s['url'] not in existing_urls]
        
        existing['sources'].extend(new_sources)
        
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(existing, f, indent=2, ensure_ascii=False)
        logger.info(f"💾 {len(new_sources)} nouvelles sources ajoutées à {filename}")

if __name__ == "__main__":
    discovery = UnifiedDiscovery()
    asyncio.run(discovery.run_discovery())