#!/usr/bin/env python3 """ Module de découverte unifié et optimisé pour Scrap-Dji Conçu pour être rapide et économe en ressources (Async/Httpx) """ import asyncio import httpx import json import re import os from urllib.parse import urljoin, urlparse from typing import List, Dict, Set from bs4 import BeautifulSoup from datetime import datetime from utils.logger import setup_logger from utils.config import SCRAPER_DELAY, SCRAPER_USER_AGENT logger = setup_logger(__name__) class UnifiedDiscovery: def __init__(self): self.headers = {'User-Agent': SCRAPER_USER_AGENT} self.african_domains = {'.sn', '.ml', '.ci', '.ng', '.gh', '.ke', '.ma', '.tn', '.dz', '.cm', '.cd', '.ga', '.bj', '.tg'} self.seed_sources = [ "https://www.allafrica.com", "https://www.africanews.com", "https://www.bbc.com/africa", "https://www.rfi.fr/fr/afrique" ] self.african_keywords = ['afrique', 'africa', 'actualités', 'news', 'journal', 'presse'] async def is_african_site(self, client: httpx.AsyncClient, url: str) -> bool: """Vérification rapide (Domaine) puis analyse de contenu si nécessaire""" domain = urlparse(url).netloc.lower() if any(domain.endswith(ext) for ext in self.african_domains): return True try: resp = await client.get(url, timeout=5.0) if resp.status_code == 200: text = resp.text.lower() return sum(text.count(kw) for kw in self.african_keywords) > 3 except Exception: pass return False async def explore_source(self, client: httpx.AsyncClient, seed_url: str) -> List[Dict]: """Explore une source de départ asynchronement""" discovered = [] try: logger.info(f"🔍 Exploration de {seed_url}...") resp = await client.get(seed_url, timeout=10.0) soup = BeautifulSoup(resp.content, 'lxml') links = soup.find_all('a', href=True) for link in links: href = link.get('href') if not href or not href.startswith('http'): continue full_url = urljoin(seed_url, href) domain = urlparse(full_url).netloc if any(ext in domain for ext in self.african_domains): discovered.append({ 'name': domain.replace('.', '_'), 'url': f"{urlparse(full_url).scheme}://{domain}", 'type': 'news', 'active': True, 'discovered_at': datetime.now().isoformat() }) except Exception as e: logger.error(f"Erreur seed {seed_url}: {e}") return discovered async def run_discovery(self): async with httpx.AsyncClient(headers=self.headers, follow_redirects=True) as client: tasks = [self.explore_source(client, seed) for seed in self.seed_sources] results = await asyncio.gather(*tasks) all_sources = [] seen = set() for batch in results: for s in batch: if s['url'] not in seen: all_sources.append(s) seen.add(s['url']) logger.info(f"✅ {len(all_sources)} sources uniques découvertes.") self.save_sources(all_sources) return all_sources def save_sources(self, sources: List[Dict], filename: str = "sources.json"): existing = {"sources": []} if os.path.exists(filename): try: with open(filename, 'r', encoding='utf-8') as f: existing = json.load(f) except: pass existing_urls = {s['url'] for s in existing.get('sources', [])} new_sources = [s for s in sources if s['url'] not in existing_urls] existing['sources'].extend(new_sources) with open(filename, 'w', encoding='utf-8') as f: json.dump(existing, f, indent=2, ensure_ascii=False) logger.info(f"💾 {len(new_sources)} nouvelles sources ajoutées à {filename}") if __name__ == "__main__": discovery = UnifiedDiscovery() asyncio.run(discovery.run_discovery())