Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| Script de scraping massif pour le Togo | |
| Collecte des données depuis toutes les sources togolaises configurées | |
| """ | |
| import asyncio | |
| import sys | |
| from pathlib import Path | |
| # Ajout du répertoire racine au path | |
| sys.path.append(str(Path(__file__).parent)) | |
| from scraper.main import ScrapDjiScraper | |
| from utils.logger import setup_logger | |
| import json | |
| logger = setup_logger(__name__) | |
| def print_banner(): | |
| print("=" * 60) | |
| print("🇹🇬 SCRAPING MASSIF - DONNÉES TOGO 🇹🇬") | |
| print("=" * 60) | |
| print() | |
| def print_stats(data_file: str = "data/search_index.json"): | |
| """Affiche les statistiques de scraping""" | |
| try: | |
| with open(data_file, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| total = len(data) | |
| togo_docs = [d for d in data if d.get('pays') == 'Togo'] | |
| print("\n" + "=" * 60) | |
| print("📊 STATISTIQUES DE SCRAPING") | |
| print("=" * 60) | |
| print(f"📄 Total de documents collectés: {total}") | |
| print(f"🇹🇬 Documents sur le Togo: {len(togo_docs)}") | |
| if togo_docs: | |
| # Statistiques par source | |
| sources = {} | |
| for doc in togo_docs: | |
| url = doc.get('source_url', '') | |
| domain = url.split('/')[2] if len(url.split('/')) > 2 else 'unknown' | |
| sources[domain] = sources.get(domain, 0) + 1 | |
| print("\n📰 Répartition par source:") | |
| for source, count in sorted(sources.items(), key=lambda x: x[1], reverse=True): | |
| print(f" • {source}: {count} articles") | |
| print("=" * 60) | |
| print() | |
| except FileNotFoundError: | |
| print("⚠️ Aucune donnée trouvée. Le scraping va commencer...") | |
| except Exception as e: | |
| logger.error(f"Erreur lecture stats: {e}") | |
| async def main(): | |
| print_banner() | |
| # Vérifier la configuration | |
| if not Path("sources.json").exists(): | |
| print("❌ Fichier sources.json manquant!") | |
| return | |
| # Afficher les sources configurées | |
| with open("sources.json", 'r', encoding='utf-8') as f: | |
| config = json.load(f) | |
| sources = config.get('sources', []) | |
| active_sources = [s for s in sources if s.get('active', True)] | |
| togo_sources = [s for s in active_sources if s.get('pays') == 'Togo'] | |
| print(f"🎯 Sources togolaises actives: {len(togo_sources)}") | |
| for source in togo_sources: | |
| print(f" • {source['name']} - {source['url']}") | |
| print() | |
| # Stats avant scraping | |
| print_stats() | |
| # Lancer le scraping | |
| print("🚀 Démarrage du scraping massif...") | |
| print("⏳ Cela peut prendre plusieurs minutes...\n") | |
| scraper = ScrapDjiScraper("sources.json") | |
| await scraper.run() | |
| # Stats après scraping | |
| print("\n✅ Scraping terminé!") | |
| print_stats() | |
| if __name__ == "__main__": | |
| try: | |
| asyncio.run(main()) | |
| except KeyboardInterrupt: | |
| print("\n\n⚠️ Scraping interrompu par l'utilisateur") | |
| print_stats() | |
| except Exception as e: | |
| logger.error(f"Erreur fatale: {e}") | |
| print(f"\n❌ Erreur: {e}") | |