import feedparser from datetime import datetime from typing import List, Dict from parser.cleaner import clean_html from parser.hasher import hash_text from indexer.typesense_indexer import create_collection_if_not_exists, index_document SCHEMA = { "name": "documents", "fields": [ {"name": "id", "type": "string"}, {"name": "titre", "type": "string"}, {"name": "texte", "type": "string"}, {"name": "langue", "type": "string", "facet": True}, {"name": "type_document", "type": "string", "facet": True}, {"name": "pays", "type": "string", "facet": True}, {"name": "source_url", "type": "string"}, {"name": "date", "type": "string"} ] } def fetch_and_index_feeds(feed_urls: List[str]) -> Dict[str, int]: create_collection_if_not_exists(SCHEMA) total, ok = 0, 0 for url in feed_urls: parsed = feedparser.parse(url) for entry in parsed.entries: total += 1 title = entry.get('title', '') link = entry.get('link', '') summary = entry.get('summary', '') or entry.get('description', '') content = summary text = clean_html(content) if not text and title: text = title if not text: continue # Déduplication stable doc_id = hash_text((link or title) + '|' + text)[:16] doc = { "id": doc_id, "titre": title or link, "texte": text[:8000], "langue": "fr", # best effort; détection langue à ajouter "type_document": "rss", "pays": "", # enrichissement géo à ajouter "source_url": link, "date": datetime.utcnow().isoformat() } try: index_document("documents", doc) ok += 1 except Exception: # on ignore silencieusement pour la robustesse POC pass return {"processed": total, "indexed": ok}