Spaces:
Sleeping
Sleeping
| import feedparser | |
| from datetime import datetime | |
| from typing import List, Dict | |
| from parser.cleaner import clean_html | |
| from parser.hasher import hash_text | |
| from indexer.typesense_indexer import create_collection_if_not_exists, index_document | |
| SCHEMA = { | |
| "name": "documents", | |
| "fields": [ | |
| {"name": "id", "type": "string"}, | |
| {"name": "titre", "type": "string"}, | |
| {"name": "texte", "type": "string"}, | |
| {"name": "langue", "type": "string", "facet": True}, | |
| {"name": "type_document", "type": "string", "facet": True}, | |
| {"name": "pays", "type": "string", "facet": True}, | |
| {"name": "source_url", "type": "string"}, | |
| {"name": "date", "type": "string"} | |
| ] | |
| } | |
| def fetch_and_index_feeds(feed_urls: List[str]) -> Dict[str, int]: | |
| create_collection_if_not_exists(SCHEMA) | |
| total, ok = 0, 0 | |
| for url in feed_urls: | |
| parsed = feedparser.parse(url) | |
| for entry in parsed.entries: | |
| total += 1 | |
| title = entry.get('title', '') | |
| link = entry.get('link', '') | |
| summary = entry.get('summary', '') or entry.get('description', '') | |
| content = summary | |
| text = clean_html(content) | |
| if not text and title: | |
| text = title | |
| if not text: | |
| continue | |
| # Déduplication stable | |
| doc_id = hash_text((link or title) + '|' + text)[:16] | |
| doc = { | |
| "id": doc_id, | |
| "titre": title or link, | |
| "texte": text[:8000], | |
| "langue": "fr", # best effort; détection langue à ajouter | |
| "type_document": "rss", | |
| "pays": "", # enrichissement géo à ajouter | |
| "source_url": link, | |
| "date": datetime.utcnow().isoformat() | |
| } | |
| try: | |
| index_document("documents", doc) | |
| ok += 1 | |
| except Exception: | |
| # on ignore silencieusement pour la robustesse POC | |
| pass | |
| return {"processed": total, "indexed": ok} | |