Spaces:
Sleeping
Sleeping
File size: 2,085 Bytes
dfdddb1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | import feedparser
from datetime import datetime
from typing import List, Dict
from parser.cleaner import clean_html
from parser.hasher import hash_text
from indexer.typesense_indexer import create_collection_if_not_exists, index_document
SCHEMA = {
"name": "documents",
"fields": [
{"name": "id", "type": "string"},
{"name": "titre", "type": "string"},
{"name": "texte", "type": "string"},
{"name": "langue", "type": "string", "facet": True},
{"name": "type_document", "type": "string", "facet": True},
{"name": "pays", "type": "string", "facet": True},
{"name": "source_url", "type": "string"},
{"name": "date", "type": "string"}
]
}
def fetch_and_index_feeds(feed_urls: List[str]) -> Dict[str, int]:
create_collection_if_not_exists(SCHEMA)
total, ok = 0, 0
for url in feed_urls:
parsed = feedparser.parse(url)
for entry in parsed.entries:
total += 1
title = entry.get('title', '')
link = entry.get('link', '')
summary = entry.get('summary', '') or entry.get('description', '')
content = summary
text = clean_html(content)
if not text and title:
text = title
if not text:
continue
# Déduplication stable
doc_id = hash_text((link or title) + '|' + text)[:16]
doc = {
"id": doc_id,
"titre": title or link,
"texte": text[:8000],
"langue": "fr", # best effort; détection langue à ajouter
"type_document": "rss",
"pays": "", # enrichissement géo à ajouter
"source_url": link,
"date": datetime.utcnow().isoformat()
}
try:
index_document("documents", doc)
ok += 1
except Exception:
# on ignore silencieusement pour la robustesse POC
pass
return {"processed": total, "indexed": ok}
|