Scrap-Dji / scraper /rss.py
joel
Initial deployment: Scrap-Dji with API
dfdddb1
import feedparser
from datetime import datetime
from typing import List, Dict
from parser.cleaner import clean_html
from parser.hasher import hash_text
from indexer.typesense_indexer import create_collection_if_not_exists, index_document
SCHEMA = {
"name": "documents",
"fields": [
{"name": "id", "type": "string"},
{"name": "titre", "type": "string"},
{"name": "texte", "type": "string"},
{"name": "langue", "type": "string", "facet": True},
{"name": "type_document", "type": "string", "facet": True},
{"name": "pays", "type": "string", "facet": True},
{"name": "source_url", "type": "string"},
{"name": "date", "type": "string"}
]
}
def fetch_and_index_feeds(feed_urls: List[str]) -> Dict[str, int]:
create_collection_if_not_exists(SCHEMA)
total, ok = 0, 0
for url in feed_urls:
parsed = feedparser.parse(url)
for entry in parsed.entries:
total += 1
title = entry.get('title', '')
link = entry.get('link', '')
summary = entry.get('summary', '') or entry.get('description', '')
content = summary
text = clean_html(content)
if not text and title:
text = title
if not text:
continue
# Déduplication stable
doc_id = hash_text((link or title) + '|' + text)[:16]
doc = {
"id": doc_id,
"titre": title or link,
"texte": text[:8000],
"langue": "fr", # best effort; détection langue à ajouter
"type_document": "rss",
"pays": "", # enrichissement géo à ajouter
"source_url": link,
"date": datetime.utcnow().isoformat()
}
try:
index_document("documents", doc)
ok += 1
except Exception:
# on ignore silencieusement pour la robustesse POC
pass
return {"processed": total, "indexed": ok}