import os import time import json import requests from datetime import datetime from parser.cleaner import clean_html from parser.hasher import hash_text from indexer.typesense_indexer import index_document, create_collection_if_not_exists SCHEMA = { "name": "documents", "fields": [ {"name": "id", "type": "string"}, {"name": "titre", "type": "string"}, {"name": "texte", "type": "string"}, {"name": "langue", "type": "string", "facet": True}, {"name": "type_document", "type": "string", "facet": True}, {"name": "pays", "type": "string", "facet": True}, {"name": "source_url", "type": "string"}, {"name": "date", "type": "string"} ] } SEED_SOURCES = [ # URLs publiques simples pour POC; remplacer par scrapers dédiés (RSS, APIs, etc.) "https://example.com", ] def fetch_url(url: str) -> str: try: r = requests.get(url, timeout=10) r.raise_for_status() return r.text except Exception: return "" def run_loop(interval_seconds: int = 300): create_collection_if_not_exists(SCHEMA) while True: for url in SEED_SOURCES: html = fetch_url(url) if not html: continue text = clean_html(html) if not text: continue doc_id = hash_text(url + "|" + text)[:16] doc = { "id": doc_id, "titre": url, "texte": text[:5000], "langue": "fr", "type_document": "texte", "pays": "", "source_url": url, "date": datetime.utcnow().isoformat() } try: index_document("documents", doc) except Exception: pass time.sleep(interval_seconds) if __name__ == "__main__": interval = int(os.environ.get("SCRAPER_INTERVAL", "300")) run_loop(interval)