Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import json | |
| import requests | |
| from datetime import datetime | |
| from parser.cleaner import clean_html | |
| from parser.hasher import hash_text | |
| from indexer.typesense_indexer import index_document, create_collection_if_not_exists | |
| SCHEMA = { | |
| "name": "documents", | |
| "fields": [ | |
| {"name": "id", "type": "string"}, | |
| {"name": "titre", "type": "string"}, | |
| {"name": "texte", "type": "string"}, | |
| {"name": "langue", "type": "string", "facet": True}, | |
| {"name": "type_document", "type": "string", "facet": True}, | |
| {"name": "pays", "type": "string", "facet": True}, | |
| {"name": "source_url", "type": "string"}, | |
| {"name": "date", "type": "string"} | |
| ] | |
| } | |
| SEED_SOURCES = [ | |
| # URLs publiques simples pour POC; remplacer par scrapers dédiés (RSS, APIs, etc.) | |
| "https://example.com", | |
| ] | |
| def fetch_url(url: str) -> str: | |
| try: | |
| r = requests.get(url, timeout=10) | |
| r.raise_for_status() | |
| return r.text | |
| except Exception: | |
| return "" | |
| def run_loop(interval_seconds: int = 300): | |
| create_collection_if_not_exists(SCHEMA) | |
| while True: | |
| for url in SEED_SOURCES: | |
| html = fetch_url(url) | |
| if not html: | |
| continue | |
| text = clean_html(html) | |
| if not text: | |
| continue | |
| doc_id = hash_text(url + "|" + text)[:16] | |
| doc = { | |
| "id": doc_id, | |
| "titre": url, | |
| "texte": text[:5000], | |
| "langue": "fr", | |
| "type_document": "texte", | |
| "pays": "", | |
| "source_url": url, | |
| "date": datetime.utcnow().isoformat() | |
| } | |
| try: | |
| index_document("documents", doc) | |
| except Exception: | |
| pass | |
| time.sleep(interval_seconds) | |
| if __name__ == "__main__": | |
| interval = int(os.environ.get("SCRAPER_INTERVAL", "300")) | |
| run_loop(interval) | |