Spaces:
Sleeping
Sleeping
File size: 1,977 Bytes
dfdddb1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import os
import time
import json
import requests
from datetime import datetime
from parser.cleaner import clean_html
from parser.hasher import hash_text
from indexer.typesense_indexer import index_document, create_collection_if_not_exists
SCHEMA = {
"name": "documents",
"fields": [
{"name": "id", "type": "string"},
{"name": "titre", "type": "string"},
{"name": "texte", "type": "string"},
{"name": "langue", "type": "string", "facet": True},
{"name": "type_document", "type": "string", "facet": True},
{"name": "pays", "type": "string", "facet": True},
{"name": "source_url", "type": "string"},
{"name": "date", "type": "string"}
]
}
SEED_SOURCES = [
# URLs publiques simples pour POC; remplacer par scrapers dédiés (RSS, APIs, etc.)
"https://example.com",
]
def fetch_url(url: str) -> str:
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
return r.text
except Exception:
return ""
def run_loop(interval_seconds: int = 300):
create_collection_if_not_exists(SCHEMA)
while True:
for url in SEED_SOURCES:
html = fetch_url(url)
if not html:
continue
text = clean_html(html)
if not text:
continue
doc_id = hash_text(url + "|" + text)[:16]
doc = {
"id": doc_id,
"titre": url,
"texte": text[:5000],
"langue": "fr",
"type_document": "texte",
"pays": "",
"source_url": url,
"date": datetime.utcnow().isoformat()
}
try:
index_document("documents", doc)
except Exception:
pass
time.sleep(interval_seconds)
if __name__ == "__main__":
interval = int(os.environ.get("SCRAPER_INTERVAL", "300"))
run_loop(interval)
|