Scrap-Dji / workers /scraper_worker.py
joel
Initial deployment: Scrap-Dji with API
dfdddb1
import os
import time
import json
import requests
from datetime import datetime
from parser.cleaner import clean_html
from parser.hasher import hash_text
from indexer.typesense_indexer import index_document, create_collection_if_not_exists
SCHEMA = {
"name": "documents",
"fields": [
{"name": "id", "type": "string"},
{"name": "titre", "type": "string"},
{"name": "texte", "type": "string"},
{"name": "langue", "type": "string", "facet": True},
{"name": "type_document", "type": "string", "facet": True},
{"name": "pays", "type": "string", "facet": True},
{"name": "source_url", "type": "string"},
{"name": "date", "type": "string"}
]
}
SEED_SOURCES = [
# URLs publiques simples pour POC; remplacer par scrapers dédiés (RSS, APIs, etc.)
"https://example.com",
]
def fetch_url(url: str) -> str:
try:
r = requests.get(url, timeout=10)
r.raise_for_status()
return r.text
except Exception:
return ""
def run_loop(interval_seconds: int = 300):
create_collection_if_not_exists(SCHEMA)
while True:
for url in SEED_SOURCES:
html = fetch_url(url)
if not html:
continue
text = clean_html(html)
if not text:
continue
doc_id = hash_text(url + "|" + text)[:16]
doc = {
"id": doc_id,
"titre": url,
"texte": text[:5000],
"langue": "fr",
"type_document": "texte",
"pays": "",
"source_url": url,
"date": datetime.utcnow().isoformat()
}
try:
index_document("documents", doc)
except Exception:
pass
time.sleep(interval_seconds)
if __name__ == "__main__":
interval = int(os.environ.get("SCRAPER_INTERVAL", "300"))
run_loop(interval)