Spaces:

jojonocode
/

Scrap-Dji

Sleeping

joel

Initial deployment: Scrap-Dji with API

dfdddb1 about 2 months ago

1.98 kB

	import os
	import time
	import json
	import requests
	from datetime import datetime
	from parser.cleaner import clean_html
	from parser.hasher import hash_text
	from indexer.typesense_indexer import index_document, create_collection_if_not_exists

	SCHEMA = {
	"name": "documents",
	"fields": [
	{"name": "id", "type": "string"},
	{"name": "titre", "type": "string"},
	{"name": "texte", "type": "string"},
	{"name": "langue", "type": "string", "facet": True},
	{"name": "type_document", "type": "string", "facet": True},
	{"name": "pays", "type": "string", "facet": True},
	{"name": "source_url", "type": "string"},
	{"name": "date", "type": "string"}
	]
	}

	SEED_SOURCES = [
	# URLs publiques simples pour POC; remplacer par scrapers dédiés (RSS, APIs, etc.)
	"https://example.com",
	]

	def fetch_url(url: str) -> str:
	try:
	r = requests.get(url, timeout=10)
	r.raise_for_status()
	return r.text
	except Exception:
	return ""

	def run_loop(interval_seconds: int = 300):
	create_collection_if_not_exists(SCHEMA)
	while True:
	for url in SEED_SOURCES:
	html = fetch_url(url)
	if not html:
	continue
	text = clean_html(html)
	if not text:
	continue
	doc_id = hash_text(url + "\|" + text)[:16]
	doc = {
	"id": doc_id,
	"titre": url,
	"texte": text[:5000],
	"langue": "fr",
	"type_document": "texte",
	"pays": "",
	"source_url": url,
	"date": datetime.utcnow().isoformat()
	}
	try:
	index_document("documents", doc)
	except Exception:
	pass
	time.sleep(interval_seconds)

	if __name__ == "__main__":
	interval = int(os.environ.get("SCRAPER_INTERVAL", "300"))
	run_loop(interval)