File size: 1,977 Bytes
dfdddb1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import time
import json
import requests
from datetime import datetime
from parser.cleaner import clean_html
from parser.hasher import hash_text
from indexer.typesense_indexer import index_document, create_collection_if_not_exists

SCHEMA = {
    "name": "documents",
    "fields": [
        {"name": "id", "type": "string"},
        {"name": "titre", "type": "string"},
        {"name": "texte", "type": "string"},
        {"name": "langue", "type": "string", "facet": True},
        {"name": "type_document", "type": "string", "facet": True},
        {"name": "pays", "type": "string", "facet": True},
        {"name": "source_url", "type": "string"},
        {"name": "date", "type": "string"}
    ]
}

SEED_SOURCES = [
    # URLs publiques simples pour POC; remplacer par scrapers dédiés (RSS, APIs, etc.)
    "https://example.com",
]

def fetch_url(url: str) -> str:
    try:
        r = requests.get(url, timeout=10)
        r.raise_for_status()
        return r.text
    except Exception:
        return ""

def run_loop(interval_seconds: int = 300):
    create_collection_if_not_exists(SCHEMA)
    while True:
        for url in SEED_SOURCES:
            html = fetch_url(url)
            if not html:
                continue
            text = clean_html(html)
            if not text:
                continue
            doc_id = hash_text(url + "|" + text)[:16]
            doc = {
                "id": doc_id,
                "titre": url,
                "texte": text[:5000],
                "langue": "fr",
                "type_document": "texte",
                "pays": "",
                "source_url": url,
                "date": datetime.utcnow().isoformat()
            }
            try:
                index_document("documents", doc)
            except Exception:
                pass
        time.sleep(interval_seconds)

if __name__ == "__main__":
    interval = int(os.environ.get("SCRAPER_INTERVAL", "300"))
    run_loop(interval)