File size: 4,313 Bytes
b4d63a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
"""
web_loader.py
--------------
Crawl the list of URLs (data/urls.txt) once and save cleaned text to:
  /home/user/app/persistent/web_cache.json

Designed for Option 2 (crawl once, then use cached file).
"""

import os
import json
import time
import re
from urllib.parse import urlparse
from urllib import robotparser

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

USER_AGENT = "CT-Chat-WebLoader/1.0 (+https://example)"
REQUEST_TIMEOUT = 20
SLEEP_BETWEEN = 0.8
MAX_RETRIES = 2
MIN_WORDS_THRESHOLD = 80

PERSISTENT_DIR = "/home/user/app/persistent"
WEB_CACHE_PATH = os.path.join(PERSISTENT_DIR, "web_cache.json")
URLS_PATH = "/home/user/app/data/urls.txt"


def is_allowed_by_robots(url, agent=USER_AGENT):
    try:
        parsed = urlparse(url)
        base = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
        rp = robotparser.RobotFileParser()
        rp.set_url(base)
        rp.read()
        return rp.can_fetch(agent, url)
    except Exception:
        # If robots cannot be read, allow by default (you can change)
        return True


def safe_get(url):
    for attempt in range(MAX_RETRIES + 1):
        try:
            resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            return resp
        except Exception as e:
            if attempt < MAX_RETRIES:
                time.sleep(1 + attempt * 1.5)
                continue
            raise


def html_to_text(html, domain=None):
    soup = BeautifulSoup(html, "lxml")
    # remove noisy tags
    for t in soup(["script", "style", "header", "footer", "nav", "form", "aside", "noscript", "svg"]):
        t.decompose()
    for c in soup.find_all(class_=re.compile(r"(cookie|consent|banner|subscribe|modal)", re.I)):
        c.decompose()

    blocks = []
    for el in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "td"]):
        text = el.get_text(separator=" ", strip=True)
        if not text:
            continue
        if len(text) < 30 and re.search(r"©|\bprivacy\b|\bterms\b", text, re.I):
            continue
        blocks.append(text)
    joined = "\n\n".join(blocks)
    joined = re.sub(r"\s{2,}", " ", joined).strip()
    return joined


def load_urls(path):
    if not os.path.exists(path):
        return []
    with open(path, "r", encoding="utf-8") as fh:
        lines = [l.strip() for l in fh if l.strip() and not l.strip().startswith("#")]
    return lines


def crawl_once(urls_file=URLS_PATH, out_path=WEB_CACHE_PATH, max_pages=50, force=False):
    os.makedirs(PERSISTENT_DIR, exist_ok=True)
    # If cache exists, do not crawl
    if os.path.exists(out_path) and os.path.getsize(out_path) > 100:
        print(f"Using existing cache at {out_path} (Option 2 behaviour).")
        return out_path

    urls = load_urls(urls_file)
    if not urls:
        print("No urls.txt found or empty — nothing to crawl.")
        return None

    results = {}
    count = 0
    for url in tqdm(urls, desc="Crawling URLs"):
        if count >= max_pages:
            break
        try:
            if not force and not is_allowed_by_robots(url):
                print(f"Skipping by robots.txt: {url}")
                continue
            resp = safe_get(url)
            domain = urlparse(url).netloc.lower()
            text = html_to_text(resp.text, domain=domain)
            if not text or len(text.split()) < MIN_WORDS_THRESHOLD:
                print(f"Skipping short page: {url} ({len(text.split())} words)")
                time.sleep(SLEEP_BETWEEN)
                continue
            title = ""
            try:
                from bs4 import BeautifulSoup
                soup = BeautifulSoup(resp.text, "lxml")
                title = (soup.title.string or "").strip() if soup.title else ""
            except Exception:
                title = ""
            results[url] = {"title": title, "text": text}
            count += 1
            time.sleep(SLEEP_BETWEEN)
        except Exception as e:
            print(f"Error fetching {url}: {e}")
            continue

    with open(out_path, "w", encoding="utf-8") as fh:
        json.dump(results, fh, indent=2, ensure_ascii=False)

    print(f"Saved {len(results)} pages to {out_path}")
    return out_path


if __name__ == "__main__":
    crawl_once()