""" web_loader.py -------------- Crawl the list of URLs (data/urls.txt) once and save cleaned text to: /home/user/app/persistent/web_cache.json Designed for Option 2 (crawl once, then use cached file). """ import os import json import time import re from urllib.parse import urlparse from urllib import robotparser import requests from bs4 import BeautifulSoup from tqdm import tqdm USER_AGENT = "CT-Chat-WebLoader/1.0 (+https://example)" REQUEST_TIMEOUT = 20 SLEEP_BETWEEN = 0.8 MAX_RETRIES = 2 MIN_WORDS_THRESHOLD = 80 PERSISTENT_DIR = "/home/user/app/persistent" WEB_CACHE_PATH = os.path.join(PERSISTENT_DIR, "web_cache.json") URLS_PATH = "/home/user/app/data/urls.txt" def is_allowed_by_robots(url, agent=USER_AGENT): try: parsed = urlparse(url) base = f"{parsed.scheme}://{parsed.netloc}/robots.txt" rp = robotparser.RobotFileParser() rp.set_url(base) rp.read() return rp.can_fetch(agent, url) except Exception: # If robots cannot be read, allow by default (you can change) return True def safe_get(url): for attempt in range(MAX_RETRIES + 1): try: resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp except Exception as e: if attempt < MAX_RETRIES: time.sleep(1 + attempt * 1.5) continue raise def html_to_text(html, domain=None): soup = BeautifulSoup(html, "lxml") # remove noisy tags for t in soup(["script", "style", "header", "footer", "nav", "form", "aside", "noscript", "svg"]): t.decompose() for c in soup.find_all(class_=re.compile(r"(cookie|consent|banner|subscribe|modal)", re.I)): c.decompose() blocks = [] for el in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "td"]): text = el.get_text(separator=" ", strip=True) if not text: continue if len(text) < 30 and re.search(r"©|\bprivacy\b|\bterms\b", text, re.I): continue blocks.append(text) joined = "\n\n".join(blocks) joined = re.sub(r"\s{2,}", " ", joined).strip() return joined def load_urls(path): if not os.path.exists(path): return [] with open(path, "r", encoding="utf-8") as fh: lines = [l.strip() for l in fh if l.strip() and not l.strip().startswith("#")] return lines def crawl_once(urls_file=URLS_PATH, out_path=WEB_CACHE_PATH, max_pages=50, force=False): os.makedirs(PERSISTENT_DIR, exist_ok=True) # If cache exists, do not crawl if os.path.exists(out_path) and os.path.getsize(out_path) > 100: print(f"Using existing cache at {out_path} (Option 2 behaviour).") return out_path urls = load_urls(urls_file) if not urls: print("No urls.txt found or empty — nothing to crawl.") return None results = {} count = 0 for url in tqdm(urls, desc="Crawling URLs"): if count >= max_pages: break try: if not force and not is_allowed_by_robots(url): print(f"Skipping by robots.txt: {url}") continue resp = safe_get(url) domain = urlparse(url).netloc.lower() text = html_to_text(resp.text, domain=domain) if not text or len(text.split()) < MIN_WORDS_THRESHOLD: print(f"Skipping short page: {url} ({len(text.split())} words)") time.sleep(SLEEP_BETWEEN) continue title = "" try: from bs4 import BeautifulSoup soup = BeautifulSoup(resp.text, "lxml") title = (soup.title.string or "").strip() if soup.title else "" except Exception: title = "" results[url] = {"title": title, "text": text} count += 1 time.sleep(SLEEP_BETWEEN) except Exception as e: print(f"Error fetching {url}: {e}") continue with open(out_path, "w", encoding="utf-8") as fh: json.dump(results, fh, indent=2, ensure_ascii=False) print(f"Saved {len(results)} pages to {out_path}") return out_path if __name__ == "__main__": crawl_once()