CT-Chat-V2 / web_loader.py
essprasad's picture
Upload 13 files
b4d63a8 verified
"""
web_loader.py
--------------
Crawl the list of URLs (data/urls.txt) once and save cleaned text to:
/home/user/app/persistent/web_cache.json
Designed for Option 2 (crawl once, then use cached file).
"""
import os
import json
import time
import re
from urllib.parse import urlparse
from urllib import robotparser
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
USER_AGENT = "CT-Chat-WebLoader/1.0 (+https://example)"
REQUEST_TIMEOUT = 20
SLEEP_BETWEEN = 0.8
MAX_RETRIES = 2
MIN_WORDS_THRESHOLD = 80
PERSISTENT_DIR = "/home/user/app/persistent"
WEB_CACHE_PATH = os.path.join(PERSISTENT_DIR, "web_cache.json")
URLS_PATH = "/home/user/app/data/urls.txt"
def is_allowed_by_robots(url, agent=USER_AGENT):
try:
parsed = urlparse(url)
base = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = robotparser.RobotFileParser()
rp.set_url(base)
rp.read()
return rp.can_fetch(agent, url)
except Exception:
# If robots cannot be read, allow by default (you can change)
return True
def safe_get(url):
for attempt in range(MAX_RETRIES + 1):
try:
resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp
except Exception as e:
if attempt < MAX_RETRIES:
time.sleep(1 + attempt * 1.5)
continue
raise
def html_to_text(html, domain=None):
soup = BeautifulSoup(html, "lxml")
# remove noisy tags
for t in soup(["script", "style", "header", "footer", "nav", "form", "aside", "noscript", "svg"]):
t.decompose()
for c in soup.find_all(class_=re.compile(r"(cookie|consent|banner|subscribe|modal)", re.I)):
c.decompose()
blocks = []
for el in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "td"]):
text = el.get_text(separator=" ", strip=True)
if not text:
continue
if len(text) < 30 and re.search(r"©|\bprivacy\b|\bterms\b", text, re.I):
continue
blocks.append(text)
joined = "\n\n".join(blocks)
joined = re.sub(r"\s{2,}", " ", joined).strip()
return joined
def load_urls(path):
if not os.path.exists(path):
return []
with open(path, "r", encoding="utf-8") as fh:
lines = [l.strip() for l in fh if l.strip() and not l.strip().startswith("#")]
return lines
def crawl_once(urls_file=URLS_PATH, out_path=WEB_CACHE_PATH, max_pages=50, force=False):
os.makedirs(PERSISTENT_DIR, exist_ok=True)
# If cache exists, do not crawl
if os.path.exists(out_path) and os.path.getsize(out_path) > 100:
print(f"Using existing cache at {out_path} (Option 2 behaviour).")
return out_path
urls = load_urls(urls_file)
if not urls:
print("No urls.txt found or empty — nothing to crawl.")
return None
results = {}
count = 0
for url in tqdm(urls, desc="Crawling URLs"):
if count >= max_pages:
break
try:
if not force and not is_allowed_by_robots(url):
print(f"Skipping by robots.txt: {url}")
continue
resp = safe_get(url)
domain = urlparse(url).netloc.lower()
text = html_to_text(resp.text, domain=domain)
if not text or len(text.split()) < MIN_WORDS_THRESHOLD:
print(f"Skipping short page: {url} ({len(text.split())} words)")
time.sleep(SLEEP_BETWEEN)
continue
title = ""
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, "lxml")
title = (soup.title.string or "").strip() if soup.title else ""
except Exception:
title = ""
results[url] = {"title": title, "text": text}
count += 1
time.sleep(SLEEP_BETWEEN)
except Exception as e:
print(f"Error fetching {url}: {e}")
continue
with open(out_path, "w", encoding="utf-8") as fh:
json.dump(results, fh, indent=2, ensure_ascii=False)
print(f"Saved {len(results)} pages to {out_path}")
return out_path
if __name__ == "__main__":
crawl_once()