Spaces:

essprasad
/

CT-Chat-V2

Running

App Files Files Community

CT-Chat-V2 / web_loader.py

essprasad

Upload 13 files

b4d63a8 verified 2 months ago

raw

history blame contribute delete

4.31 kB

	"""
	web_loader.py
	--------------
	Crawl the list of URLs (data/urls.txt) once and save cleaned text to:
	/home/user/app/persistent/web_cache.json

	Designed for Option 2 (crawl once, then use cached file).
	"""

	import os
	import json
	import time
	import re
	from urllib.parse import urlparse
	from urllib import robotparser

	import requests
	from bs4 import BeautifulSoup
	from tqdm import tqdm

	USER_AGENT = "CT-Chat-WebLoader/1.0 (+https://example)"
	REQUEST_TIMEOUT = 20
	SLEEP_BETWEEN = 0.8
	MAX_RETRIES = 2
	MIN_WORDS_THRESHOLD = 80

	PERSISTENT_DIR = "/home/user/app/persistent"
	WEB_CACHE_PATH = os.path.join(PERSISTENT_DIR, "web_cache.json")
	URLS_PATH = "/home/user/app/data/urls.txt"


	def is_allowed_by_robots(url, agent=USER_AGENT):
	try:
	parsed = urlparse(url)
	base = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
	rp = robotparser.RobotFileParser()
	rp.set_url(base)
	rp.read()
	return rp.can_fetch(agent, url)
	except Exception:
	# If robots cannot be read, allow by default (you can change)
	return True


	def safe_get(url):
	for attempt in range(MAX_RETRIES + 1):
	try:
	resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT)
	resp.raise_for_status()
	return resp
	except Exception as e:
	if attempt < MAX_RETRIES:
	time.sleep(1 + attempt * 1.5)
	continue
	raise


	def html_to_text(html, domain=None):
	soup = BeautifulSoup(html, "lxml")
	# remove noisy tags
	for t in soup(["script", "style", "header", "footer", "nav", "form", "aside", "noscript", "svg"]):
	t.decompose()
	for c in soup.find_all(class_=re.compile(r"(cookie\|consent\|banner\|subscribe\|modal)", re.I)):
	c.decompose()

	blocks = []
	for el in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "td"]):
	text = el.get_text(separator=" ", strip=True)
	if not text:
	continue
	if len(text) < 30 and re.search(r"©\|\bprivacy\b\|\bterms\b", text, re.I):
	continue
	blocks.append(text)
	joined = "\n\n".join(blocks)
	joined = re.sub(r"\s{2,}", " ", joined).strip()
	return joined


	def load_urls(path):
	if not os.path.exists(path):
	return []
	with open(path, "r", encoding="utf-8") as fh:
	lines = [l.strip() for l in fh if l.strip() and not l.strip().startswith("#")]
	return lines


	def crawl_once(urls_file=URLS_PATH, out_path=WEB_CACHE_PATH, max_pages=50, force=False):
	os.makedirs(PERSISTENT_DIR, exist_ok=True)
	# If cache exists, do not crawl
	if os.path.exists(out_path) and os.path.getsize(out_path) > 100:
	print(f"Using existing cache at {out_path} (Option 2 behaviour).")
	return out_path

	urls = load_urls(urls_file)
	if not urls:
	print("No urls.txt found or empty — nothing to crawl.")
	return None

	results = {}
	count = 0
	for url in tqdm(urls, desc="Crawling URLs"):
	if count >= max_pages:
	break
	try:
	if not force and not is_allowed_by_robots(url):
	print(f"Skipping by robots.txt: {url}")
	continue
	resp = safe_get(url)
	domain = urlparse(url).netloc.lower()
	text = html_to_text(resp.text, domain=domain)
	if not text or len(text.split()) < MIN_WORDS_THRESHOLD:
	print(f"Skipping short page: {url} ({len(text.split())} words)")
	time.sleep(SLEEP_BETWEEN)
	continue
	title = ""
	try:
	from bs4 import BeautifulSoup
	soup = BeautifulSoup(resp.text, "lxml")
	title = (soup.title.string or "").strip() if soup.title else ""
	except Exception:
	title = ""
	results[url] = {"title": title, "text": text}
	count += 1
	time.sleep(SLEEP_BETWEEN)
	except Exception as e:
	print(f"Error fetching {url}: {e}")
	continue

	with open(out_path, "w", encoding="utf-8") as fh:
	json.dump(results, fh, indent=2, ensure_ascii=False)

	print(f"Saved {len(results)} pages to {out_path}")
	return out_path


	if __name__ == "__main__":
	crawl_once()