Spaces:
Running
Running
| """ | |
| web_loader.py | |
| -------------- | |
| Crawl the list of URLs (data/urls.txt) once and save cleaned text to: | |
| /home/user/app/persistent/web_cache.json | |
| Designed for Option 2 (crawl once, then use cached file). | |
| """ | |
| import os | |
| import json | |
| import time | |
| import re | |
| from urllib.parse import urlparse | |
| from urllib import robotparser | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from tqdm import tqdm | |
| USER_AGENT = "CT-Chat-WebLoader/1.0 (+https://example)" | |
| REQUEST_TIMEOUT = 20 | |
| SLEEP_BETWEEN = 0.8 | |
| MAX_RETRIES = 2 | |
| MIN_WORDS_THRESHOLD = 80 | |
| PERSISTENT_DIR = "/home/user/app/persistent" | |
| WEB_CACHE_PATH = os.path.join(PERSISTENT_DIR, "web_cache.json") | |
| URLS_PATH = "/home/user/app/data/urls.txt" | |
| def is_allowed_by_robots(url, agent=USER_AGENT): | |
| try: | |
| parsed = urlparse(url) | |
| base = f"{parsed.scheme}://{parsed.netloc}/robots.txt" | |
| rp = robotparser.RobotFileParser() | |
| rp.set_url(base) | |
| rp.read() | |
| return rp.can_fetch(agent, url) | |
| except Exception: | |
| # If robots cannot be read, allow by default (you can change) | |
| return True | |
| def safe_get(url): | |
| for attempt in range(MAX_RETRIES + 1): | |
| try: | |
| resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=REQUEST_TIMEOUT) | |
| resp.raise_for_status() | |
| return resp | |
| except Exception as e: | |
| if attempt < MAX_RETRIES: | |
| time.sleep(1 + attempt * 1.5) | |
| continue | |
| raise | |
| def html_to_text(html, domain=None): | |
| soup = BeautifulSoup(html, "lxml") | |
| # remove noisy tags | |
| for t in soup(["script", "style", "header", "footer", "nav", "form", "aside", "noscript", "svg"]): | |
| t.decompose() | |
| for c in soup.find_all(class_=re.compile(r"(cookie|consent|banner|subscribe|modal)", re.I)): | |
| c.decompose() | |
| blocks = [] | |
| for el in soup.find_all(["h1", "h2", "h3", "h4", "p", "li", "td"]): | |
| text = el.get_text(separator=" ", strip=True) | |
| if not text: | |
| continue | |
| if len(text) < 30 and re.search(r"©|\bprivacy\b|\bterms\b", text, re.I): | |
| continue | |
| blocks.append(text) | |
| joined = "\n\n".join(blocks) | |
| joined = re.sub(r"\s{2,}", " ", joined).strip() | |
| return joined | |
| def load_urls(path): | |
| if not os.path.exists(path): | |
| return [] | |
| with open(path, "r", encoding="utf-8") as fh: | |
| lines = [l.strip() for l in fh if l.strip() and not l.strip().startswith("#")] | |
| return lines | |
| def crawl_once(urls_file=URLS_PATH, out_path=WEB_CACHE_PATH, max_pages=50, force=False): | |
| os.makedirs(PERSISTENT_DIR, exist_ok=True) | |
| # If cache exists, do not crawl | |
| if os.path.exists(out_path) and os.path.getsize(out_path) > 100: | |
| print(f"Using existing cache at {out_path} (Option 2 behaviour).") | |
| return out_path | |
| urls = load_urls(urls_file) | |
| if not urls: | |
| print("No urls.txt found or empty — nothing to crawl.") | |
| return None | |
| results = {} | |
| count = 0 | |
| for url in tqdm(urls, desc="Crawling URLs"): | |
| if count >= max_pages: | |
| break | |
| try: | |
| if not force and not is_allowed_by_robots(url): | |
| print(f"Skipping by robots.txt: {url}") | |
| continue | |
| resp = safe_get(url) | |
| domain = urlparse(url).netloc.lower() | |
| text = html_to_text(resp.text, domain=domain) | |
| if not text or len(text.split()) < MIN_WORDS_THRESHOLD: | |
| print(f"Skipping short page: {url} ({len(text.split())} words)") | |
| time.sleep(SLEEP_BETWEEN) | |
| continue | |
| title = "" | |
| try: | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(resp.text, "lxml") | |
| title = (soup.title.string or "").strip() if soup.title else "" | |
| except Exception: | |
| title = "" | |
| results[url] = {"title": title, "text": text} | |
| count += 1 | |
| time.sleep(SLEEP_BETWEEN) | |
| except Exception as e: | |
| print(f"Error fetching {url}: {e}") | |
| continue | |
| with open(out_path, "w", encoding="utf-8") as fh: | |
| json.dump(results, fh, indent=2, ensure_ascii=False) | |
| print(f"Saved {len(results)} pages to {out_path}") | |
| return out_path | |
| if __name__ == "__main__": | |
| crawl_once() | |