Spaces:

Debanna
/

Customer-Support-Copilot

Sleeping

Customer-Support-Copilot / src /scrape_docs.py

dasdebanna

Prepare app for Hugging Face Space (include index files)

37a70cc 4 months ago

5.72 kB

	# src/scrape_docs.py
	"""
	Crawl allowed Atlan docs and write a cleaned docs_corpus.jsonl.
	Improvements:
	- robust cleaning of encoding artifacts (utf-8 replace + ftfy optional)
	- removes paragraph markers ¶, <placeholders>, group-id--digits tokens
	- strips boilerplate lines and tiny nav lines
	- collapses and normalizes whitespace / encoding
	- removes script/style/header/footer/nav/form tags before extracting
	Output: docs_corpus.jsonl (overwrites)
	"""
	import requests
	import html
	import re
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	from collections import deque
	from pathlib import Path
	from url_normalize import url_normalize
	import ujson as json
	from tqdm import tqdm

	OUTPUT = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
	SEEDS = [
	"https://docs.atlan.com/",
	"https://developer.atlan.com/"
	]
	ALLOWED_DOMAINS = {"docs.atlan.com", "developer.atlan.com"}
	HEADERS = {"User-Agent": "atlan-rag-bot/0.1 (+your_email@example.com)"}

	# heuristics
	MIN_LINE_WORDS = 3
	MIN_PAGE_WORDS = 30

	# regex cleanup
	RE_CONTROL = re.compile(r"[\x00-\x1f\x7f-\x9f]")
	RE_PARAGRAPH_MARK = re.compile(r"¶")
	RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>\n]{1,200}>")
	RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b")
	RE_MULTIPLE_SPACES = re.compile(r"\s+")
	RE_REPEATED_CHAR = re.compile(r"(.)\1{5,}") # long repeated chars
	RE_BAD_ELLIPSIS = re.compile(r"\.{2,}") # multiple dots

	BOILERPLATE_KEYWORDS = [
	"table of contents", "overview", "read more", "privacy", "terms", "©", "cookie",
	"search", "related articles", "last updated", "release notes", "subscribe", "breadcrumb"
	]

	# optional: try to import ftfy for robust fixes (if installed)
	try:
	import ftfy
	except Exception:
	ftfy = None


	def is_allowed(url):
	try:
	return urlparse(url).netloc in ALLOWED_DOMAINS
	except:
	return False

	def _keep_line(line: str) -> bool:
	s = line.strip().lower()
	if not s:
	return False
	if len(s.split()) < MIN_LINE_WORDS:
	return False
	if s.startswith("http") or s.startswith("www."):
	return False
	for k in BOILERPLATE_KEYWORDS:
	if k in s:
	return False
	# short code-like lines
	if len(s) < 10 and any(ch in s for ch in ['/', '.', '#']):
	return False
	return True

	def clean_text(soup):
	# remove undesired blocks
	for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
	tag.decompose()
	parts = []
	# only consider headings, paragraphs and list items
	for el in soup.find_all(["h1", "h2", "h3", "p", "li"]):
	t = el.get_text(separator=" ", strip=True)
	if not t:
	continue
	# HTML unescape
	t = html.unescape(t)
	# remove paragraph mark and placeholders
	t = RE_PARAGRAPH_MARK.sub(" ", t)
	t = RE_ANGLE_PLACEHOLDER.sub(" ", t)
	t = RE_DOUBLE_DASH_ID.sub(" ", t)
	# remove control chars
	t = RE_CONTROL.sub(" ", t)
	# remove excessive repeated chars
	t = RE_REPEATED_CHAR.sub(" ", t)
	# normalize ellipsis
	t = RE_BAD_ELLIPSIS.sub(". ", t)
	# collapse whitespace
	t = RE_MULTIPLE_SPACES.sub(" ", t).strip()
	if _keep_line(t):
	parts.append(t)
	joined = "\n\n".join(parts).strip()
	# final normalization: force utf-8 safe output & fix broken chars
	joined = joined.encode('utf-8', errors='replace').decode('utf-8')
	joined = joined.replace("\ufffd", " ")
	# optional stronger fix using ftfy if available
	if ftfy is not None:
	joined = ftfy.fix_text(joined)
	# Remove common weird bytes sequences left by encoding (Â, â etc.)
	joined = joined.replace("Â", "").replace("â", "")
	joined = RE_MULTIPLE_SPACES.sub(" ", joined).strip()
	return joined

	def crawl(seeds=SEEDS, max_pages=1000, max_depth=2):
	seen = set()
	out = []
	q = deque()
	for s in seeds:
	q.append((s, 0))
	pbar = tqdm(total=max_pages, desc="Crawl", unit="page")
	while q and len(out) < max_pages:
	url, depth = q.popleft()
	url = url_normalize(url)
	if url in seen:
	continue
	if depth > max_depth:
	continue
	if not is_allowed(url):
	seen.add(url)
	continue
	try:
	r = requests.get(url, headers=HEADERS, timeout=12)
	if r.status_code != 200:
	seen.add(url)
	continue
	soup = BeautifulSoup(r.text, "html.parser")
	title = soup.title.string.strip() if soup.title else url
	text = clean_text(soup)
	if text and len(text.split()) >= MIN_PAGE_WORDS:
	out.append({"url": url, "title": title, "text": text})
	pbar.update(1)
	seen.add(url)
	# find links
	for a in soup.find_all("a", href=True):
	href = urljoin(url, a["href"])
	href = url_normalize(href)
	if is_allowed(href) and href not in seen:
	# skip common media files
	if any(href.lower().endswith(ext) for ext in [".pdf", ".zip", ".png", ".jpg", ".jpeg", ".svg"]):
	continue
	q.append((href, depth + 1))
	except Exception as e:
	# keep going
	seen.add(url)
	continue
	pbar.close()
	# write JSONL (overwrite)
	with OUTPUT.open("w", encoding="utf-8") as f:
	for doc in out:
	f.write(json.dumps(doc, ensure_ascii=False) + "\n")
	print(f"Wrote {len(out)} docs to {OUTPUT}")

	if __name__ == "__main__":
	crawl(max_pages=400, max_depth=2)