# src/scrape_docs.py """ Crawl allowed Atlan docs and write a cleaned docs_corpus.jsonl. Improvements: - robust cleaning of encoding artifacts (utf-8 replace + ftfy optional) - removes paragraph markers ¶, , group-id--digits tokens - strips boilerplate lines and tiny nav lines - collapses and normalizes whitespace / encoding - removes script/style/header/footer/nav/form tags before extracting Output: docs_corpus.jsonl (overwrites) """ import requests import html import re from bs4 import BeautifulSoup from urllib.parse import urljoin, urlparse from collections import deque from pathlib import Path from url_normalize import url_normalize import ujson as json from tqdm import tqdm OUTPUT = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl") SEEDS = [ "https://docs.atlan.com/", "https://developer.atlan.com/" ] ALLOWED_DOMAINS = {"docs.atlan.com", "developer.atlan.com"} HEADERS = {"User-Agent": "atlan-rag-bot/0.1 (+your_email@example.com)"} # heuristics MIN_LINE_WORDS = 3 MIN_PAGE_WORDS = 30 # regex cleanup RE_CONTROL = re.compile(r"[\x00-\x1f\x7f-\x9f]") RE_PARAGRAPH_MARK = re.compile(r"¶") RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>\n]{1,200}>") RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b") RE_MULTIPLE_SPACES = re.compile(r"\s+") RE_REPEATED_CHAR = re.compile(r"(.)\1{5,}") # long repeated chars RE_BAD_ELLIPSIS = re.compile(r"\.{2,}") # multiple dots BOILERPLATE_KEYWORDS = [ "table of contents", "overview", "read more", "privacy", "terms", "©", "cookie", "search", "related articles", "last updated", "release notes", "subscribe", "breadcrumb" ] # optional: try to import ftfy for robust fixes (if installed) try: import ftfy except Exception: ftfy = None def is_allowed(url): try: return urlparse(url).netloc in ALLOWED_DOMAINS except: return False def _keep_line(line: str) -> bool: s = line.strip().lower() if not s: return False if len(s.split()) < MIN_LINE_WORDS: return False if s.startswith("http") or s.startswith("www."): return False for k in BOILERPLATE_KEYWORDS: if k in s: return False # short code-like lines if len(s) < 10 and any(ch in s for ch in ['/', '.', '#']): return False return True def clean_text(soup): # remove undesired blocks for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]): tag.decompose() parts = [] # only consider headings, paragraphs and list items for el in soup.find_all(["h1", "h2", "h3", "p", "li"]): t = el.get_text(separator=" ", strip=True) if not t: continue # HTML unescape t = html.unescape(t) # remove paragraph mark and placeholders t = RE_PARAGRAPH_MARK.sub(" ", t) t = RE_ANGLE_PLACEHOLDER.sub(" ", t) t = RE_DOUBLE_DASH_ID.sub(" ", t) # remove control chars t = RE_CONTROL.sub(" ", t) # remove excessive repeated chars t = RE_REPEATED_CHAR.sub(" ", t) # normalize ellipsis t = RE_BAD_ELLIPSIS.sub(". ", t) # collapse whitespace t = RE_MULTIPLE_SPACES.sub(" ", t).strip() if _keep_line(t): parts.append(t) joined = "\n\n".join(parts).strip() # final normalization: force utf-8 safe output & fix broken chars joined = joined.encode('utf-8', errors='replace').decode('utf-8') joined = joined.replace("\ufffd", " ") # optional stronger fix using ftfy if available if ftfy is not None: joined = ftfy.fix_text(joined) # Remove common weird bytes sequences left by encoding (Â, â etc.) joined = joined.replace("Â", "").replace("â", "") joined = RE_MULTIPLE_SPACES.sub(" ", joined).strip() return joined def crawl(seeds=SEEDS, max_pages=1000, max_depth=2): seen = set() out = [] q = deque() for s in seeds: q.append((s, 0)) pbar = tqdm(total=max_pages, desc="Crawl", unit="page") while q and len(out) < max_pages: url, depth = q.popleft() url = url_normalize(url) if url in seen: continue if depth > max_depth: continue if not is_allowed(url): seen.add(url) continue try: r = requests.get(url, headers=HEADERS, timeout=12) if r.status_code != 200: seen.add(url) continue soup = BeautifulSoup(r.text, "html.parser") title = soup.title.string.strip() if soup.title else url text = clean_text(soup) if text and len(text.split()) >= MIN_PAGE_WORDS: out.append({"url": url, "title": title, "text": text}) pbar.update(1) seen.add(url) # find links for a in soup.find_all("a", href=True): href = urljoin(url, a["href"]) href = url_normalize(href) if is_allowed(href) and href not in seen: # skip common media files if any(href.lower().endswith(ext) for ext in [".pdf", ".zip", ".png", ".jpg", ".jpeg", ".svg"]): continue q.append((href, depth + 1)) except Exception as e: # keep going seen.add(url) continue pbar.close() # write JSONL (overwrite) with OUTPUT.open("w", encoding="utf-8") as f: for doc in out: f.write(json.dumps(doc, ensure_ascii=False) + "\n") print(f"Wrote {len(out)} docs to {OUTPUT}") if __name__ == "__main__": crawl(max_pages=400, max_depth=2)