Spaces:
Sleeping
Sleeping
File size: 5,718 Bytes
37a70cc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# src/scrape_docs.py
"""
Crawl allowed Atlan docs and write a cleaned docs_corpus.jsonl.
Improvements:
- robust cleaning of encoding artifacts (utf-8 replace + ftfy optional)
- removes paragraph markers ¶, <placeholders>, group-id--digits tokens
- strips boilerplate lines and tiny nav lines
- collapses and normalizes whitespace / encoding
- removes script/style/header/footer/nav/form tags before extracting
Output: docs_corpus.jsonl (overwrites)
"""
import requests
import html
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
from pathlib import Path
from url_normalize import url_normalize
import ujson as json
from tqdm import tqdm
OUTPUT = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
SEEDS = [
"https://docs.atlan.com/",
"https://developer.atlan.com/"
]
ALLOWED_DOMAINS = {"docs.atlan.com", "developer.atlan.com"}
HEADERS = {"User-Agent": "atlan-rag-bot/0.1 (+your_email@example.com)"}
# heuristics
MIN_LINE_WORDS = 3
MIN_PAGE_WORDS = 30
# regex cleanup
RE_CONTROL = re.compile(r"[\x00-\x1f\x7f-\x9f]")
RE_PARAGRAPH_MARK = re.compile(r"¶")
RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>\n]{1,200}>")
RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b")
RE_MULTIPLE_SPACES = re.compile(r"\s+")
RE_REPEATED_CHAR = re.compile(r"(.)\1{5,}") # long repeated chars
RE_BAD_ELLIPSIS = re.compile(r"\.{2,}") # multiple dots
BOILERPLATE_KEYWORDS = [
"table of contents", "overview", "read more", "privacy", "terms", "©", "cookie",
"search", "related articles", "last updated", "release notes", "subscribe", "breadcrumb"
]
# optional: try to import ftfy for robust fixes (if installed)
try:
import ftfy
except Exception:
ftfy = None
def is_allowed(url):
try:
return urlparse(url).netloc in ALLOWED_DOMAINS
except:
return False
def _keep_line(line: str) -> bool:
s = line.strip().lower()
if not s:
return False
if len(s.split()) < MIN_LINE_WORDS:
return False
if s.startswith("http") or s.startswith("www."):
return False
for k in BOILERPLATE_KEYWORDS:
if k in s:
return False
# short code-like lines
if len(s) < 10 and any(ch in s for ch in ['/', '.', '#']):
return False
return True
def clean_text(soup):
# remove undesired blocks
for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
tag.decompose()
parts = []
# only consider headings, paragraphs and list items
for el in soup.find_all(["h1", "h2", "h3", "p", "li"]):
t = el.get_text(separator=" ", strip=True)
if not t:
continue
# HTML unescape
t = html.unescape(t)
# remove paragraph mark and placeholders
t = RE_PARAGRAPH_MARK.sub(" ", t)
t = RE_ANGLE_PLACEHOLDER.sub(" ", t)
t = RE_DOUBLE_DASH_ID.sub(" ", t)
# remove control chars
t = RE_CONTROL.sub(" ", t)
# remove excessive repeated chars
t = RE_REPEATED_CHAR.sub(" ", t)
# normalize ellipsis
t = RE_BAD_ELLIPSIS.sub(". ", t)
# collapse whitespace
t = RE_MULTIPLE_SPACES.sub(" ", t).strip()
if _keep_line(t):
parts.append(t)
joined = "\n\n".join(parts).strip()
# final normalization: force utf-8 safe output & fix broken chars
joined = joined.encode('utf-8', errors='replace').decode('utf-8')
joined = joined.replace("\ufffd", " ")
# optional stronger fix using ftfy if available
if ftfy is not None:
joined = ftfy.fix_text(joined)
# Remove common weird bytes sequences left by encoding (Â, â etc.)
joined = joined.replace("Â", "").replace("â", "")
joined = RE_MULTIPLE_SPACES.sub(" ", joined).strip()
return joined
def crawl(seeds=SEEDS, max_pages=1000, max_depth=2):
seen = set()
out = []
q = deque()
for s in seeds:
q.append((s, 0))
pbar = tqdm(total=max_pages, desc="Crawl", unit="page")
while q and len(out) < max_pages:
url, depth = q.popleft()
url = url_normalize(url)
if url in seen:
continue
if depth > max_depth:
continue
if not is_allowed(url):
seen.add(url)
continue
try:
r = requests.get(url, headers=HEADERS, timeout=12)
if r.status_code != 200:
seen.add(url)
continue
soup = BeautifulSoup(r.text, "html.parser")
title = soup.title.string.strip() if soup.title else url
text = clean_text(soup)
if text and len(text.split()) >= MIN_PAGE_WORDS:
out.append({"url": url, "title": title, "text": text})
pbar.update(1)
seen.add(url)
# find links
for a in soup.find_all("a", href=True):
href = urljoin(url, a["href"])
href = url_normalize(href)
if is_allowed(href) and href not in seen:
# skip common media files
if any(href.lower().endswith(ext) for ext in [".pdf", ".zip", ".png", ".jpg", ".jpeg", ".svg"]):
continue
q.append((href, depth + 1))
except Exception as e:
# keep going
seen.add(url)
continue
pbar.close()
# write JSONL (overwrite)
with OUTPUT.open("w", encoding="utf-8") as f:
for doc in out:
f.write(json.dumps(doc, ensure_ascii=False) + "\n")
print(f"Wrote {len(out)} docs to {OUTPUT}")
if __name__ == "__main__":
crawl(max_pages=400, max_depth=2)
|