File size: 5,718 Bytes
37a70cc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
# src/scrape_docs.py
"""
Crawl allowed Atlan docs and write a cleaned docs_corpus.jsonl.
Improvements:
 - robust cleaning of encoding artifacts (utf-8 replace + ftfy optional)
 - removes paragraph markers ¶, <placeholders>, group-id--digits tokens
 - strips boilerplate lines and tiny nav lines
 - collapses and normalizes whitespace / encoding
 - removes script/style/header/footer/nav/form tags before extracting
Output: docs_corpus.jsonl (overwrites)
"""
import requests
import html
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
from pathlib import Path
from url_normalize import url_normalize
import ujson as json
from tqdm import tqdm

OUTPUT = Path(__file__).parent.parent.joinpath("docs_corpus.jsonl")
SEEDS = [
    "https://docs.atlan.com/",
    "https://developer.atlan.com/"
]
ALLOWED_DOMAINS = {"docs.atlan.com", "developer.atlan.com"}
HEADERS = {"User-Agent": "atlan-rag-bot/0.1 (+your_email@example.com)"}

# heuristics
MIN_LINE_WORDS = 3
MIN_PAGE_WORDS = 30

# regex cleanup
RE_CONTROL = re.compile(r"[\x00-\x1f\x7f-\x9f]")
RE_PARAGRAPH_MARK = re.compile(r"¶")
RE_ANGLE_PLACEHOLDER = re.compile(r"<[^>\n]{1,200}>")
RE_DOUBLE_DASH_ID = re.compile(r"\b[a-zA-Z0-9_-]{3,}--\d{3,}\b")
RE_MULTIPLE_SPACES = re.compile(r"\s+")
RE_REPEATED_CHAR = re.compile(r"(.)\1{5,}")   # long repeated chars
RE_BAD_ELLIPSIS = re.compile(r"\.{2,}")       # multiple dots

BOILERPLATE_KEYWORDS = [
    "table of contents", "overview", "read more", "privacy", "terms", "©", "cookie",
    "search", "related articles", "last updated", "release notes", "subscribe", "breadcrumb"
]

# optional: try to import ftfy for robust fixes (if installed)
try:
    import ftfy
except Exception:
    ftfy = None


def is_allowed(url):
    try:
        return urlparse(url).netloc in ALLOWED_DOMAINS
    except:
        return False

def _keep_line(line: str) -> bool:
    s = line.strip().lower()
    if not s:
        return False
    if len(s.split()) < MIN_LINE_WORDS:
        return False
    if s.startswith("http") or s.startswith("www."):
        return False
    for k in BOILERPLATE_KEYWORDS:
        if k in s:
            return False
    # short code-like lines
    if len(s) < 10 and any(ch in s for ch in ['/', '.', '#']):
        return False
    return True

def clean_text(soup):
    # remove undesired blocks
    for tag in soup(["script", "style", "noscript", "header", "footer", "nav", "form", "aside"]):
        tag.decompose()
    parts = []
    # only consider headings, paragraphs and list items
    for el in soup.find_all(["h1", "h2", "h3", "p", "li"]):
        t = el.get_text(separator=" ", strip=True)
        if not t:
            continue
        # HTML unescape
        t = html.unescape(t)
        # remove paragraph mark and placeholders
        t = RE_PARAGRAPH_MARK.sub(" ", t)
        t = RE_ANGLE_PLACEHOLDER.sub(" ", t)
        t = RE_DOUBLE_DASH_ID.sub(" ", t)
        # remove control chars
        t = RE_CONTROL.sub(" ", t)
        # remove excessive repeated chars
        t = RE_REPEATED_CHAR.sub(" ", t)
        # normalize ellipsis
        t = RE_BAD_ELLIPSIS.sub(". ", t)
        # collapse whitespace
        t = RE_MULTIPLE_SPACES.sub(" ", t).strip()
        if _keep_line(t):
            parts.append(t)
    joined = "\n\n".join(parts).strip()
    # final normalization: force utf-8 safe output & fix broken chars
    joined = joined.encode('utf-8', errors='replace').decode('utf-8')
    joined = joined.replace("\ufffd", " ")
    # optional stronger fix using ftfy if available
    if ftfy is not None:
        joined = ftfy.fix_text(joined)
    # Remove common weird bytes sequences left by encoding (Â, â etc.)
    joined = joined.replace("Â", "").replace("â", "")
    joined = RE_MULTIPLE_SPACES.sub(" ", joined).strip()
    return joined

def crawl(seeds=SEEDS, max_pages=1000, max_depth=2):
    seen = set()
    out = []
    q = deque()
    for s in seeds:
        q.append((s, 0))
    pbar = tqdm(total=max_pages, desc="Crawl", unit="page")
    while q and len(out) < max_pages:
        url, depth = q.popleft()
        url = url_normalize(url)
        if url in seen:
            continue
        if depth > max_depth:
            continue
        if not is_allowed(url):
            seen.add(url)
            continue
        try:
            r = requests.get(url, headers=HEADERS, timeout=12)
            if r.status_code != 200:
                seen.add(url)
                continue
            soup = BeautifulSoup(r.text, "html.parser")
            title = soup.title.string.strip() if soup.title else url
            text = clean_text(soup)
            if text and len(text.split()) >= MIN_PAGE_WORDS:
                out.append({"url": url, "title": title, "text": text})
                pbar.update(1)
            seen.add(url)
            # find links
            for a in soup.find_all("a", href=True):
                href = urljoin(url, a["href"])
                href = url_normalize(href)
                if is_allowed(href) and href not in seen:
                    # skip common media files
                    if any(href.lower().endswith(ext) for ext in [".pdf", ".zip", ".png", ".jpg", ".jpeg", ".svg"]):
                        continue
                    q.append((href, depth + 1))
        except Exception as e:
            # keep going
            seen.add(url)
            continue
    pbar.close()
    # write JSONL (overwrite)
    with OUTPUT.open("w", encoding="utf-8") as f:
        for doc in out:
            f.write(json.dumps(doc, ensure_ascii=False) + "\n")
    print(f"Wrote {len(out)} docs to {OUTPUT}")

if __name__ == "__main__":
    crawl(max_pages=400, max_depth=2)