File size: 1,095 Bytes
1d0b8d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://www.spjimr.org/"


def get_links():
    r = requests.get(BASE_URL)
    soup = BeautifulSoup(r.text, "html.parser")

    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]

        if href.startswith("/"):
            href = urljoin(BASE_URL, href)

        if BASE_URL in href:
            links.add(href)

    return list(links)


def extract_text(url):
    try:
        r = requests.get(url, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        paragraphs = soup.find_all("p")

        text = "\n".join(
            p.get_text(strip=True) for p in paragraphs
        )

        return text

    except:
        return ""


def scrape(max_pages=40):

    links = get_links()[:max_pages]

    docs = []

    for link in links:
        print("Scraping:", link)

        text = extract_text(link)

        if len(text) > 200:
            docs.append({
                "source": link,
                "text": text
            })

    return docs