Spaces:

Prof-Hunter
/

spjimr-chatbot

Sleeping

File size: 1,095 Bytes

1d0b8d4

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

BASE_URL = "https://www.spjimr.org/"


def get_links():
    r = requests.get(BASE_URL)
    soup = BeautifulSoup(r.text, "html.parser")

    links = set()

    for a in soup.find_all("a", href=True):
        href = a["href"]

        if href.startswith("/"):
            href = urljoin(BASE_URL, href)

        if BASE_URL in href:
            links.add(href)

    return list(links)


def extract_text(url):
    try:
        r = requests.get(url, timeout=10)
        soup = BeautifulSoup(r.text, "html.parser")

        paragraphs = soup.find_all("p")

        text = "\n".join(
            p.get_text(strip=True) for p in paragraphs
        )

        return text

    except:
        return ""


def scrape(max_pages=40):

    links = get_links()[:max_pages]

    docs = []

    for link in links:
        print("Scraping:", link)

        text = extract_text(link)

        if len(text) > 200:
            docs.append({
                "source": link,
                "text": text
            })

    return docs