Spaces:

Prof-Hunter
/

spjimr-chatbot

Running

spjimr-chatbot / scraper.py

Create scraper.py

1d0b8d4 verified 7 days ago

1.1 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin

	BASE_URL = "https://www.spjimr.org/"


	def get_links():
	r = requests.get(BASE_URL)
	soup = BeautifulSoup(r.text, "html.parser")

	links = set()

	for a in soup.find_all("a", href=True):
	href = a["href"]

	if href.startswith("/"):
	href = urljoin(BASE_URL, href)

	if BASE_URL in href:
	links.add(href)

	return list(links)


	def extract_text(url):
	try:
	r = requests.get(url, timeout=10)
	soup = BeautifulSoup(r.text, "html.parser")

	paragraphs = soup.find_all("p")

	text = "\n".join(
	p.get_text(strip=True) for p in paragraphs
	)

	return text

	except:
	return ""


	def scrape(max_pages=40):

	links = get_links()[:max_pages]

	docs = []

	for link in links:
	print("Scraping:", link)

	text = extract_text(link)

	if len(text) > 200:
	docs.append({
	"source": link,
	"text": text
	})

	return docs