spjimr-chatbot / scraper.py
Prof-Hunter's picture
Create scraper.py
1d0b8d4 verified
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
BASE_URL = "https://www.spjimr.org/"
def get_links():
r = requests.get(BASE_URL)
soup = BeautifulSoup(r.text, "html.parser")
links = set()
for a in soup.find_all("a", href=True):
href = a["href"]
if href.startswith("/"):
href = urljoin(BASE_URL, href)
if BASE_URL in href:
links.add(href)
return list(links)
def extract_text(url):
try:
r = requests.get(url, timeout=10)
soup = BeautifulSoup(r.text, "html.parser")
paragraphs = soup.find_all("p")
text = "\n".join(
p.get_text(strip=True) for p in paragraphs
)
return text
except:
return ""
def scrape(max_pages=40):
links = get_links()[:max_pages]
docs = []
for link in links:
print("Scraping:", link)
text = extract_text(link)
if len(text) > 200:
docs.append({
"source": link,
"text": text
})
return docs