import requests from bs4 import BeautifulSoup from urllib.parse import urljoin BASE_URL = "https://www.spjimr.org/" def get_links(): r = requests.get(BASE_URL) soup = BeautifulSoup(r.text, "html.parser") links = set() for a in soup.find_all("a", href=True): href = a["href"] if href.startswith("/"): href = urljoin(BASE_URL, href) if BASE_URL in href: links.add(href) return list(links) def extract_text(url): try: r = requests.get(url, timeout=10) soup = BeautifulSoup(r.text, "html.parser") paragraphs = soup.find_all("p") text = "\n".join( p.get_text(strip=True) for p in paragraphs ) return text except: return "" def scrape(max_pages=40): links = get_links()[:max_pages] docs = [] for link in links: print("Scraping:", link) text = extract_text(link) if len(text) > 200: docs.append({ "source": link, "text": text }) return docs