Spaces:
Running
Running
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin | |
| BASE_URL = "https://www.spjimr.org/" | |
| def get_links(): | |
| r = requests.get(BASE_URL) | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| links = set() | |
| for a in soup.find_all("a", href=True): | |
| href = a["href"] | |
| if href.startswith("/"): | |
| href = urljoin(BASE_URL, href) | |
| if BASE_URL in href: | |
| links.add(href) | |
| return list(links) | |
| def extract_text(url): | |
| try: | |
| r = requests.get(url, timeout=10) | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| paragraphs = soup.find_all("p") | |
| text = "\n".join( | |
| p.get_text(strip=True) for p in paragraphs | |
| ) | |
| return text | |
| except: | |
| return "" | |
| def scrape(max_pages=40): | |
| links = get_links()[:max_pages] | |
| docs = [] | |
| for link in links: | |
| print("Scraping:", link) | |
| text = extract_text(link) | |
| if len(text) > 200: | |
| docs.append({ | |
| "source": link, | |
| "text": text | |
| }) | |
| return docs |