Spaces:
Sleeping
Sleeping
File size: 1,095 Bytes
1d0b8d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 |
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
BASE_URL = "https://www.spjimr.org/"
def get_links():
r = requests.get(BASE_URL)
soup = BeautifulSoup(r.text, "html.parser")
links = set()
for a in soup.find_all("a", href=True):
href = a["href"]
if href.startswith("/"):
href = urljoin(BASE_URL, href)
if BASE_URL in href:
links.add(href)
return list(links)
def extract_text(url):
try:
r = requests.get(url, timeout=10)
soup = BeautifulSoup(r.text, "html.parser")
paragraphs = soup.find_all("p")
text = "\n".join(
p.get_text(strip=True) for p in paragraphs
)
return text
except:
return ""
def scrape(max_pages=40):
links = get_links()[:max_pages]
docs = []
for link in links:
print("Scraping:", link)
text = extract_text(link)
if len(text) > 200:
docs.append({
"source": link,
"text": text
})
return docs |