scholar-rag-engine / scraper.py
snakeeee's picture
Initial commit - Scholar RAG Engine
1505bbf
raw
history blame contribute delete
399 Bytes
import requests
from bs4 import BeautifulSoup
def scrape_url(url):
headers = {"User-Agent":"Mozilla/5.0"}
r = requests.get(url, headers=headers)
soup = BeautifulSoup(r.text,"html.parser")
elements = soup.find_all(["h1","h2","h3","p","li"])
text = " ".join(
el.get_text(strip=True)
for el in elements
if el.get_text(strip=True)
)
return text