deepdive-IR / agent /tools.py
Ritabanm's picture
Update agent/tools.py
9710aee verified
raw
history blame contribute delete
611 Bytes
import httpx
from bs4 import BeautifulSoup
from settings import USER_AGENT
class FetchTools:
async def get_text_from_url(self, url: str) -> str:
headers = {"User-Agent": USER_AGENT}
async with httpx.AsyncClient(timeout=60, follow_redirects=True) as client:
r = await client.get(url, headers=headers)
r.raise_for_status()
html = r.text
soup = BeautifulSoup(html, "html.parser")
for tag in soup(["script", "style", "noscript"]):
tag.decompose()
text = " ".join(soup.get_text(separator=" ").split())
return text