|
|
from langchain_core.tools import tool
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
from bs4 import BeautifulSoup
|
|
|
from urllib.parse import urljoin
|
|
|
|
|
|
@tool
|
|
|
def get_rendered_html(url: str) -> dict:
|
|
|
"""
|
|
|
Fetch and return the fully rendered HTML of a webpage.
|
|
|
"""
|
|
|
print("\nFetching and rendering:", url)
|
|
|
try:
|
|
|
with sync_playwright() as p:
|
|
|
browser = p.chromium.launch(headless=True)
|
|
|
page = browser.new_page()
|
|
|
|
|
|
page.goto(url, wait_until="networkidle")
|
|
|
content = page.content()
|
|
|
|
|
|
browser.close()
|
|
|
|
|
|
|
|
|
soup = BeautifulSoup(content, "html.parser")
|
|
|
imgs = [urljoin(url, img["src"]) for img in soup.find_all("img", src=True)]
|
|
|
|
|
|
return {
|
|
|
"html": content,
|
|
|
"images": imgs,
|
|
|
"url": url
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
return {"error": f"Error fetching/rendering page: {str(e)}"}
|
|
|
|
|
|
|
|
|
|