Spaces:
Sleeping
Sleeping
File size: 1,433 Bytes
29f4cdf |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 |
from langchain_core.tools import tool
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
@tool
def get_rendered_html(url: str) -> str:
"""
Fetch and return the fully rendered HTML of a webpage.
This function uses Playwright to load a webpage in a headless Chromium
browser, allowing all JavaScript on the page to execute. Use this for
dynamic websites that require rendering.
IMPORTANT RESTRICTIONS:
- ONLY use this for actual HTML webpages (articles, documentation, dashboards).
- DO NOT use this for direct file links (URLs ending in .csv, .pdf, .zip, .png).
Playwright cannot render these and will crash. Use the 'download_file' tool instead.
Parameters
----------
url : str
The URL of the webpage to retrieve and render.
Returns
-------
str
The fully rendered and cleaned HTML content.
"""
# ... existing code ...
print("\nFetching and rendering:", url)
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Load the page (let JS execute)
page.goto(url, wait_until="networkidle")
# Extract rendered HTML
content = page.content()
browser.close()
return content
except Exception as e:
return f"Error fetching/rendering page: {str(e)}" |