Spaces:

Navyssh
/

TDS-Project2

Sleeping

File size: 1,433 Bytes

29f4cdf

from langchain_core.tools import tool
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

@tool
def get_rendered_html(url: str) -> str:
    """
    Fetch and return the fully rendered HTML of a webpage.

    This function uses Playwright to load a webpage in a headless Chromium
    browser, allowing all JavaScript on the page to execute. Use this for
    dynamic websites that require rendering.

    IMPORTANT RESTRICTIONS:
    - ONLY use this for actual HTML webpages (articles, documentation, dashboards).
    - DO NOT use this for direct file links (URLs ending in .csv, .pdf, .zip, .png).
      Playwright cannot render these and will crash. Use the 'download_file' tool instead.

    Parameters
    ----------
    url : str
        The URL of the webpage to retrieve and render.

    Returns
    -------
    str
        The fully rendered and cleaned HTML content.
    """
    # ... existing code ...
    print("\nFetching and rendering:", url)
    try:
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()

            # Load the page (let JS execute)
            page.goto(url, wait_until="networkidle")

            # Extract rendered HTML
            content = page.content()

            browser.close()
            return content

    except Exception as e:
        return f"Error fetching/rendering page: {str(e)}"