llmt1 / tools /web_scraper.py
Samarth Naik
updated new code
cc42847
from langchain_core.tools import tool
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
@tool
def get_rendered_html(url: str) -> str:
"""
Fetch and return the fully rendered HTML of a webpage.
This function uses Playwright to load a webpage in a headless Chromium
browser, allowing all JavaScript on the page to execute. Use this for
dynamic websites that require rendering.
IMPORTANT RESTRICTIONS:
- ONLY use this for actual HTML webpages (articles, documentation, dashboards).
- DO NOT use this for direct file links (URLs ending in .csv, .pdf, .zip, .png).
Playwright cannot render these and will crash. Use the 'download_file' tool instead.
Parameters
----------
url : str
The URL of the webpage to retrieve and render.
Returns
-------
str
The fully rendered and cleaned HTML content.
"""
# ... existing code ...
print("\nFetching and rendering:", url)
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
# Load the page (let JS execute)
page.goto(url, wait_until="networkidle")
# Extract rendered HTML
content = page.content()
browser.close()
return content
except Exception as e:
return f"Error fetching/rendering page: {str(e)}"