Spaces:

samarthnaikk
/

llmt1

Sleeping

llmt1 / tools /web_scraper.py

Samarth Naik

updated new code

cc42847 2 months ago

1.43 kB

	from langchain_core.tools import tool
	from playwright.sync_api import sync_playwright
	from bs4 import BeautifulSoup

	@tool
	def get_rendered_html(url: str) -> str:
	"""
	Fetch and return the fully rendered HTML of a webpage.

	This function uses Playwright to load a webpage in a headless Chromium
	browser, allowing all JavaScript on the page to execute. Use this for
	dynamic websites that require rendering.

	IMPORTANT RESTRICTIONS:
	- ONLY use this for actual HTML webpages (articles, documentation, dashboards).
	- DO NOT use this for direct file links (URLs ending in .csv, .pdf, .zip, .png).
	Playwright cannot render these and will crash. Use the 'download_file' tool instead.

	Parameters
	----------
	url : str
	The URL of the webpage to retrieve and render.

	Returns
	-------
	str
	The fully rendered and cleaned HTML content.
	"""
	# ... existing code ...
	print("\nFetching and rendering:", url)
	try:
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=True)
	page = browser.new_page()

	# Load the page (let JS execute)
	page.goto(url, wait_until="networkidle")

	# Extract rendered HTML
	content = page.content()

	browser.close()
	return content

	except Exception as e:
	return f"Error fetching/rendering page: {str(e)}"