Spaces:

Agents-MCP-Hackathon
/

BuyVerse

Runtime error

App Files Files Community

BuyVerse / Solution /src /tools /visit_webpage.py

abdo-Mansour

done

b8ee1a5 11 months ago

raw

history blame contribute delete

3.21 kB

	import re
	import time
	import requests
	import markdownify
	from typing import Any, Optional
	from llama_index.core.tools import FunctionTool
	from bs4 import BeautifulSoup
	from bs4 import Comment

	def visit_webpage(url: str) -> str:
	"""
	Visits a webpage at the given url and reads its content as a markdown string.

	Args:
	url (str): The url of the webpage to visit.

	Returns:
	str: The webpage content converted to markdown.
	"""
	try:

	# Sleep for 3 seconds to avoid overwhevlming the server
	time.sleep(3)

	# Send a GET request to the URL with a 20-second timeout
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.6",
	"Cache-Control": "max-age=0",
	"Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
	"Sec-Ch-Ua-Mobile": "?0",
	"Sec-Ch-Ua-Platform": "\"Windows\"",
	"Sec-Fetch-Dest": "document",
	"Sec-Fetch-Mode": "navigate",
	"Sec-Fetch-Site": "none",
	"Sec-Fetch-User": "?1",
	"Upgrade-Insecure-Requests": "1",
	}

	# Make the HTTP GET request with a timeout.
	response = requests.get(url, headers=headers, timeout=20)
	# response = requests.get(url, timeout=20)
	response.raise_for_status() # Raise an exception for bad status codes

	# Parse the HTML content
	soup = BeautifulSoup(response.text, "html.parser")

	# Remove script and style elements
	for tag in soup(["script", "style"]):
	tag.decompose()

	# Remove HTML comments
	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()


	text = soup.get_text(separator=" ", strip=True)
	clean_text = re.sub(r'\s+', ' ', text)

	# Convert the HTML content to Markdown
	# markdown_content = markdownify.markdownify(soup.text).strip()

	# Remove multiple line breaks
	# markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content)

	# Truncate to reasonable size
	# max_length = 10000
	# if len(markdown_content) > max_length:
	# markdown_content = markdown_content[:max_length] + \
	# "... (content truncated)"

	return clean_text[:10]

	except requests.exceptions.Timeout:
	return "The request timed out. Please try again later or check the URL."
	except requests.exceptions.RequestException as e:
	return f"Error fetching the webpage: {str(e)}"
	except Exception as e:
	return f"An unexpected error occurred: {str(e)}"


	# Create a LlamaIndex tool
	visit_webpage_tool = FunctionTool.from_defaults(
	name="visit_webpage",
	fn=visit_webpage,
	description="Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages."
	)