Spaces:
Runtime error
Runtime error
| import re | |
| import time | |
| import requests | |
| import markdownify | |
| from typing import Any, Optional | |
| from llama_index.core.tools import FunctionTool | |
| from bs4 import BeautifulSoup | |
| from bs4 import Comment | |
| def visit_webpage(url: str) -> str: | |
| """ | |
| Visits a webpage at the given url and reads its content as a markdown string. | |
| Args: | |
| url (str): The url of the webpage to visit. | |
| Returns: | |
| str: The webpage content converted to markdown. | |
| """ | |
| try: | |
| # Sleep for 3 seconds to avoid overwhevlming the server | |
| time.sleep(3) | |
| # Send a GET request to the URL with a 20-second timeout | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.6", | |
| "Cache-Control": "max-age=0", | |
| "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"", | |
| "Sec-Ch-Ua-Mobile": "?0", | |
| "Sec-Ch-Ua-Platform": "\"Windows\"", | |
| "Sec-Fetch-Dest": "document", | |
| "Sec-Fetch-Mode": "navigate", | |
| "Sec-Fetch-Site": "none", | |
| "Sec-Fetch-User": "?1", | |
| "Upgrade-Insecure-Requests": "1", | |
| } | |
| # Make the HTTP GET request with a timeout. | |
| response = requests.get(url, headers=headers, timeout=20) | |
| # response = requests.get(url, timeout=20) | |
| response.raise_for_status() # Raise an exception for bad status codes | |
| # Parse the HTML content | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| # Remove script and style elements | |
| for tag in soup(["script", "style"]): | |
| tag.decompose() | |
| # Remove HTML comments | |
| for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): | |
| comment.extract() | |
| text = soup.get_text(separator=" ", strip=True) | |
| clean_text = re.sub(r'\s+', ' ', text) | |
| # Convert the HTML content to Markdown | |
| # markdown_content = markdownify.markdownify(soup.text).strip() | |
| # Remove multiple line breaks | |
| # markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) | |
| # Truncate to reasonable size | |
| # max_length = 10000 | |
| # if len(markdown_content) > max_length: | |
| # markdown_content = markdown_content[:max_length] + \ | |
| # "... (content truncated)" | |
| return clean_text[:10] | |
| except requests.exceptions.Timeout: | |
| return "The request timed out. Please try again later or check the URL." | |
| except requests.exceptions.RequestException as e: | |
| return f"Error fetching the webpage: {str(e)}" | |
| except Exception as e: | |
| return f"An unexpected error occurred: {str(e)}" | |
| # Create a LlamaIndex tool | |
| visit_webpage_tool = FunctionTool.from_defaults( | |
| name="visit_webpage", | |
| fn=visit_webpage, | |
| description="Visits a webpage at the given url and reads its content as a markdown string. Use this to browse webpages." | |
| ) | |