Spaces:
Sleeping
Sleeping
| import re | |
| import requests | |
| from markdownify import markdownify | |
| from requests.exceptions import RequestException | |
| from langchain_core.tools import tool | |
| import requests | |
| from langchain_community.tools import DuckDuckGoSearchResults | |
| DEFAULT_HEADERS = { | |
| # Generic, browser-like UA. For Wikipedia, better to identify your app & contact. | |
| "User-Agent": ( | |
| "Mozilla/5.0 (X11; Linux x86_64) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/120.0 Safari/537.36" | |
| ), | |
| "Accept": ( | |
| "text/html,application/xhtml+xml,application/xml;" | |
| "q=0.9,image/avif,image/webp,*/*;q=0.8" | |
| ), | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Accept-Encoding": "gzip, deflate, br", | |
| } | |
| def visit_webpage(url: str) -> str: | |
| """Visits a webpage at the given URL and returns its content as a markdown string. | |
| Args: | |
| url: The URL of the webpage to visit. | |
| Returns: | |
| The content of the webpage converted to Markdown, or an error message if the request fails. | |
| """ | |
| try: | |
| # Send a GET request to the URL | |
| response = requests.get("https://urltomarkdown.herokuapp.com/?url=" + url) | |
| #print(response.text) | |
| response.raise_for_status() # Raise an exception for bad status codes | |
| # Convert the HTML content to Markdown | |
| markdown_content = markdownify(response.text).strip() | |
| # Remove multiple line breaks | |
| markdown_content = re.sub(r"\n{3,}", "\n\n", markdown_content) | |
| return markdown_content | |
| except RequestException as e: | |
| return f"Error fetching the webpage: {str(e)}" | |
| except Exception as e: | |
| return f"An unexpected error occurred: {str(e)}" |