Spaces:
Sleeping
Sleeping
| from src.agent.utils.tooling import tool | |
| from src.agent.utils.vector_store import chunk_content, load_in_vector_db | |
| def visit_webpage(url: str) -> str: | |
| """ | |
| Visits a webpage at the given URL and reads its content as a markdown string. | |
| This tool is useful for extracting information from web pages in a structured format after a search. | |
| Args: | |
| url (str): The URL of the webpage to visit. | |
| """ | |
| try: | |
| from src.web2llm.app.scraper import scrape_url | |
| from src.web2llm.app.converter import html_to_markdown | |
| import re | |
| import requests | |
| from markdownify import markdownify | |
| from requests.exceptions import RequestException | |
| from smolagents.utils import truncate_content | |
| from urllib.parse import urlparse | |
| except ImportError as e: | |
| raise ImportError( | |
| f"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests` : {e}" | |
| ) from e | |
| forbidden_domains = ["universetoday.com"] | |
| parsed_url = urlparse(url) | |
| domain = parsed_url.netloc | |
| if domain in forbidden_domains: | |
| return "This domain is forbidden and cannot be accessed, please try another one." | |
| try: | |
| # Web2LLM app | |
| result = scrape_url(url, clean=True) | |
| markdown_content = html_to_markdown(result["clean_html"]) | |
| load_in_vector_db( | |
| markdown_content, | |
| metadatas={ | |
| "title": result["title"], | |
| "url": url, | |
| } | |
| ) | |
| return "The webpage has been successfully visited: content has been vectorized and stored in the knowledge base." | |
| except requests.exceptions.Timeout: | |
| return "The request timed out. Please try again later or check the URL." | |
| except RequestException as e: | |
| return f"Error fetching the webpage: {str(e)}" | |
| except Exception as e: | |
| return f"An unexpected error occurred: {str(e)}" |