| | from src.agent.utils.tooling import tool |
| | from src.agent.utils.vector_store import chunk_content, load_in_vector_db |
| |
|
| |
|
| |
|
| | @tool |
| | def visit_webpage(url: str) -> str: |
| | """ |
| | Visits a webpage at the given URL and reads its content as a markdown string. |
| | This tool is useful for extracting information from web pages in a structured format after a search. |
| | Args: |
| | url (str): The URL of the webpage to visit. |
| | """ |
| | try: |
| | from src.web2llm.app.scraper import scrape_url |
| | from src.web2llm.app.converter import html_to_markdown |
| | import re |
| | import requests |
| | from markdownify import markdownify |
| | from requests.exceptions import RequestException |
| | from smolagents.utils import truncate_content |
| | from urllib.parse import urlparse |
| |
|
| | except ImportError as e: |
| | raise ImportError( |
| | f"You must install packages `markdownify` and `requests` to run this tool: for instance run `pip install markdownify requests` : {e}" |
| | ) from e |
| |
|
| | forbidden_domains = ["universetoday.com"] |
| |
|
| | parsed_url = urlparse(url) |
| | domain = parsed_url.netloc |
| |
|
| | if domain in forbidden_domains: |
| | return "This domain is forbidden and cannot be accessed, please try another one." |
| |
|
| | try: |
| | |
| | result = scrape_url(url, clean=True) |
| | markdown_content = html_to_markdown(result["clean_html"]) |
| |
|
| | load_in_vector_db( |
| | markdown_content, |
| | metadatas={ |
| | "title": result["title"], |
| | "url": url, |
| | } |
| | ) |
| | return "The webpage has been successfully visited: content has been vectorized and stored in the knowledge base." |
| |
|
| | except requests.exceptions.Timeout: |
| | return "The request timed out. Please try again later or check the URL." |
| |
|
| | except RequestException as e: |
| | return f"Error fetching the webpage: {str(e)}" |
| |
|
| | except Exception as e: |
| | return f"An unexpected error occurred: {str(e)}" |