scider / scievo /tools /web_tool.py
harry-lu-0708's picture
clean HF Space commit (no binary history)
0913c52
"""
Toolset for web search and web access.
"""
from typing import Any
from urllib.parse import urlparse
import requests
from bs4 import BeautifulSoup
from jinja2 import Template
from .registry import register_tool, register_toolset_desc
register_toolset_desc(
"web",
"Web toolset for searching the web and fetching content from URLs. Should be only used if it is indeed needed.",
)
WEB_SEARCH_TEMPLATE: Template = Template(
"""\
Results for web search query '{{ query }}':
{% for result in results %}
=== Web Result {{ loop.index }} ===
{% for key, value in result.items() %}
{{ key }}: {{ value }}
{%- endfor %}
{% endfor %}
"""
)
@register_tool(
"web",
{
"type": "function",
"function": {
"name": "web_search",
"description": "Perform a general web search and return the top results.",
"parameters": {
"type": "object",
"properties": {
"query": {"type": "string", "description": "The search query to perform"},
"max_results": {
"type": "integer",
"description": "Maximum number of search results to return",
"default": 5,
},
},
"required": ["query"],
},
},
},
)
def web_search(query: str, max_results: int = 5) -> str:
"""
Perform a general web search and return the top results.
"""
from ddgs import DDGS
try:
results: list[dict[str, Any]] = DDGS().text(query, max_results=max_results)
output_text = WEB_SEARCH_TEMPLATE.render(query=query, results=results)
except Exception as e:
output_text = "web_search tool error: " + str(e)
return output_text
@register_tool(
"web",
{
"type": "function",
"function": {
"name": "get_url_content",
"description": "Fetch and extract textual content from a web URL. Supports HTML and other textual content, but excludes binary files like PDFs, ZIP files, etc.",
"parameters": {
"type": "object",
"properties": {
"url": {"type": "string", "description": "The URL to fetch content from"},
"timeout": {
"type": "integer",
"description": "Request timeout in seconds",
"default": 10,
},
},
"required": ["url"],
},
},
},
)
def get_url_content(url: str, timeout: int = 10) -> str:
"""
Fetch textual content from a web URL.
Args:
url: The URL to fetch content from
timeout: Request timeout in seconds
Returns:
The textual content of the web page
"""
try:
# Parse URL to validate it
parsed_url = urlparse(url)
if not parsed_url.scheme or not parsed_url.netloc:
return f"Error: Invalid URL format: {url}"
# Set headers to mimic a browser request
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# Make the request
response = requests.get(url, headers=headers, timeout=timeout)
response.raise_for_status()
# Check content type to ensure it's textual
content_type = response.headers.get("content-type", "").lower()
# Reject binary file types
binary_types = ["pdf", "zip", "rar", "7z", "tar", "gz", "exe", "dmg", "pkg", "deb", "rpm"]
if any(binary_type in content_type for binary_type in binary_types):
return f"Error: Binary content type detected ({content_type}). Only textual content is supported."
# Handle HTML content
if "text/html" in content_type:
soup = BeautifulSoup(response.content, "html.parser")
# Remove script and style elements
for script in soup(["script", "style"]):
script.decompose()
# Extract title
title = soup.find("title")
title_text = title.get_text().strip() if title else "No title"
# Extract main content
text = soup.get_text()
# Clean up whitespace
lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
text = "\n".join(chunk for chunk in chunks if chunk)
return f"Title: {title_text}\n\nContent:\n{text}"
# Handle other textual content types
elif any(
text_type in content_type
for text_type in ["text/", "application/json", "application/xml"]
):
return f"Content from {url}:\n\n{response.text}"
else:
return f"Warning: Unknown content type ({content_type}). Attempting to extract as text:\n\n{response.text[:5000]}{'...' if len(response.text) > 5000 else ''}"
except requests.exceptions.Timeout:
return f"Error: Request timeout after {timeout} seconds for URL: {url}"
except requests.exceptions.RequestException as e:
return f"Error fetching URL {url}: {str(e)}"
except Exception as e:
return f"Unexpected error processing URL {url}: {str(e)}"