Spaces:

AI4Research
/

scider

Running

App Files Files Community

scider / scievo /tools /web_tool.py

harry-lu-0708

clean HF Space commit (no binary history)

0913c52 21 days ago

raw

history blame contribute delete

5.43 kB

	"""
	Toolset for web search and web access.
	"""

	from typing import Any
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup
	from jinja2 import Template

	from .registry import register_tool, register_toolset_desc

	register_toolset_desc(
	"web",
	"Web toolset for searching the web and fetching content from URLs. Should be only used if it is indeed needed.",
	)

	WEB_SEARCH_TEMPLATE: Template = Template(
	"""\
	Results for web search query '{{ query }}':

	{% for result in results %}
	=== Web Result {{ loop.index }} ===
	{% for key, value in result.items() %}
	{{ key }}: {{ value }}
	{%- endfor %}
	{% endfor %}
	"""
	)


	@register_tool(
	"web",
	{
	"type": "function",
	"function": {
	"name": "web_search",
	"description": "Perform a general web search and return the top results.",
	"parameters": {
	"type": "object",
	"properties": {
	"query": {"type": "string", "description": "The search query to perform"},
	"max_results": {
	"type": "integer",
	"description": "Maximum number of search results to return",
	"default": 5,
	},
	},
	"required": ["query"],
	},
	},
	},
	)
	def web_search(query: str, max_results: int = 5) -> str:
	"""
	Perform a general web search and return the top results.
	"""
	from ddgs import DDGS

	try:
	results: list[dict[str, Any]] = DDGS().text(query, max_results=max_results)
	output_text = WEB_SEARCH_TEMPLATE.render(query=query, results=results)
	except Exception as e:
	output_text = "web_search tool error: " + str(e)
	return output_text


	@register_tool(
	"web",
	{
	"type": "function",
	"function": {
	"name": "get_url_content",
	"description": "Fetch and extract textual content from a web URL. Supports HTML and other textual content, but excludes binary files like PDFs, ZIP files, etc.",
	"parameters": {
	"type": "object",
	"properties": {
	"url": {"type": "string", "description": "The URL to fetch content from"},
	"timeout": {
	"type": "integer",
	"description": "Request timeout in seconds",
	"default": 10,
	},
	},
	"required": ["url"],
	},
	},
	},
	)
	def get_url_content(url: str, timeout: int = 10) -> str:
	"""
	Fetch textual content from a web URL.

	Args:
	url: The URL to fetch content from
	timeout: Request timeout in seconds

	Returns:
	The textual content of the web page
	"""
	try:
	# Parse URL to validate it
	parsed_url = urlparse(url)
	if not parsed_url.scheme or not parsed_url.netloc:
	return f"Error: Invalid URL format: {url}"

	# Set headers to mimic a browser request
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	}

	# Make the request
	response = requests.get(url, headers=headers, timeout=timeout)
	response.raise_for_status()

	# Check content type to ensure it's textual
	content_type = response.headers.get("content-type", "").lower()

	# Reject binary file types
	binary_types = ["pdf", "zip", "rar", "7z", "tar", "gz", "exe", "dmg", "pkg", "deb", "rpm"]
	if any(binary_type in content_type for binary_type in binary_types):
	return f"Error: Binary content type detected ({content_type}). Only textual content is supported."

	# Handle HTML content
	if "text/html" in content_type:
	soup = BeautifulSoup(response.content, "html.parser")

	# Remove script and style elements
	for script in soup(["script", "style"]):
	script.decompose()

	# Extract title
	title = soup.find("title")
	title_text = title.get_text().strip() if title else "No title"

	# Extract main content
	text = soup.get_text()

	# Clean up whitespace
	lines = (line.strip() for line in text.splitlines())
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	text = "\n".join(chunk for chunk in chunks if chunk)

	return f"Title: {title_text}\n\nContent:\n{text}"

	# Handle other textual content types
	elif any(
	text_type in content_type
	for text_type in ["text/", "application/json", "application/xml"]
	):
	return f"Content from {url}:\n\n{response.text}"

	else:
	return f"Warning: Unknown content type ({content_type}). Attempting to extract as text:\n\n{response.text[:5000]}{'...' if len(response.text) > 5000 else ''}"

	except requests.exceptions.Timeout:
	return f"Error: Request timeout after {timeout} seconds for URL: {url}"
	except requests.exceptions.RequestException as e:
	return f"Error fetching URL {url}: {str(e)}"
	except Exception as e:
	return f"Unexpected error processing URL {url}: {str(e)}"