Spaces:

Saffn
/

llm

Running

App Files Files Community

llm / src /tools.py

Saffn

Silence duckduckgo_search package rename warning in src/tools.py

2ddc644 13 days ago

Raw

History Blame Contribute Delete

3.74 kB

	import re
	import urllib.parse
	import requests
	from bs4 import BeautifulSoup
	import html2text

	import warnings
	# Suppress duckduckgo_search rename warning
	warnings.filterwarnings("ignore", message=".duckduckgo_search.")

	from duckduckgo_search import DDGS

	# Standard browser headers to avoid getting blocked by websites
	HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Referer": "https://www.google.com/"
	}

	def clean_text(text: str) -> str:
	"""Cleans excess whitespace and formats text nicely."""
	# Replace multiple newlines/spaces with single ones
	text = re.sub(r'\n+', '\n', text)
	text = re.sub(r' +', ' ', text)
	return text.strip()

	def web_search(query: str, max_results: int = 3) -> list:
	"""
	Searches DuckDuckGo and returns a list of dictionaries with titles, hrefs, and body snippets.
	Falls back gracefully if the search fails.
	"""
	try:
	results = []
	with DDGS() as ddgs:
	for r in ddgs.text(query, max_results=max_results):
	results.append({
	"title": r.get("title", "No Title"),
	"url": r.get("href", ""),
	"snippet": r.get("body", "")
	})
	return results
	except Exception as e:
	print(f"Error during DuckDuckGo search: {e}")
	return []

	def scrape_url(url: str, max_chars: int = 4000) -> str:
	"""
	Fetches the web page content and converts it to clean markdown.
	Truncates the output to fit context windows.
	"""
	if not url.startswith("http"):
	return "Invalid URL format."

	try:
	response = requests.get(url, headers=HEADERS, timeout=8)
	if response.status_code != 200:
	return f"Failed to retrieve page. Status code: {response.status_code}"

	# Detect and convert content
	content_type = response.headers.get('Content-Type', '').lower()
	if 'text/html' not in content_type:
	return f"Scraping is limited to HTML content. Content-Type received: {content_type}"

	# Initialize html2text converter
	h = html2text.HTML2Text()
	h.ignore_links = False
	h.ignore_images = True
	h.ignore_emphasis = False
	h.body_width = 0 # Wrap lines at infinity

	# Extract HTML
	html = response.text
	markdown_content = h.handle(html)

	# Clean text
	markdown_content = clean_text(markdown_content)

	if len(markdown_content) > max_chars:
	return markdown_content[:max_chars] + "\n\n... [Content Truncated due to size constraints] ..."

	return markdown_content

	except requests.exceptions.Timeout:
	return "Scraping error: Connection timed out."
	except Exception as e:
	return f"Scraping error occurred: {str(e)}"

	def format_search_results_for_prompt(query: str, search_results: list) -> str:
	"""Formats search results and snippets into a structured text context block."""
	if not search_results:
	return "No search results returned for the query."

	context = f"### WEB SEARCH RESULTS FOR: '{query}'\n"
	context += "Below are relevant snippets retrieved from the web. Use these to formulate a factually correct answer:\n\n"

	for idx, res in enumerate(search_results, 1):
	context += f"Source [{idx}]: {res['title']}\n"
	context += f"URL: {res['url']}\n"
	context += f"Snippet: {res['snippet']}\n\n"

	context += "---\n"
	return context