DIY_Agent

Sleeping

App Files Files Community

DIY_Agent / ai_tool.py

bwilkie

Update ai_tool.py

aa39e68 verified 6 months ago

raw

history blame contribute delete

16.8 kB

	from typing import Callable, Dict, List, Any
	import time
	import wikipedia
	from googlesearch import search
	from bs4 import BeautifulSoup
	from PIL import Image
	from io import BytesIO
	import pytesseract
	import requests
	from PyPDF2 import PdfReader
	import re
	#from duckduckgo_search import DDGS
	import random
	from tavily import TavilyClient
	from playwright.sync_api import sync_playwright
	from bs4 import BeautifulSoup
	import os
	# Tool definitions
	def tool(func: Callable) -> Callable:
	"""Decorator to mark functions as tools."""
	func.is_tool = True
	return func
	###########################

	import requests
	import xml.etree.ElementTree as ET
	from datetime import datetime

	@tool
	def ArxivPaperSearcher(topic: str, max_results: int = 5):

	"""
	Search arXiv for papers.

	Args:
	query (str): Search term(s), e.g. "machine learning".
	max_results (int): Number of results to fetch.

	Returns:
	List of dicts with paper info (title, authors, summary, url).
	"""
	base_url = "http://export.arxiv.org/api/query"
	timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
	print(f"[{timestamp}] Searching arXiv for: {topic}")

	# Build query
	params = {
	"search_query": f"all:{topic}",
	"start": 0,
	"max_results": max_results
	}

	response = requests.get(base_url, params=params)
	response.raise_for_status()

	# Parse XML feed
	root = ET.fromstring(response.text)
	ns = {"atom": "http://www.w3.org/2005/Atom"}

	results = []
	for entry in root.findall("atom:entry", ns):
	title = entry.find("atom:title", ns).text.strip()
	summary = entry.find("atom:summary", ns).text.strip()
	link = entry.find("atom:id", ns).text.strip()
	authors = [a.find("atom:name", ns).text for a in entry.findall("atom:author", ns)]

	results.append({
	"title": title,
	"authors": authors,
	"summary": summary,
	"url": link
	})

	print(f"Found {len(results)} results.")
	return results



	##############################
	# --- heuristics for cookie/consent elements ---
	_COOKIE_ATTR_RE = re.compile(
	r"(cookie\|consent\|gdpr\|cmp\|onetrust\|ot-sdk\|osano\|iubenda\|didomi\|trustarc\|truste\|"
	r"quantcast\|qc-cmp\|axeptio\|sp_message\|sp-cc\|privacy\|manage-choices\|preferences)",
	re.I,
	)
	_COOKIE_TEXT_RE = re.compile(
	r"(cookies?\|cookie settings\|we (use\|value) your privacy\|consent\|gdpr\|"
	r"manage (cookies\|choices)\|your choices\|accept all\|reject all\|use of cookies\|"
	r"privacy (policy\|preferences))",
	re.I,
	)

	def _normalize_text(txt: str) -> str:
	# neat paragraphs, no extra blanks
	lines = [line.strip() for line in txt.splitlines()]
	lines = [ln for ln in lines if ln]
	return "\n".join(lines)

	def _strip_noise_and_cookies(soup: BeautifulSoup) -> None:
	# Remove obvious non-content
	for tag in soup(["script", "style", "noscript", "template", "svg", "iframe"]):
	tag.decompose()
	for c in soup.find_all(string=lambda t: isinstance(t, Comment)):
	c.extract()

	def looks_like_cookie(el) -> bool:
	# attributes
	attrs = []
	for k in ("id", "class", "data-component", "data-testid", "aria-label"):
	v = el.get(k)
	if isinstance(v, list):
	v = " ".join(v)
	if v:
	attrs.append(v)
	attr_str = " ".join(attrs)

	# text
	text = el.get_text(" ", strip=True)
	style = (el.get("style") or "")
	role = (el.get("role") or "").lower()

	if _COOKIE_ATTR_RE.search(attr_str):
	return True
	if role in ("dialog", "alert", "banner") and (_COOKIE_ATTR_RE.search(attr_str) or _COOKIE_TEXT_RE.search(text)):
	return True

	# shorter text is typical for banners; don't nuke long articles accidentally
	if text and len(text) < 1200 and _COOKIE_TEXT_RE.search(text):
	return True

	style_l = style.replace(" ", "").lower()
	if ("position:fixed" in style_l or "position:sticky" in style_l) and (
	_COOKIE_TEXT_RE.search(text) or _COOKIE_ATTR_RE.search(attr_str)
	):
	return True

	return False

	# Remove cookie/consent blocks and their fixed-position parents (up to 3 levels)
	for el in soup.find_all(["div", "section", "aside", "form", "nav", "footer", "header", "dialog"]):
	if looks_like_cookie(el):
	target = el
	for _ in range(3):
	p = target.parent
	if not p or p.name in ("body", "html"):
	break
	p_style = (p.get("style") or "").replace(" ", "").lower()
	p_attrs = " ".join(
	filter(
	None,
	[
	p.get("id") or "",
	" ".join(p.get("class", [])) if isinstance(p.get("class"), list) else (p.get("class") or ""),
	],
	)
	)
	if "position:fixed" in p_style or _COOKIE_ATTR_RE.search(p_attrs):
	target = p
	else:
	break
	target.decompose()

	# Kill generic fixed overlays/backdrops with almost no text (safety net)
	for el in soup.find_all(style=True):
	s = el["style"].replace(" ", "").lower()
	if "position:fixed" in s and ("width:100%" in s or "inset:" in s or "top:0" in s):
	if len(el.get_text(strip=True)) < 200:
	el.decompose()

	@tool
	def extract_content_with_playwright(url):
	with sync_playwright() as p:
	browser = p.chromium.launch(headless=False)
	context = browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/115.0.0.0 Safari/537.36",
	viewport={"width": 1280, "height": 800},
	locale="en-US",
	)
	page = context.new_page()
	response = page.goto(url, wait_until="networkidle", timeout=30000)
	print(f"Page response status: {response.status}")

	# Try to accept cookies if the button is present
	try:
	page.click('button:has-text("Accept")', timeout=5000)
	page.wait_for_load_state("networkidle")
	print("Clicked Accept on cookie consent.")
	except Exception as e:
	print("No cookie accept button found or clicking failed:", e)

	content = page.content()
	browser.close()
	return content

	def extract_webpage_content(url: str) -> str:

	#logger.info(f"Starting content extraction for: {url}")

	# Fix URL format if missing protocol
	if url and not url.startswith(('http://', 'https://')):
	url = f"https://{url}"
	#logger.info(f"Fixed URL to: {url}")

	# Validate URL
	if not url:
	return "ERROR: Empty URL provided"

	html = extract_content_with_playwright(url)
	print(html[:1000]) # preview content

	soup = BeautifulSoup(html, 'html.parser')
	_strip_noise_and_cookies(soup) # modifies soup in-place

	candidates = soup.find_all(['article', 'main', 'section', 'div'])
	candidates = [c for c in candidates if len(c.get_text(strip=True)) > 300]
	best = max(candidates, key=lambda c: len(c.get_text()), default=soup.body)

	return best.get_text(separator="\n", strip=True)[:10000]
	# @tool
	# def extract_webpage_content(url: str) -> str:
	# """Extracts the main visible content from a webpage, omitting headers, footers, and nav bars."""
	# import requests
	# from bs4 import BeautifulSoup

	# headers = {
	# "User-Agent": (
	# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	# "AppleWebKit/537.36 (KHTML, like Gecko) "
	# "Chrome/115.0.0.0 Safari/537.36"
	# ),
	# "Accept-Language": "en-US,en;q=0.9",
	# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	# "Referer": "https://www.google.com/",
	# "Connection": "keep-alive",
	# }

	# try:
	# session = requests.Session()
	# response = session.get(url, headers=headers, timeout=10)
	# response.raise_for_status()

	# soup = BeautifulSoup(response.text, 'html.parser')

	# # Find likely main content containers
	# candidates = soup.find_all(['article', 'main', 'section', 'div'])
	# candidates = [c for c in candidates if len(c.get_text(strip=True)) > 300]

	# # Pick the largest one
	# best = max(candidates, key=lambda c: len(c.get_text()), default=soup.body)

	# return best.get_text(separator="\n", strip=True)[:10000]

	# except Exception as e:
	# return f"[Error extracting content] {e}"

	def preprocess_search_results(results, n_results=4, min_score=0):
	# Filter and keep the score temporarily for sorting
	filtered = [
	{
	"title": item["title"],
	"url": item["url"],
	"summary": item["content"],
	"score": item["score"]
	}
	for item in results
	if item["score"] >= min_score
	]

	# Sort by score descending
	sorted_results = sorted(filtered, key=lambda x: x["score"], reverse=True)

	# Return only the top N, excluding the score from the final output
	return [
	{
	"title": item["title"],
	"url": item["url"],
	"summary": item["summary"]
	}
	for item in sorted_results[:n_results]
	]

	@tool
	def web_search_tool(query: str) -> str:
	"""Performs a web search using and return url, title, and summary."""
	def duck_duck_go(query):
	print('Print:Tool: Performing DuckDuckGo Search...')
	print('Print:query:', query)
	try:
	time.sleep(random.uniform(2, 5)) # Respectful pause
	with DDGS() as ddgs:
	results = ddgs.text(query, max_results=5)
	except Exception as e:
	print(f"Print:❌ DuckDuckGo search failed: {e}")
	results = None


	output = "## DuckDuckGo Search Results\n\n"
	for i, r in enumerate(results, 1):
	output += f"{i}. {r['title']}\n{r['href']}\n\n"
	return output


	def tavily(query):
	print("Print:🔁 Falling back to Tavily Search...")
	try:
	tavily_client = TavilyClient(api_key=os.getenv("TAVILY_API_KEY"))
	response_out = tavily_client.search(query)
	#print('Print Tavily Raw:', response_out)
	response_out = preprocess_search_results(response_out['results'])
	except Exception as e:
	print(f"Print:❌ tavily search failed: {e}")

	#print('Print:', response_out)
	return response_out

	# try:
	# query_out = duck_duck_go(query)
	# if not query_out:
	# query_out = tavily(query)
	# except:
	# query_out = tavily(query)

	# if not query_out:
	# query_out = tavily(query)
	# print('Print Search results:', query_out)
	# return query_out

	try:
	query_out = tavily(query)
	except:
	time.sleep(3)
	query_out = tavily(query)
	print('Print Search results:', query_out)
	return query_out

	# @tool
	# def web_search_tool(query: str) -> str:
	# """Perform a web search using Google Search with retry logic."""
	# print('Tool: Performing Search')
	# retries = 3
	# delay = 3 # seconds

	# for attempt in range(retries):
	# try:
	# # Convert generator to list and limit results
	# search_results = list(search(query, num_results=5))

	# if not search_results:
	# raise Exception("No results found.")

	# # Format results nicely
	# formatted_results = "## Search Results\n\n"
	# for i, url in enumerate(search_results, 1):
	# formatted_results += f"{i}. {url}\n"

	# return formatted_results

	# except Exception as e:
	# print(f"[Attempt {attempt+1}] Web search failed: {e}")
	# if attempt < retries - 1: # Don't sleep on last attempt
	# time.sleep(delay)

	# return "Failed to retrieve search results after retries."

	@tool
	def retrieve_images_from_url(url: str) -> list:
	"""Extracts image from a URLs."""
	print('Print:Tool: Retrieving Image')
	try:
	response = requests.get(url)
	response.raise_for_status()
	soup = BeautifulSoup(response.text, 'html.parser')
	images = [img['src'] for img in soup.find_all('img') if img.get('src')]
	return images if images else ["No images found."]
	except Exception as e:
	print(f"Print:Image retrieval error: {e}")
	return [f"Failed to retrieve images. {e}"]

	@tool
	def perform_ocr_on_image(image_url: str) -> str:
	"""Download an image and perform OCR to extract text."""
	print('Print:Tool: Performing OCR')
	try:
	response = requests.get(image_url)
	response.raise_for_status()
	image = Image.open(BytesIO(response.content))
	text = pytesseract.image_to_string(image)
	return text.strip() if text else "No text found in image."
	except Exception as e:
	print(f"Print:OCR failed: {e}")
	return f"OCR failed. {e}"

	@tool
	def wikipedia_tool(topic: str) -> str:
	"""Search Wikipedia for a given topic and return the summary."""
	print('Print:Tool: Performing Wiki Search')
	try:
	page = wikipedia.page(topic, auto_suggest=True)
	return f"Title: {page.title}\n\nSummary:\n{page.summary}"
	except wikipedia.exceptions.DisambiguationError as e:
	return f"Disambiguation required. Options: {e.options[:5]}"
	except wikipedia.exceptions.PageError:
	return "Page not found on Wikipedia."
	except Exception as e:
	print(f"Print:Wikipedia fetch error: {e}")
	return f"Wikipedia lookup failed. {e}"


	@tool
	def extract_text_from_pdf_url(pdf_url: str) -> str:
	"""
	Downloads a PDF from the provided URL and extracts text from the first few pages.
	"""
	print('Tool: Extracting PDF content')
	try:
	response = requests.get(pdf_url)
	response.raise_for_status()
	pdf_reader = PdfReader(BytesIO(response.content))

	# Read text from the first few pages
	text = ""
	for i, page in enumerate(pdf_reader.pages[:3]):
	page_text = page.extract_text()
	if page_text:
	text += page_text
	print('Print:PDF txt', text[:3000])
	return text[:3000] if text else "No text found in PDF."
	except Exception as e:
	print(f"Print:PDF extraction failed: {e}")
	return f"Failed to extract PDF content from {pdf_url}: {e}"


	#
	# NOT A REAL TOOL, only for debuggin
	#

	import re

	# Simple checker for potentially dangerous code
	def llm_content_checker(llm_input: str):
	"""
	Simple security checker that flags dangerous imports and patterns.
	Returns the input unchanged if safe, or a safe error message if dangerous.
	"""

	# List of dangerous imports to check for
	dangerous_imports = [
	'os', 'subprocess', 'shutil', 'sys', 'socket', 'urllib',
	'requests', 'pickle', 'eval', 'exec', 'compile', '__import__'
	]

	# List of dangerous patterns
	dangerous_patterns = [
	r'rm\s+-rf', # Delete commands
	r'del\s+/[fs]', # Windows delete
	r'format\s+c:', # Format drive
	r'shutdown', # System shutdown
	r'system\s*\(', # System calls
	r'exec\s*\(', # Code execution
	r'eval\s*\(', # Code evaluation
	r'open\s*\([\'"][/\\]', # File access with absolute paths
	r'\.\./', # Directory traversal
	r'DROP\s+TABLE', # SQL deletion
	r'DELETE\s+FROM', # SQL deletion
	]

	# Convert to lowercase for checking
	content_lower = llm_input.lower()

	# Check for dangerous imports
	for dangerous_import in dangerous_imports:
	if f'import {dangerous_import}' in content_lower or f'from {dangerous_import}' in content_lower:
	print(f"🚫 Blocked dangerous import: {dangerous_import}")
	return '{"final_answer": "Request blocked due to security restrictions."}'

	# Check for dangerous patterns
	for pattern in dangerous_patterns:
	if re.search(pattern, content_lower):
	print(f"🚫 Blocked dangerous pattern: {pattern}")
	return '{"final_answer": "Request blocked due to security restrictions."}'

	# If no dangerous content found, return original input
	return llm_input