|
|
import os
|
|
|
from dotenv import load_dotenv
|
|
|
from urllib.parse import unquote
|
|
|
import tempfile
|
|
|
import wikipedia
|
|
|
from playwright.sync_api import sync_playwright, TimeoutError
|
|
|
import bs4
|
|
|
import pandas as pd
|
|
|
|
|
|
from langchain_openai import ChatOpenAI
|
|
|
from langchain_community.document_loaders import UnstructuredHTMLLoader
|
|
|
from langchain_google_community import GoogleSearchAPIWrapper
|
|
|
from langchain_community.utilities import ArxivAPIWrapper
|
|
|
from langchain_core.tools import tool
|
|
|
|
|
|
|
|
|
|
|
|
load_dotenv()
|
|
|
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
|
|
|
OPENAI_API_MODEL = os.getenv("OPENAI_API_MODEL")
|
|
|
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
|
|
|
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")
|
|
|
|
|
|
|
|
|
@tool("google_search_tool")
|
|
|
def google_search_tool(query: str) -> str:
|
|
|
"""
|
|
|
Performs a Google search and returns the top results.
|
|
|
Use this for general web searches, finding articles, or recent information.
|
|
|
"""
|
|
|
print(f"--- TOOL: Executing Google Search for: '{query}' ---")
|
|
|
google_search = GoogleSearchAPIWrapper(google_api_key=GOOGLE_API_KEY, google_cse_id=GOOGLE_CSE_ID)
|
|
|
return google_search.results(query, num_results=3)
|
|
|
|
|
|
|
|
|
|
|
|
@tool("wikipedia_search_tool")
|
|
|
def wikipedia_search_tool(query_or_url: str, max_results: int = 1) -> str:
|
|
|
"""
|
|
|
Fetches content from a Wikipedia page. This tool is dual-purpose:
|
|
|
1. If the input is a search query, it finds the most relevant Wikipedia page and returns its full content.
|
|
|
2. If the input is a full Wikipedia URL, it directly fetches and returns the content of that page.
|
|
|
|
|
|
This is the preferred tool for all interactions with Wikipedia.
|
|
|
|
|
|
Args:
|
|
|
query_or_url (str): A search query (e.g., "Mercedes Sosa discography") or a full Wikipedia URL.
|
|
|
"""
|
|
|
print(f"--- WIKIPEDIA TOOL (Dual-Purpose): Input is '{query_or_url}' ---")
|
|
|
|
|
|
wikipedia.set_lang("en")
|
|
|
|
|
|
page_title = ""
|
|
|
|
|
|
try:
|
|
|
|
|
|
if query_or_url.startswith("http://") or query_or_url.startswith("https://"):
|
|
|
|
|
|
|
|
|
raw_title = query_or_url.split('/')[-1]
|
|
|
|
|
|
|
|
|
|
|
|
page_title = unquote(raw_title).replace('_', ' ')
|
|
|
|
|
|
print(f"Input is a URL. Decoded page title: '{page_title}'")
|
|
|
page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
|
|
|
else:
|
|
|
|
|
|
print("Input is a search query. Finding best page...")
|
|
|
search_results = wikipedia.search(query_or_url, results=1)
|
|
|
if not search_results:
|
|
|
return f"Error: No Wikipedia page found for query '{query_or_url}'."
|
|
|
page_title = search_results[0]
|
|
|
page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
|
|
|
|
|
|
|
|
|
print(f"Fetching HTML for page: '{page.title}'")
|
|
|
html_content_str = page.html()
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".html", encoding='utf-8') as temp_file:
|
|
|
temp_file.write(html_content_str)
|
|
|
temp_filepath = temp_file.name
|
|
|
|
|
|
try:
|
|
|
loader = UnstructuredHTMLLoader(temp_filepath, strategy="fast")
|
|
|
docs = loader.load()
|
|
|
finally:
|
|
|
os.remove(temp_filepath)
|
|
|
|
|
|
if not docs:
|
|
|
return f"Content from Wikipedia page '{page.title}': Could not extract any content."
|
|
|
|
|
|
page_content = docs[0].page_content
|
|
|
formatted_output = f"Content from Wikipedia page: '{page.title}'\nURL: {page.url}\n\n{page_content[:20000]}"
|
|
|
return formatted_output
|
|
|
|
|
|
except wikipedia.exceptions.DisambiguationError as e:
|
|
|
return f"Error: Your query '{query_or_url}' is ambiguous. Options: {e.options[:5]}"
|
|
|
except wikipedia.exceptions.PageError:
|
|
|
return f"Error: Could not find or load the Wikipedia page for title derived from '{query_or_url}'."
|
|
|
except Exception as e:
|
|
|
return f"An unexpected error occurred in the Wikipedia tool: {e}"
|
|
|
|
|
|
|
|
|
|
|
|
@tool("browse_web_page_tool")
|
|
|
def browse_web_page_tool(url: str) -> str:
|
|
|
"""
|
|
|
Navigates a web page using a headless browser, then uses Unstructured to extract
|
|
|
the full, clean content, including text and tables.
|
|
|
|
|
|
Args:
|
|
|
url (str): The full URL of the page to browse and extract content from.
|
|
|
"""
|
|
|
print(f"--- TOOL: Browsing and extracting from: {url} ---")
|
|
|
|
|
|
try:
|
|
|
|
|
|
with sync_playwright() as p:
|
|
|
browser = p.chromium.launch(headless=True)
|
|
|
page = browser.new_page()
|
|
|
page.goto(url, timeout=30000, wait_until="domcontentloaded")
|
|
|
html_content = page.content()
|
|
|
browser.close()
|
|
|
|
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".html", encoding='utf-8') as temp_file:
|
|
|
temp_file.write(html_content)
|
|
|
temp_filepath = temp_file.name
|
|
|
|
|
|
try:
|
|
|
|
|
|
loader = UnstructuredHTMLLoader(temp_filepath, strategy="fast")
|
|
|
docs = loader.load()
|
|
|
finally:
|
|
|
|
|
|
os.remove(temp_filepath)
|
|
|
|
|
|
|
|
|
if not docs:
|
|
|
return f"Content from URL '{url}': Could not extract any content using Unstructured."
|
|
|
|
|
|
|
|
|
page_content = docs[0].page_content
|
|
|
|
|
|
formatted_output = f"Content from URL: '{url}'\n\n{page_content[:20000]}"
|
|
|
|
|
|
return formatted_output
|
|
|
|
|
|
except TimeoutError:
|
|
|
return f"Error browsing '{url}': The page took too long to load and timed out."
|
|
|
except Exception as e:
|
|
|
return f"An unexpected error occurred while browsing '{url}': {e}"
|
|
|
|
|
|
|
|
|
|
|
|
@tool("text_analyzer_tool")
|
|
|
def text_analyzer_tool(text_to_analyze: str, question: str) -> str:
|
|
|
"""
|
|
|
Analyzes a given text to answer a specific question or extract information.
|
|
|
Use this tool when you have already gathered content (e.g., from browsing a page)
|
|
|
and need to find a specific answer within that text.
|
|
|
|
|
|
Args:
|
|
|
text_to_analyze (str): The text content to be analyzed.
|
|
|
question (str): The specific question to answer based on the text.
|
|
|
"""
|
|
|
print(f"--- TOOL: Analyzing text to answer: '{question}' ---")
|
|
|
|
|
|
|
|
|
analyzer_llm = ChatOpenAI(model=OPENAI_API_MODEL, temperature=0)
|
|
|
|
|
|
prompt = f"""
|
|
|
You are a text analysis expert. Your task is to carefully read the provided text and answer the user's question based ONLY on that text.
|
|
|
Provide a concise and direct answer.
|
|
|
|
|
|
**Text to Analyze:**
|
|
|
---
|
|
|
{text_to_analyze}
|
|
|
---
|
|
|
|
|
|
**Question to Answer:**
|
|
|
"{question}"
|
|
|
|
|
|
Your concise answer:
|
|
|
"""
|
|
|
response = analyzer_llm.invoke(prompt)
|
|
|
return response.content |