Spaces:

nynuzz
/

SamyAgent

Runtime error

File size: 7,734 Bytes

158ed3d

import os
from dotenv import load_dotenv
from urllib.parse import unquote
import tempfile
import wikipedia
from playwright.sync_api import sync_playwright, TimeoutError
import bs4
import pandas as pd

from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import UnstructuredHTMLLoader
from langchain_google_community import GoogleSearchAPIWrapper
from langchain_community.utilities import ArxivAPIWrapper
from langchain_core.tools import tool


# Carica le variabili d'ambiente per i tool
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_MODEL = os.getenv("OPENAI_API_MODEL")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
GOOGLE_CSE_ID = os.getenv("GOOGLE_CSE_ID")

# --- Tool di Ricerca Google ---
@tool("google_search_tool")
def google_search_tool(query: str) -> str:
    """

    Performs a Google search and returns the top results.

    Use this for general web searches, finding articles, or recent information.

    """
    print(f"--- TOOL: Executing Google Search for: '{query}' ---")
    google_search = GoogleSearchAPIWrapper(google_api_key=GOOGLE_API_KEY, google_cse_id=GOOGLE_CSE_ID)
    return google_search.results(query, num_results=3)


# --- Tool di Ricerca Wikipedia ---
@tool("wikipedia_search_tool")
def wikipedia_search_tool(query_or_url: str, max_results: int = 1) -> str:
    """

    Fetches content from a Wikipedia page. This tool is dual-purpose:

    1. If the input is a search query, it finds the most relevant Wikipedia page and returns its full content.

    2. If the input is a full Wikipedia URL, it directly fetches and returns the content of that page.

    

    This is the preferred tool for all interactions with Wikipedia.



    Args:

        query_or_url (str): A search query (e.g., "Mercedes Sosa discography") or a full Wikipedia URL.

    """
    print(f"--- WIKIPEDIA TOOL (Dual-Purpose): Input is '{query_or_url}' ---")
    
    wikipedia.set_lang("en")
    
    page_title = ""

    try:
        # --- LOGICA DI DECISIONE E DECODIFICA ---
        if query_or_url.startswith("http://") or query_or_url.startswith("https://"):
            # Caso 1: L'input è un URL
            # Estraiamo l'ultima parte dell'URL
            raw_title = query_or_url.split('/')[-1]
            
            # CORREZIONE: Decodifica i caratteri speciali (es. %C4%85 -> ą)
            # e sostituisci gli underscore con spazi.
            page_title = unquote(raw_title).replace('_', ' ')
            
            print(f"Input is a URL. Decoded page title: '{page_title}'")
            page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
        else:
            # Caso 2: L'input è una query di ricerca
            print("Input is a search query. Finding best page...")
            search_results = wikipedia.search(query_or_url, results=1)
            if not search_results:
                return f"Error: No Wikipedia page found for query '{query_or_url}'."
            page_title = search_results[0]
            page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
            
        # --- ESTRAZIONE HTML E PARSING (invariato) ---
        print(f"Fetching HTML for page: '{page.title}'")
        html_content_str = page.html()

        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".html", encoding='utf-8') as temp_file:
            temp_file.write(html_content_str)
            temp_filepath = temp_file.name

        try:
            loader = UnstructuredHTMLLoader(temp_filepath, strategy="fast")
            docs = loader.load()
        finally:
            os.remove(temp_filepath)

        if not docs:
            return f"Content from Wikipedia page '{page.title}': Could not extract any content."

        page_content = docs[0].page_content
        formatted_output = f"Content from Wikipedia page: '{page.title}'\nURL: {page.url}\n\n{page_content[:20000]}"
        return formatted_output

    except wikipedia.exceptions.DisambiguationError as e:
        return f"Error: Your query '{query_or_url}' is ambiguous. Options: {e.options[:5]}"
    except wikipedia.exceptions.PageError:
        return f"Error: Could not find or load the Wikipedia page for title derived from '{query_or_url}'."
    except Exception as e:
        return f"An unexpected error occurred in the Wikipedia tool: {e}"
    

# --- Tool di Navigazione ---
@tool("browse_web_page_tool")
def browse_web_page_tool(url: str) -> str:
    """

    Navigates a web page using a headless browser, then uses Unstructured to extract

    the full, clean content, including text and tables.



    Args:

        url (str): The full URL of the page to browse and extract content from.

    """
    print(f"--- TOOL: Browsing and extracting from: {url} ---")
    
    try:
        # 1. Usa Playwright per ottenere l'HTML completo
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()
            page.goto(url, timeout=30000, wait_until="domcontentloaded")
            html_content = page.content()
            browser.close()

        # 2. Usa un file temporaneo IN MEMORIA per passare l'HTML a Unstructured
        # Questo evita di scrivere su disco, è veloce e pulito.
        with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix=".html", encoding='utf-8') as temp_file:
            temp_file.write(html_content)
            temp_filepath = temp_file.name

        try:
            # 3. Carica e parsa l'HTML con UnstructuredFileLoader
            loader = UnstructuredHTMLLoader(temp_filepath, strategy="fast")
            docs = loader.load()
        finally:
            # Assicurati di cancellare sempre il file temporaneo
            os.remove(temp_filepath)

        # 4. Formatta l'output
        if not docs:
            return f"Content from URL '{url}': Could not extract any content using Unstructured."

        # Unstructured di solito mette tutto in un unico documento
        page_content = docs[0].page_content
        
        formatted_output = f"Content from URL: '{url}'\n\n{page_content[:20000]}"
                
        return formatted_output

    except TimeoutError:
        return f"Error browsing '{url}': The page took too long to load and timed out."
    except Exception as e:
        return f"An unexpected error occurred while browsing '{url}': {e}"
    

# --- Tool di analisi del contenuto web ---
@tool("text_analyzer_tool")
def text_analyzer_tool(text_to_analyze: str, question: str) -> str:
    """

    Analyzes a given text to answer a specific question or extract information.

    Use this tool when you have already gathered content (e.g., from browsing a page)

    and need to find a specific answer within that text.



    Args:

        text_to_analyze (str): The text content to be analyzed.

        question (str): The specific question to answer based on the text.

    """
    print(f"--- TOOL: Analyzing text to answer: '{question}' ---")
    
    # Usiamo un LLM per fare l'analisi
    analyzer_llm = ChatOpenAI(model=OPENAI_API_MODEL, temperature=0)
    
    prompt = f"""

        You are a text analysis expert. Your task is to carefully read the provided text and answer the user's question based ONLY on that text.

        Provide a concise and direct answer.



        **Text to Analyze:**

        ---

        {text_to_analyze}

        ---



        **Question to Answer:**

        "{question}"



        Your concise answer:

    """
    response = analyzer_llm.invoke(prompt)
    return response.content