File size: 20,214 Bytes

5374a2d

import os
import requests
from typing import Dict, Any, Optional, List
from pydantic import Field
from .search_base import SearchBase
from .tool import Tool, Toolkit
from evoagentx.core.logging import logger
import dotenv

dotenv.load_dotenv()

class SearchSerpAPI(SearchBase):
    """
    SerpAPI search tool that provides access to multiple search engines including
    Google, Bing, Baidu, Yahoo, and DuckDuckGo through a unified interface.
    """
    
    api_key: Optional[str] = Field(default=None, description="SerpAPI authentication key")
    default_engine: Optional[str] = Field(default="google", description="Default search engine")
    default_location: Optional[str] = Field(default=None, description="Default geographic location")
    default_language: Optional[str] = Field(default="en", description="Default interface language")
    default_country: Optional[str] = Field(default="us", description="Default country code")
    enable_content_scraping: Optional[bool] = Field(default=True, description="Enable full content scraping")
    
    def __init__(
        self,
        name: str = "SearchSerpAPI",
        num_search_pages: Optional[int] = 5,
        max_content_words: Optional[int] = None,
        api_key: Optional[str] = None,
        default_engine: Optional[str] = "google",
        default_location: Optional[str] = None,
        default_language: Optional[str] = "en",
        default_country: Optional[str] = "us",
        enable_content_scraping: Optional[bool] = True,
        **kwargs
    ):
        """
        Initialize the SerpAPI Search tool.
        
        Args:
            name (str): Name of the tool
            num_search_pages (int): Number of search results to retrieve
            max_content_words (int): Maximum number of words to include in content
            api_key (str): SerpAPI authentication key (can also use SERPAPI_KEY env var)
            default_engine (str): Default search engine (google, bing, baidu, yahoo, duckduckgo)
            default_location (str): Default geographic location for searches
            default_language (str): Default interface language
            default_country (str): Default country code
            enable_content_scraping (bool): Whether to scrape full page content
            **kwargs: Additional keyword arguments for parent class initialization
        """
        super().__init__(
            name=name,
            num_search_pages=num_search_pages,
            max_content_words=max_content_words,
            api_key=api_key,
            default_engine=default_engine,
            default_location=default_location,
            default_language=default_language,
            default_country=default_country,
            enable_content_scraping=enable_content_scraping,
            **kwargs
        )
        
        # Get API key from parameter or environment variable
        self.api_key = api_key or os.getenv('SERPAPI_KEY', '')
        self.base_url = "https://serpapi.com/search.json"
        
        if not self.api_key:
            logger.warning("SerpAPI key not found. Set SERPAPI_KEY environment variable or pass api_key parameter.")

    def _build_serpapi_params(self, query: str, engine: str = None, location: str = None, 
                             language: str = None, country: str = None, search_type: str = None,
                             num_results: int = None) -> Dict[str, Any]:
        """
        Build SerpAPI request parameters.
        
        Args:
            query (str): Search query
            engine (str): Search engine to use
            location (str): Geographic location
            language (str): Interface language
            country (str): Country code
            search_type (str): Type of search (web, images, news, shopping, maps)
            num_results (int): Number of results to retrieve
            
        Returns:
            Dict[str, Any]: SerpAPI request parameters
        """
        params = {
            "q": query,
            "api_key": self.api_key,
            "num": num_results or self.num_search_pages,
        }
        
        # Add optional parameters if provided
        if location or self.default_location:
            params["location"] = location or self.default_location
            
        if language or self.default_language:
            params["hl"] = language or self.default_language
            
        if country or self.default_country:
            params["gl"] = country or self.default_country
            
        # Handle different search types for Google
        if search_type and search_type != "web":
            search_type_map = {
                "images": "isch",
                "news": "nws", 
                "shopping": "shop",
                "maps": "lcl"
            }
            if search_type in search_type_map:
                params["tbm"] = search_type_map[search_type]
        
        return params

    def _execute_serpapi_search(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """
        Execute search using direct HTTP requests to SerpAPI.
        
        Args:
            params (Dict[str, Any]): Search parameters
            
        Returns:
            Dict[str, Any]: SerpAPI response data
            
        Raises:
            Exception: For API errors
        """
        try:
            response = requests.get(self.base_url, params=params, timeout=30)
            response.raise_for_status()
            
            data = response.json()
            
            # Check for SerpAPI errors in response
            if "error" in data:
                raise Exception(f"SerpAPI error: {data['error']}")
                
            return data
            
        except requests.exceptions.RequestException as e:
            raise Exception(f"SerpAPI request failed: {str(e)}")
        except Exception as e:
            raise Exception(f"SerpAPI search failed: {str(e)}")

    def _process_serpapi_results(self, serpapi_data: Dict[str, Any], max_content_words: int = None) -> Dict[str, Any]:
        """
        Process SerpAPI results into structured format with processed results + raw data.
        
        Args:
            serpapi_data (Dict[str, Any]): Raw SerpAPI response
            max_content_words (int): Maximum words per result content
            
        Returns:
            Dict[str, Any]: Structured response with processed results and raw data
        """
        processed_results = []
        
        # 1. Process Knowledge Graph (highest priority)
        if knowledge_graph := serpapi_data.get("knowledge_graph", {}):
            if description := knowledge_graph.get("description"):
                title = knowledge_graph.get("title", "Unknown")
                content = f"**{title}**"
                
                # Add type if available
                if kg_type := knowledge_graph.get("type"):
                    content += f" ({kg_type})"
                content += f"\n\n{description}"
                
                # Add key attributes if available
                if kg_list := knowledge_graph.get("list", {}):
                    content += "\n\n**Key Information:**"
                    for key, value in list(kg_list.items())[:5]:  # Limit to 5 attributes
                        if isinstance(value, list) and value:
                            formatted_key = key.replace('_', ' ').title()
                            formatted_value = ', '.join(str(v) for v in value[:3])  # Max 3 values
                            content += f"\n• {formatted_key}: {formatted_value}"
                
                processed_results.append({
                    "title": f"Knowledge: {title}",
                    "content": self._truncate_content(content, max_content_words or 200),
                    "url": knowledge_graph.get("source", {}).get("link", ""),
                    "type": "knowledge_graph",
                    "priority": 1
                })
        
        # 2. Process Organic Results with scraping
        for item in serpapi_data.get("organic_results", []):
            url = item.get("link", "")
            title = item.get("title", "No Title")
            snippet = item.get("snippet", "")
            position = item.get("position", 0)
            
            # Prepare the result dict
            result = {
                "title": title,
                "content": self._truncate_content(snippet, max_content_words or 400),
                "url": url,
                "type": "organic",
                "priority": 2,
                "position": position
            }
            
            # Try to scrape full content if enabled and add as site_content
            if self.enable_content_scraping and url and url.startswith(('http://', 'https://')):
                try:
                    scraped_title, scraped_content = self._scrape_page(url)
                    if scraped_content and scraped_content.strip():
                        # Update title if scraped title is better
                        if scraped_title and scraped_title.strip():
                            result["title"] = scraped_title
                        # Add scraped content as site_content
                        result["site_content"] = self._truncate_content(scraped_content, max_content_words or 400)
                    else:
                        result["site_content"] = None
                except Exception as e:
                    logger.debug(f"Content scraping failed for {url}: {str(e)}")
                    result["site_content"] = None
            else:
                result["site_content"] = None
            
            # Only include results that have either snippet or scraped content
            if snippet or result.get("site_content"):
                processed_results.append(result)
        
        # 3. Collect raw data sections for LLM processing
        raw_data = {}
        raw_sections = [
            "local_results", "news_results", "shopping_results", 
            "related_questions", "recipes_results", "images_results"
        ]
        
        for section in raw_sections:
            if section in serpapi_data and serpapi_data[section]:
                # Limit raw data to prevent overwhelming response
                if section == "local_results":
                    # Local results have nested structure
                    places = serpapi_data[section].get("places", [])[:3]
                    if places:
                        raw_data[section] = {"places": places}
                else:
                    # Other sections are arrays
                    raw_data[section] = serpapi_data[section][:3]
        
        # 4. Extract search metadata
        search_metadata = {}
        if search_meta := serpapi_data.get("search_metadata", {}):
            search_metadata = {
                "query": search_meta.get("query", ""),
                "location": search_meta.get("location", ""),
                "total_results": search_meta.get("total_results", ""),
                "search_time": search_meta.get("total_time_taken", "")
            }
        
        # Sort processed results by priority and position
        processed_results.sort(key=lambda x: (x.get("priority", 999), x.get("position", 0)))
        
        return {
            "results": processed_results,
            "raw_data": raw_data if raw_data else None,
            "search_metadata": search_metadata if search_metadata else None,
            "error": None
        }

    def _handle_api_errors(self, error: Exception) -> str:
        """
        Handle SerpAPI specific errors with appropriate messages.
        
        Args:
            error (Exception): The exception that occurred
            
        Returns:
            str: User-friendly error message
        """
        error_str = str(error).lower()
        
        if "api key" in error_str or "unauthorized" in error_str:
            return "Invalid or missing SerpAPI key. Please set SERPAPI_KEY environment variable."
        elif "rate limit" in error_str or "too many requests" in error_str:
            return "SerpAPI rate limit exceeded. Please try again later."
        elif "quota" in error_str or "credit" in error_str:
            return "SerpAPI quota exceeded. Please check your plan limits."
        elif "timeout" in error_str:
            return "SerpAPI request timeout. Please try again."
        else:
            return f"SerpAPI error: {str(error)}"

    def search(self, query: str, num_search_pages: int = None, max_content_words: int = None,
               engine: str = None, location: str = None, language: str = None, 
               country: str = None, search_type: str = None) -> Dict[str, Any]:
        """
        Search using SerpAPI with comprehensive parameter support.
        
        Args:
            query (str): The search query
            num_search_pages (int): Number of search results to retrieve
            max_content_words (int): Maximum number of words to include in content
            engine (str): Search engine (google, bing, baidu, yahoo, duckduckgo)
            location (str): Geographic location for localized results
            language (str): Interface language (e.g., 'en', 'es', 'fr')
            country (str): Country code for country-specific results (e.g., 'us', 'uk')
            search_type (str): Type of search (web, images, news, shopping, maps)
            
        Returns:
            Dict[str, Any]: Contains search results and optional error message
        """
        # Use instance defaults if parameters not provided
        num_search_pages = num_search_pages or self.num_search_pages
        max_content_words = max_content_words or self.max_content_words
        
        if not self.api_key:
            error_msg = (
                "SerpAPI key is required. Please set SERPAPI_KEY environment variable "
                "or pass api_key parameter. Get your key from: https://serpapi.com/"
            )
            logger.error(error_msg)
            return {"results": [], "raw_data": None, "search_metadata": None, "error": error_msg}
        
        try:
            search_engine = engine or self.default_engine
            logger.info(f"Searching {search_engine} via SerpAPI: {query}, "
                       f"num_results={num_search_pages}, max_content_words={max_content_words}")
            
            # Build request parameters
            params = self._build_serpapi_params(
                query=query,
                engine=search_engine,
                location=location,
                language=language,
                country=country,
                search_type=search_type,
                num_results=num_search_pages
            )
            
            # Execute search using direct HTTP request
            serpapi_data = self._execute_serpapi_search(params)
            
            # Process results
            response_data = self._process_serpapi_results(serpapi_data, max_content_words)
            
            logger.info(f"Successfully retrieved {len(response_data['results'])} processed results")
            return response_data
            
        except Exception as e:
            error_msg = self._handle_api_errors(e)
            logger.error(f"SerpAPI search failed: {error_msg}")
            return {"results": [], "raw_data": None, "search_metadata": None, "error": error_msg}


class SerpAPITool(Tool):
    name: str = "serpapi_search"
    description: str = "Search multiple search engines using SerpAPI with comprehensive result processing and content scraping"
    inputs: Dict[str, Dict[str, str]] = {
        "query": {
            "type": "string",
            "description": "The search query to execute"
        },
        "num_search_pages": {
            "type": "integer", 
            "description": "Number of search results to retrieve. Default: 5"
        },
        "max_content_words": {
            "type": "integer",
            "description": "Maximum number of words to include in content per result. None means no limit. Default: None"
        },
        "engine": {
            "type": "string",
            "description": "Search engine to use: google, bing, baidu, yahoo, duckduckgo. Default: google"
        },
        "location": {
            "type": "string", 
            "description": "Geographic location for localized results (e.g., 'New York, NY', 'London, UK')"
        },
        "language": {
            "type": "string",
            "description": "Interface language code (e.g., 'en', 'es', 'fr', 'de'). Default: en"
        },
        "country": {
            "type": "string",
            "description": "Country code for country-specific results (e.g., 'us', 'uk', 'ca'). Default: us"
        },
        "search_type": {
            "type": "string",
            "description": "Type of search: web, images, news, shopping, maps. Default: web"
        }
    }
    required: Optional[List[str]] = ["query"]
    
    def __init__(self, search_serpapi: SearchSerpAPI = None):
        super().__init__()
        self.search_serpapi = search_serpapi
    
    def __call__(self, query: str, num_search_pages: int = None, max_content_words: int = None,
                 engine: str = None, location: str = None, language: str = None, 
                 country: str = None, search_type: str = None) -> Dict[str, Any]:
        """Execute SerpAPI search using the SearchSerpAPI instance."""
        if not self.search_serpapi:
            raise RuntimeError("SerpAPI search instance not initialized")
        
        try:
            return self.search_serpapi.search(
                query=query,
                num_search_pages=num_search_pages,
                max_content_words=max_content_words,
                engine=engine,
                location=location,
                language=language,
                country=country,
                search_type=search_type
            )
        except Exception as e:
            return {"results": [], "error": f"Error executing SerpAPI search: {str(e)}"}


class SerpAPIToolkit(Toolkit):
    def __init__(
        self,
        name: str = "SerpAPIToolkit",
        api_key: Optional[str] = None,
        num_search_pages: Optional[int] = 5,
        max_content_words: Optional[int] = None,
        default_engine: Optional[str] = "google",
        default_location: Optional[str] = None,
        default_language: Optional[str] = "en",
        default_country: Optional[str] = "us",
        enable_content_scraping: Optional[bool] = True,
        **kwargs
    ):
        """
        Initialize SerpAPI Toolkit.
        
        Args:
            name (str): Name of the toolkit
            api_key (str): SerpAPI authentication key
            num_search_pages (int): Default number of search results to retrieve
            max_content_words (int): Default maximum words per result content
            default_engine (str): Default search engine
            default_location (str): Default geographic location
            default_language (str): Default interface language
            default_country (str): Default country code
            enable_content_scraping (bool): Whether to enable content scraping
            **kwargs: Additional keyword arguments
        """
        # Create the shared SerpAPI search instance
        search_serpapi = SearchSerpAPI(
            name="SearchSerpAPI",
            api_key=api_key,
            num_search_pages=num_search_pages,
            max_content_words=max_content_words,
            default_engine=default_engine,
            default_location=default_location,
            default_language=default_language,
            default_country=default_country,
            enable_content_scraping=enable_content_scraping,
            **kwargs
        )
        
        # Create tools with the shared search instance
        tools = [
            SerpAPITool(search_serpapi=search_serpapi)
        ]
        
        # Initialize parent with tools
        super().__init__(name=name, tools=tools)
        
        # Store search_serpapi as instance variable
        self.search_serpapi = search_serpapi