Spaces:

BrightData
/

brightdata-search-tool

Sleeping

File size: 3,566 Bytes

from __future__ import annotations

import json
import os
from typing import Any, Dict, List, Tuple

import requests
from smolagents.tools import Tool


class BrightDataSearchTool(Tool):
    name = "brightdata_search_engine"
    description = """
    Search Google, Bing, or Yandex and get structured results.
    Returns search results with URLs, titles, and descriptions.
    Ideal for gathering current information and news.
    """
    output_type = "string"

    engines: Tuple[str, ...] = ("google", "bing", "yandex")

    def __init__(self) -> None:
        self.inputs = {
            "query": {
                "type": "string",
                "description": "The search query",
            },
            "engine": {
                "type": "string",
                "description": "Search engine to use",
                "enum": list(self.engines),
                "nullable": True,
                "default": "google",
            },
        }
        super().__init__()

    def forward(self, query: str, engine: str = "google") -> str:
        """
        Search using Bright Data's search API.

        Args:
            query: The search query.
            engine: Search engine to use (google, bing, or yandex).

        Returns:
            JSON string with search results or markdown for non-Google engines.
        """
        api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
        unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")

        if not api_token:
            raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")

        normalized_engine = engine.strip().lower()
        if normalized_engine not in self.engines:
            normalized_engine = "google"

        search_urls = self._build_search_urls(query)
        search_url = search_urls[normalized_engine]
        is_google = normalized_engine == "google"

        api_url = "https://api.brightdata.com/request"
        headers = {
            "Authorization": f"Bearer {api_token}",
            "Content-Type": "application/json",
        }

        payload = {
            "url": search_url,
            "zone": unlocker_zone,
            "format": "raw",
        }

        if not is_google:
            payload["data_format"] = "markdown"

        try:
            response = requests.post(api_url, json=payload, headers=headers, timeout=120)
            response.raise_for_status()

            if is_google:
                return self._format_google_results(response.json())

            return response.text

        except requests.exceptions.RequestException as e:
            return json.dumps({"error": str(e)})

    def _build_search_urls(self, query: str) -> Dict[str, str]:
        encoded_query = requests.utils.quote(query)
        return {
            "google": f"https://www.google.com/search?q={encoded_query}&brd_json=1",
            "bing": f"https://www.bing.com/search?q={encoded_query}",
            "yandex": f"https://yandex.com/search/?text={encoded_query}",
        }

    def _format_google_results(self, data: Dict[str, Any]) -> str:
        results: Dict[str, Any] = {
            "organic": data.get("organic", []),
            "images": self._extract_image_links(data.get("images", [])),
            "related": data.get("related", []),
            "ai_overview": data.get("ai_overview"),
        }
        return json.dumps(results, indent=2)

    def _extract_image_links(self, images: List[Dict[str, Any]]) -> List[str]:
        return [img.get("link", "") for img in images if img.get("link")]