Spaces:
Sleeping
Sleeping
File size: 3,566 Bytes
e5e2636 3567fa5 e5e2636 fc420de e5e2636 b78b32a e5e2636 8bb17bf e5e2636 8bb17bf b78b32a e5e2636 b78b32a e5e2636 b78b32a e5e2636 b78b32a e5e2636 b78b32a d624fb3 e5e2636 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 |
from __future__ import annotations
import json
import os
from typing import Any, Dict, List, Tuple
import requests
from smolagents.tools import Tool
class BrightDataSearchTool(Tool):
name = "brightdata_search_engine"
description = """
Search Google, Bing, or Yandex and get structured results.
Returns search results with URLs, titles, and descriptions.
Ideal for gathering current information and news.
"""
output_type = "string"
engines: Tuple[str, ...] = ("google", "bing", "yandex")
def __init__(self) -> None:
self.inputs = {
"query": {
"type": "string",
"description": "The search query",
},
"engine": {
"type": "string",
"description": "Search engine to use",
"enum": list(self.engines),
"nullable": True,
"default": "google",
},
}
super().__init__()
def forward(self, query: str, engine: str = "google") -> str:
"""
Search using Bright Data's search API.
Args:
query: The search query.
engine: Search engine to use (google, bing, or yandex).
Returns:
JSON string with search results or markdown for non-Google engines.
"""
api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker1")
if not api_token:
raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
normalized_engine = engine.strip().lower()
if normalized_engine not in self.engines:
normalized_engine = "google"
search_urls = self._build_search_urls(query)
search_url = search_urls[normalized_engine]
is_google = normalized_engine == "google"
api_url = "https://api.brightdata.com/request"
headers = {
"Authorization": f"Bearer {api_token}",
"Content-Type": "application/json",
}
payload = {
"url": search_url,
"zone": unlocker_zone,
"format": "raw",
}
if not is_google:
payload["data_format"] = "markdown"
try:
response = requests.post(api_url, json=payload, headers=headers, timeout=120)
response.raise_for_status()
if is_google:
return self._format_google_results(response.json())
return response.text
except requests.exceptions.RequestException as e:
return json.dumps({"error": str(e)})
def _build_search_urls(self, query: str) -> Dict[str, str]:
encoded_query = requests.utils.quote(query)
return {
"google": f"https://www.google.com/search?q={encoded_query}&brd_json=1",
"bing": f"https://www.bing.com/search?q={encoded_query}",
"yandex": f"https://yandex.com/search/?text={encoded_query}",
}
def _format_google_results(self, data: Dict[str, Any]) -> str:
results: Dict[str, Any] = {
"organic": data.get("organic", []),
"images": self._extract_image_links(data.get("images", [])),
"related": data.get("related", []),
"ai_overview": data.get("ai_overview"),
}
return json.dumps(results, indent=2)
def _extract_image_links(self, images: List[Dict[str, Any]]) -> List[str]:
return [img.get("link", "") for img in images if img.get("link")]
|