| from typing import Any, Optional |
| from smolagents.tools import Tool |
| import requests |
| import os |
|
|
| class BrightDataScraperTool(Tool): |
| name = "brightdata_web_scraper" |
| description = """ |
| Scrape any webpage and return content in Markdown format. |
| This tool can bypass bot detection and CAPTCHAs. |
| Use this when you need to extract content from websites. |
| """ |
| inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to scrape'}} |
| output_type = "string" |
|
|
| def forward(self, url: str) -> str: |
| """ |
| Scrape a webpage using Bright Data's API. |
| |
| Args: |
| url: The URL to scrape |
| |
| Returns: |
| The scraped content in Markdown format |
| """ |
| import os |
| import requests |
|
|
| api_token = os.getenv("BRIGHT_DATA_API_TOKEN") |
| unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1") |
|
|
| if not api_token: |
| raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables") |
|
|
| api_url = "https://api.brightdata.com/request" |
| headers = { |
| "Authorization": f"Bearer {api_token}", |
| "Content-Type": "application/json", |
| } |
|
|
| payload = { |
| "url": url, |
| "zone": unlocker_zone, |
| "format": "raw", |
| "data_format": "markdown", |
| } |
|
|
| try: |
| response = requests.post(api_url, json=payload, headers=headers) |
| response.raise_for_status() |
| return response.text |
| except requests.exceptions.RequestException as e: |
| return f"Error scraping URL: {str(e)}" |
|
|
| def __init__(self, *args, **kwargs): |
| self.is_initialized = False |
|
|