Spaces:

BrightData
/

brightdata-scraper-tool

Sleeping

App Files Files Community

BrightData commited on Dec 7, 2025

Commit

fbd126d

verified ·

1 Parent(s): 9f0cf15

Add Bright Data Scraper Tool

Browse files

Files changed (3) hide show

app.py +5 -0
requirements.txt +2 -0
tool.py +56 -0

app.py ADDED Viewed

	@@ -0,0 +1,5 @@

+from smolagents import launch_gradio_demo
+from tool import BrightDataScraperTool
+tool = BrightDataScraperTool()
+launch_gradio_demo(tool)

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ requests
2	+ smolagents

tool.py ADDED Viewed

	@@ -0,0 +1,56 @@

+from typing import Any, Optional
+from smolagents.tools import Tool
+import requests
+import os
+class BrightDataScraperTool(Tool):
+    name = "brightdata_web_scraper"
+    description = """
+    Scrape any webpage and return content in Markdown format.
+    This tool can bypass bot detection and CAPTCHAs.
+    Use this when you need to extract content from websites.
+    """
+    inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to scrape'}}
+    output_type = "string"
+    def forward(self, url: str) -> str:
+        """
+        Scrape a webpage using Bright Data's API.
+        Args:
+            url: The URL to scrape
+        Returns:
+            The scraped content in Markdown format
+        """
+        import os
+        import requests
+        api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
+        unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")
+        if not api_token:
+            raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
+        api_url = "https://api.brightdata.com/request"
+        headers = {
+            "Authorization": f"Bearer {api_token}",
+            "Content-Type": "application/json",
+        }
+        payload = {
+            "url": url,
+            "zone": unlocker_zone,
+            "format": "raw",
+            "data_format": "markdown",
+        }
+        try:
+            response = requests.post(api_url, json=payload, headers=headers)
+            response.raise_for_status()
+            return response.text
+        except requests.exceptions.RequestException as e:
+            return f"Error scraping URL: {str(e)}"
+    def __init__(self, *args, **kwargs):
+        self.is_initialized = False