BrightData commited on
Commit
fbd126d
·
verified ·
1 Parent(s): 9f0cf15

Add Bright Data Scraper Tool

Browse files
Files changed (3) hide show
  1. app.py +5 -0
  2. requirements.txt +2 -0
  3. tool.py +56 -0
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from smolagents import launch_gradio_demo
2
+ from tool import BrightDataScraperTool
3
+
4
+ tool = BrightDataScraperTool()
5
+ launch_gradio_demo(tool)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ requests
2
+ smolagents
tool.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional
2
+ from smolagents.tools import Tool
3
+ import requests
4
+ import os
5
+
6
+ class BrightDataScraperTool(Tool):
7
+ name = "brightdata_web_scraper"
8
+ description = """
9
+ Scrape any webpage and return content in Markdown format.
10
+ This tool can bypass bot detection and CAPTCHAs.
11
+ Use this when you need to extract content from websites.
12
+ """
13
+ inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to scrape'}}
14
+ output_type = "string"
15
+
16
+ def forward(self, url: str) -> str:
17
+ """
18
+ Scrape a webpage using Bright Data's API.
19
+
20
+ Args:
21
+ url: The URL to scrape
22
+
23
+ Returns:
24
+ The scraped content in Markdown format
25
+ """
26
+ import os
27
+ import requests
28
+
29
+ api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
30
+ unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")
31
+
32
+ if not api_token:
33
+ raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
34
+
35
+ api_url = "https://api.brightdata.com/request"
36
+ headers = {
37
+ "Authorization": f"Bearer {api_token}",
38
+ "Content-Type": "application/json",
39
+ }
40
+
41
+ payload = {
42
+ "url": url,
43
+ "zone": unlocker_zone,
44
+ "format": "raw",
45
+ "data_format": "markdown",
46
+ }
47
+
48
+ try:
49
+ response = requests.post(api_url, json=payload, headers=headers)
50
+ response.raise_for_status()
51
+ return response.text
52
+ except requests.exceptions.RequestException as e:
53
+ return f"Error scraping URL: {str(e)}"
54
+
55
+ def __init__(self, *args, **kwargs):
56
+ self.is_initialized = False