BrightData commited on
Commit
98eec6c
·
verified ·
1 Parent(s): 323b54a

Add Bright Data Dataset Tool

Browse files
Files changed (3) hide show
  1. app.py +5 -0
  2. requirements.txt +2 -0
  3. tool.py +195 -0
app.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ from smolagents import launch_gradio_demo
2
+ from tool import BrightDataDatasetTool
3
+
4
+ tool = BrightDataDatasetTool()
5
+ launch_gradio_demo(tool)
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ requests
2
+ smolagents
tool.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Any, Optional
2
+ from smolagents.tools import Tool
3
+ import time
4
+ import json
5
+ import requests
6
+ import os
7
+
8
+ class BrightDataDatasetTool(Tool):
9
+ name = "brightdata_dataset_fetch"
10
+ description = "Trigger a Bright Data dataset collection and poll until the snapshot is ready. Choose a dataset key (e.g., amazon_product, linkedin_company_profile, google_maps_reviews). For most datasets, you only need to provide the URL parameter. For example: brightdata_dataset_fetch(dataset='linkedin_person_profile', url='https://linkedin.com/in/...')"
11
+ output_type = "string"
12
+
13
+ def __init__(self):
14
+ # Keep dataset catalogue on the instance and build the inputs schema dynamically to satisfy tool validation.
15
+ self.datasets = globals().get("DATASETS", {})
16
+ if not self.datasets:
17
+ raise ValueError("Dataset catalogue is not available.")
18
+ self.inputs = {
19
+ "dataset": {
20
+ "type": "string",
21
+ "description": f"Dataset key. Options: {', '.join(sorted(self.datasets.keys()))}",
22
+ },
23
+ "url": {
24
+ "type": "string",
25
+ "description": "URL for the dataset (required for most datasets)",
26
+ "nullable": True,
27
+ },
28
+ "keyword": {
29
+ "type": "string",
30
+ "description": "Search keyword (for search datasets like amazon_product_search)",
31
+ "nullable": True,
32
+ },
33
+ "first_name": {
34
+ "type": "string",
35
+ "description": "First name (for datasets like linkedin_people_search)",
36
+ "nullable": True,
37
+ },
38
+ "last_name": {
39
+ "type": "string",
40
+ "description": "Last name (for datasets like linkedin_people_search)",
41
+ "nullable": True,
42
+ },
43
+ "days_limit": {
44
+ "type": "string",
45
+ "description": "Days limit (for datasets like google_maps_reviews, default: 3)",
46
+ "nullable": True,
47
+ },
48
+ "num_of_reviews": {
49
+ "type": "string",
50
+ "description": "Number of reviews (for datasets like facebook_company_reviews)",
51
+ "nullable": True,
52
+ },
53
+ "num_of_comments": {
54
+ "type": "string",
55
+ "description": "Number of comments (for datasets like youtube_comments, default: 10)",
56
+ "nullable": True,
57
+ },
58
+ }
59
+ super().__init__()
60
+
61
+ def _prepare_payload(self, dataset_key: str, params):
62
+ """Validate required fields, apply defaults, and merge fixed values."""
63
+ config = self.datasets[dataset_key]
64
+ payload = {}
65
+
66
+ defaults = config.get("defaults", {})
67
+ fixed_values = config.get("fixed_values", {})
68
+
69
+ for field in config["inputs"]:
70
+ if field in params:
71
+ payload[field] = params[field]
72
+ elif field in defaults:
73
+ payload[field] = defaults[field]
74
+ else:
75
+ raise ValueError(f"Missing required field '{field}' for dataset '{dataset_key}'")
76
+
77
+ # Apply fixed values that should always be sent
78
+ payload.update(fixed_values)
79
+ return payload
80
+
81
+ def forward(
82
+ self,
83
+ dataset: str,
84
+ url: str = None,
85
+ keyword: str = None,
86
+ first_name: str = None,
87
+ last_name: str = None,
88
+ days_limit: str = None,
89
+ num_of_reviews: str = None,
90
+ num_of_comments: str = None,
91
+ ) -> str:
92
+ """
93
+ Trigger a dataset run and poll until results are ready.
94
+
95
+ Args:
96
+ dataset: The dataset key from DATASETS.
97
+ url: URL for the dataset (required for most datasets).
98
+ keyword: Search keyword (for search datasets).
99
+ first_name: First name (for people search datasets).
100
+ last_name: Last name (for people search datasets).
101
+ days_limit: Days limit (for time-based datasets).
102
+ num_of_reviews: Number of reviews to fetch.
103
+ num_of_comments: Number of comments to fetch.
104
+
105
+ Returns:
106
+ JSON string of the snapshot data once ready.
107
+ """
108
+ import os
109
+ import json
110
+ import time
111
+ import requests
112
+
113
+ api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
114
+ if not api_token:
115
+ raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
116
+
117
+ if dataset not in self.datasets:
118
+ raise ValueError(f"Unknown dataset '{dataset}'. Valid options: {', '.join(sorted(self.datasets.keys()))}")
119
+
120
+ # Build params dict from provided arguments
121
+ params = {}
122
+ if url is not None:
123
+ params["url"] = url
124
+ if keyword is not None:
125
+ params["keyword"] = keyword
126
+ if first_name is not None:
127
+ params["first_name"] = first_name
128
+ if last_name is not None:
129
+ params["last_name"] = last_name
130
+ if days_limit is not None:
131
+ params["days_limit"] = days_limit
132
+ if num_of_reviews is not None:
133
+ params["num_of_reviews"] = num_of_reviews
134
+ if num_of_comments is not None:
135
+ params["num_of_comments"] = num_of_comments
136
+
137
+ payload = self._prepare_payload(dataset, params)
138
+ dataset_id = self.datasets[dataset]["dataset_id"]
139
+
140
+ trigger_url = "https://api.brightdata.com/datasets/v3/trigger"
141
+ trigger_headers = {
142
+ "Authorization": f"Bearer {api_token}",
143
+ "Content-Type": "application/json",
144
+ }
145
+
146
+ trigger_response = requests.post(
147
+ trigger_url,
148
+ params={"dataset_id": dataset_id, "include_errors": "true"},
149
+ json=[payload],
150
+ headers=trigger_headers,
151
+ timeout=60,
152
+ )
153
+ trigger_response.raise_for_status()
154
+ snapshot_id = trigger_response.json().get("snapshot_id")
155
+
156
+ if not snapshot_id:
157
+ raise RuntimeError("No snapshot ID returned from Bright Data.")
158
+
159
+ # Poll for completion (up to 10 minutes, matching MCP logic)
160
+ snapshot_url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
161
+ max_attempts = 600
162
+ attempts = 0
163
+
164
+ while attempts < max_attempts:
165
+ try:
166
+ response = requests.get(
167
+ snapshot_url,
168
+ params={"format": "json"},
169
+ headers={"Authorization": f"Bearer {api_token}"},
170
+ timeout=30,
171
+ )
172
+
173
+ # If Bright Data returns an error response we don't want to loop forever
174
+ if response.status_code == 400:
175
+ response.raise_for_status()
176
+
177
+ data = response.json()
178
+ if isinstance(data, list):
179
+ return json.dumps(data, indent=2)
180
+
181
+ status = data.get("status") if isinstance(data, dict) else None
182
+ if status not in {"running", "building"}:
183
+ return json.dumps(data, indent=2)
184
+
185
+ attempts += 1
186
+ time.sleep(1)
187
+
188
+ except requests.exceptions.RequestException as exc:
189
+ # Mirror JS logic: tolerate transient failures, but break on 400
190
+ if getattr(getattr(exc, "response", None), "status_code", None) == 400:
191
+ raise
192
+ attempts += 1
193
+ time.sleep(1)
194
+
195
+ raise TimeoutError(f"Timeout waiting for snapshot {snapshot_id} after {max_attempts} seconds")