Spaces:

BrightData
/

brightdata-dataset-tool

Sleeping

App Files Files Community

meirk-brd commited on Dec 8, 2025

Commit

bcba5ba

1 Parent(s): 86503c7

refactor

Browse files

Files changed (2) hide show

app.py +41 -33
tool.py +166 -168

app.py CHANGED Viewed

@@ -1,20 +1,23 @@
 import gradio as gr
-import importlib
-BrightDataDatasetTool = importlib.import_module("tool").BrightDataDatasetTool
 tool = BrightDataDatasetTool()
-DATASET_FIELDS = {'amazon_product': ['url'], 'amazon_product_reviews': ['url'], 'amazon_product_search': ['keyword', 'url'], 'walmart_product': ['url'], 'walmart_seller': ['url'], 'ebay_product': ['url'], 'homedepot_products': ['url'], 'zara_products': ['url'], 'etsy_products': ['url'], 'bestbuy_products': ['url'], 'linkedin_person_profile': ['url'], 'linkedin_company_profile': ['url'], 'linkedin_job_listings': ['url'], 'linkedin_posts': ['url'], 'linkedin_people_search': ['url', 'first_name', 'last_name'], 'crunchbase_company': ['url'], 'zoominfo_company_profile': ['url'], 'instagram_profiles': ['url'], 'instagram_posts': ['url'], 'instagram_reels': ['url'], 'instagram_comments': ['url'], 'facebook_posts': ['url'], 'facebook_marketplace_listings': ['url'], 'facebook_company_reviews': ['url', 'num_of_reviews'], 'facebook_events': ['url'], 'tiktok_profiles': ['url'], 'tiktok_posts': ['url'], 'tiktok_shop': ['url'], 'tiktok_comments': ['url'], 'google_maps_reviews': ['url', 'days_limit'], 'google_shopping': ['url'], 'google_play_store': ['url'], 'apple_app_store': ['url'], 'reuter_news': ['url'], 'github_repository_file': ['url'], 'yahoo_finance_business': ['url'], 'x_posts': ['url'], 'zillow_properties_listing': ['url'], 'booking_hotel_listings': ['url'], 'youtube_profiles': ['url'], 'youtube_comments': ['url', 'num_of_comments'], 'reddit_posts': ['url'], 'youtube_videos': ['url']}
-CHOICES = ['amazon_product', 'amazon_product_reviews', 'amazon_product_search', 'apple_app_store', 'bestbuy_products', 'booking_hotel_listings', 'crunchbase_company', 'ebay_product', 'etsy_products', 'facebook_company_reviews', 'facebook_events', 'facebook_marketplace_listings', 'facebook_posts', 'github_repository_file', 'google_maps_reviews', 'google_play_store', 'google_shopping', 'homedepot_products', 'instagram_comments', 'instagram_posts', 'instagram_profiles', 'instagram_reels', 'linkedin_company_profile', 'linkedin_job_listings', 'linkedin_people_search', 'linkedin_person_profile', 'linkedin_posts', 'reddit_posts', 'reuter_news', 'tiktok_comments', 'tiktok_posts', 'tiktok_profiles', 'tiktok_shop', 'walmart_product', 'walmart_seller', 'x_posts', 'yahoo_finance_business', 'youtube_comments', 'youtube_profiles', 'youtube_videos', 'zara_products', 'zillow_properties_listing', 'zoominfo_company_profile']
-def toggle_fields(selected):
     inputs = ["url", "keyword", "first_name", "last_name", "days_limit", "num_of_reviews", "num_of_comments"]
     wanted = set(DATASET_FIELDS.get(selected, []))
-    def vis(name):
         return gr.update(visible=name in wanted)
-    return tuple(vis(n) for n in inputs)
-def run(dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments):
     return tool(
         dataset=dataset,
         url=url,
@@ -26,29 +29,34 @@ def run(dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews
         num_of_comments=num_of_comments,
     )
-with gr.Blocks() as demo:
-    gr.Markdown("### Bright Data dataset fetch")
-    dataset = gr.Dropdown(choices=CHOICES, label="Dataset", multiselect=False, value=CHOICES[0])
-    url = gr.Textbox(label="URL", placeholder="https://...", visible=True)
-    keyword = gr.Textbox(label="Keyword", visible=False)
-    first_name = gr.Textbox(label="First name", visible=False)
-    last_name = gr.Textbox(label="Last name", visible=False)
-    days_limit = gr.Textbox(label="Days limit (e.g. 3)", visible=False)
-    num_of_reviews = gr.Textbox(label="Number of reviews", visible=False)
-    num_of_comments = gr.Textbox(label="Number of comments", visible=False)
-    dataset.change(
-        toggle_fields,
-        inputs=[dataset],
-        outputs=[url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
-    )
-    run_btn = gr.Button("Run")
-    output = gr.Textbox(label="Output", lines=12)
-    run_btn.click(
-        run,
-        inputs=[dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
-        outputs=output,
-    )
-demo.launch()

+from __future__ import annotations
 import gradio as gr
+from tool import BrightDataDatasetTool, DATASET_CHOICES, DATASET_FIELDS
 tool = BrightDataDatasetTool()
+def toggle_fields(selected: str):
     inputs = ["url", "keyword", "first_name", "last_name", "days_limit", "num_of_reviews", "num_of_comments"]
     wanted = set(DATASET_FIELDS.get(selected, []))
+    def visibility(name: str):
         return gr.update(visible=name in wanted)
+    return tuple(visibility(name) for name in inputs)
+def run(dataset: str, url: str, keyword: str, first_name: str, last_name: str, days_limit: str, num_of_reviews: str, num_of_comments: str) -> str:
     return tool(
         dataset=dataset,
         url=url,
         num_of_comments=num_of_comments,
     )
+def create_demo() -> gr.Blocks:
+    with gr.Blocks() as demo:
+        gr.Markdown("### Bright Data dataset fetch")
+        dataset = gr.Dropdown(choices=DATASET_CHOICES, label="Dataset", value=DATASET_CHOICES[0])
+        url = gr.Textbox(label="URL", placeholder="https://...", visible=True)
+        keyword = gr.Textbox(label="Keyword", visible=False)
+        first_name = gr.Textbox(label="First name", visible=False)
+        last_name = gr.Textbox(label="Last name", visible=False)
+        days_limit = gr.Textbox(label="Days limit (e.g. 3)", visible=False)
+        num_of_reviews = gr.Textbox(label="Number of reviews", visible=False)
+        num_of_comments = gr.Textbox(label="Number of comments", visible=False)
+        dataset.change(
+            toggle_fields,
+            inputs=[dataset],
+            outputs=[url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
+        )
+        run_btn = gr.Button("Run")
+        output = gr.Textbox(label="Output", lines=12)
+        run_btn.click(
+            run,
+            inputs=[dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
+            outputs=output,
+        )
+    return demo
+if __name__ == "__main__":
+    create_demo().launch()

tool.py CHANGED Viewed

@@ -1,118 +1,83 @@
-from typing import Any, Optional
-from smolagents.tools import Tool
 import json
-import time
 import os
 import requests
 class BrightDataDatasetTool(Tool):
     name = "brightdata_dataset_fetch"
-    description = "Trigger a Bright Data dataset collection and poll until the snapshot is ready. Choose a dataset key (e.g., amazon_product, linkedin_company_profile, google_maps_reviews). For most datasets, you only need to provide the URL parameter. For example: brightdata_dataset_fetch(dataset='linkedin_person_profile', url='https://linkedin.com/in/...')"
     output_type = "string"
-    def __init__(self):
-        # Keep dataset catalogue on the instance and build the inputs schema dynamically to satisfy tool validation.
-        self.datasets = globals().get("DATASETS")
-        if not self.datasets:
-            import json
-            fallback_json = r'{"amazon_product": {"dataset_id": "gd_l7q7dkf244hwjntr0", "description": "Quickly read structured amazon product data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_reviews": {"dataset_id": "gd_le8e811kzy4ggddlq", "description": "Quickly read structured amazon product review data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_search": {"dataset_id": "gd_lwdb4vjm1ehb499uxs", "description": "Quickly read structured amazon product search data.\nRequires a valid search keyword and amazon domain URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["keyword", "url"], "fixed_values": {"pages_to_search": "1"}}, "walmart_product": {"dataset_id": "gd_l95fol7l1ru6rlo116", "description": "Quickly read structured walmart product data.\nRequires a valid product URL with /ip/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "walmart_seller": {"dataset_id": "gd_m7ke48w81ocyu4hhz0", "description": "Quickly read structured walmart seller data.\nRequires a valid walmart seller URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "ebay_product": {"dataset_id": "gd_ltr9mjt81n0zzdk1fb", "description": "Quickly read structured ebay product data.\nRequires a valid ebay product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "homedepot_products": {"dataset_id": "gd_lmusivh019i7g97q2n", "description": "Quickly read structured homedepot product data.\nRequires a valid homedepot product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zara_products": {"dataset_id": "gd_lct4vafw1tgx27d4o0", "description": "Quickly read structured zara product data.\nRequires a valid zara product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "etsy_products": {"dataset_id": "gd_ltppk0jdv1jqz25mz", "description": "Quickly read structured etsy product data.\nRequires a valid etsy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "bestbuy_products": {"dataset_id": "gd_ltre1jqe1jfr7cccf", "description": "Quickly read structured bestbuy product data.\nRequires a valid bestbuy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_person_profile": {"dataset_id": "gd_l1viktl72bvl7bjuj0", "description": "Quickly read structured linkedin people profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_company_profile": {"dataset_id": "gd_l1vikfnt1wgvvqz95w", "description": "Quickly read structured linkedin company profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_job_listings": {"dataset_id": "gd_lpfll7v5hcqtkxl6l", "description": "Quickly read structured linkedin job listings data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_posts": {"dataset_id": "gd_lyy3tktm25m4avu764", "description": "Quickly read structured linkedin posts data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_people_search": {"dataset_id": "gd_m8d03he47z8nwb5xc", "description": "Quickly read structured linkedin people search data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "first_name", "last_name"]}, "crunchbase_company": {"dataset_id": "gd_l1vijqt9jfj7olije", "description": "Quickly read structured crunchbase company data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zoominfo_company_profile": {"dataset_id": "gd_m0ci4a4ivx3j5l6nx", "description": "Quickly read structured ZoomInfo company profile data.\nRequires a valid ZoomInfo company URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_profiles": {"dataset_id": "gd_l1vikfch901nx3by4", "description": "Quickly read structured Instagram profile data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_posts": {"dataset_id": "gd_lk5ns7kz21pck8jpis", "description": "Quickly read structured Instagram post data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_reels": {"dataset_id": "gd_lyclm20il4r5helnj", "description": "Quickly read structured Instagram reel data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_comments": {"dataset_id": "gd_ltppn085pokosxh13", "description": "Quickly read structured Instagram comments data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_posts": {"dataset_id": "gd_lyclm1571iy3mv57zw", "description": "Quickly read structured Facebook post data.\nRequires a valid Facebook post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_marketplace_listings": {"dataset_id": "gd_lvt9iwuh6fbcwmx1a", "description": "Quickly read structured Facebook marketplace listing data.\nRequires a valid Facebook marketplace listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_company_reviews": {"dataset_id": "gd_m0dtqpiu1mbcyc2g86", "description": "Quickly read structured Facebook company reviews data.\nRequires a valid Facebook company URL and number of reviews.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_reviews"]}, "facebook_events": {"dataset_id": "gd_m14sd0to1jz48ppm51", "description": "Quickly read structured Facebook events data.\nRequires a valid Facebook event URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_profiles": {"dataset_id": "gd_l1villgoiiidt09ci", "description": "Quickly read structured Tiktok profiles data.\nRequires a valid Tiktok profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_posts": {"dataset_id": "gd_lu702nij2f790tmv9h", "description": "Quickly read structured Tiktok post data.\nRequires a valid Tiktok post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_shop": {"dataset_id": "gd_m45m1u911dsa4274pi", "description": "Quickly read structured Tiktok shop data.\nRequires a valid Tiktok shop product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_comments": {"dataset_id": "gd_lkf2st302ap89utw5k", "description": "Quickly read structured Tiktok comments data.\nRequires a valid Tiktok video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_maps_reviews": {"dataset_id": "gd_luzfs1dn2oa0teb81", "description": "Quickly read structured Google maps reviews data.\nRequires a valid Google maps URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "days_limit"], "defaults": {"days_limit": "3"}}, "google_shopping": {"dataset_id": "gd_ltppk50q18kdw67omz", "description": "Quickly read structured Google shopping data.\nRequires a valid Google shopping product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_play_store": {"dataset_id": "gd_lsk382l8xei8vzm4u", "description": "Quickly read structured Google play store data.\nRequires a valid Google play store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "apple_app_store": {"dataset_id": "gd_lsk9ki3u2iishmwrui", "description": "Quickly read structured apple app store data.\nRequires a valid apple app store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "reuter_news": {"dataset_id": "gd_lyptx9h74wtlvpnfu", "description": "Quickly read structured reuter news data.\nRequires a valid reuter news report URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "github_repository_file": {"dataset_id": "gd_lyrexgxc24b3d4imjt", "description": "Quickly read structured github repository data.\nRequires a valid github repository file URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "yahoo_finance_business": {"dataset_id": "gd_lmrpz3vxmz972ghd7", "description": "Quickly read structured yahoo finance business data.\nRequires a valid yahoo finance business URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "x_posts": {"dataset_id": "gd_lwxkxvnf1cynvib9co", "description": "Quickly read structured X post data.\nRequires a valid X post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zillow_properties_listing": {"dataset_id": "gd_lfqkr8wm13ixtbd8f5", "description": "Quickly read structured zillow properties listing data.\nRequires a valid zillow properties listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "booking_hotel_listings": {"dataset_id": "gd_m5mbdl081229ln6t4a", "description": "Quickly read structured booking hotel listings data.\nRequires a valid booking hotel listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_profiles": {"dataset_id": "gd_lk538t2k2p1k3oos71", "description": "Quickly read structured youtube profiles data.\nRequires a valid youtube profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_comments": {"dataset_id": "gd_lk9q0ew71spt1mxywf", "description": "Quickly read structured youtube comments data.\nRequires a valid youtube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_comments"], "defaults": {"num_of_comments": "10"}}, "reddit_posts": {"dataset_id": "gd_lvz8ah06191smkebj4", "description": "Quickly read structured reddit posts data.\nRequires a valid reddit post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_videos": {"dataset_id": "gd_lk56epmy2i5g7lzu0k", "description": "Quickly read structured YouTube videos data.\nRequires a valid YouTube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}}'
-            self.datasets = json.loads(fallback_json)
         self.inputs = {
             "dataset": {
                 "type": "string",
                 "description": "Dataset key",
-                # Provide choices so UI renders a dropdown instead of a long list.
                 "enum": sorted(self.datasets.keys()),
             },
             "url": {
                 "type": "string",
-                "description": "URL for the dataset (required for most datasets)",
                 "nullable": True,
             },
             "keyword": {
                 "type": "string",
-                "description": "Search keyword (for search datasets like amazon_product_search)",
                 "nullable": True,
             },
             "first_name": {
                 "type": "string",
-                "description": "First name (for datasets like linkedin_people_search)",
                 "nullable": True,
             },
             "last_name": {
                 "type": "string",
-                "description": "Last name (for datasets like linkedin_people_search)",
                 "nullable": True,
             },
             "days_limit": {
                 "type": "string",
-                "description": "Days limit (for datasets like google_maps_reviews, default: 3)",
                 "nullable": True,
             },
             "num_of_reviews": {
                 "type": "string",
-                "description": "Number of reviews (for datasets like facebook_company_reviews)",
                 "nullable": True,
             },
             "num_of_comments": {
                 "type": "string",
-                "description": "Number of comments (for datasets like youtube_comments, default: 10)",
                 "nullable": True,
             },
         }
         super().__init__()
-    def _prepare_payload(self, dataset_key: str, params):
-        """Validate required fields, apply defaults, and merge fixed values."""
-        config = self.datasets[dataset_key]
-        payload = {}
-        defaults = config.get("defaults", {})
-        fixed_values = config.get("fixed_values", {})
-        for field in config["inputs"]:
-            if field in params:
-                payload[field] = params[field]
-            elif field in defaults:
-                payload[field] = defaults[field]
-            else:
-                raise ValueError(f"Missing required field '{field}' for dataset '{dataset_key}'")
-        payload.update(fixed_values)
-        return payload
     def forward(
         self,
         dataset: str,
-        url: str = None,
-        keyword: str = None,
-        first_name: str = None,
-        last_name: str = None,
-        days_limit: str = None,
-        num_of_reviews: str = None,
-        num_of_comments: str = None,
     ) -> str:
-        """
-        Trigger a dataset run and poll until results are ready.
-        Args:
-            dataset: The dataset key from DATASETS.
-            url: URL for the dataset (required for most datasets).
-            keyword: Search keyword (for search datasets).
-            first_name: First name (for people search datasets).
-            last_name: Last name (for people search datasets).
-            days_limit: Days limit (for time-based datasets).
-            num_of_reviews: Number of reviews to fetch.
-            num_of_comments: Number of comments to fetch.
-        Returns:
-            JSON string of the snapshot data once ready.
-        """
-        import os
-        import json
-        import time
-        import requests
         api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
         if not api_token:
             raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
@@ -120,8 +85,36 @@ class BrightDataDatasetTool(Tool):
         if dataset not in self.datasets:
             raise ValueError(f"Unknown dataset '{dataset}'. Valid options: {', '.join(sorted(self.datasets.keys()))}")
-        # Build params dict from provided arguments
-        params = {}
         if url is not None:
             params["url"] = url
         if keyword is not None:
@@ -136,124 +129,129 @@ class BrightDataDatasetTool(Tool):
             params["num_of_reviews"] = num_of_reviews
         if num_of_comments is not None:
             params["num_of_comments"] = num_of_comments
-        payload = self._prepare_payload(dataset, params)
-        dataset_id = self.datasets[dataset]["dataset_id"]
-        trigger_url = "https://api.brightdata.com/datasets/v3/trigger"
-        trigger_headers = {
-            "Authorization": f"Bearer {api_token}",
-            "Content-Type": "application/json",
-        }
-        trigger_response = requests.post(
             trigger_url,
             params={"dataset_id": dataset_id, "include_errors": "true"},
             json=[payload],
-            headers=trigger_headers,
             timeout=60,
         )
-        trigger_response.raise_for_status()
-        snapshot_id = trigger_response.json().get("snapshot_id")
         if not snapshot_id:
             raise RuntimeError("No snapshot ID returned from Bright Data.")
         snapshot_url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
         max_attempts = 600
         attempts = 0
         while attempts < max_attempts:
-            try:
-                response = requests.get(
-                    snapshot_url,
-                    params={"format": "json"},
-                    headers={"Authorization": f"Bearer {api_token}"},
-                    timeout=30,
-                )
-                if response.status_code == 400:
-                    response.raise_for_status()
-                data = response.json()
-                if isinstance(data, list):
-                    return json.dumps(data, indent=2)
-                status = data.get("status") if isinstance(data, dict) else None
-                if status not in {"running", "building"}:
-                    return json.dumps(data, indent=2)
-                attempts += 1
-                time.sleep(1)
-            except requests.exceptions.RequestException as exc:
-                if getattr(getattr(exc, "response", None), "status_code", None) == 400:
-                    raise
-                attempts += 1
-                time.sleep(1)
-        raise TimeoutError(f"Timeout waiting for snapshot {snapshot_id} after {max_attempts} seconds")
-    def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str:
-            """
-            Override the default app to render a dropdown for dataset selection
-            instead of a long text field. Kept minimal: single-select dropdown,
-            and shows only relevant parameter fields for the chosen dataset.
-            """
-            choices = sorted(self.datasets.keys())
-            dataset_fields = {k: v["inputs"] for k, v in self.datasets.items()}
-            return f"""import gradio as gr
-    import importlib
-    BrightDataDatasetTool = importlib.import_module("tool").BrightDataDatasetTool
-    tool = BrightDataDatasetTool()
-    DATASET_FIELDS = {dataset_fields}
-    CHOICES = {choices}
-    def toggle_fields(selected):
-        inputs = ["url", "keyword", "first_name", "last_name", "days_limit", "num_of_reviews", "num_of_comments"]
-        wanted = set(DATASET_FIELDS.get(selected, []))
-        def vis(name):
-            return gr.update(visible=name in wanted)
-        return tuple(vis(n) for n in inputs)
-    def run(dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments):
-        return tool(
-            dataset=dataset,
-            url=url,
-            keyword=keyword,
-            first_name=first_name,
-            last_name=last_name,
-            days_limit=days_limit,
-            num_of_reviews=num_of_reviews,
-            num_of_comments=num_of_comments,
-        )
-    with gr.Blocks() as demo:
-        gr.Markdown("### Bright Data dataset fetch")
-        dataset = gr.Dropdown(choices=CHOICES, label="Dataset", multiselect=False, value=CHOICES[0])
-        url = gr.Textbox(label="URL", placeholder="https://...", visible=True)
-        keyword = gr.Textbox(label="Keyword", visible=False)
-        first_name = gr.Textbox(label="First name", visible=False)
-        last_name = gr.Textbox(label="Last name", visible=False)
-        days_limit = gr.Textbox(label="Days limit (e.g. 3)", visible=False)
-        num_of_reviews = gr.Textbox(label="Number of reviews", visible=False)
-        num_of_comments = gr.Textbox(label="Number of comments", visible=False)
-        dataset.change(
-            toggle_fields,
-            inputs=[dataset],
-            outputs=[url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
-        )
-        run_btn = gr.Button("Run")
-        output = gr.Textbox(label="Output", lines=12)
-        run_btn.click(
-            run,
-            inputs=[dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
-            outputs=output,
-        )
-    demo.launch()
-    """

+from __future__ import annotations
 import json
 import os
+import time
+from typing import Any, Dict, List, Optional
 import requests
+from smolagents.tools import Tool
+DATASETS_JSON = r'''{"amazon_product": {"dataset_id": "gd_l7q7dkf244hwjntr0", "description": "Quickly read structured amazon product data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_reviews": {"dataset_id": "gd_le8e811kzy4ggddlq", "description": "Quickly read structured amazon product review data.\nRequires a valid product URL with /dp/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "amazon_product_search": {"dataset_id": "gd_lwdb4vjm1ehb499uxs", "description": "Quickly read structured amazon product search data.\nRequires a valid search keyword and amazon domain URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["keyword", "url"], "fixed_values": {"pages_to_search": "1"}}, "walmart_product": {"dataset_id": "gd_l95fol7l1ru6rlo116", "description": "Quickly read structured walmart product data.\nRequires a valid product URL with /ip/ in it.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "walmart_seller": {"dataset_id": "gd_m7ke48w81ocyu4hhz0", "description": "Quickly read structured walmart seller data.\nRequires a valid walmart seller URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "ebay_product": {"dataset_id": "gd_ltr9mjt81n0zzdk1fb", "description": "Quickly read structured ebay product data.\nRequires a valid ebay product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "homedepot_products": {"dataset_id": "gd_lmusivh019i7g97q2n", "description": "Quickly read structured homedepot product data.\nRequires a valid homedepot product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zara_products": {"dataset_id": "gd_lct4vafw1tgx27d4o0", "description": "Quickly read structured zara product data.\nRequires a valid zara product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "etsy_products": {"dataset_id": "gd_ltppk0jdv1jqz25mz", "description": "Quickly read structured etsy product data.\nRequires a valid etsy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "bestbuy_products": {"dataset_id": "gd_ltre1jqe1jfr7cccf", "description": "Quickly read structured bestbuy product data.\nRequires a valid bestbuy product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_person_profile": {"dataset_id": "gd_l1viktl72bvl7bjuj0", "description": "Quickly read structured linkedin people profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_company_profile": {"dataset_id": "gd_l1vikfnt1wgvvqz95w", "description": "Quickly read structured linkedin company profile data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_job_listings": {"dataset_id": "gd_lpfll7v5hcqtkxl6l", "description": "Quickly read structured linkedin job listings data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_posts": {"dataset_id": "gd_lyy3tktm25m4avu764", "description": "Quickly read structured linkedin posts data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "linkedin_people_search": {"dataset_id": "gd_m8d03he47z8nwb5xc", "description": "Quickly read structured linkedin people search data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "first_name", "last_name"]}, "crunchbase_company": {"dataset_id": "gd_l1vijqt9jfj7olije", "description": "Quickly read structured crunchbase company data.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zoominfo_company_profile": {"dataset_id": "gd_m0ci4a4ivx3j5l6nx", "description": "Quickly read structured ZoomInfo company profile data.\nRequires a valid ZoomInfo company URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_profiles": {"dataset_id": "gd_l1vikfch901nx3by4", "description": "Quickly read structured Instagram profile data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_posts": {"dataset_id": "gd_lk5ns7kz21pck8jpis", "description": "Quickly read structured Instagram post data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_reels": {"dataset_id": "gd_lyclm20il4r5helnj", "description": "Quickly read structured Instagram reel data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "instagram_comments": {"dataset_id": "gd_ltppn085pokosxh13", "description": "Quickly read structured Instagram comments data.\nRequires a valid Instagram URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_posts": {"dataset_id": "gd_lyclm1571iy3mv57zw", "description": "Quickly read structured Facebook post data.\nRequires a valid Facebook post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_marketplace_listings": {"dataset_id": "gd_lvt9iwuh6fbcwmx1a", "description": "Quickly read structured Facebook marketplace listing data.\nRequires a valid Facebook marketplace listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "facebook_company_reviews": {"dataset_id": "gd_m0dtqpiu1mbcyc2g86", "description": "Quickly read structured Facebook company reviews data.\nRequires a valid Facebook company URL and number of reviews.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_reviews"]}, "facebook_events": {"dataset_id": "gd_m14sd0to1jz48ppm51", "description": "Quickly read structured Facebook events data.\nRequires a valid Facebook event URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_profiles": {"dataset_id": "gd_l1villgoiiidt09ci", "description": "Quickly read structured Tiktok profiles data.\nRequires a valid Tiktok profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_posts": {"dataset_id": "gd_lu702nij2f790tmv9h", "description": "Quickly read structured Tiktok post data.\nRequires a valid Tiktok post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_shop": {"dataset_id": "gd_m45m1u911dsa4274pi", "description": "Quickly read structured Tiktok shop data.\nRequires a valid Tiktok shop product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "tiktok_comments": {"dataset_id": "gd_lkf2st302ap89utw5k", "description": "Quickly read structured Tiktok comments data.\nRequires a valid Tiktok video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_maps_reviews": {"dataset_id": "gd_luzfs1dn2oa0teb81", "description": "Quickly read structured Google maps reviews data.\nRequires a valid Google maps URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "days_limit"], "defaults": {"days_limit": "3"}}, "google_shopping": {"dataset_id": "gd_ltppk50q18kdw67omz", "description": "Quickly read structured Google shopping data.\nRequires a valid Google shopping product URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "google_play_store": {"dataset_id": "gd_lsk382l8xei8vzm4u", "description": "Quickly read structured Google play store data.\nRequires a valid Google play store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "apple_app_store": {"dataset_id": "gd_lsk9ki3u2iishmwrui", "description": "Quickly read structured apple app store data.\nRequires a valid apple app store app URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "reuter_news": {"dataset_id": "gd_lyptx9h74wtlvpnfu", "description": "Quickly read structured reuter news data.\nRequires a valid reuter news report URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "github_repository_file": {"dataset_id": "gd_lyrexgxc24b3d4imjt", "description": "Quickly read structured github repository data.\nRequires a valid github repository file URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "yahoo_finance_business": {"dataset_id": "gd_lmrpz3vxmz972ghd7", "description": "Quickly read structured yahoo finance business data.\nRequires a valid yahoo finance business URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "x_posts": {"dataset_id": "gd_lwxkxvnf1cynvib9co", "description": "Quickly read structured X post data.\nRequires a valid X post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "zillow_properties_listing": {"dataset_id": "gd_lfqkr8wm13ixtbd8f5", "description": "Quickly read structured zillow properties listing data.\nRequires a valid zillow properties listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "booking_hotel_listings": {"dataset_id": "gd_m5mbdl081229ln6t4a", "description": "Quickly read structured booking hotel listings data.\nRequires a valid booking hotel listing URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_profiles": {"dataset_id": "gd_lk538t2k2p1k3oos71", "description": "Quickly read structured youtube profiles data.\nRequires a valid youtube profile URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_comments": {"dataset_id": "gd_lk9q0ew71spt1mxywf", "description": "Quickly read structured youtube comments data.\nRequires a valid youtube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url", "num_of_comments"], "defaults": {"num_of_comments": "10"}}, "reddit_posts": {"dataset_id": "gd_lvz8ah06191smkebj4", "description": "Quickly read structured reddit posts data.\nRequires a valid reddit post URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}, "youtube_videos": {"dataset_id": "gd_lk56epmy2i5g7lzu0k", "description": "Quickly read structured YouTube videos data.\nRequires a valid YouTube video URL.\nThis can be a cache lookup, so it can be more reliable than scraping.", "inputs": ["url"]}}'''
+DATASETS: Dict[str, Any] = json.loads(DATASETS_JSON)
+DATASET_FIELDS: Dict[str, List[str]] = {key: value["inputs"] for key, value in DATASETS.items()}
+DATASET_CHOICES = sorted(DATASETS.keys())
 class BrightDataDatasetTool(Tool):
     name = "brightdata_dataset_fetch"
+    description = "Trigger a Bright Data dataset collection and poll until the snapshot is ready."
     output_type = "string"
+    def __init__(self, datasets: Optional[Dict[str, Any]] = None) -> None:
+        self.datasets = datasets or DATASETS
         self.inputs = {
             "dataset": {
                 "type": "string",
                 "description": "Dataset key",
                 "enum": sorted(self.datasets.keys()),
             },
             "url": {
                 "type": "string",
+                "description": "URL for the dataset",
                 "nullable": True,
             },
             "keyword": {
                 "type": "string",
+                "description": "Search keyword",
                 "nullable": True,
             },
             "first_name": {
                 "type": "string",
+                "description": "First name",
                 "nullable": True,
             },
             "last_name": {
                 "type": "string",
+                "description": "Last name",
                 "nullable": True,
             },
             "days_limit": {
                 "type": "string",
+                "description": "Days limit",
                 "nullable": True,
             },
             "num_of_reviews": {
                 "type": "string",
+                "description": "Number of reviews",
                 "nullable": True,
             },
             "num_of_comments": {
                 "type": "string",
+                "description": "Number of comments",
                 "nullable": True,
             },
         }
         super().__init__()
     def forward(
         self,
         dataset: str,
+        url: Optional[str] = None,
+        keyword: Optional[str] = None,
+        first_name: Optional[str] = None,
+        last_name: Optional[str] = None,
+        days_limit: Optional[str] = None,
+        num_of_reviews: Optional[str] = None,
+        num_of_comments: Optional[str] = None,
     ) -> str:
         api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
         if not api_token:
             raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")
         if dataset not in self.datasets:
             raise ValueError(f"Unknown dataset '{dataset}'. Valid options: {', '.join(sorted(self.datasets.keys()))}")
+        params = self._build_params(
+            url=url,
+            keyword=keyword,
+            first_name=first_name,
+            last_name=last_name,
+            days_limit=days_limit,
+            num_of_reviews=num_of_reviews,
+            num_of_comments=num_of_comments,
+        )
+        payload = self._prepare_payload(dataset, params)
+        try:
+            snapshot_id = self._trigger_snapshot(dataset, payload, api_token)
+            data = self._poll_snapshot(snapshot_id, api_token)
+            return json.dumps(data, indent=2)
+        except requests.exceptions.RequestException as exc:
+            return json.dumps({"error": str(exc)})
+    def _build_params(
+        self,
+        url: Optional[str],
+        keyword: Optional[str],
+        first_name: Optional[str],
+        last_name: Optional[str],
+        days_limit: Optional[str],
+        num_of_reviews: Optional[str],
+        num_of_comments: Optional[str],
+    ) -> Dict[str, str]:
+        params: Dict[str, str] = {}
         if url is not None:
             params["url"] = url
         if keyword is not None:
             params["num_of_reviews"] = num_of_reviews
         if num_of_comments is not None:
             params["num_of_comments"] = num_of_comments
+        return params
+    def _prepare_payload(self, dataset_key: str, params: Dict[str, str]) -> Dict[str, str]:
+        config = self.datasets[dataset_key]
+        payload: Dict[str, str] = {}
+        defaults = config.get("defaults", {})
+        fixed_values = config.get("fixed_values", {})
+        for field in config["inputs"]:
+            if field in params:
+                payload[field] = params[field]
+            elif field in defaults:
+                payload[field] = defaults[field]
+            else:
+                raise ValueError(f"Missing required field '{field}' for dataset '{dataset_key}'")
+        payload.update(fixed_values)
+        return payload
+    def _trigger_snapshot(self, dataset_key: str, payload: Dict[str, str], api_token: str) -> str:
+        dataset_id = self.datasets[dataset_key]["dataset_id"]
+        trigger_url = "https://api.brightdata.com/datasets/v3/trigger"
+        response = requests.post(
             trigger_url,
             params={"dataset_id": dataset_id, "include_errors": "true"},
             json=[payload],
+            headers={
+                "Authorization": f"Bearer {api_token}",
+                "Content-Type": "application/json",
+            },
             timeout=60,
         )
+        response.raise_for_status()
+        snapshot_id = response.json().get("snapshot_id")
         if not snapshot_id:
             raise RuntimeError("No snapshot ID returned from Bright Data.")
+        return snapshot_id
+    def _poll_snapshot(self, snapshot_id: str, api_token: str) -> Any:
         snapshot_url = f"https://api.brightdata.com/datasets/v3/snapshot/{snapshot_id}"
         max_attempts = 600
         attempts = 0
         while attempts < max_attempts:
+            response = requests.get(
+                snapshot_url,
+                params={"format": "json"},
+                headers={"Authorization": f"Bearer {api_token}"},
+                timeout=30,
+            )
+            if response.status_code == 400:
+                response.raise_for_status()
+            data = response.json()
+            if isinstance(data, list):
+                return data
+            status = data.get("status") if isinstance(data, dict) else None
+            if status not in {"running", "building"}:
+                return data
+            attempts += 1
+            time.sleep(1)
+        raise TimeoutError(f"Timeout waiting for snapshot {snapshot_id} after {max_attempts} seconds")
+    def _get_gradio_app_code(self, tool_module_name: str = "tool") -> str:
+        choices = sorted(self.datasets.keys())
+        dataset_fields = {key: value["inputs"] for key, value in self.datasets.items()}
+        return f"""import gradio as gr
+import importlib
+BrightDataDatasetTool = importlib.import_module("{tool_module_name}").BrightDataDatasetTool
+tool = BrightDataDatasetTool()
+DATASET_FIELDS = {dataset_fields}
+CHOICES = {choices}
+def toggle_fields(selected):
+    inputs = ["url", "keyword", "first_name", "last_name", "days_limit", "num_of_reviews", "num_of_comments"]
+    wanted = set(DATASET_FIELDS.get(selected, []))
+    def vis(name):
+        return gr.update(visible=name in wanted)
+    return tuple(vis(name) for name in inputs)
+def run(dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments):
+    return tool(
+        dataset=dataset,
+        url=url,
+        keyword=keyword,
+        first_name=first_name,
+        last_name=last_name,
+        days_limit=days_limit,
+        num_of_reviews=num_of_reviews,
+        num_of_comments=num_of_comments,
+    )
+with gr.Blocks() as demo:
+    gr.Markdown("### Bright Data dataset fetch")
+    dataset = gr.Dropdown(choices=CHOICES, label="Dataset", value=CHOICES[0])
+    url = gr.Textbox(label="URL", placeholder="https://...", visible=True)
+    keyword = gr.Textbox(label="Keyword", visible=False)
+    first_name = gr.Textbox(label="First name", visible=False)
+    last_name = gr.Textbox(label="Last name", visible=False)
+    days_limit = gr.Textbox(label="Days limit (e.g. 3)", visible=False)
+    num_of_reviews = gr.Textbox(label="Number of reviews", visible=False)
+    num_of_comments = gr.Textbox(label="Number of comments", visible=False)
+    dataset.change(
+        toggle_fields,
+        inputs=[dataset],
+        outputs=[url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
+    )
+    run_btn = gr.Button("Run")
+    output = gr.Textbox(label="Output", lines=12)
+    run_btn.click(
+        run,
+        inputs=[dataset, url, keyword, first_name, last_name, days_limit, num_of_reviews, num_of_comments],
+        outputs=output,
+    )
+demo.launch()
+"""