Spaces:

BrightData
/

brightdata-scraper-tool

Running

Add Bright Data Scraper Tool

fbd126d verified 5 months ago

1.71 kB

	from typing import Any, Optional
	from smolagents.tools import Tool
	import requests
	import os

	class BrightDataScraperTool(Tool):
	name = "brightdata_web_scraper"
	description = """
	Scrape any webpage and return content in Markdown format.
	This tool can bypass bot detection and CAPTCHAs.
	Use this when you need to extract content from websites.
	"""
	inputs = {'url': {'type': 'string', 'description': 'The URL of the webpage to scrape'}}
	output_type = "string"

	def forward(self, url: str) -> str:
	"""
	Scrape a webpage using Bright Data's API.

	Args:
	url: The URL to scrape

	Returns:
	The scraped content in Markdown format
	"""
	import os
	import requests

	api_token = os.getenv("BRIGHT_DATA_API_TOKEN")
	unlocker_zone = os.getenv("BRIGHT_DATA_UNLOCKER_ZONE", "web_unlocker_1")

	if not api_token:
	raise ValueError("BRIGHT_DATA_API_TOKEN not found in environment variables")

	api_url = "https://api.brightdata.com/request"
	headers = {
	"Authorization": f"Bearer {api_token}",
	"Content-Type": "application/json",
	}

	payload = {
	"url": url,
	"zone": unlocker_zone,
	"format": "raw",
	"data_format": "markdown",
	}

	try:
	response = requests.post(api_url, json=payload, headers=headers)
	response.raise_for_status()
	return response.text
	except requests.exceptions.RequestException as e:
	return f"Error scraping URL: {str(e)}"

	def __init__(self, args, *kwargs):
	self.is_initialized = False