Spaces:

TRUBETSKOY
/

amazon-mcp-server

Sleeping

App Files Files Community

amazon-mcp-server / amazon_scraper.py

TRUBETSKOY

Upload folder using huggingface_hub

c2e2262 verified 5 months ago

raw

history blame contribute delete

12.5 kB

	# amazon_scraper.py
	# This is an Amazon products scraper compatible with Gradio.
	# It can be run as a standalone Gradio app or its functions can be loaded as tools.

	import httpx
	import re
	import gradio as gr
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse
	from typing import List, Dict

	# --- Helper Functions for Web Scraping ---

	async def fetch_amazon_page(url: str) -> str:
	"""Helper function to fetch Amazon product page"""
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
	'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate',
	'Connection': 'keep-alive',
	'Upgrade-Insecure-Requests': '1',
	}

	async with httpx.AsyncClient() as client:
	response = await client.get(url, headers=headers, timeout=15.0)
	response.raise_for_status()
	return response.text

	def clean_price(price_text: str) -> str:
	"""
	Cleans and extracts the numerical price from a string.
	"""
	if not price_text:
	return "Price not available"
	# Find the first occurrence of a currency symbol followed by numbers
	match = re.search(r'([\$\£\€]?\d[\d,.]*)', price_text)
	if match:
	return match.group(1)
	return "Price not available"
	def extract_product_data(html_content: str, url: str) -> dict:
	"""Extract product information from Amazon page HTML"""
	soup = BeautifulSoup(html_content, 'html.parser')

	# Initialize product data
	product_data = {
	'name': 'Product name not found',
	'price': 'Price not available',
	'image_url': 'Image not found',
	'rating': 'Rating not available',
	'reviews_count': 'Reviews not available',
	'availability': 'Availability not found',
	'description': 'Description not available',
	'url': url
	}

	try:
	# Extract product name
	name_selectors = [
	'#productTitle',
	'h1.a-size-large',
	'.a-size-large.product-title-word-break',
	'h1[data-automation-id="product-title"]'
	]

	for selector in name_selectors:
	name_elem = soup.select_one(selector)
	if name_elem:
	product_data['name'] = name_elem.get_text().strip()
	break

	# Extract price
	price_selectors = [
	'.a-price-whole',
	'.a-price .a-offscreen',
	'.a-price-range .a-price-range-min .a-offscreen',
	'.a-price .a-price-symbol + span',
	'[data-a-color="price"] .a-offscreen'
	]

	for selector in price_selectors:
	price_elem = soup.select_one(selector)
	if price_elem:
	product_data['price'] = clean_price(price_elem.get_text())
	break

	# Extract image URL
	image_selectors = [
	'#landingImage',
	'#imgBlkFront',
	'.a-dynamic-image',
	'[data-old-hires]'
	]

	for selector in image_selectors:
	img_elem = soup.select_one(selector)
	if img_elem:
	img_url = img_elem.get('src') or img_elem.get('data-old-hires')
	if img_url:
	if img_url.startswith('//'):
	img_url = 'https:' + img_url
	product_data['image_url'] = img_url
	break

	# Extract rating
	rating_selectors = [
	'.a-icon-alt',
	'[data-hook="rating-out-of-text"]',
	'.a-icon-star-small .a-icon-alt'
	]

	for selector in rating_selectors:
	rating_elem = soup.select_one(selector)
	if rating_elem:
	rating_text = rating_elem.get_text()
	rating_match = re.search(r'(\d+\.?\d*)', rating_text)
	if rating_match:
	product_data['rating'] = f"{rating_match.group(1)} out of 5"
	break

	# Extract reviews count
	reviews_selectors = [
	'#acrCustomerReviewText',
	'[data-hook="total-review-count"]',
	'.a-size-base.s-underline-text'
	]

	for selector in reviews_selectors:
	reviews_elem = soup.select_one(selector)
	if reviews_elem:
	reviews_text = reviews_elem.get_text()
	reviews_match = re.search(r'(\d+(?:,\d+)*)', reviews_text)
	if reviews_match:
	product_data['reviews_count'] = f"{reviews_match.group(1)} reviews"
	break

	# Extract availability
	availability_selectors = [
	'#availability .a-size-medium',
	'#availability span',
	'.a-size-medium.a-color-success'
	]

	for selector in availability_selectors:
	avail_elem = soup.select_one(selector)
	if avail_elem:
	product_data['availability'] = avail_elem.get_text().strip()
	break

	# Extract description
	desc_selectors = [
	'#productDescription p',
	'#feature-bullets .a-list-item',
	'.a-expander-content p'
	]

	for selector in desc_selectors:
	desc_elem = soup.select_one(selector)
	if desc_elem:
	product_data['description'] = desc_elem.get_text().strip()
	break

	except Exception as e:
	product_data['error'] = f"Error parsing product data: {str(e)}"

	return product_data

	def extract_search_results(html_content: str, max_results: int) -> list:
	"""Extract product information from Amazon search results"""
	soup = BeautifulSoup(html_content, 'html.parser')
	products = []

	# Find product containers
	product_containers = soup.select('[data-component-type="s-search-result"]')

	for container in product_containers[:max_results]:
	try:
	product = {
	'name': 'Product name not found',
	'price': 'Price not available',
	'image_url': 'Image not found',
	'rating': 'Rating not available',
	'url': 'URL not found'
	}

	# Extract product name
	name_elem = container.select_one('a h2 span')
	if name_elem:
	product['name'] = name_elem.get_text().strip()

	# Extract product URL
	url_elem = container.select_one('a')
	if url_elem:
	product_url = url_elem.get('href')
	if product_url:
	if product_url.startswith('/'):
	product_url = 'https://www.amazon.com' + product_url
	product['url'] = product_url

	# Extract price
	price_elem = container.select_one('.a-price-whole')
	if price_elem:
	product['price'] = clean_price(price_elem.get_text())

	# Extract image
	img_elem = container.select_one('img.s-image')
	if img_elem:
	img_url = img_elem.get('src')
	if img_url:
	product['image_url'] = img_url

	# Extract rating
	rating_elem = container.select_one('.a-icon-alt')
	if rating_elem:
	rating_text = rating_elem.get_text()
	rating_match = re.search(r'(\d+\.?\d*)', rating_text)
	if rating_match:
	product['rating'] = f"{rating_match.group(1)} out of 5"

	products.append(product)

	except Exception as e:
	print(f"Error extracting product data: {str(e)}")

	return products

	# --- Formatting Functions for Display ---

	def format_product_details(product: dict) -> str:
	"""Formats a single product's details into a Markdown string."""
	return (
	f"## {product.get('name', 'N/A')}\n"
	f"Price: {product.get('price', 'N/A')}\n\n"
	f"![Product Image]({product.get('image_url', '')})\n\n"
	f"URL: {product.get('url', 'N/A')}"
	)

	def format_search_results(products: list, query: str) -> str:
	"""Formats a list of search results into a single Markdown string."""
	if not products:
	return f"No products found for '{query}'."

	result = f"# Search Results for '{query}'\n\n---\n\n"
	for product in products:
	result += (
	f"### {product.get('name', 'N/A')}\n"
	f"Price: {product.get('price', 'N/A')}\n"
	f"URL: <{product.get('url', 'N/A')}>\n\n---\n\n"
	)
	return result

	# --- Gradio Tool Functions ---

	async def scrape_product(product_url: str) -> str:
	"""
	Scrapes product information from a single Amazon product URL.

	Args:
	product_url: The full URL of the Amazon product page.

	Returns:
	A Markdown formatted string with the product's name, price, image, and URL.
	"""
	try:
	parsed_url = urlparse(product_url)
	if 'amazon' not in parsed_url.netloc:
	return "Error: Please provide a valid Amazon product URL."

	html_content = await fetch_amazon_page(product_url)
	product_data = extract_product_data(html_content, product_url)
	return format_product_details(product_data)

	except httpx.HTTPStatusError as e:
	return f"HTTP Error: {e.response.status_code}. Amazon may have blocked the request."
	except Exception as e:
	return f"An error occurred: {str(e)}"

	async def search_products(query: str, max_results: int = 5) -> str:
	"""
	Searches for products on Amazon and returns a list of results.

	Args:
	query: The search term (e.g., "laptop stand").
	max_results: The maximum number of results to return.

	Returns:
	A Markdown formatted string with the search results.
	"""
	try:
	search_url = f"https://www.amazon.com/s?k={query.replace(' ', '+')}"
	html_content = await fetch_amazon_page(search_url)
	products = extract_search_results(html_content, max_results)
	return format_search_results(products, query)

	except Exception as e:
	return f"An error occurred during search: {str(e)}"

	# --- Gradio Interface (for standalone execution) ---

	if __name__ == "__main__":
	print("Starting Amazon Scraper Gradio App...")

	with gr.Blocks(theme=gr.themes.Soft(), title="Amazon Scraper") as demo:
	gr.Markdown("# 🤖 Amazon Product Scraper")
	gr.Markdown("Use the tools below to search for products or scrape a specific product URL.")

	with gr.Tabs():
	with gr.TabItem("Search Products"):
	with gr.Row():
	search_query_input = gr.Textbox(label="Search Query", placeholder="e.g., mechanical keyboard")
	max_results_input = gr.Number(label="Max Results", value=5, step=1, minimum=1, maximum=20)
	search_button = gr.Button("Search", variant="primary")
	search_output = gr.Markdown(label="Search Results")

	with gr.TabItem("Scrape Product by URL"):
	url_input = gr.Textbox(label="Amazon Product URL", placeholder="Paste a full Amazon URL here...")
	scrape_button = gr.Button("Scrape", variant="primary")
	scrape_output = gr.Markdown(label="Product Details")

	search_button.click(
	fn=search_products,
	inputs=[search_query_input, max_results_input],
	outputs=search_output
	)

	scrape_button.click(
	fn=scrape_product,
	inputs=[url_input],
	outputs=scrape_output
	)

	demo.launch(mcp_server=True, share=True)