# amazon_scraper.py # This is an Amazon products scraper compatible with Gradio. # It can be run as a standalone Gradio app or its functions can be loaded as tools. import httpx import re import gradio as gr from bs4 import BeautifulSoup from urllib.parse import urlparse from typing import List, Dict # --- Helper Functions for Web Scraping --- async def fetch_amazon_page(url: str) -> str: """Helper function to fetch Amazon product page""" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } async with httpx.AsyncClient() as client: response = await client.get(url, headers=headers, timeout=15.0) response.raise_for_status() return response.text def clean_price(price_text: str) -> str: """ Cleans and extracts the numerical price from a string. """ if not price_text: return "Price not available" # Find the first occurrence of a currency symbol followed by numbers match = re.search(r'([\$\£\€]?\d[\d,.]*)', price_text) if match: return match.group(1) return "Price not available" def extract_product_data(html_content: str, url: str) -> dict: """Extract product information from Amazon page HTML""" soup = BeautifulSoup(html_content, 'html.parser') # Initialize product data product_data = { 'name': 'Product name not found', 'price': 'Price not available', 'image_url': 'Image not found', 'rating': 'Rating not available', 'reviews_count': 'Reviews not available', 'availability': 'Availability not found', 'description': 'Description not available', 'url': url } try: # Extract product name name_selectors = [ '#productTitle', 'h1.a-size-large', '.a-size-large.product-title-word-break', 'h1[data-automation-id="product-title"]' ] for selector in name_selectors: name_elem = soup.select_one(selector) if name_elem: product_data['name'] = name_elem.get_text().strip() break # Extract price price_selectors = [ '.a-price-whole', '.a-price .a-offscreen', '.a-price-range .a-price-range-min .a-offscreen', '.a-price .a-price-symbol + span', '[data-a-color="price"] .a-offscreen' ] for selector in price_selectors: price_elem = soup.select_one(selector) if price_elem: product_data['price'] = clean_price(price_elem.get_text()) break # Extract image URL image_selectors = [ '#landingImage', '#imgBlkFront', '.a-dynamic-image', '[data-old-hires]' ] for selector in image_selectors: img_elem = soup.select_one(selector) if img_elem: img_url = img_elem.get('src') or img_elem.get('data-old-hires') if img_url: if img_url.startswith('//'): img_url = 'https:' + img_url product_data['image_url'] = img_url break # Extract rating rating_selectors = [ '.a-icon-alt', '[data-hook="rating-out-of-text"]', '.a-icon-star-small .a-icon-alt' ] for selector in rating_selectors: rating_elem = soup.select_one(selector) if rating_elem: rating_text = rating_elem.get_text() rating_match = re.search(r'(\d+\.?\d*)', rating_text) if rating_match: product_data['rating'] = f"{rating_match.group(1)} out of 5" break # Extract reviews count reviews_selectors = [ '#acrCustomerReviewText', '[data-hook="total-review-count"]', '.a-size-base.s-underline-text' ] for selector in reviews_selectors: reviews_elem = soup.select_one(selector) if reviews_elem: reviews_text = reviews_elem.get_text() reviews_match = re.search(r'(\d+(?:,\d+)*)', reviews_text) if reviews_match: product_data['reviews_count'] = f"{reviews_match.group(1)} reviews" break # Extract availability availability_selectors = [ '#availability .a-size-medium', '#availability span', '.a-size-medium.a-color-success' ] for selector in availability_selectors: avail_elem = soup.select_one(selector) if avail_elem: product_data['availability'] = avail_elem.get_text().strip() break # Extract description desc_selectors = [ '#productDescription p', '#feature-bullets .a-list-item', '.a-expander-content p' ] for selector in desc_selectors: desc_elem = soup.select_one(selector) if desc_elem: product_data['description'] = desc_elem.get_text().strip() break except Exception as e: product_data['error'] = f"Error parsing product data: {str(e)}" return product_data def extract_search_results(html_content: str, max_results: int) -> list: """Extract product information from Amazon search results""" soup = BeautifulSoup(html_content, 'html.parser') products = [] # Find product containers product_containers = soup.select('[data-component-type="s-search-result"]') for container in product_containers[:max_results]: try: product = { 'name': 'Product name not found', 'price': 'Price not available', 'image_url': 'Image not found', 'rating': 'Rating not available', 'url': 'URL not found' } # Extract product name name_elem = container.select_one('a h2 span') if name_elem: product['name'] = name_elem.get_text().strip() # Extract product URL url_elem = container.select_one('a') if url_elem: product_url = url_elem.get('href') if product_url: if product_url.startswith('/'): product_url = 'https://www.amazon.com' + product_url product['url'] = product_url # Extract price price_elem = container.select_one('.a-price-whole') if price_elem: product['price'] = clean_price(price_elem.get_text()) # Extract image img_elem = container.select_one('img.s-image') if img_elem: img_url = img_elem.get('src') if img_url: product['image_url'] = img_url # Extract rating rating_elem = container.select_one('.a-icon-alt') if rating_elem: rating_text = rating_elem.get_text() rating_match = re.search(r'(\d+\.?\d*)', rating_text) if rating_match: product['rating'] = f"{rating_match.group(1)} out of 5" products.append(product) except Exception as e: print(f"Error extracting product data: {str(e)}") return products # --- Formatting Functions for Display --- def format_product_details(product: dict) -> str: """Formats a single product's details into a Markdown string.""" return ( f"## {product.get('name', 'N/A')}\n" f"**Price:** {product.get('price', 'N/A')}\n\n" f"![Product Image]({product.get('image_url', '')})\n\n" f"**URL:** {product.get('url', 'N/A')}" ) def format_search_results(products: list, query: str) -> str: """Formats a list of search results into a single Markdown string.""" if not products: return f"No products found for '{query}'." result = f"# Search Results for '{query}'\n\n---\n\n" for product in products: result += ( f"### {product.get('name', 'N/A')}\n" f"**Price:** {product.get('price', 'N/A')}\n" f"**URL:** <{product.get('url', 'N/A')}>\n\n---\n\n" ) return result # --- Gradio Tool Functions --- async def scrape_product(product_url: str) -> str: """ Scrapes product information from a single Amazon product URL. Args: product_url: The full URL of the Amazon product page. Returns: A Markdown formatted string with the product's name, price, image, and URL. """ try: parsed_url = urlparse(product_url) if 'amazon' not in parsed_url.netloc: return "Error: Please provide a valid Amazon product URL." html_content = await fetch_amazon_page(product_url) product_data = extract_product_data(html_content, product_url) return format_product_details(product_data) except httpx.HTTPStatusError as e: return f"HTTP Error: {e.response.status_code}. Amazon may have blocked the request." except Exception as e: return f"An error occurred: {str(e)}" async def search_products(query: str, max_results: int = 5) -> str: """ Searches for products on Amazon and returns a list of results. Args: query: The search term (e.g., "laptop stand"). max_results: The maximum number of results to return. Returns: A Markdown formatted string with the search results. """ try: search_url = f"https://www.amazon.com/s?k={query.replace(' ', '+')}" html_content = await fetch_amazon_page(search_url) products = extract_search_results(html_content, max_results) return format_search_results(products, query) except Exception as e: return f"An error occurred during search: {str(e)}" # --- Gradio Interface (for standalone execution) --- if __name__ == "__main__": print("Starting Amazon Scraper Gradio App...") with gr.Blocks(theme=gr.themes.Soft(), title="Amazon Scraper") as demo: gr.Markdown("# 🤖 Amazon Product Scraper") gr.Markdown("Use the tools below to search for products or scrape a specific product URL.") with gr.Tabs(): with gr.TabItem("Search Products"): with gr.Row(): search_query_input = gr.Textbox(label="Search Query", placeholder="e.g., mechanical keyboard") max_results_input = gr.Number(label="Max Results", value=5, step=1, minimum=1, maximum=20) search_button = gr.Button("Search", variant="primary") search_output = gr.Markdown(label="Search Results") with gr.TabItem("Scrape Product by URL"): url_input = gr.Textbox(label="Amazon Product URL", placeholder="Paste a full Amazon URL here...") scrape_button = gr.Button("Scrape", variant="primary") scrape_output = gr.Markdown(label="Product Details") search_button.click( fn=search_products, inputs=[search_query_input, max_results_input], outputs=search_output ) scrape_button.click( fn=scrape_product, inputs=[url_input], outputs=scrape_output ) demo.launch(mcp_server=True, share=True)