Spaces:
Sleeping
Sleeping
| # amazon_scraper.py | |
| # This is an Amazon products scraper compatible with Gradio. | |
| # It can be run as a standalone Gradio app or its functions can be loaded as tools. | |
| import httpx | |
| import re | |
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse | |
| from typing import List, Dict | |
| # --- Helper Functions for Web Scraping --- | |
| async def fetch_amazon_page(url: str) -> str: | |
| """Helper function to fetch Amazon product page""" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1', | |
| } | |
| async with httpx.AsyncClient() as client: | |
| response = await client.get(url, headers=headers, timeout=15.0) | |
| response.raise_for_status() | |
| return response.text | |
| def clean_price(price_text: str) -> str: | |
| """ | |
| Cleans and extracts the numerical price from a string. | |
| """ | |
| if not price_text: | |
| return "Price not available" | |
| # Find the first occurrence of a currency symbol followed by numbers | |
| match = re.search(r'([\$\£\€]?\d[\d,.]*)', price_text) | |
| if match: | |
| return match.group(1) | |
| return "Price not available" | |
| def extract_product_data(html_content: str, url: str) -> dict: | |
| """Extract product information from Amazon page HTML""" | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Initialize product data | |
| product_data = { | |
| 'name': 'Product name not found', | |
| 'price': 'Price not available', | |
| 'image_url': 'Image not found', | |
| 'rating': 'Rating not available', | |
| 'reviews_count': 'Reviews not available', | |
| 'availability': 'Availability not found', | |
| 'description': 'Description not available', | |
| 'url': url | |
| } | |
| try: | |
| # Extract product name | |
| name_selectors = [ | |
| '#productTitle', | |
| 'h1.a-size-large', | |
| '.a-size-large.product-title-word-break', | |
| 'h1[data-automation-id="product-title"]' | |
| ] | |
| for selector in name_selectors: | |
| name_elem = soup.select_one(selector) | |
| if name_elem: | |
| product_data['name'] = name_elem.get_text().strip() | |
| break | |
| # Extract price | |
| price_selectors = [ | |
| '.a-price-whole', | |
| '.a-price .a-offscreen', | |
| '.a-price-range .a-price-range-min .a-offscreen', | |
| '.a-price .a-price-symbol + span', | |
| '[data-a-color="price"] .a-offscreen' | |
| ] | |
| for selector in price_selectors: | |
| price_elem = soup.select_one(selector) | |
| if price_elem: | |
| product_data['price'] = clean_price(price_elem.get_text()) | |
| break | |
| # Extract image URL | |
| image_selectors = [ | |
| '#landingImage', | |
| '#imgBlkFront', | |
| '.a-dynamic-image', | |
| '[data-old-hires]' | |
| ] | |
| for selector in image_selectors: | |
| img_elem = soup.select_one(selector) | |
| if img_elem: | |
| img_url = img_elem.get('src') or img_elem.get('data-old-hires') | |
| if img_url: | |
| if img_url.startswith('//'): | |
| img_url = 'https:' + img_url | |
| product_data['image_url'] = img_url | |
| break | |
| # Extract rating | |
| rating_selectors = [ | |
| '.a-icon-alt', | |
| '[data-hook="rating-out-of-text"]', | |
| '.a-icon-star-small .a-icon-alt' | |
| ] | |
| for selector in rating_selectors: | |
| rating_elem = soup.select_one(selector) | |
| if rating_elem: | |
| rating_text = rating_elem.get_text() | |
| rating_match = re.search(r'(\d+\.?\d*)', rating_text) | |
| if rating_match: | |
| product_data['rating'] = f"{rating_match.group(1)} out of 5" | |
| break | |
| # Extract reviews count | |
| reviews_selectors = [ | |
| '#acrCustomerReviewText', | |
| '[data-hook="total-review-count"]', | |
| '.a-size-base.s-underline-text' | |
| ] | |
| for selector in reviews_selectors: | |
| reviews_elem = soup.select_one(selector) | |
| if reviews_elem: | |
| reviews_text = reviews_elem.get_text() | |
| reviews_match = re.search(r'(\d+(?:,\d+)*)', reviews_text) | |
| if reviews_match: | |
| product_data['reviews_count'] = f"{reviews_match.group(1)} reviews" | |
| break | |
| # Extract availability | |
| availability_selectors = [ | |
| '#availability .a-size-medium', | |
| '#availability span', | |
| '.a-size-medium.a-color-success' | |
| ] | |
| for selector in availability_selectors: | |
| avail_elem = soup.select_one(selector) | |
| if avail_elem: | |
| product_data['availability'] = avail_elem.get_text().strip() | |
| break | |
| # Extract description | |
| desc_selectors = [ | |
| '#productDescription p', | |
| '#feature-bullets .a-list-item', | |
| '.a-expander-content p' | |
| ] | |
| for selector in desc_selectors: | |
| desc_elem = soup.select_one(selector) | |
| if desc_elem: | |
| product_data['description'] = desc_elem.get_text().strip() | |
| break | |
| except Exception as e: | |
| product_data['error'] = f"Error parsing product data: {str(e)}" | |
| return product_data | |
| def extract_search_results(html_content: str, max_results: int) -> list: | |
| """Extract product information from Amazon search results""" | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| products = [] | |
| # Find product containers | |
| product_containers = soup.select('[data-component-type="s-search-result"]') | |
| for container in product_containers[:max_results]: | |
| try: | |
| product = { | |
| 'name': 'Product name not found', | |
| 'price': 'Price not available', | |
| 'image_url': 'Image not found', | |
| 'rating': 'Rating not available', | |
| 'url': 'URL not found' | |
| } | |
| # Extract product name | |
| name_elem = container.select_one('a h2 span') | |
| if name_elem: | |
| product['name'] = name_elem.get_text().strip() | |
| # Extract product URL | |
| url_elem = container.select_one('a') | |
| if url_elem: | |
| product_url = url_elem.get('href') | |
| if product_url: | |
| if product_url.startswith('/'): | |
| product_url = 'https://www.amazon.com' + product_url | |
| product['url'] = product_url | |
| # Extract price | |
| price_elem = container.select_one('.a-price-whole') | |
| if price_elem: | |
| product['price'] = clean_price(price_elem.get_text()) | |
| # Extract image | |
| img_elem = container.select_one('img.s-image') | |
| if img_elem: | |
| img_url = img_elem.get('src') | |
| if img_url: | |
| product['image_url'] = img_url | |
| # Extract rating | |
| rating_elem = container.select_one('.a-icon-alt') | |
| if rating_elem: | |
| rating_text = rating_elem.get_text() | |
| rating_match = re.search(r'(\d+\.?\d*)', rating_text) | |
| if rating_match: | |
| product['rating'] = f"{rating_match.group(1)} out of 5" | |
| products.append(product) | |
| except Exception as e: | |
| print(f"Error extracting product data: {str(e)}") | |
| return products | |
| # --- Formatting Functions for Display --- | |
| def format_product_details(product: dict) -> str: | |
| """Formats a single product's details into a Markdown string.""" | |
| return ( | |
| f"## {product.get('name', 'N/A')}\n" | |
| f"**Price:** {product.get('price', 'N/A')}\n\n" | |
| f"})\n\n" | |
| f"**URL:** {product.get('url', 'N/A')}" | |
| ) | |
| def format_search_results(products: list, query: str) -> str: | |
| """Formats a list of search results into a single Markdown string.""" | |
| if not products: | |
| return f"No products found for '{query}'." | |
| result = f"# Search Results for '{query}'\n\n---\n\n" | |
| for product in products: | |
| result += ( | |
| f"### {product.get('name', 'N/A')}\n" | |
| f"**Price:** {product.get('price', 'N/A')}\n" | |
| f"**URL:** <{product.get('url', 'N/A')}>\n\n---\n\n" | |
| ) | |
| return result | |
| # --- Gradio Tool Functions --- | |
| async def scrape_product(product_url: str) -> str: | |
| """ | |
| Scrapes product information from a single Amazon product URL. | |
| Args: | |
| product_url: The full URL of the Amazon product page. | |
| Returns: | |
| A Markdown formatted string with the product's name, price, image, and URL. | |
| """ | |
| try: | |
| parsed_url = urlparse(product_url) | |
| if 'amazon' not in parsed_url.netloc: | |
| return "Error: Please provide a valid Amazon product URL." | |
| html_content = await fetch_amazon_page(product_url) | |
| product_data = extract_product_data(html_content, product_url) | |
| return format_product_details(product_data) | |
| except httpx.HTTPStatusError as e: | |
| return f"HTTP Error: {e.response.status_code}. Amazon may have blocked the request." | |
| except Exception as e: | |
| return f"An error occurred: {str(e)}" | |
| async def search_products(query: str, max_results: int = 5) -> str: | |
| """ | |
| Searches for products on Amazon and returns a list of results. | |
| Args: | |
| query: The search term (e.g., "laptop stand"). | |
| max_results: The maximum number of results to return. | |
| Returns: | |
| A Markdown formatted string with the search results. | |
| """ | |
| try: | |
| search_url = f"https://www.amazon.com/s?k={query.replace(' ', '+')}" | |
| html_content = await fetch_amazon_page(search_url) | |
| products = extract_search_results(html_content, max_results) | |
| return format_search_results(products, query) | |
| except Exception as e: | |
| return f"An error occurred during search: {str(e)}" | |
| # --- Gradio Interface (for standalone execution) --- | |
| if __name__ == "__main__": | |
| print("Starting Amazon Scraper Gradio App...") | |
| with gr.Blocks(theme=gr.themes.Soft(), title="Amazon Scraper") as demo: | |
| gr.Markdown("# 🤖 Amazon Product Scraper") | |
| gr.Markdown("Use the tools below to search for products or scrape a specific product URL.") | |
| with gr.Tabs(): | |
| with gr.TabItem("Search Products"): | |
| with gr.Row(): | |
| search_query_input = gr.Textbox(label="Search Query", placeholder="e.g., mechanical keyboard") | |
| max_results_input = gr.Number(label="Max Results", value=5, step=1, minimum=1, maximum=20) | |
| search_button = gr.Button("Search", variant="primary") | |
| search_output = gr.Markdown(label="Search Results") | |
| with gr.TabItem("Scrape Product by URL"): | |
| url_input = gr.Textbox(label="Amazon Product URL", placeholder="Paste a full Amazon URL here...") | |
| scrape_button = gr.Button("Scrape", variant="primary") | |
| scrape_output = gr.Markdown(label="Product Details") | |
| search_button.click( | |
| fn=search_products, | |
| inputs=[search_query_input, max_results_input], | |
| outputs=search_output | |
| ) | |
| scrape_button.click( | |
| fn=scrape_product, | |
| inputs=[url_input], | |
| outputs=scrape_output | |
| ) | |
| demo.launch(mcp_server=True, share=True) |