# amazon_scraper.py
# This is an Amazon products scraper compatible with Gradio.
# It can be run as a standalone Gradio app or its functions can be loaded as tools.

import httpx
import re
import gradio as gr
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from typing import List, Dict

# --- Helper Functions for Web Scraping ---

async def fetch_amazon_page(url: str) -> str:
    """Helper function to fetch Amazon product page"""
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Language': 'en-US,en;q=0.5',
        'Accept-Encoding': 'gzip, deflate',
        'Connection': 'keep-alive',
        'Upgrade-Insecure-Requests': '1',
    }
    
    async with httpx.AsyncClient() as client:
        response = await client.get(url, headers=headers, timeout=15.0)
        response.raise_for_status()
        return response.text

def clean_price(price_text: str) -> str:
    """
    Cleans and extracts the numerical price from a string.
    """
    if not price_text:
        return "Price not available"
    # Find the first occurrence of a currency symbol followed by numbers
    match = re.search(r'([\$\£\€]?\d[\d,.]*)', price_text)
    if match:
        return match.group(1)
    return "Price not available"
def extract_product_data(html_content: str, url: str) -> dict:
    """Extract product information from Amazon page HTML"""
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Initialize product data
    product_data = {
        'name': 'Product name not found',
        'price': 'Price not available',
        'image_url': 'Image not found',
        'rating': 'Rating not available',
        'reviews_count': 'Reviews not available',
        'availability': 'Availability not found',
        'description': 'Description not available',
        'url': url
    }
    
    try:
        # Extract product name
        name_selectors = [
            '#productTitle',
            'h1.a-size-large',
            '.a-size-large.product-title-word-break',
            'h1[data-automation-id="product-title"]'
        ]
        
        for selector in name_selectors:
            name_elem = soup.select_one(selector)
            if name_elem:
                product_data['name'] = name_elem.get_text().strip()
                break
        
        # Extract price
        price_selectors = [
            '.a-price-whole',
            '.a-price .a-offscreen',
            '.a-price-range .a-price-range-min .a-offscreen',
            '.a-price .a-price-symbol + span',
            '[data-a-color="price"] .a-offscreen'
        ]
        
        for selector in price_selectors:
            price_elem = soup.select_one(selector)
            if price_elem:
                product_data['price'] = clean_price(price_elem.get_text())
                break
        
        # Extract image URL
        image_selectors = [
            '#landingImage',
            '#imgBlkFront',
            '.a-dynamic-image',
            '[data-old-hires]'
        ]
        
        for selector in image_selectors:
            img_elem = soup.select_one(selector)
            if img_elem:
                img_url = img_elem.get('src') or img_elem.get('data-old-hires')
                if img_url:
                    if img_url.startswith('//'):
                        img_url = 'https:' + img_url
                    product_data['image_url'] = img_url
                    break
        
        # Extract rating
        rating_selectors = [
            '.a-icon-alt',
            '[data-hook="rating-out-of-text"]',
            '.a-icon-star-small .a-icon-alt'
        ]
        
        for selector in rating_selectors:
            rating_elem = soup.select_one(selector)
            if rating_elem:
                rating_text = rating_elem.get_text()
                rating_match = re.search(r'(\d+\.?\d*)', rating_text)
                if rating_match:
                    product_data['rating'] = f"{rating_match.group(1)} out of 5"
                    break
        
        # Extract reviews count
        reviews_selectors = [
            '#acrCustomerReviewText',
            '[data-hook="total-review-count"]',
            '.a-size-base.s-underline-text'
        ]
        
        for selector in reviews_selectors:
            reviews_elem = soup.select_one(selector)
            if reviews_elem:
                reviews_text = reviews_elem.get_text()
                reviews_match = re.search(r'(\d+(?:,\d+)*)', reviews_text)
                if reviews_match:
                    product_data['reviews_count'] = f"{reviews_match.group(1)} reviews"
                    break
        
        # Extract availability
        availability_selectors = [
            '#availability .a-size-medium',
            '#availability span',
            '.a-size-medium.a-color-success'
        ]
        
        for selector in availability_selectors:
            avail_elem = soup.select_one(selector)
            if avail_elem:
                product_data['availability'] = avail_elem.get_text().strip()
                break
        
        # Extract description
        desc_selectors = [
            '#productDescription p',
            '#feature-bullets .a-list-item',
            '.a-expander-content p'
        ]
        
        for selector in desc_selectors:
            desc_elem = soup.select_one(selector)
            if desc_elem:
                product_data['description'] = desc_elem.get_text().strip()
                break
                
    except Exception as e:
        product_data['error'] = f"Error parsing product data: {str(e)}"
    
    return product_data

def extract_search_results(html_content: str, max_results: int) -> list:
    """Extract product information from Amazon search results"""
    soup = BeautifulSoup(html_content, 'html.parser')
    products = []
    
    # Find product containers
    product_containers = soup.select('[data-component-type="s-search-result"]')
    
    for container in product_containers[:max_results]:
        try:
            product = {
                'name': 'Product name not found',
                'price': 'Price not available',
                'image_url': 'Image not found',
                'rating': 'Rating not available',
                'url': 'URL not found'
            }
            
            # Extract product name
            name_elem = container.select_one('a h2 span')
            if name_elem:
                product['name'] = name_elem.get_text().strip()
            
            # Extract product URL
            url_elem = container.select_one('a')
            if url_elem:
                product_url = url_elem.get('href')
                if product_url:
                    if product_url.startswith('/'):
                        product_url = 'https://www.amazon.com' + product_url
                    product['url'] = product_url
            
            # Extract price
            price_elem = container.select_one('.a-price-whole')
            if price_elem:
                product['price'] = clean_price(price_elem.get_text())
            
            # Extract image
            img_elem = container.select_one('img.s-image')
            if img_elem:
                img_url = img_elem.get('src')
                if img_url:
                    product['image_url'] = img_url
            
            # Extract rating
            rating_elem = container.select_one('.a-icon-alt')
            if rating_elem:
                rating_text = rating_elem.get_text()
                rating_match = re.search(r'(\d+\.?\d*)', rating_text)
                if rating_match:
                    product['rating'] = f"{rating_match.group(1)} out of 5"
            
            products.append(product)
            
        except Exception as e:
            print(f"Error extracting product data: {str(e)}")
    
    return products

# --- Formatting Functions for Display ---

def format_product_details(product: dict) -> str:
    """Formats a single product's details into a Markdown string."""
    return (
        f"## {product.get('name', 'N/A')}\n"
        f"**Price:** {product.get('price', 'N/A')}\n\n"
        f"![Product Image]({product.get('image_url', '')})\n\n"
        f"**URL:** {product.get('url', 'N/A')}"
    )

def format_search_results(products: list, query: str) -> str:
    """Formats a list of search results into a single Markdown string."""
    if not products:
        return f"No products found for '{query}'."
    
    result = f"# Search Results for '{query}'\n\n---\n\n"
    for product in products:
        result += (
            f"### {product.get('name', 'N/A')}\n"
            f"**Price:** {product.get('price', 'N/A')}\n"
            f"**URL:** <{product.get('url', 'N/A')}>\n\n---\n\n"
        )
    return result
    
# --- Gradio Tool Functions ---

async def scrape_product(product_url: str) -> str:
    """
    Scrapes product information from a single Amazon product URL.
    
    Args:
        product_url: The full URL of the Amazon product page.
        
    Returns:
        A Markdown formatted string with the product's name, price, image, and URL.
    """
    try:
        parsed_url = urlparse(product_url)
        if 'amazon' not in parsed_url.netloc:
            return "Error: Please provide a valid Amazon product URL."
            
        html_content = await fetch_amazon_page(product_url)
        product_data = extract_product_data(html_content, product_url)
        return format_product_details(product_data)
        
    except httpx.HTTPStatusError as e:
        return f"HTTP Error: {e.response.status_code}. Amazon may have blocked the request."
    except Exception as e:
        return f"An error occurred: {str(e)}"

async def search_products(query: str, max_results: int = 5) -> str:
    """
    Searches for products on Amazon and returns a list of results.
    
    Args:
        query: The search term (e.g., "laptop stand").
        max_results: The maximum number of results to return.
        
    Returns:
        A Markdown formatted string with the search results.
    """
    try:
        search_url = f"https://www.amazon.com/s?k={query.replace(' ', '+')}"
        html_content = await fetch_amazon_page(search_url)
        products = extract_search_results(html_content, max_results)
        return format_search_results(products, query)
        
    except Exception as e:
        return f"An error occurred during search: {str(e)}"

# --- Gradio Interface (for standalone execution) ---

if __name__ == "__main__":
    print("Starting Amazon Scraper Gradio App...")
    
    with gr.Blocks(theme=gr.themes.Soft(), title="Amazon Scraper") as demo:
        gr.Markdown("# 🤖 Amazon Product Scraper")
        gr.Markdown("Use the tools below to search for products or scrape a specific product URL.")
        
        with gr.Tabs():
            with gr.TabItem("Search Products"):
                with gr.Row():
                    search_query_input = gr.Textbox(label="Search Query", placeholder="e.g., mechanical keyboard")
                    max_results_input = gr.Number(label="Max Results", value=5, step=1, minimum=1, maximum=20)
                search_button = gr.Button("Search", variant="primary")
                search_output = gr.Markdown(label="Search Results")

            with gr.TabItem("Scrape Product by URL"):
                url_input = gr.Textbox(label="Amazon Product URL", placeholder="Paste a full Amazon URL here...")
                scrape_button = gr.Button("Scrape", variant="primary")
                scrape_output = gr.Markdown(label="Product Details")
        
        search_button.click(
            fn=search_products,
            inputs=[search_query_input, max_results_input],
            outputs=search_output
        )
        
        scrape_button.click(
            fn=scrape_product,
            inputs=[url_input],
            outputs=scrape_output
        )
        
    demo.launch(mcp_server=True, share=True)