Spaces:

abdallah-03
/

MarketPriceEstimation

Sleeping

File size: 5,015 Bytes

5ff89f7

import requests
from bs4 import BeautifulSoup
import re
import random
import time
from difflib import SequenceMatcher


def extract_prefix_and_number(text):
    match = re.search(r'([A-Za-z]+)(\d+)', text)
    if match:
        return match.group(1), match.group(2)
    return None, None  # No valid match


def similarity(a, b):
    a_lower, b_lower = a.lower(), b.lower()
    a_prefix, a_number = extract_prefix_and_number(a_lower)
    b_prefix, b_number = extract_prefix_and_number(b_lower)
    if not a_prefix or not b_prefix or not a_number or not b_number:
        return 0
    if (a_prefix != b_prefix) or (a_number != b_number):
        return 0
    if a_number not in b_lower:
        return 0
    base_similarity = SequenceMatcher(None, a_lower, b_lower).ratio()
    return base_similarity


def parse_amazon_page(content, product_name, your_cost):
    soup = BeautifulSoup(content, 'html.parser')
    price_digit_limit = len(f"{your_cost}")
    product_prices = []
    products = soup.findAll("div", attrs={"data-component-type": "s-search-result"})

    for product in products[:20]:
        title = product.find("h2", attrs={"class": "a-size-base-plus"})
        if not title:
            continue

        spans = title.findAll("span")
        for span in spans:
            name = span.text.strip()
            similarity_score = similarity(product_name, name)
            if similarity_score >= 0.0:
                # Get product link
                product_link = ""
                link_tag = title.find_parent("a")
                if link_tag and 'href' in link_tag.attrs:
                    product_link = "https://www.amazon.eg" + link_tag['href']

                # Get image link
                image_link = ""
                img_tag = product.find("img", attrs={"class": "s-image"})
                if img_tag and 'src' in img_tag.attrs:
                    image_link = img_tag['src']

                price_tag = product.find("span", attrs={"class": "a-price-whole"})
                if price_tag:
                    raw_price = price_tag.text.strip()
                    numeric_price = re.sub(r"[^\d]", "", raw_price)

                    if not numeric_price:
                        continue

                    integer_part = numeric_price.split('.')[0]
                    if ((len(integer_part) == price_digit_limit) or (len(integer_part) == price_digit_limit + 1)) and (
                            int(integer_part) > int(your_cost)):
                        product_prices.append((name, numeric_price, product_link, image_link))

    if not product_prices:
        print("Warning: No valid prices found on Amazon.")

    return product_prices


def scrape_amazon(product_name, your_cost, queue, max_retries=3, retry_delay=3):
    url = f"https://www.amazon.eg/s?k={product_name.replace(' ', '+')}&language=en"
    print(f"Fetching: {url}")

    headers = {
        "User-Agent": random.choice([
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.77 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
        ]),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Connection": "keep-alive",
    }

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)

            if response.status_code in [506, 503]:
                print(
                    f"Error {response.status_code}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
                time.sleep(retry_delay)
                continue

            if response.status_code == 200:
                print("Page fetched successfully with status code: 200")

                results = parse_amazon_page(response.content, product_name, your_cost)
                queue.put(("amazon", results))

                print("Amazon results sent to queue")  # Fix: Now this line runs
                return results  # Fix: Ensures function exits properly

            else:
                print(f"Unexpected status code: {response.status_code}")
                queue.put([])

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
            time.sleep(retry_delay)

    print("Failed to fetch Amazon data after multiple attempts.")
    queue.put([])  # Ensure the queue gets an empty list if all retries fail