MarketPriceEstimation / Amazon_scraper.py
abdallah-03's picture
Upload 8 files
5ff89f7 verified
import requests
from bs4 import BeautifulSoup
import re
import random
import time
from difflib import SequenceMatcher
def extract_prefix_and_number(text):
match = re.search(r'([A-Za-z]+)(\d+)', text)
if match:
return match.group(1), match.group(2)
return None, None # No valid match
def similarity(a, b):
a_lower, b_lower = a.lower(), b.lower()
a_prefix, a_number = extract_prefix_and_number(a_lower)
b_prefix, b_number = extract_prefix_and_number(b_lower)
if not a_prefix or not b_prefix or not a_number or not b_number:
return 0
if (a_prefix != b_prefix) or (a_number != b_number):
return 0
if a_number not in b_lower:
return 0
base_similarity = SequenceMatcher(None, a_lower, b_lower).ratio()
return base_similarity
def parse_amazon_page(content, product_name, your_cost):
soup = BeautifulSoup(content, 'html.parser')
price_digit_limit = len(f"{your_cost}")
product_prices = []
products = soup.findAll("div", attrs={"data-component-type": "s-search-result"})
for product in products[:20]:
title = product.find("h2", attrs={"class": "a-size-base-plus"})
if not title:
continue
spans = title.findAll("span")
for span in spans:
name = span.text.strip()
similarity_score = similarity(product_name, name)
if similarity_score >= 0.0:
# Get product link
product_link = ""
link_tag = title.find_parent("a")
if link_tag and 'href' in link_tag.attrs:
product_link = "https://www.amazon.eg" + link_tag['href']
# Get image link
image_link = ""
img_tag = product.find("img", attrs={"class": "s-image"})
if img_tag and 'src' in img_tag.attrs:
image_link = img_tag['src']
price_tag = product.find("span", attrs={"class": "a-price-whole"})
if price_tag:
raw_price = price_tag.text.strip()
numeric_price = re.sub(r"[^\d]", "", raw_price)
if not numeric_price:
continue
integer_part = numeric_price.split('.')[0]
if ((len(integer_part) == price_digit_limit) or (len(integer_part) == price_digit_limit + 1)) and (
int(integer_part) > int(your_cost)):
product_prices.append((name, numeric_price, product_link, image_link))
if not product_prices:
print("Warning: No valid prices found on Amazon.")
return product_prices
def scrape_amazon(product_name, your_cost, queue, max_retries=3, retry_delay=3):
url = f"https://www.amazon.eg/s?k={product_name.replace(' ', '+')}&language=en"
print(f"Fetching: {url}")
headers = {
"User-Agent": random.choice([
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.77 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
]),
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://www.google.com/",
"Accept-Encoding": "gzip, deflate, br",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Connection": "keep-alive",
}
for attempt in range(max_retries):
try:
response = requests.get(url, headers=headers, timeout=10)
if response.status_code in [506, 503]:
print(
f"Error {response.status_code}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
time.sleep(retry_delay)
continue
if response.status_code == 200:
print("Page fetched successfully with status code: 200")
results = parse_amazon_page(response.content, product_name, your_cost)
queue.put(("amazon", results))
print("Amazon results sent to queue") # Fix: Now this line runs
return results # Fix: Ensures function exits properly
else:
print(f"Unexpected status code: {response.status_code}")
queue.put([])
except requests.exceptions.RequestException as e:
print(f"An error occurred: {e}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
time.sleep(retry_delay)
print("Failed to fetch Amazon data after multiple attempts.")
queue.put([]) # Ensure the queue gets an empty list if all retries fail