Spaces:

abdallah-03
/

MarketPriceEstimation

Sleeping

App Files Files Community

MarketPriceEstimation / Amazon_scraper.py

abdallah-03

Upload 8 files

5ff89f7 verified 10 months ago

raw

history blame contribute delete

5.02 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	import random
	import time
	from difflib import SequenceMatcher


	def extract_prefix_and_number(text):
	match = re.search(r'([A-Za-z]+)(\d+)', text)
	if match:
	return match.group(1), match.group(2)
	return None, None # No valid match


	def similarity(a, b):
	a_lower, b_lower = a.lower(), b.lower()
	a_prefix, a_number = extract_prefix_and_number(a_lower)
	b_prefix, b_number = extract_prefix_and_number(b_lower)
	if not a_prefix or not b_prefix or not a_number or not b_number:
	return 0
	if (a_prefix != b_prefix) or (a_number != b_number):
	return 0
	if a_number not in b_lower:
	return 0
	base_similarity = SequenceMatcher(None, a_lower, b_lower).ratio()
	return base_similarity


	def parse_amazon_page(content, product_name, your_cost):
	soup = BeautifulSoup(content, 'html.parser')
	price_digit_limit = len(f"{your_cost}")
	product_prices = []
	products = soup.findAll("div", attrs={"data-component-type": "s-search-result"})

	for product in products[:20]:
	title = product.find("h2", attrs={"class": "a-size-base-plus"})
	if not title:
	continue

	spans = title.findAll("span")
	for span in spans:
	name = span.text.strip()
	similarity_score = similarity(product_name, name)
	if similarity_score >= 0.0:
	# Get product link
	product_link = ""
	link_tag = title.find_parent("a")
	if link_tag and 'href' in link_tag.attrs:
	product_link = "https://www.amazon.eg" + link_tag['href']

	# Get image link
	image_link = ""
	img_tag = product.find("img", attrs={"class": "s-image"})
	if img_tag and 'src' in img_tag.attrs:
	image_link = img_tag['src']

	price_tag = product.find("span", attrs={"class": "a-price-whole"})
	if price_tag:
	raw_price = price_tag.text.strip()
	numeric_price = re.sub(r"[^\d]", "", raw_price)

	if not numeric_price:
	continue

	integer_part = numeric_price.split('.')[0]
	if ((len(integer_part) == price_digit_limit) or (len(integer_part) == price_digit_limit + 1)) and (
	int(integer_part) > int(your_cost)):
	product_prices.append((name, numeric_price, product_link, image_link))

	if not product_prices:
	print("Warning: No valid prices found on Amazon.")

	return product_prices


	def scrape_amazon(product_name, your_cost, queue, max_retries=3, retry_delay=3):
	url = f"https://www.amazon.eg/s?k={product_name.replace(' ', '+')}&language=en"
	print(f"Fetching: {url}")

	headers = {
	"User-Agent": random.choice([
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.77 Safari/537.36",
	"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
	]),
	"Accept-Language": "en-US,en;q=0.9",
	"Referer": "https://www.google.com/",
	"Accept-Encoding": "gzip, deflate, br",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	"Connection": "keep-alive",
	}

	for attempt in range(max_retries):
	try:
	response = requests.get(url, headers=headers, timeout=10)

	if response.status_code in [506, 503]:
	print(
	f"Error {response.status_code}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
	time.sleep(retry_delay)
	continue

	if response.status_code == 200:
	print("Page fetched successfully with status code: 200")

	results = parse_amazon_page(response.content, product_name, your_cost)
	queue.put(("amazon", results))

	print("Amazon results sent to queue") # Fix: Now this line runs
	return results # Fix: Ensures function exits properly

	else:
	print(f"Unexpected status code: {response.status_code}")
	queue.put([])

	except requests.exceptions.RequestException as e:
	print(f"An error occurred: {e}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
	time.sleep(retry_delay)

	print("Failed to fetch Amazon data after multiple attempts.")
	queue.put([]) # Ensure the queue gets an empty list if all retries fail