File size: 5,015 Bytes
5ff89f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import requests
from bs4 import BeautifulSoup
import re
import random
import time
from difflib import SequenceMatcher


def extract_prefix_and_number(text):
    match = re.search(r'([A-Za-z]+)(\d+)', text)
    if match:
        return match.group(1), match.group(2)
    return None, None  # No valid match


def similarity(a, b):
    a_lower, b_lower = a.lower(), b.lower()
    a_prefix, a_number = extract_prefix_and_number(a_lower)
    b_prefix, b_number = extract_prefix_and_number(b_lower)
    if not a_prefix or not b_prefix or not a_number or not b_number:
        return 0
    if (a_prefix != b_prefix) or (a_number != b_number):
        return 0
    if a_number not in b_lower:
        return 0
    base_similarity = SequenceMatcher(None, a_lower, b_lower).ratio()
    return base_similarity


def parse_amazon_page(content, product_name, your_cost):
    soup = BeautifulSoup(content, 'html.parser')
    price_digit_limit = len(f"{your_cost}")
    product_prices = []
    products = soup.findAll("div", attrs={"data-component-type": "s-search-result"})

    for product in products[:20]:
        title = product.find("h2", attrs={"class": "a-size-base-plus"})
        if not title:
            continue

        spans = title.findAll("span")
        for span in spans:
            name = span.text.strip()
            similarity_score = similarity(product_name, name)
            if similarity_score >= 0.0:
                # Get product link
                product_link = ""
                link_tag = title.find_parent("a")
                if link_tag and 'href' in link_tag.attrs:
                    product_link = "https://www.amazon.eg" + link_tag['href']

                # Get image link
                image_link = ""
                img_tag = product.find("img", attrs={"class": "s-image"})
                if img_tag and 'src' in img_tag.attrs:
                    image_link = img_tag['src']

                price_tag = product.find("span", attrs={"class": "a-price-whole"})
                if price_tag:
                    raw_price = price_tag.text.strip()
                    numeric_price = re.sub(r"[^\d]", "", raw_price)

                    if not numeric_price:
                        continue

                    integer_part = numeric_price.split('.')[0]
                    if ((len(integer_part) == price_digit_limit) or (len(integer_part) == price_digit_limit + 1)) and (
                            int(integer_part) > int(your_cost)):
                        product_prices.append((name, numeric_price, product_link, image_link))

    if not product_prices:
        print("Warning: No valid prices found on Amazon.")

    return product_prices


def scrape_amazon(product_name, your_cost, queue, max_retries=3, retry_delay=3):
    url = f"https://www.amazon.eg/s?k={product_name.replace(' ', '+')}&language=en"
    print(f"Fetching: {url}")

    headers = {
        "User-Agent": random.choice([
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.5481.77 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
        ]),
        "Accept-Language": "en-US,en;q=0.9",
        "Referer": "https://www.google.com/",
        "Accept-Encoding": "gzip, deflate, br",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
        "Connection": "keep-alive",
    }

    for attempt in range(max_retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)

            if response.status_code in [506, 503]:
                print(
                    f"Error {response.status_code}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
                time.sleep(retry_delay)
                continue

            if response.status_code == 200:
                print("Page fetched successfully with status code: 200")

                results = parse_amazon_page(response.content, product_name, your_cost)
                queue.put(("amazon", results))

                print("Amazon results sent to queue")  # Fix: Now this line runs
                return results  # Fix: Ensures function exits properly

            else:
                print(f"Unexpected status code: {response.status_code}")
                queue.put([])

        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}. Retrying in {retry_delay} seconds... (Attempt {attempt + 1}/{max_retries})")
            time.sleep(retry_delay)

    print("Failed to fetch Amazon data after multiple attempts.")
    queue.put([])  # Ensure the queue gets an empty list if all retries fail