Spaces:

mumer119131
/

shops-scraper

Runtime error

File size: 13,736 Bytes

import requests
from bs4 import BeautifulSoup
import re
import csv
import json
import time
from undetected_chromedriver import Chrome, ChromeOptions
import psycopg2
import datetime
from httpx_socks import SyncProxyTransport
import httpx

class WallmartScraper:
    def __init__(self):
        self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
        self.headers = {
            'Host': 'www.walmart.ca',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
            'Accept': '*/*',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Content-Type': 'application/json',
            'Connection': 'keep-alive'
        }
        self.cookies = {
            'walmart.shippingPostalCode':'P7B3Z7',
            'defaultNearestStoreId':'3124',
            'zone':"9",
            'deliveryCatchment':"3124",
            'walmart.csrf':'73996cac34766ec995777784',
            'wmt.c':"0",
            'vtc':'ZAUFmHNTbFPrWyLrN8WTXA',
            'userSegment':'50-percent',
            'TBV':"7",
            'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
            'dtSa':'-',
            '_ga':'GA1.2.1363574403.1590552905',
            '_gid':'GA1.2.85728116.1590552905',
            'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff',
            'usrState':"1",
            'walmart.nearestPostalCode':'P7B3Z7',
            's_ecid':'MCMID%7C17236695788713957075642593017320325404',
            'walmart.locale':'en',
            'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1",
            's_visit':"1",
            's_cc':'true',
            'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
            'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
            '_gcl_au':'1.1.482108716.1590552907',
            '_fbp':'fb.1.1590552907225.702607671',
            'og_autoship':"0",
            'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H|5b6d58542e634882|0',
            'walmart.nearestLatLng':"48.4120872,-89.2413988",
            'dtLatC':"3",
            'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
            'dtSa':'-',
            'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30',
            'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==',
            'LT':'1590553091850',
            'BVImplmain_site':"2036",
            'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542',
            'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30',
            'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90',
            'NEXT_GEN.ENABLED':"1",
            '_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi',
            'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
            'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
            'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}',
            'headerType':'grocery',
            's_sq':'%5B%5BB%5D%5D',
            'previousBreakpoint':'desktop',
            'wmt.breakpoint':'d',
            'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764',
            'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04',
            '_uetsid':'2127b16a-c523-20a6-d801-43923775d65e',
            '_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk',
            'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0',
            'rxvt':'1590610238206|1590608438206',
            's_gnr':'1590609571427-Repeat',
            'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0',
            '_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D'
        }
        self.session = self.create_session()
        self.categories = [
            'personal care', 'beauty', 'health'
        ]
        # options = ChromeOptions()
        # options.add_argument("--headless")
        # options.add_argument("--disable-gpu")
        # options.add_argument("--no-sandbox")
        # self.driver = Chrome(options=options)
        # self.generate_session()

    # def generate_session(self):

    #     self.driver.get("https://www.walmart.com/")
    #     print(self.driver.get_cookies())
    #     cookies = self.driver.get_cookies()
    #     [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]

    def create_session(self):
        transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321")
        session = httpx.Client(transport=transport)
        return session
    
    def get_product_detail(self, url):
        try:
            response = self.session.get(
                url, headers=self.headers, cookies=self.cookies)
            soup = BeautifulSoup(response.text, 'html.parser')
            script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
        except:
            try:
                self.session = self.create_session()
                response = self.session.get(
                    url, headers=self.headers, cookies=self.cookies)
                soup = BeautifulSoup(response.text, 'html.parser')
                script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
            except:
                return False
        # convert scraipt tag json to dict
        script_tag_json = json.loads(script_tag)
        try:
            ingridents = script_tag_json['props']['pageProps']['initialData'][
                'data']['idml']['ingredients']['ingredients']['value']
        except:
            ingridents = ''
        
        try:
            # x.props.pageProps.initialData.data.product.name
            product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
            # x.props.pageProps.initialData.data.product.usItemId
            product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
        except:
            return False
        return [product_name, product_id, ingridents]

    def get_no_of_pages_of_sub_category(self, url):
        # burp0_headers = {"Dpr": "1", "Downlink": "10", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Referer": "https://www.walmart.com/cp/health/976760?q=health", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
        # response = requests.get(url, headers=burp0_headers)

        # # try:
        # pattern = r'"maxPage":\s*(\d+)'
        # match = re.search(pattern, response.text)
        # total_pages = match.group(1)

        # print(total_pages)
        return 25
        # except:
        #     print('No pagination found')
        #     return 1

    def get_category_browse_urls(self, category):
        url = f'https://www.walmart.com/cp/health/976760?q={category}'
        try:
            response = self.session.get(url, headers=self.headers, cookies=self.cookies)
            
            print(response.text)
            url_pattern = re.compile(
                r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
        except:
            try:
                self.session = self.create_session()
                response = self.session.get(url, headers=self.headers, cookies=self.cookies)
                url_pattern = re.compile(
                    r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
            except:
                return False
        url_matches = url_pattern.findall(response.text)

        soup = BeautifulSoup(response.text, 'html.parser')
        # Find all anchor (a) tags in the HTML
        all_links = soup.find_all("a")

        # Extract and print the href attribute of each anchor tag matching the pattern
        for link in all_links:
            href = link.get("href")
            if href and url_pattern.match(href):
                print(href)
                print(href)
                try:
                    total_pages = self.get_no_of_pages_of_sub_category(href)
                    urls = self.get_product_urls(href, int(total_pages))
                except Exception as e:
                    print(e)
                    pass
        

    def get_product_urls(self, url, pages):
        urls = []
        for i in range(1, pages + 1):
            print(i)

            request_url = f'{url}&page={i}'
            print(request_url)
            try:
                response = self.session.get(
                    request_url, headers=self.headers, cookies=self.cookies)
            
                soup = BeautifulSoup(response.text, 'html.parser')
                items = soup.find_all('div', {'class': 'b--near-white'})
            except:
                try:
                    self.session = self.create_session()
                    response = self.session.get(
                        request_url, headers=self.headers, cookies=self.cookies)
                
                    soup = BeautifulSoup(response.text, 'html.parser')
                    items = soup.find_all('div', {'class': 'b--near-white'})
                except:
                    return False
            for item in items:
                a_tag = item.find('a')
                if a_tag:
                    if a_tag['href'].startswith('/ip'):
                        a_tag['href'] = 'https://www.walmart.com' + \
                            a_tag['href']
                    product = self.get_product_detail(a_tag['href'])
                    time.sleep(2)
                    print(product)
                    if product:
                        self.save_product({
                            'title': product[0],
                            'product_id': product[1],
                            'ingredients': product[2],
                            'url': a_tag['href'],
                            'store_name': 'Wallmart'
                        })
                    # self.save_product_to_csv(product)
        print(urls)
        return urls

    def save_product_to_csv(self, product):
        with open('wallmart.csv', 'a+', encoding='utf-8', newline='') as file:
            # check if the product is already in the csv
            file.seek(0)  # move the file pointer to the beginning of the file
            reader = csv.reader(file)
            product_ids = [row[1] for row in reader]
            if product[1] in product_ids:
                return False

            writer = csv.writer(file)
            writer.writerow(product)
            file.close()
    
    def save_product(self, product):
        try:
            conn = psycopg2.connect(
                host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
                database="ingredients-scraper",
                user="mumer113141",
                password="SFBtp4xnPeA2"
            )
            cur = conn.cursor()
            cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
            conn.commit()
            cur.close()
            conn.close()
            return True
        except Exception as e:
            print(e)
            return False
    def run(self):
        print('Wallmart scraper started')
        for category in self.categories:
            self.get_category_browse_urls(category)

        return True