Spaces:

mumer119131
/

shops-scraper

Runtime error

App Files Files Community

mumer119131 commited on Oct 22, 2023

Commit

369e8bf

1 Parent(s): 643dbca

Update scraper/utils/WallmartScraper.py

Browse files

Files changed (1) hide show

scraper/utils/WallmartScraper.py +172 -42

scraper/utils/WallmartScraper.py CHANGED Viewed

@@ -4,35 +4,120 @@ import re
 import csv
 import json
 import time
-from .DatabaseDataSaver import save_product
 from undetected_chromedriver import Chrome, ChromeOptions
 class WallmartScraper:
     def __init__(self):
         self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
-        self.headers = {"Referer":"https://www.google.com","Connection":"Keep-Alive","Accept-Language":"en-US,en;q=0.9","Accept-Encoding":"gzip, deflate, br","Accept":self.ac,"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}
-        self.session = requests.Session()
         self.categories = [
-            'health', 'beauty', 'personal care'
         ]
-        self.generate_session()
-    def generate_session(self):
-        options = ChromeOptions()
-        options.add_argument("--headless")
-        options.add_argument("--disable-gpu")
-        options.add_argument("--no-sandbox")
-        driver = Chrome(options=options)
-        driver.get("https://www.walmart.com/")
-        print(driver.get_cookies())
-        cookies = driver.get_cookies()
-        [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
     def get_product_detail(self, url):
-        response = self.session.get(
-            url, headers=self.headers)
-        soup = BeautifulSoup(response.text, 'html.parser')
-        script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
         # convert scraipt tag json to dict
         script_tag_json = json.loads(script_tag)
         try:
@@ -40,10 +125,14 @@ class WallmartScraper:
                 'data']['idml']['ingredients']['ingredients']['value']
         except:
             ingridents = ''
-        # x.props.pageProps.initialData.data.product.name
-        product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
-        # x.props.pageProps.initialData.data.product.usItemId
-        product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
         return [product_name, product_id, ingridents]
     def get_no_of_pages_of_sub_category(self, url):
@@ -63,12 +152,22 @@ class WallmartScraper:
     def get_category_browse_urls(self, category):
         url = f'https://www.walmart.com/cp/health/976760?q={category}'
-        response = self.session.get(
-            url, headers=self.headers)
-        url_pattern = re.compile(
-            r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
         url_matches = url_pattern.findall(response.text)
         soup = BeautifulSoup(response.text, 'html.parser')
         # Find all anchor (a) tags in the HTML
         all_links = soup.find_all("a")
@@ -77,6 +176,7 @@ class WallmartScraper:
         for link in all_links:
             href = link.get("href")
             if href and url_pattern.match(href):
                 print(href)
                 try:
                     total_pages = self.get_no_of_pages_of_sub_category(href)
@@ -93,11 +193,22 @@ class WallmartScraper:
             request_url = f'{url}&page={i}'
             print(request_url)
-            response = self.session.get(
-                request_url, headers=self.headers)
-            soup = BeautifulSoup(response.text, 'html.parser')
-            items = soup.find_all('div', {'class': 'b--near-white'})
             for item in items:
                 a_tag = item.find('a')
                 if a_tag:
@@ -107,13 +218,14 @@ class WallmartScraper:
                     product = self.get_product_detail(a_tag['href'])
                     time.sleep(2)
                     print(product)
-                    save_product({
-                        'title': product[0],
-                        'product_id': product[1],
-                        'ingredients': product[2],
-                        'url': a_tag['href'],
-                        'store_name': 'Wallmart'
-                    })
                     # self.save_product_to_csv(product)
         print(urls)
         return urls
@@ -130,8 +242,26 @@ class WallmartScraper:
             writer = csv.writer(file)
             writer.writerow(product)
             file.close()
     def run(self):
         for category in self.categories:
             self.get_category_browse_urls(category)

 import csv
 import json
 import time
 from undetected_chromedriver import Chrome, ChromeOptions
+import psycopg2
+import datetime
+from httpx_socks import SyncProxyTransport
+import httpx
 class WallmartScraper:
     def __init__(self):
         self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
+        self.headers = {
+            'Host': 'www.walmart.ca',
+            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
+            'Accept': '*/*',
+            'Accept-Language': 'en-US,en;q=0.5',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Content-Type': 'application/json',
+            'Connection': 'keep-alive'
+        }
+        self.cookies = {
+            'walmart.shippingPostalCode':'P7B3Z7',
+            'defaultNearestStoreId':'3124',
+            'zone':"9",
+            'deliveryCatchment':"3124",
+            'walmart.csrf':'73996cac34766ec995777784',
+            'wmt.c':"0",
+            'vtc':'ZAUFmHNTbFPrWyLrN8WTXA',
+            'userSegment':'50-percent',
+            'TBV':"7",
+            'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
+            'dtSa':'-',
+            '_ga':'GA1.2.1363574403.1590552905',
+            '_gid':'GA1.2.85728116.1590552905',
+            'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff',
+            'usrState':"1",
+            'walmart.nearestPostalCode':'P7B3Z7',
+            's_ecid':'MCMID%7C17236695788713957075642593017320325404',
+            'walmart.locale':'en',
+            'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1",
+            's_visit':"1",
+            's_cc':'true',
+            'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
+            'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
+            '_gcl_au':'1.1.482108716.1590552907',
+            '_fbp':'fb.1.1590552907225.702607671',
+            'og_autoship':"0",
+            'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H|5b6d58542e634882|0',
+            'walmart.nearestLatLng':"48.4120872,-89.2413988",
+            'dtLatC':"3",
+            'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
+            'dtSa':'-',
+            'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30',
+            'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==',
+            'LT':'1590553091850',
+            'BVImplmain_site':"2036",
+            'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542',
+            'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30',
+            'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90',
+            'NEXT_GEN.ENABLED':"1",
+            '_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi',
+            'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
+            'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
+            'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}',
+            'headerType':'grocery',
+            's_sq':'%5B%5BB%5D%5D',
+            'previousBreakpoint':'desktop',
+            'wmt.breakpoint':'d',
+            'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764',
+            'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04',
+            '_uetsid':'2127b16a-c523-20a6-d801-43923775d65e',
+            '_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk',
+            'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0',
+            'rxvt':'1590610238206|1590608438206',
+            's_gnr':'1590609571427-Repeat',
+            'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0',
+            '_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D'
+        }
+        self.session = self.create_session()
         self.categories = [
+            'personal care', 'beauty', 'health'
         ]
+        # options = ChromeOptions()
+        # options.add_argument("--headless")
+        # options.add_argument("--disable-gpu")
+        # options.add_argument("--no-sandbox")
+        # self.driver = Chrome(options=options)
+        # self.generate_session()
+    # def generate_session(self):
+    #     self.driver.get("https://www.walmart.com/")
+    #     print(self.driver.get_cookies())
+    #     cookies = self.driver.get_cookies()
+    #     [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
+    def create_session(self):
+        transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321")
+        session = httpx.Client(transport=transport)
+        return session
     def get_product_detail(self, url):
+        try:
+            response = self.session.get(
+                url, headers=self.headers, cookies=self.cookies)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
+        except:
+            try:
+                self.session = self.create_session()
+                response = self.session.get(
+                    url, headers=self.headers, cookies=self.cookies)
+                soup = BeautifulSoup(response.text, 'html.parser')
+                script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
+            except:
+                return False
         # convert scraipt tag json to dict
         script_tag_json = json.loads(script_tag)
         try:
                 'data']['idml']['ingredients']['ingredients']['value']
         except:
             ingridents = ''
+        try:
+            # x.props.pageProps.initialData.data.product.name
+            product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
+            # x.props.pageProps.initialData.data.product.usItemId
+            product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
+        except:
+            return False
         return [product_name, product_id, ingridents]
     def get_no_of_pages_of_sub_category(self, url):
     def get_category_browse_urls(self, category):
         url = f'https://www.walmart.com/cp/health/976760?q={category}'
+        try:
+            response = self.session.get(url, headers=self.headers, cookies=self.cookies)
+            print(response.text)
+            url_pattern = re.compile(
+                r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
+        except:
+            try:
+                self.session = self.create_session()
+                response = self.session.get(url, headers=self.headers, cookies=self.cookies)
+                url_pattern = re.compile(
+                    r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
+            except:
+                return False
         url_matches = url_pattern.findall(response.text)
         soup = BeautifulSoup(response.text, 'html.parser')
         # Find all anchor (a) tags in the HTML
         all_links = soup.find_all("a")
         for link in all_links:
             href = link.get("href")
             if href and url_pattern.match(href):
+                print(href)
                 print(href)
                 try:
                     total_pages = self.get_no_of_pages_of_sub_category(href)
             request_url = f'{url}&page={i}'
             print(request_url)
+            try:
+                response = self.session.get(
+                    request_url, headers=self.headers, cookies=self.cookies)
+                soup = BeautifulSoup(response.text, 'html.parser')
+                items = soup.find_all('div', {'class': 'b--near-white'})
+            except:
+                try:
+                    self.session = self.create_session()
+                    response = self.session.get(
+                        request_url, headers=self.headers, cookies=self.cookies)
+                    soup = BeautifulSoup(response.text, 'html.parser')
+                    items = soup.find_all('div', {'class': 'b--near-white'})
+                except:
+                    return False
             for item in items:
                 a_tag = item.find('a')
                 if a_tag:
                     product = self.get_product_detail(a_tag['href'])
                     time.sleep(2)
                     print(product)
+                    if product:
+                        self.save_product({
+                            'title': product[0],
+                            'product_id': product[1],
+                            'ingredients': product[2],
+                            'url': a_tag['href'],
+                            'store_name': 'Wallmart'
+                        })
                     # self.save_product_to_csv(product)
         print(urls)
         return urls
             writer = csv.writer(file)
             writer.writerow(product)
             file.close()
+    def save_product(self, product):
+        try:
+            conn = psycopg2.connect(
+                host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
+                database="ingredients-scraper",
+                user="mumer113141",
+                password="SFBtp4xnPeA2"
+            )
+            cur = conn.cursor()
+            cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
+            conn.commit()
+            cur.close()
+            conn.close()
+            return True
+        except Exception as e:
+            print(e)
+            return False
     def run(self):
+        print('Wallmart scraper started')
         for category in self.categories:
             self.get_category_browse_urls(category)