import requests from bs4 import BeautifulSoup import re import csv import json import time from undetected_chromedriver import Chrome, ChromeOptions import psycopg2 import datetime from httpx_socks import SyncProxyTransport import httpx class WallmartScraper: def __init__(self): self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" self.headers = { 'Host': 'www.walmart.ca', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'Content-Type': 'application/json', 'Connection': 'keep-alive' } self.cookies = { 'walmart.shippingPostalCode':'P7B3Z7', 'defaultNearestStoreId':'3124', 'zone':"9", 'deliveryCatchment':"3124", 'walmart.csrf':'73996cac34766ec995777784', 'wmt.c':"0", 'vtc':'ZAUFmHNTbFPrWyLrN8WTXA', 'userSegment':'50-percent', 'TBV':"7", 'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M', 'dtSa':'-', '_ga':'GA1.2.1363574403.1590552905', '_gid':'GA1.2.85728116.1590552905', 'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff', 'usrState':"1", 'walmart.nearestPostalCode':'P7B3Z7', 's_ecid':'MCMID%7C17236695788713957075642593017320325404', 'walmart.locale':'en', 'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1", 's_visit':"1", 's_cc':'true', 'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906', 'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906', '_gcl_au':'1.1.482108716.1590552907', '_fbp':'fb.1.1590552907225.702607671', 'og_autoship':"0", 'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H|5b6d58542e634882|0', 'walmart.nearestLatLng':"48.4120872,-89.2413988", 'dtLatC':"3", 'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M', 'dtSa':'-', 'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30', 'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==', 'LT':'1590553091850', 'BVImplmain_site':"2036", 'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542', 'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30', 'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90', 'NEXT_GEN.ENABLED':"1", '_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi', 'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309', 'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309', 'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}', 'headerType':'grocery', 's_sq':'%5B%5BB%5D%5D', 'previousBreakpoint':'desktop', 'wmt.breakpoint':'d', 'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764', 'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04', '_uetsid':'2127b16a-c523-20a6-d801-43923775d65e', '_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk', 'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0', 'rxvt':'1590610238206|1590608438206', 's_gnr':'1590609571427-Repeat', 'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0', '_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D' } self.session = self.create_session() self.categories = [ 'personal care', 'beauty', 'health' ] # options = ChromeOptions() # options.add_argument("--headless") # options.add_argument("--disable-gpu") # options.add_argument("--no-sandbox") # self.driver = Chrome(options=options) # self.generate_session() # def generate_session(self): # self.driver.get("https://www.walmart.com/") # print(self.driver.get_cookies()) # cookies = self.driver.get_cookies() # [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies] def create_session(self): transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321") session = httpx.Client(transport=transport) return session def get_product_detail(self, url): try: response = self.session.get( url, headers=self.headers, cookies=self.cookies) soup = BeautifulSoup(response.text, 'html.parser') script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text except: try: self.session = self.create_session() response = self.session.get( url, headers=self.headers, cookies=self.cookies) soup = BeautifulSoup(response.text, 'html.parser') script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text except: return False # convert scraipt tag json to dict script_tag_json = json.loads(script_tag) try: ingridents = script_tag_json['props']['pageProps']['initialData'][ 'data']['idml']['ingredients']['ingredients']['value'] except: ingridents = '' try: # x.props.pageProps.initialData.data.product.name product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name'] # x.props.pageProps.initialData.data.product.usItemId product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId'] except: return False return [product_name, product_id, ingridents] def get_no_of_pages_of_sub_category(self, url): # burp0_headers = {"Dpr": "1", "Downlink": "10", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Referer": "https://www.walmart.com/cp/health/976760?q=health", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} # response = requests.get(url, headers=burp0_headers) # # try: # pattern = r'"maxPage":\s*(\d+)' # match = re.search(pattern, response.text) # total_pages = match.group(1) # print(total_pages) return 25 # except: # print('No pagination found') # return 1 def get_category_browse_urls(self, category): url = f'https://www.walmart.com/cp/health/976760?q={category}' try: response = self.session.get(url, headers=self.headers, cookies=self.cookies) print(response.text) url_pattern = re.compile( r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*') except: try: self.session = self.create_session() response = self.session.get(url, headers=self.headers, cookies=self.cookies) url_pattern = re.compile( r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*') except: return False url_matches = url_pattern.findall(response.text) soup = BeautifulSoup(response.text, 'html.parser') # Find all anchor (a) tags in the HTML all_links = soup.find_all("a") # Extract and print the href attribute of each anchor tag matching the pattern for link in all_links: href = link.get("href") if href and url_pattern.match(href): print(href) print(href) try: total_pages = self.get_no_of_pages_of_sub_category(href) urls = self.get_product_urls(href, int(total_pages)) except Exception as e: print(e) pass def get_product_urls(self, url, pages): urls = [] for i in range(1, pages + 1): print(i) request_url = f'{url}&page={i}' print(request_url) try: response = self.session.get( request_url, headers=self.headers, cookies=self.cookies) soup = BeautifulSoup(response.text, 'html.parser') items = soup.find_all('div', {'class': 'b--near-white'}) except: try: self.session = self.create_session() response = self.session.get( request_url, headers=self.headers, cookies=self.cookies) soup = BeautifulSoup(response.text, 'html.parser') items = soup.find_all('div', {'class': 'b--near-white'}) except: return False for item in items: a_tag = item.find('a') if a_tag: if a_tag['href'].startswith('/ip'): a_tag['href'] = 'https://www.walmart.com' + \ a_tag['href'] product = self.get_product_detail(a_tag['href']) time.sleep(2) print(product) if product: self.save_product({ 'title': product[0], 'product_id': product[1], 'ingredients': product[2], 'url': a_tag['href'], 'store_name': 'Wallmart' }) # self.save_product_to_csv(product) print(urls) return urls def save_product_to_csv(self, product): with open('wallmart.csv', 'a+', encoding='utf-8', newline='') as file: # check if the product is already in the csv file.seek(0) # move the file pointer to the beginning of the file reader = csv.reader(file) product_ids = [row[1] for row in reader] if product[1] in product_ids: return False writer = csv.writer(file) writer.writerow(product) file.close() def save_product(self, product): try: conn = psycopg2.connect( host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech", database="ingredients-scraper", user="mumer113141", password="SFBtp4xnPeA2" ) cur = conn.cursor() cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) conn.commit() cur.close() conn.close() return True except Exception as e: print(e) return False def run(self): print('Wallmart scraper started') for category in self.categories: self.get_category_browse_urls(category) return True