Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import csv | |
| import json | |
| import time | |
| from undetected_chromedriver import Chrome, ChromeOptions | |
| import psycopg2 | |
| import datetime | |
| from httpx_socks import SyncProxyTransport | |
| import httpx | |
| class WallmartScraper: | |
| def __init__(self): | |
| self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" | |
| self.headers = { | |
| 'Host': 'www.walmart.ca', | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0', | |
| 'Accept': '*/*', | |
| 'Accept-Language': 'en-US,en;q=0.5', | |
| 'Accept-Encoding': 'gzip, deflate, br', | |
| 'Content-Type': 'application/json', | |
| 'Connection': 'keep-alive' | |
| } | |
| self.cookies = { | |
| 'walmart.shippingPostalCode':'P7B3Z7', | |
| 'defaultNearestStoreId':'3124', | |
| 'zone':"9", | |
| 'deliveryCatchment':"3124", | |
| 'walmart.csrf':'73996cac34766ec995777784', | |
| 'wmt.c':"0", | |
| 'vtc':'ZAUFmHNTbFPrWyLrN8WTXA', | |
| 'userSegment':'50-percent', | |
| 'TBV':"7", | |
| 'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M', | |
| 'dtSa':'-', | |
| '_ga':'GA1.2.1363574403.1590552905', | |
| '_gid':'GA1.2.85728116.1590552905', | |
| 'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff', | |
| 'usrState':"1", | |
| 'walmart.nearestPostalCode':'P7B3Z7', | |
| 's_ecid':'MCMID%7C17236695788713957075642593017320325404', | |
| 'walmart.locale':'en', | |
| 'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1", | |
| 's_visit':"1", | |
| 's_cc':'true', | |
| 'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906', | |
| 'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906', | |
| '_gcl_au':'1.1.482108716.1590552907', | |
| '_fbp':'fb.1.1590552907225.702607671', | |
| 'og_autoship':"0", | |
| 'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H|5b6d58542e634882|0', | |
| 'walmart.nearestLatLng':"48.4120872,-89.2413988", | |
| 'dtLatC':"3", | |
| 'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M', | |
| 'dtSa':'-', | |
| 'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30', | |
| 'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==', | |
| 'LT':'1590553091850', | |
| 'BVImplmain_site':"2036", | |
| 'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542', | |
| 'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30', | |
| 'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90', | |
| 'NEXT_GEN.ENABLED':"1", | |
| '_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi', | |
| 'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309', | |
| 'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309', | |
| 'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}', | |
| 'headerType':'grocery', | |
| 's_sq':'%5B%5BB%5D%5D', | |
| 'previousBreakpoint':'desktop', | |
| 'wmt.breakpoint':'d', | |
| 'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764', | |
| 'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04', | |
| '_uetsid':'2127b16a-c523-20a6-d801-43923775d65e', | |
| '_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk', | |
| 'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0', | |
| 'rxvt':'1590610238206|1590608438206', | |
| 's_gnr':'1590609571427-Repeat', | |
| 'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0', | |
| '_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D' | |
| } | |
| self.session = self.create_session() | |
| self.categories = [ | |
| 'personal care', 'beauty', 'health' | |
| ] | |
| # options = ChromeOptions() | |
| # options.add_argument("--headless") | |
| # options.add_argument("--disable-gpu") | |
| # options.add_argument("--no-sandbox") | |
| # self.driver = Chrome(options=options) | |
| # self.generate_session() | |
| # def generate_session(self): | |
| # self.driver.get("https://www.walmart.com/") | |
| # print(self.driver.get_cookies()) | |
| # cookies = self.driver.get_cookies() | |
| # [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies] | |
| def create_session(self): | |
| transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321") | |
| session = httpx.Client(transport=transport) | |
| return session | |
| def get_product_detail(self, url): | |
| try: | |
| response = self.session.get( | |
| url, headers=self.headers, cookies=self.cookies) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text | |
| except: | |
| try: | |
| self.session = self.create_session() | |
| response = self.session.get( | |
| url, headers=self.headers, cookies=self.cookies) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text | |
| except: | |
| return False | |
| # convert scraipt tag json to dict | |
| script_tag_json = json.loads(script_tag) | |
| try: | |
| ingridents = script_tag_json['props']['pageProps']['initialData'][ | |
| 'data']['idml']['ingredients']['ingredients']['value'] | |
| except: | |
| ingridents = '' | |
| try: | |
| # x.props.pageProps.initialData.data.product.name | |
| product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name'] | |
| # x.props.pageProps.initialData.data.product.usItemId | |
| product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId'] | |
| except: | |
| return False | |
| return [product_name, product_id, ingridents] | |
| def get_no_of_pages_of_sub_category(self, url): | |
| # burp0_headers = {"Dpr": "1", "Downlink": "10", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Referer": "https://www.walmart.com/cp/health/976760?q=health", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} | |
| # response = requests.get(url, headers=burp0_headers) | |
| # # try: | |
| # pattern = r'"maxPage":\s*(\d+)' | |
| # match = re.search(pattern, response.text) | |
| # total_pages = match.group(1) | |
| # print(total_pages) | |
| return 25 | |
| # except: | |
| # print('No pagination found') | |
| # return 1 | |
| def get_category_browse_urls(self, category): | |
| url = f'https://www.walmart.com/cp/health/976760?q={category}' | |
| try: | |
| response = self.session.get(url, headers=self.headers, cookies=self.cookies) | |
| print(response.text) | |
| url_pattern = re.compile( | |
| r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*') | |
| except: | |
| try: | |
| self.session = self.create_session() | |
| response = self.session.get(url, headers=self.headers, cookies=self.cookies) | |
| url_pattern = re.compile( | |
| r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*') | |
| except: | |
| return False | |
| url_matches = url_pattern.findall(response.text) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find all anchor (a) tags in the HTML | |
| all_links = soup.find_all("a") | |
| # Extract and print the href attribute of each anchor tag matching the pattern | |
| for link in all_links: | |
| href = link.get("href") | |
| if href and url_pattern.match(href): | |
| print(href) | |
| print(href) | |
| try: | |
| total_pages = self.get_no_of_pages_of_sub_category(href) | |
| urls = self.get_product_urls(href, int(total_pages)) | |
| except Exception as e: | |
| print(e) | |
| pass | |
| def get_product_urls(self, url, pages): | |
| urls = [] | |
| for i in range(1, pages + 1): | |
| print(i) | |
| request_url = f'{url}&page={i}' | |
| print(request_url) | |
| try: | |
| response = self.session.get( | |
| request_url, headers=self.headers, cookies=self.cookies) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| items = soup.find_all('div', {'class': 'b--near-white'}) | |
| except: | |
| try: | |
| self.session = self.create_session() | |
| response = self.session.get( | |
| request_url, headers=self.headers, cookies=self.cookies) | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| items = soup.find_all('div', {'class': 'b--near-white'}) | |
| except: | |
| return False | |
| for item in items: | |
| a_tag = item.find('a') | |
| if a_tag: | |
| if a_tag['href'].startswith('/ip'): | |
| a_tag['href'] = 'https://www.walmart.com' + \ | |
| a_tag['href'] | |
| product = self.get_product_detail(a_tag['href']) | |
| time.sleep(2) | |
| print(product) | |
| if product: | |
| self.save_product({ | |
| 'title': product[0], | |
| 'product_id': product[1], | |
| 'ingredients': product[2], | |
| 'url': a_tag['href'], | |
| 'store_name': 'Wallmart' | |
| }) | |
| # self.save_product_to_csv(product) | |
| print(urls) | |
| return urls | |
| def save_product_to_csv(self, product): | |
| with open('wallmart.csv', 'a+', encoding='utf-8', newline='') as file: | |
| # check if the product is already in the csv | |
| file.seek(0) # move the file pointer to the beginning of the file | |
| reader = csv.reader(file) | |
| product_ids = [row[1] for row in reader] | |
| if product[1] in product_ids: | |
| return False | |
| writer = csv.writer(file) | |
| writer.writerow(product) | |
| file.close() | |
| def save_product(self, product): | |
| try: | |
| conn = psycopg2.connect( | |
| host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech", | |
| database="ingredients-scraper", | |
| user="mumer113141", | |
| password="SFBtp4xnPeA2" | |
| ) | |
| cur = conn.cursor() | |
| cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))) | |
| conn.commit() | |
| cur.close() | |
| conn.close() | |
| return True | |
| except Exception as e: | |
| print(e) | |
| return False | |
| def run(self): | |
| print('Wallmart scraper started') | |
| for category in self.categories: | |
| self.get_category_browse_urls(category) | |
| return True | |