Spaces:

mumer119131
/

shops-scraper

Runtime error

App Files Files Community

shops-scraper / scraper /utils /WallmartScraper.py

mumer119131

Update scraper/utils/WallmartScraper.py

369e8bf over 2 years ago

raw

history blame contribute delete

13.7 kB

	import requests
	from bs4 import BeautifulSoup
	import re
	import csv
	import json
	import time
	from undetected_chromedriver import Chrome, ChromeOptions
	import psycopg2
	import datetime
	from httpx_socks import SyncProxyTransport
	import httpx

	class WallmartScraper:
	def __init__(self):
	self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.9"
	self.headers = {
	'Host': 'www.walmart.ca',
	'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
	'Accept': '/',
	'Accept-Language': 'en-US,en;q=0.5',
	'Accept-Encoding': 'gzip, deflate, br',
	'Content-Type': 'application/json',
	'Connection': 'keep-alive'
	}
	self.cookies = {
	'walmart.shippingPostalCode':'P7B3Z7',
	'defaultNearestStoreId':'3124',
	'zone':"9",
	'deliveryCatchment':"3124",
	'walmart.csrf':'73996cac34766ec995777784',
	'wmt.c':"0",
	'vtc':'ZAUFmHNTbFPrWyLrN8WTXA',
	'userSegment':'50-percent',
	'TBV':"7",
	'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
	'dtSa':'-',
	'_ga':'GA1.2.1363574403.1590552905',
	'_gid':'GA1.2.85728116.1590552905',
	'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff',
	'usrState':"1",
	'walmart.nearestPostalCode':'P7B3Z7',
	's_ecid':'MCMID%7C17236695788713957075642593017320325404',
	'walmart.locale':'en',
	'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1",
	's_visit':"1",
	's_cc':'true',
	'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
	'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
	'_gcl_au':'1.1.482108716.1590552907',
	'_fbp':'fb.1.1590552907225.702607671',
	'og_autoship':"0",
	'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H\|5b6d58542e634882\|0',
	'walmart.nearestLatLng':"48.4120872,-89.2413988",
	'dtLatC':"3",
	'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
	'dtSa':'-',
	'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30',
	'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==',
	'LT':'1590553091850',
	'BVImplmain_site':"2036",
	'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542',
	'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30',
	'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90',
	'NEXT_GEN.ENABLED':"1",
	'_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi',
	'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
	'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
	'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}',
	'headerType':'grocery',
	's_sq':'%5B%5BB%5D%5D',
	'previousBreakpoint':'desktop',
	'wmt.breakpoint':'d',
	'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764',
	'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04',
	'_uetsid':'2127b16a-c523-20a6-d801-43923775d65e',
	'_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk',
	'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0',
	'rxvt':'1590610238206\|1590608438206',
	's_gnr':'1590609571427-Repeat',
	'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0',
	'_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D'
	}
	self.session = self.create_session()
	self.categories = [
	'personal care', 'beauty', 'health'
	]
	# options = ChromeOptions()
	# options.add_argument("--headless")
	# options.add_argument("--disable-gpu")
	# options.add_argument("--no-sandbox")
	# self.driver = Chrome(options=options)
	# self.generate_session()

	# def generate_session(self):

	# self.driver.get("https://www.walmart.com/")
	# print(self.driver.get_cookies())
	# cookies = self.driver.get_cookies()
	# [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]

	def create_session(self):
	transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321")
	session = httpx.Client(transport=transport)
	return session

	def get_product_detail(self, url):
	try:
	response = self.session.get(
	url, headers=self.headers, cookies=self.cookies)
	soup = BeautifulSoup(response.text, 'html.parser')
	script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
	except:
	try:
	self.session = self.create_session()
	response = self.session.get(
	url, headers=self.headers, cookies=self.cookies)
	soup = BeautifulSoup(response.text, 'html.parser')
	script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
	except:
	return False
	# convert scraipt tag json to dict
	script_tag_json = json.loads(script_tag)
	try:
	ingridents = script_tag_json['props']['pageProps']['initialData'][
	'data']['idml']['ingredients']['ingredients']['value']
	except:
	ingridents = ''

	try:
	# x.props.pageProps.initialData.data.product.name
	product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
	# x.props.pageProps.initialData.data.product.usItemId
	product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
	except:
	return False
	return [product_name, product_id, ingridents]

	def get_no_of_pages_of_sub_category(self, url):
	# burp0_headers = {"Dpr": "1", "Downlink": "10", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Referer": "https://www.walmart.com/cp/health/976760?q=health", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
	# response = requests.get(url, headers=burp0_headers)

	# # try:
	# pattern = r'"maxPage":\s*(\d+)'
	# match = re.search(pattern, response.text)
	# total_pages = match.group(1)

	# print(total_pages)
	return 25
	# except:
	# print('No pagination found')
	# return 1

	def get_category_browse_urls(self, category):
	url = f'https://www.walmart.com/cp/health/976760?q={category}'
	try:
	response = self.session.get(url, headers=self.headers, cookies=self.cookies)

	print(response.text)
	url_pattern = re.compile(
	r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
	except:
	try:
	self.session = self.create_session()
	response = self.session.get(url, headers=self.headers, cookies=self.cookies)
	url_pattern = re.compile(
	r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
	except:
	return False
	url_matches = url_pattern.findall(response.text)

	soup = BeautifulSoup(response.text, 'html.parser')
	# Find all anchor (a) tags in the HTML
	all_links = soup.find_all("a")

	# Extract and print the href attribute of each anchor tag matching the pattern
	for link in all_links:
	href = link.get("href")
	if href and url_pattern.match(href):
	print(href)
	print(href)
	try:
	total_pages = self.get_no_of_pages_of_sub_category(href)
	urls = self.get_product_urls(href, int(total_pages))
	except Exception as e:
	print(e)
	pass


	def get_product_urls(self, url, pages):
	urls = []
	for i in range(1, pages + 1):
	print(i)

	request_url = f'{url}&page={i}'
	print(request_url)
	try:
	response = self.session.get(
	request_url, headers=self.headers, cookies=self.cookies)

	soup = BeautifulSoup(response.text, 'html.parser')
	items = soup.find_all('div', {'class': 'b--near-white'})
	except:
	try:
	self.session = self.create_session()
	response = self.session.get(
	request_url, headers=self.headers, cookies=self.cookies)

	soup = BeautifulSoup(response.text, 'html.parser')
	items = soup.find_all('div', {'class': 'b--near-white'})
	except:
	return False
	for item in items:
	a_tag = item.find('a')
	if a_tag:
	if a_tag['href'].startswith('/ip'):
	a_tag['href'] = 'https://www.walmart.com' + \
	a_tag['href']
	product = self.get_product_detail(a_tag['href'])
	time.sleep(2)
	print(product)
	if product:
	self.save_product({
	'title': product[0],
	'product_id': product[1],
	'ingredients': product[2],
	'url': a_tag['href'],
	'store_name': 'Wallmart'
	})
	# self.save_product_to_csv(product)
	print(urls)
	return urls

	def save_product_to_csv(self, product):
	with open('wallmart.csv', 'a+', encoding='utf-8', newline='') as file:
	# check if the product is already in the csv
	file.seek(0) # move the file pointer to the beginning of the file
	reader = csv.reader(file)
	product_ids = [row[1] for row in reader]
	if product[1] in product_ids:
	return False

	writer = csv.writer(file)
	writer.writerow(product)
	file.close()

	def save_product(self, product):
	try:
	conn = psycopg2.connect(
	host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
	database="ingredients-scraper",
	user="mumer113141",
	password="SFBtp4xnPeA2"
	)
	cur = conn.cursor()
	cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
	conn.commit()
	cur.close()
	conn.close()
	return True
	except Exception as e:
	print(e)
	return False
	def run(self):
	print('Wallmart scraper started')
	for category in self.categories:
	self.get_category_browse_urls(category)

	return True