shops-scraper / scraper /utils /WallmartScraper.py
mumer119131's picture
Update scraper/utils/WallmartScraper.py
369e8bf
import requests
from bs4 import BeautifulSoup
import re
import csv
import json
import time
from undetected_chromedriver import Chrome, ChromeOptions
import psycopg2
import datetime
from httpx_socks import SyncProxyTransport
import httpx
class WallmartScraper:
def __init__(self):
self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
self.headers = {
'Host': 'www.walmart.ca',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'Content-Type': 'application/json',
'Connection': 'keep-alive'
}
self.cookies = {
'walmart.shippingPostalCode':'P7B3Z7',
'defaultNearestStoreId':'3124',
'zone':"9",
'deliveryCatchment':"3124",
'walmart.csrf':'73996cac34766ec995777784',
'wmt.c':"0",
'vtc':'ZAUFmHNTbFPrWyLrN8WTXA',
'userSegment':'50-percent',
'TBV':"7",
'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
'dtSa':'-',
'_ga':'GA1.2.1363574403.1590552905',
'_gid':'GA1.2.85728116.1590552905',
'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff',
'usrState':"1",
'walmart.nearestPostalCode':'P7B3Z7',
's_ecid':'MCMID%7C17236695788713957075642593017320325404',
'walmart.locale':'en',
'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1",
's_visit':"1",
's_cc':'true',
'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
'_gcl_au':'1.1.482108716.1590552907',
'_fbp':'fb.1.1590552907225.702607671',
'og_autoship':"0",
'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H|5b6d58542e634882|0',
'walmart.nearestLatLng':"48.4120872,-89.2413988",
'dtLatC':"3",
'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
'dtSa':'-',
'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30',
'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==',
'LT':'1590553091850',
'BVImplmain_site':"2036",
'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542',
'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30',
'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90',
'NEXT_GEN.ENABLED':"1",
'_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi',
'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}',
'headerType':'grocery',
's_sq':'%5B%5BB%5D%5D',
'previousBreakpoint':'desktop',
'wmt.breakpoint':'d',
'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764',
'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04',
'_uetsid':'2127b16a-c523-20a6-d801-43923775d65e',
'_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk',
'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0',
'rxvt':'1590610238206|1590608438206',
's_gnr':'1590609571427-Repeat',
'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0',
'_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D'
}
self.session = self.create_session()
self.categories = [
'personal care', 'beauty', 'health'
]
# options = ChromeOptions()
# options.add_argument("--headless")
# options.add_argument("--disable-gpu")
# options.add_argument("--no-sandbox")
# self.driver = Chrome(options=options)
# self.generate_session()
# def generate_session(self):
# self.driver.get("https://www.walmart.com/")
# print(self.driver.get_cookies())
# cookies = self.driver.get_cookies()
# [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
def create_session(self):
transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321")
session = httpx.Client(transport=transport)
return session
def get_product_detail(self, url):
try:
response = self.session.get(
url, headers=self.headers, cookies=self.cookies)
soup = BeautifulSoup(response.text, 'html.parser')
script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
except:
try:
self.session = self.create_session()
response = self.session.get(
url, headers=self.headers, cookies=self.cookies)
soup = BeautifulSoup(response.text, 'html.parser')
script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
except:
return False
# convert scraipt tag json to dict
script_tag_json = json.loads(script_tag)
try:
ingridents = script_tag_json['props']['pageProps']['initialData'][
'data']['idml']['ingredients']['ingredients']['value']
except:
ingridents = ''
try:
# x.props.pageProps.initialData.data.product.name
product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
# x.props.pageProps.initialData.data.product.usItemId
product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
except:
return False
return [product_name, product_id, ingridents]
def get_no_of_pages_of_sub_category(self, url):
# burp0_headers = {"Dpr": "1", "Downlink": "10", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Referer": "https://www.walmart.com/cp/health/976760?q=health", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
# response = requests.get(url, headers=burp0_headers)
# # try:
# pattern = r'"maxPage":\s*(\d+)'
# match = re.search(pattern, response.text)
# total_pages = match.group(1)
# print(total_pages)
return 25
# except:
# print('No pagination found')
# return 1
def get_category_browse_urls(self, category):
url = f'https://www.walmart.com/cp/health/976760?q={category}'
try:
response = self.session.get(url, headers=self.headers, cookies=self.cookies)
print(response.text)
url_pattern = re.compile(
r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
except:
try:
self.session = self.create_session()
response = self.session.get(url, headers=self.headers, cookies=self.cookies)
url_pattern = re.compile(
r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
except:
return False
url_matches = url_pattern.findall(response.text)
soup = BeautifulSoup(response.text, 'html.parser')
# Find all anchor (a) tags in the HTML
all_links = soup.find_all("a")
# Extract and print the href attribute of each anchor tag matching the pattern
for link in all_links:
href = link.get("href")
if href and url_pattern.match(href):
print(href)
print(href)
try:
total_pages = self.get_no_of_pages_of_sub_category(href)
urls = self.get_product_urls(href, int(total_pages))
except Exception as e:
print(e)
pass
def get_product_urls(self, url, pages):
urls = []
for i in range(1, pages + 1):
print(i)
request_url = f'{url}&page={i}'
print(request_url)
try:
response = self.session.get(
request_url, headers=self.headers, cookies=self.cookies)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', {'class': 'b--near-white'})
except:
try:
self.session = self.create_session()
response = self.session.get(
request_url, headers=self.headers, cookies=self.cookies)
soup = BeautifulSoup(response.text, 'html.parser')
items = soup.find_all('div', {'class': 'b--near-white'})
except:
return False
for item in items:
a_tag = item.find('a')
if a_tag:
if a_tag['href'].startswith('/ip'):
a_tag['href'] = 'https://www.walmart.com' + \
a_tag['href']
product = self.get_product_detail(a_tag['href'])
time.sleep(2)
print(product)
if product:
self.save_product({
'title': product[0],
'product_id': product[1],
'ingredients': product[2],
'url': a_tag['href'],
'store_name': 'Wallmart'
})
# self.save_product_to_csv(product)
print(urls)
return urls
def save_product_to_csv(self, product):
with open('wallmart.csv', 'a+', encoding='utf-8', newline='') as file:
# check if the product is already in the csv
file.seek(0) # move the file pointer to the beginning of the file
reader = csv.reader(file)
product_ids = [row[1] for row in reader]
if product[1] in product_ids:
return False
writer = csv.writer(file)
writer.writerow(product)
file.close()
def save_product(self, product):
try:
conn = psycopg2.connect(
host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
database="ingredients-scraper",
user="mumer113141",
password="SFBtp4xnPeA2"
)
cur = conn.cursor()
cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
conn.commit()
cur.close()
conn.close()
return True
except Exception as e:
print(e)
return False
def run(self):
print('Wallmart scraper started')
for category in self.categories:
self.get_category_browse_urls(category)
return True