Spaces:
Runtime error
Runtime error
Commit
·
369e8bf
1
Parent(s):
643dbca
Update scraper/utils/WallmartScraper.py
Browse files- scraper/utils/WallmartScraper.py +172 -42
scraper/utils/WallmartScraper.py
CHANGED
|
@@ -4,35 +4,120 @@ import re
|
|
| 4 |
import csv
|
| 5 |
import json
|
| 6 |
import time
|
| 7 |
-
from .DatabaseDataSaver import save_product
|
| 8 |
from undetected_chromedriver import Chrome, ChromeOptions
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
class WallmartScraper:
|
| 10 |
def __init__(self):
|
| 11 |
self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
|
| 12 |
-
self.headers = {
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
self.categories = [
|
| 15 |
-
'
|
| 16 |
]
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
options
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def get_product_detail(self, url):
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
# convert scraipt tag json to dict
|
| 37 |
script_tag_json = json.loads(script_tag)
|
| 38 |
try:
|
|
@@ -40,10 +125,14 @@ class WallmartScraper:
|
|
| 40 |
'data']['idml']['ingredients']['ingredients']['value']
|
| 41 |
except:
|
| 42 |
ingridents = ''
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
return [product_name, product_id, ingridents]
|
| 48 |
|
| 49 |
def get_no_of_pages_of_sub_category(self, url):
|
|
@@ -63,12 +152,22 @@ class WallmartScraper:
|
|
| 63 |
|
| 64 |
def get_category_browse_urls(self, category):
|
| 65 |
url = f'https://www.walmart.com/cp/health/976760?q={category}'
|
| 66 |
-
|
| 67 |
-
url, headers=self.headers)
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
url_matches = url_pattern.findall(response.text)
|
|
|
|
| 72 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 73 |
# Find all anchor (a) tags in the HTML
|
| 74 |
all_links = soup.find_all("a")
|
|
@@ -77,6 +176,7 @@ class WallmartScraper:
|
|
| 77 |
for link in all_links:
|
| 78 |
href = link.get("href")
|
| 79 |
if href and url_pattern.match(href):
|
|
|
|
| 80 |
print(href)
|
| 81 |
try:
|
| 82 |
total_pages = self.get_no_of_pages_of_sub_category(href)
|
|
@@ -93,11 +193,22 @@ class WallmartScraper:
|
|
| 93 |
|
| 94 |
request_url = f'{url}&page={i}'
|
| 95 |
print(request_url)
|
| 96 |
-
|
| 97 |
-
|
|
|
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
for item in items:
|
| 102 |
a_tag = item.find('a')
|
| 103 |
if a_tag:
|
|
@@ -107,13 +218,14 @@ class WallmartScraper:
|
|
| 107 |
product = self.get_product_detail(a_tag['href'])
|
| 108 |
time.sleep(2)
|
| 109 |
print(product)
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
| 117 |
# self.save_product_to_csv(product)
|
| 118 |
print(urls)
|
| 119 |
return urls
|
|
@@ -130,8 +242,26 @@ class WallmartScraper:
|
|
| 130 |
writer = csv.writer(file)
|
| 131 |
writer.writerow(product)
|
| 132 |
file.close()
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
def run(self):
|
|
|
|
| 135 |
for category in self.categories:
|
| 136 |
self.get_category_browse_urls(category)
|
| 137 |
|
|
|
|
| 4 |
import csv
|
| 5 |
import json
|
| 6 |
import time
|
|
|
|
| 7 |
from undetected_chromedriver import Chrome, ChromeOptions
|
| 8 |
+
import psycopg2
|
| 9 |
+
import datetime
|
| 10 |
+
from httpx_socks import SyncProxyTransport
|
| 11 |
+
import httpx
|
| 12 |
+
|
| 13 |
class WallmartScraper:
|
| 14 |
def __init__(self):
|
| 15 |
self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
|
| 16 |
+
self.headers = {
|
| 17 |
+
'Host': 'www.walmart.ca',
|
| 18 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
|
| 19 |
+
'Accept': '*/*',
|
| 20 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 21 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 22 |
+
'Content-Type': 'application/json',
|
| 23 |
+
'Connection': 'keep-alive'
|
| 24 |
+
}
|
| 25 |
+
self.cookies = {
|
| 26 |
+
'walmart.shippingPostalCode':'P7B3Z7',
|
| 27 |
+
'defaultNearestStoreId':'3124',
|
| 28 |
+
'zone':"9",
|
| 29 |
+
'deliveryCatchment':"3124",
|
| 30 |
+
'walmart.csrf':'73996cac34766ec995777784',
|
| 31 |
+
'wmt.c':"0",
|
| 32 |
+
'vtc':'ZAUFmHNTbFPrWyLrN8WTXA',
|
| 33 |
+
'userSegment':'50-percent',
|
| 34 |
+
'TBV':"7",
|
| 35 |
+
'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
|
| 36 |
+
'dtSa':'-',
|
| 37 |
+
'_ga':'GA1.2.1363574403.1590552905',
|
| 38 |
+
'_gid':'GA1.2.85728116.1590552905',
|
| 39 |
+
'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff',
|
| 40 |
+
'usrState':"1",
|
| 41 |
+
'walmart.nearestPostalCode':'P7B3Z7',
|
| 42 |
+
's_ecid':'MCMID%7C17236695788713957075642593017320325404',
|
| 43 |
+
'walmart.locale':'en',
|
| 44 |
+
'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1",
|
| 45 |
+
's_visit':"1",
|
| 46 |
+
's_cc':'true',
|
| 47 |
+
'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
|
| 48 |
+
'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
|
| 49 |
+
'_gcl_au':'1.1.482108716.1590552907',
|
| 50 |
+
'_fbp':'fb.1.1590552907225.702607671',
|
| 51 |
+
'og_autoship':"0",
|
| 52 |
+
'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H|5b6d58542e634882|0',
|
| 53 |
+
'walmart.nearestLatLng':"48.4120872,-89.2413988",
|
| 54 |
+
'dtLatC':"3",
|
| 55 |
+
'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
|
| 56 |
+
'dtSa':'-',
|
| 57 |
+
'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30',
|
| 58 |
+
'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==',
|
| 59 |
+
'LT':'1590553091850',
|
| 60 |
+
'BVImplmain_site':"2036",
|
| 61 |
+
'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542',
|
| 62 |
+
'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30',
|
| 63 |
+
'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90',
|
| 64 |
+
'NEXT_GEN.ENABLED':"1",
|
| 65 |
+
'_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi',
|
| 66 |
+
'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
|
| 67 |
+
'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
|
| 68 |
+
'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}',
|
| 69 |
+
'headerType':'grocery',
|
| 70 |
+
's_sq':'%5B%5BB%5D%5D',
|
| 71 |
+
'previousBreakpoint':'desktop',
|
| 72 |
+
'wmt.breakpoint':'d',
|
| 73 |
+
'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764',
|
| 74 |
+
'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04',
|
| 75 |
+
'_uetsid':'2127b16a-c523-20a6-d801-43923775d65e',
|
| 76 |
+
'_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk',
|
| 77 |
+
'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0',
|
| 78 |
+
'rxvt':'1590610238206|1590608438206',
|
| 79 |
+
's_gnr':'1590609571427-Repeat',
|
| 80 |
+
'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0',
|
| 81 |
+
'_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D'
|
| 82 |
+
}
|
| 83 |
+
self.session = self.create_session()
|
| 84 |
self.categories = [
|
| 85 |
+
'personal care', 'beauty', 'health'
|
| 86 |
]
|
| 87 |
+
# options = ChromeOptions()
|
| 88 |
+
# options.add_argument("--headless")
|
| 89 |
+
# options.add_argument("--disable-gpu")
|
| 90 |
+
# options.add_argument("--no-sandbox")
|
| 91 |
+
# self.driver = Chrome(options=options)
|
| 92 |
+
# self.generate_session()
|
| 93 |
+
|
| 94 |
+
# def generate_session(self):
|
| 95 |
+
|
| 96 |
+
# self.driver.get("https://www.walmart.com/")
|
| 97 |
+
# print(self.driver.get_cookies())
|
| 98 |
+
# cookies = self.driver.get_cookies()
|
| 99 |
+
# [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
|
| 100 |
|
| 101 |
+
def create_session(self):
|
| 102 |
+
transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321")
|
| 103 |
+
session = httpx.Client(transport=transport)
|
| 104 |
+
return session
|
| 105 |
+
|
| 106 |
def get_product_detail(self, url):
|
| 107 |
+
try:
|
| 108 |
+
response = self.session.get(
|
| 109 |
+
url, headers=self.headers, cookies=self.cookies)
|
| 110 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 111 |
+
script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
|
| 112 |
+
except:
|
| 113 |
+
try:
|
| 114 |
+
self.session = self.create_session()
|
| 115 |
+
response = self.session.get(
|
| 116 |
+
url, headers=self.headers, cookies=self.cookies)
|
| 117 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 118 |
+
script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
|
| 119 |
+
except:
|
| 120 |
+
return False
|
| 121 |
# convert scraipt tag json to dict
|
| 122 |
script_tag_json = json.loads(script_tag)
|
| 123 |
try:
|
|
|
|
| 125 |
'data']['idml']['ingredients']['ingredients']['value']
|
| 126 |
except:
|
| 127 |
ingridents = ''
|
| 128 |
+
|
| 129 |
+
try:
|
| 130 |
+
# x.props.pageProps.initialData.data.product.name
|
| 131 |
+
product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
|
| 132 |
+
# x.props.pageProps.initialData.data.product.usItemId
|
| 133 |
+
product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
|
| 134 |
+
except:
|
| 135 |
+
return False
|
| 136 |
return [product_name, product_id, ingridents]
|
| 137 |
|
| 138 |
def get_no_of_pages_of_sub_category(self, url):
|
|
|
|
| 152 |
|
| 153 |
def get_category_browse_urls(self, category):
|
| 154 |
url = f'https://www.walmart.com/cp/health/976760?q={category}'
|
| 155 |
+
try:
|
| 156 |
+
response = self.session.get(url, headers=self.headers, cookies=self.cookies)
|
| 157 |
+
|
| 158 |
+
print(response.text)
|
| 159 |
+
url_pattern = re.compile(
|
| 160 |
+
r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
|
| 161 |
+
except:
|
| 162 |
+
try:
|
| 163 |
+
self.session = self.create_session()
|
| 164 |
+
response = self.session.get(url, headers=self.headers, cookies=self.cookies)
|
| 165 |
+
url_pattern = re.compile(
|
| 166 |
+
r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
|
| 167 |
+
except:
|
| 168 |
+
return False
|
| 169 |
url_matches = url_pattern.findall(response.text)
|
| 170 |
+
|
| 171 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 172 |
# Find all anchor (a) tags in the HTML
|
| 173 |
all_links = soup.find_all("a")
|
|
|
|
| 176 |
for link in all_links:
|
| 177 |
href = link.get("href")
|
| 178 |
if href and url_pattern.match(href):
|
| 179 |
+
print(href)
|
| 180 |
print(href)
|
| 181 |
try:
|
| 182 |
total_pages = self.get_no_of_pages_of_sub_category(href)
|
|
|
|
| 193 |
|
| 194 |
request_url = f'{url}&page={i}'
|
| 195 |
print(request_url)
|
| 196 |
+
try:
|
| 197 |
+
response = self.session.get(
|
| 198 |
+
request_url, headers=self.headers, cookies=self.cookies)
|
| 199 |
|
| 200 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 201 |
+
items = soup.find_all('div', {'class': 'b--near-white'})
|
| 202 |
+
except:
|
| 203 |
+
try:
|
| 204 |
+
self.session = self.create_session()
|
| 205 |
+
response = self.session.get(
|
| 206 |
+
request_url, headers=self.headers, cookies=self.cookies)
|
| 207 |
+
|
| 208 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 209 |
+
items = soup.find_all('div', {'class': 'b--near-white'})
|
| 210 |
+
except:
|
| 211 |
+
return False
|
| 212 |
for item in items:
|
| 213 |
a_tag = item.find('a')
|
| 214 |
if a_tag:
|
|
|
|
| 218 |
product = self.get_product_detail(a_tag['href'])
|
| 219 |
time.sleep(2)
|
| 220 |
print(product)
|
| 221 |
+
if product:
|
| 222 |
+
self.save_product({
|
| 223 |
+
'title': product[0],
|
| 224 |
+
'product_id': product[1],
|
| 225 |
+
'ingredients': product[2],
|
| 226 |
+
'url': a_tag['href'],
|
| 227 |
+
'store_name': 'Wallmart'
|
| 228 |
+
})
|
| 229 |
# self.save_product_to_csv(product)
|
| 230 |
print(urls)
|
| 231 |
return urls
|
|
|
|
| 242 |
writer = csv.writer(file)
|
| 243 |
writer.writerow(product)
|
| 244 |
file.close()
|
| 245 |
+
|
| 246 |
+
def save_product(self, product):
|
| 247 |
+
try:
|
| 248 |
+
conn = psycopg2.connect(
|
| 249 |
+
host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
|
| 250 |
+
database="ingredients-scraper",
|
| 251 |
+
user="mumer113141",
|
| 252 |
+
password="SFBtp4xnPeA2"
|
| 253 |
+
)
|
| 254 |
+
cur = conn.cursor()
|
| 255 |
+
cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
|
| 256 |
+
conn.commit()
|
| 257 |
+
cur.close()
|
| 258 |
+
conn.close()
|
| 259 |
+
return True
|
| 260 |
+
except Exception as e:
|
| 261 |
+
print(e)
|
| 262 |
+
return False
|
| 263 |
def run(self):
|
| 264 |
+
print('Wallmart scraper started')
|
| 265 |
for category in self.categories:
|
| 266 |
self.get_category_browse_urls(category)
|
| 267 |
|