mumer119131 commited on
Commit
369e8bf
·
1 Parent(s): 643dbca

Update scraper/utils/WallmartScraper.py

Browse files
Files changed (1) hide show
  1. scraper/utils/WallmartScraper.py +172 -42
scraper/utils/WallmartScraper.py CHANGED
@@ -4,35 +4,120 @@ import re
4
  import csv
5
  import json
6
  import time
7
- from .DatabaseDataSaver import save_product
8
  from undetected_chromedriver import Chrome, ChromeOptions
 
 
 
 
 
9
  class WallmartScraper:
10
  def __init__(self):
11
  self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
12
- self.headers = {"Referer":"https://www.google.com","Connection":"Keep-Alive","Accept-Language":"en-US,en;q=0.9","Accept-Encoding":"gzip, deflate, br","Accept":self.ac,"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0"}
13
- self.session = requests.Session()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  self.categories = [
15
- 'health', 'beauty', 'personal care'
16
  ]
17
- self.generate_session()
18
-
19
- def generate_session(self):
20
- options = ChromeOptions()
21
- options.add_argument("--headless")
22
- options.add_argument("--disable-gpu")
23
- options.add_argument("--no-sandbox")
24
-
25
- driver = Chrome(options=options)
26
- driver.get("https://www.walmart.com/")
27
- print(driver.get_cookies())
28
- cookies = driver.get_cookies()
29
- [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
30
 
 
 
 
 
 
31
  def get_product_detail(self, url):
32
- response = self.session.get(
33
- url, headers=self.headers)
34
- soup = BeautifulSoup(response.text, 'html.parser')
35
- script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
 
 
 
 
 
 
 
 
 
 
36
  # convert scraipt tag json to dict
37
  script_tag_json = json.loads(script_tag)
38
  try:
@@ -40,10 +125,14 @@ class WallmartScraper:
40
  'data']['idml']['ingredients']['ingredients']['value']
41
  except:
42
  ingridents = ''
43
- # x.props.pageProps.initialData.data.product.name
44
- product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
45
- # x.props.pageProps.initialData.data.product.usItemId
46
- product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
 
 
 
 
47
  return [product_name, product_id, ingridents]
48
 
49
  def get_no_of_pages_of_sub_category(self, url):
@@ -63,12 +152,22 @@ class WallmartScraper:
63
 
64
  def get_category_browse_urls(self, category):
65
  url = f'https://www.walmart.com/cp/health/976760?q={category}'
66
- response = self.session.get(
67
- url, headers=self.headers)
68
-
69
- url_pattern = re.compile(
70
- r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
 
 
 
 
 
 
 
 
 
71
  url_matches = url_pattern.findall(response.text)
 
72
  soup = BeautifulSoup(response.text, 'html.parser')
73
  # Find all anchor (a) tags in the HTML
74
  all_links = soup.find_all("a")
@@ -77,6 +176,7 @@ class WallmartScraper:
77
  for link in all_links:
78
  href = link.get("href")
79
  if href and url_pattern.match(href):
 
80
  print(href)
81
  try:
82
  total_pages = self.get_no_of_pages_of_sub_category(href)
@@ -93,11 +193,22 @@ class WallmartScraper:
93
 
94
  request_url = f'{url}&page={i}'
95
  print(request_url)
96
- response = self.session.get(
97
- request_url, headers=self.headers)
 
98
 
99
- soup = BeautifulSoup(response.text, 'html.parser')
100
- items = soup.find_all('div', {'class': 'b--near-white'})
 
 
 
 
 
 
 
 
 
 
101
  for item in items:
102
  a_tag = item.find('a')
103
  if a_tag:
@@ -107,13 +218,14 @@ class WallmartScraper:
107
  product = self.get_product_detail(a_tag['href'])
108
  time.sleep(2)
109
  print(product)
110
- save_product({
111
- 'title': product[0],
112
- 'product_id': product[1],
113
- 'ingredients': product[2],
114
- 'url': a_tag['href'],
115
- 'store_name': 'Wallmart'
116
- })
 
117
  # self.save_product_to_csv(product)
118
  print(urls)
119
  return urls
@@ -130,8 +242,26 @@ class WallmartScraper:
130
  writer = csv.writer(file)
131
  writer.writerow(product)
132
  file.close()
133
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
  def run(self):
 
135
  for category in self.categories:
136
  self.get_category_browse_urls(category)
137
 
 
4
  import csv
5
  import json
6
  import time
 
7
  from undetected_chromedriver import Chrome, ChromeOptions
8
+ import psycopg2
9
+ import datetime
10
+ from httpx_socks import SyncProxyTransport
11
+ import httpx
12
+
13
  class WallmartScraper:
14
  def __init__(self):
15
  self.ac = "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9"
16
+ self.headers = {
17
+ 'Host': 'www.walmart.ca',
18
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0',
19
+ 'Accept': '*/*',
20
+ 'Accept-Language': 'en-US,en;q=0.5',
21
+ 'Accept-Encoding': 'gzip, deflate, br',
22
+ 'Content-Type': 'application/json',
23
+ 'Connection': 'keep-alive'
24
+ }
25
+ self.cookies = {
26
+ 'walmart.shippingPostalCode':'P7B3Z7',
27
+ 'defaultNearestStoreId':'3124',
28
+ 'zone':"9",
29
+ 'deliveryCatchment':"3124",
30
+ 'walmart.csrf':'73996cac34766ec995777784',
31
+ 'wmt.c':"0",
32
+ 'vtc':'ZAUFmHNTbFPrWyLrN8WTXA',
33
+ 'userSegment':'50-percent',
34
+ 'TBV':"7",
35
+ 'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
36
+ 'dtSa':'-',
37
+ '_ga':'GA1.2.1363574403.1590552905',
38
+ '_gid':'GA1.2.85728116.1590552905',
39
+ 'walmart.id':'24be2423-225b-44d0-851c-9f83c8e47dff',
40
+ 'usrState':"1",
41
+ 'walmart.nearestPostalCode':'P7B3Z7',
42
+ 's_ecid':'MCMID%7C17236695788713957075642593017320325404',
43
+ 'walmart.locale':'en',
44
+ 'AMCVS_C4C6370453309C960A490D44%40AdobeOrg':"1",
45
+ 's_visit':"1",
46
+ 's_cc':'true',
47
+ 'og_session_id':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
48
+ 'og_session_id_conf':'af0a84f8847311e3b233bc764e1107f2.616221.1590552906',
49
+ '_gcl_au':'1.1.482108716.1590552907',
50
+ '_fbp':'fb.1.1590552907225.702607671',
51
+ 'og_autoship':"0",
52
+ 'dtCookie':'3$1GS1LRIIKIBM595EBN2HIHIPCU4QVQ3H|5b6d58542e634882|0',
53
+ 'walmart.nearestLatLng':"48.4120872,-89.2413988",
54
+ 'dtLatC':"3",
55
+ 'rxVisitor':'1590552903550G5KJVCBIUCN3R32E3OSSVIKN9FTMDI5M',
56
+ 'dtSa':'-',
57
+ 'DYN_USER_ID':'23c3e447-cab5-4a76-beec-86d431f09b30',
58
+ 'WM_SEC.AUTH_TOKEN':'MTAyOTYyMDE46M9ya4OWOAX9Ycj9G+/EtZZ2rrXYDwJUPMuf8aNPxGq6es3kBtQx/WxiXKAkaKfkoKbMqixeQFrYdB1W0oSN1wIIzkNIxIEmVq7cOUtRuTRSgSwdxAsAWBT8plmFWLKwj8OFN4dileb20bpDLeCIlSFd/Hsc7bnSe4+TLU2zbj06SQbscc1R1tIesXl4ioL4y1NvN1BBj6GkfAZCjCfhDTASAGkrw9upmzYhCz4UwRzb/SoGFgAYL9DGZ8K45WCXb/Ew67/GsLtdlJHpe1JgEG+jVJ7bQ3VTYSMGmHEYCS8c8IAFKTMeYOPXxSWUpSrKtEbQ9hG+J0B2+kHzA8jyKD+vhACQYbIqsOCISVNY3spUIeGCIOmGJLznpUXbYF3gVk3LktwueMY7RuHPZ68PyA==',
59
+ 'LT':'1590553091850',
60
+ 'BVImplmain_site':"2036",
61
+ 'BVBRANDID':'20ae010b-0053-4a9f-902a-9197d72dc542',
62
+ 'DYN_USER_ID.ro':'23c3e447-cab5-4a76-beec-86d431f09b30',
63
+ 'cartId':'b6eb398f-ed49-46e8-8034-af8da418dd90',
64
+ 'NEXT_GEN.ENABLED':"1",
65
+ '_pin_unauth':'NTY4YjUyZDctYzNmOC00NzA5LWExOTYtOWQxOWZlOWVkYjFi',
66
+ 'TS011fb5f6':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
67
+ 'TS0175e29f':'01c5a4e2f941ffc623122b68eca74f3a27e0c416f7e2a5707b9417a73c048cb4be6507e9fd51df79c8015b3ba420dc6643bb0f8309',
68
+ 'authDuration':'{"lat":"1590555466230000","lt":"1590555466230000"}',
69
+ 'headerType':'grocery',
70
+ 's_sq':'%5B%5BB%5D%5D',
71
+ 'previousBreakpoint':'desktop',
72
+ 'wmt.breakpoint':'d',
73
+ 'akaau_P1':'1590607795~id=484ae7f711ac9dd38dbda655bd6ca764',
74
+ 'TS01f4281b':'01c5a4e2f97a7d51551a734ebe2cb1fc4f7a86c4df28824fb5812f83c96f6df870698b389077cf6f5fd822d05324df82b802c7ad04',
75
+ '_uetsid':'2127b16a-c523-20a6-d801-43923775d65e',
76
+ '_derived_epik':'dj0yJnU9NC1yUFlPMF9IczhrTlFabmZpYWVTQ0NMZFl5blN2eEMmbj1wX2o0OFVpeUZLWjRUcGM3Rl9xaGFnJm09MSZ0PUFBQUFBRjdPdXpJJnJtPTEmcnQ9QUFBQUFGN091ekk',
77
+ 'dtPC':'3$6637950_447h-vAKCBSUJVQJIVFIAUKQCIVTJULXFWHTFQ-0',
78
+ 'rxvt':'1590610238206|1590608438206',
79
+ 's_gnr':'1590609571427-Repeat',
80
+ 'AMCV_C4C6370453309C960A490D44%40AdobeOrg':'-408604571%7CMCIDTS%7C18410%7CMCMID%7C17236695788713957075642593017320325404%7CMCAID%7CNONE%7CMCOPTOUT-1590616771s%7CNONE%7CvVersion%7C4.6.0',
81
+ '_4c_':'rVJNbxoxEP0rkQ85sbv%2BXHuRooqkUdWqSZQmVY%2FIeL1gZWGRbdimEf89YyCQNqnUQzmYnfF7M5437wn1M7tAQyIqXOJKKEIIH6AH%2BxjQ8AmZZTrX6Vj5Fg3RLMZlGBZF3%2Fd5r9u59jE3urCLYuo7Y%2F1j0fiViyFb26mNetLasM8U1xlTgqIBMl1toRSpcpULiOMviDKGMXwvfVevTBzHx2XC9HZyEuoHuKjt2hk77l0dZ1syxcfszLrpLKY0Vtv00kOAc5nK925Rd%2F2BSUWJj9kjk%2FH0tonv%2BmAT%2B2Lmu7k9UQSyHYiBfmwZAUJvG%2Bv9FvU%2F9Agubmc90Pc5WAKkIbi5uv82Pr8cXdxcv2rZzRcurrzNQmhf954UIRT93Bm90LVOghak%2BHKX0ZziHGdfR3eqCIxgQZVUmNCSVx9Gt%2Bdn5HTu6jMiKSvLSkilJGHwj6UoORUVw0QyihkVHPPT0e3lWVJmCd5ASeW2M7pNY4CbBujTaPz988etrCUTknPM8mQxISgcLyNdXeww%2F9QSSPfeTafWX9k462og3ntdu%2Bi6hW7T0sHGYIhGr9qYwrRV0%2BoQnKlteIjdEm0G6Ofe61AcWjEJ9otgbFVynH6A8K7emx5Z1kwaymRGqagzXpY001KxTHJdlsLYpqyTCLuailFVYYkrstnpsq0hji1ZxSXH6p2WO9f9nVPxtxyYdA9nb%2BDiHfjiZaijRId3w7OBVQLMvaD0H%2FeCKZXE6feAw4USQv0OTRmArg%2B1aNVII7XMYPgq48aYTBtNM6Mbogzs2AqBjkNgxWGOqtwPQdRuhs3mGQ%3D%3D'
82
+ }
83
+ self.session = self.create_session()
84
  self.categories = [
85
+ 'personal care', 'beauty', 'health'
86
  ]
87
+ # options = ChromeOptions()
88
+ # options.add_argument("--headless")
89
+ # options.add_argument("--disable-gpu")
90
+ # options.add_argument("--no-sandbox")
91
+ # self.driver = Chrome(options=options)
92
+ # self.generate_session()
93
+
94
+ # def generate_session(self):
95
+
96
+ # self.driver.get("https://www.walmart.com/")
97
+ # print(self.driver.get_cookies())
98
+ # cookies = self.driver.get_cookies()
99
+ # [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
100
 
101
+ def create_session(self):
102
+ transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321")
103
+ session = httpx.Client(transport=transport)
104
+ return session
105
+
106
  def get_product_detail(self, url):
107
+ try:
108
+ response = self.session.get(
109
+ url, headers=self.headers, cookies=self.cookies)
110
+ soup = BeautifulSoup(response.text, 'html.parser')
111
+ script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
112
+ except:
113
+ try:
114
+ self.session = self.create_session()
115
+ response = self.session.get(
116
+ url, headers=self.headers, cookies=self.cookies)
117
+ soup = BeautifulSoup(response.text, 'html.parser')
118
+ script_tag = soup.find('script', {'id': '__NEXT_DATA__'}).text
119
+ except:
120
+ return False
121
  # convert scraipt tag json to dict
122
  script_tag_json = json.loads(script_tag)
123
  try:
 
125
  'data']['idml']['ingredients']['ingredients']['value']
126
  except:
127
  ingridents = ''
128
+
129
+ try:
130
+ # x.props.pageProps.initialData.data.product.name
131
+ product_name = script_tag_json['props']['pageProps']['initialData']['data']['product']['name']
132
+ # x.props.pageProps.initialData.data.product.usItemId
133
+ product_id = script_tag_json['props']['pageProps']['initialData']['data']['product']['usItemId']
134
+ except:
135
+ return False
136
  return [product_name, product_id, ingridents]
137
 
138
  def get_no_of_pages_of_sub_category(self, url):
 
152
 
153
  def get_category_browse_urls(self, category):
154
  url = f'https://www.walmart.com/cp/health/976760?q={category}'
155
+ try:
156
+ response = self.session.get(url, headers=self.headers, cookies=self.cookies)
157
+
158
+ print(response.text)
159
+ url_pattern = re.compile(
160
+ r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
161
+ except:
162
+ try:
163
+ self.session = self.create_session()
164
+ response = self.session.get(url, headers=self.headers, cookies=self.cookies)
165
+ url_pattern = re.compile(
166
+ r'https://www.walmart.com/browse/[^/]+/[^/]+/\d+_\d+\?povid=.*')
167
+ except:
168
+ return False
169
  url_matches = url_pattern.findall(response.text)
170
+
171
  soup = BeautifulSoup(response.text, 'html.parser')
172
  # Find all anchor (a) tags in the HTML
173
  all_links = soup.find_all("a")
 
176
  for link in all_links:
177
  href = link.get("href")
178
  if href and url_pattern.match(href):
179
+ print(href)
180
  print(href)
181
  try:
182
  total_pages = self.get_no_of_pages_of_sub_category(href)
 
193
 
194
  request_url = f'{url}&page={i}'
195
  print(request_url)
196
+ try:
197
+ response = self.session.get(
198
+ request_url, headers=self.headers, cookies=self.cookies)
199
 
200
+ soup = BeautifulSoup(response.text, 'html.parser')
201
+ items = soup.find_all('div', {'class': 'b--near-white'})
202
+ except:
203
+ try:
204
+ self.session = self.create_session()
205
+ response = self.session.get(
206
+ request_url, headers=self.headers, cookies=self.cookies)
207
+
208
+ soup = BeautifulSoup(response.text, 'html.parser')
209
+ items = soup.find_all('div', {'class': 'b--near-white'})
210
+ except:
211
+ return False
212
  for item in items:
213
  a_tag = item.find('a')
214
  if a_tag:
 
218
  product = self.get_product_detail(a_tag['href'])
219
  time.sleep(2)
220
  print(product)
221
+ if product:
222
+ self.save_product({
223
+ 'title': product[0],
224
+ 'product_id': product[1],
225
+ 'ingredients': product[2],
226
+ 'url': a_tag['href'],
227
+ 'store_name': 'Wallmart'
228
+ })
229
  # self.save_product_to_csv(product)
230
  print(urls)
231
  return urls
 
242
  writer = csv.writer(file)
243
  writer.writerow(product)
244
  file.close()
245
+
246
+ def save_product(self, product):
247
+ try:
248
+ conn = psycopg2.connect(
249
+ host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
250
+ database="ingredients-scraper",
251
+ user="mumer113141",
252
+ password="SFBtp4xnPeA2"
253
+ )
254
+ cur = conn.cursor()
255
+ cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
256
+ conn.commit()
257
+ cur.close()
258
+ conn.close()
259
+ return True
260
+ except Exception as e:
261
+ print(e)
262
+ return False
263
  def run(self):
264
+ print('Wallmart scraper started')
265
  for category in self.categories:
266
  self.get_category_browse_urls(category)
267