mumer119131 commited on
Commit
a6b5498
·
1 Parent(s): cf56e8e

Update scraper/utils/HebScraper.py

Browse files
Files changed (1) hide show
  1. scraper/utils/HebScraper.py +147 -10
scraper/utils/HebScraper.py CHANGED
@@ -15,9 +15,144 @@ class HebScraper:
15
  self.headers = {
16
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
17
  }
 
18
  self.categories = [
19
  'health', 'beauty', 'personal care'
20
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  self.session = requests.Session()
22
  self.generate_session()
23
 
@@ -72,8 +207,6 @@ class HebScraper:
72
 
73
  def search_category_pages(self, category):
74
  url = f"https://www.heb.com:443/search/?q={category}"
75
- cookies = {"HEB_AMP_DEVICE_ID": "h-9553dc8b-559c-4b78-ac9e-29473ea5147b", "USER_SELECT_STORE": "false", "CURR_SESSION_STORE": "92", "visid_incap_2302070": "vJO0L0njTA29eovhNqUWS1G6KmUAAAAAQUIPAAAAAACceG/mmByIIKugw3IJIqb7", "AMP_MKTG_760524e2ba": "JTdCJTdE", "_gcl_au": "1.1.317767528.1697299041", "_cs_mk": "0.2353851536637448_1697299041315", "AMP_TOKEN": "%24NOT_FOUND", "_gid": "GA1.2.534463307.1697299046", "__pdst": "81f3b640db914df78cc13cef62fcce3d", "_scid": "f840b179-6a37-41f0-98a9-74e3442ab044", "_fbp": "fb.1.1697299048562.502869888", "_pin_unauth": "dWlkPVl6azJNekF4TURJdFlXTXhZUzAwTmpaaUxXRmpNRFV0TURFeFpETm1aRGRrT1dFeQ", "_sctr": "1%7C1697223600000", "incap_ses_7245_2302070": "SpfVMbfbJlWH8UP9qGmLZNO/KmUAAAAAQ6DdBaFAErdn71N1kOCBVQ==", "DYN_USER_ID": "16893314532", "DYN_USER_CONFIRM": "837ca6cfe35db1d6751dc9a494700b94", "sessionContext": "curbside", "JSESSIONID": "db-spVppjEPfOL2HNHGA9xXNNPz5yFqJofTA1P5P", "_uetsid": "5e6a5f306aaa11eeb682912c91164f65", "_uetvid": "5e6a66c06aaa11eebfd5cfd70a56b55d", "_scid_r": "f840b179-6a37-41f0-98a9-74e3442ab044", "_ga": "GA1.2.1176000733.1697299042", "_dc_gtm_UA-26725300-5": "1", "AWSALB": "b1DIZXjB38LKdvzXrptoESB/gX95NzqAqHP+iHYgV2VRlMx39vMmZbjj1+3nehO6PJ261/X1VhyYY4fX/h6X0XCWnwahqqMS4ouwhzHiyT2gGlvNSaLlHr647Jgb", "_ga_WKSH6HYPT4": "GS1.1.1697299041.1.1.1697300936.0.0.0", "AMP_760524e2ba": "JTdCJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJkZXZpY2VJZCUyMiUzQSUyMmgtOTU1M2RjOGItNTU5Yy00Yjc4LWFjOWUtMjk0NzNlYTUxNDdiJTIyJTJDJTIybGFzdEV2ZW50VGltZSUyMiUzQTE2OTczMDA5Mzc0NTAlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNjk3Mjk5MDQwMzIyJTdE", "reese84": "3:MI9duruS5ddU/GvGLWwcRw==:wdq4U02M5AC5G6t9+QdHIEkgE3bYtn0KTF3BGxo2hot1ZthsfB5dYr+TXfLKq0HRxdj81COMZDeRwmfR/7tKp1BuMeZTbkmq/JqL/+V6lnu5QT5hO/9cqFny1JybiH68Siy0mk8dVFcyLQmiZfAY11gQmhlwGPo7CjnDKCirS0D6MifVE9HoZrSkjkm16VDGRDo1LyHQ1CgFQjeipvK5cuN63tK3E/r6y9GT/hDCp6hq0z5JJqZZ7UTdIvF5NmVdu6EBKpq6x2PJW440TCEDhmDN0JSrL4GRopfOu4Xb4ClPzeokv10RfZ+ZBj/SREDVNiwP1EEn2AJM9WPZE+l8eFWhDE55aBPPFJm3wsg6ggoAMb7JNc3rlSgH8ACVvtfU4u2sSkAYA+Pi4eFx6n8J3xJYJ2jU1AAMFsWyZAEOqTil0wqJMiBlOdrfskgEWNSIL7LjN9yhk4Otmwnhl3hoaHjPDn397nsgEbElrjmKIiXqSo/kuNG5A5AOKRsbnys/:0hMb0LNmXdkecUxwJtVPLkqqBNFhFcidBrSnXtWVWHw="}
76
- headers = {"Pragma": "no-cache", "Cache-Control": "no-cache", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
77
  response = self.session.get(url)
78
  soup = BeautifulSoup(response.text, 'html.parser')
79
  total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text
@@ -83,14 +216,18 @@ class HebScraper:
83
  def get_urls_of_category_from_page(self, category, pages):
84
  all_urls = []
85
  for page in range(1, pages + 1):
86
- burp0_cookies = {"HEB_AMP_DEVICE_ID": "h-9553dc8b-559c-4b78-ac9e-29473ea5147b", "USER_SELECT_STORE": "false", "CURR_SESSION_STORE": "92", "visid_incap_2302070": "vJO0L0njTA29eovhNqUWS1G6KmUAAAAAQUIPAAAAAACceG/mmByIIKugw3IJIqb7", "AMP_MKTG_760524e2ba": "JTdCJTdE", "_gcl_au": "1.1.317767528.1697299041", "_cs_mk": "0.2353851536637448_1697299041315", "AMP_TOKEN": "%24NOT_FOUND", "_gid": "GA1.2.534463307.1697299046", "__pdst": "81f3b640db914df78cc13cef62fcce3d", "_scid": "f840b179-6a37-41f0-98a9-74e3442ab044", "_fbp": "fb.1.1697299048562.502869888", "_pin_unauth": "dWlkPVl6azJNekF4TURJdFlXTXhZUzAwTmpaaUxXRmpNRFV0TURFeFpETm1aRGRrT1dFeQ", "_sctr": "1%7C1697223600000", "incap_ses_7245_2302070": "SpfVMbfbJlWH8UP9qGmLZNO/KmUAAAAAQ6DdBaFAErdn71N1kOCBVQ==", "DYN_USER_ID": "16893314532", "DYN_USER_CONFIRM": "837ca6cfe35db1d6751dc9a494700b94", "sessionContext": "curbside", "JSESSIONID": "db-spVppjEPfOL2HNHGA9xXNNPz5yFqJofTA1P5P", "_uetsid": "5e6a5f306aaa11eeb682912c91164f65", "_uetvid": "5e6a66c06aaa11eebfd5cfd70a56b55d", "_scid_r": "f840b179-6a37-41f0-98a9-74e3442ab044", "_ga": "GA1.2.1176000733.1697299042", "_dc_gtm_UA-26725300-5": "1", "AWSALB": "b1DIZXjB38LKdvzXrptoESB/gX95NzqAqHP+iHYgV2VRlMx39vMmZbjj1+3nehO6PJ261/X1VhyYY4fX/h6X0XCWnwahqqMS4ouwhzHiyT2gGlvNSaLlHr647Jgb", "_ga_WKSH6HYPT4": "GS1.1.1697299041.1.1.1697300936.0.0.0", "AMP_760524e2ba": "JTdCJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJkZXZpY2VJZCUyMiUzQSUyMmgtOTU1M2RjOGItNTU5Yy00Yjc4LWFjOWUtMjk0NzNlYTUxNDdiJTIyJTJDJTIybGFzdEV2ZW50VGltZSUyMiUzQTE2OTczMDA5Mzc0NTAlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNjk3Mjk5MDQwMzIyJTdE", "reese84": "3:MI9duruS5ddU/GvGLWwcRw==:wdq4U02M5AC5G6t9+QdHIEkgE3bYtn0KTF3BGxo2hot1ZthsfB5dYr+TXfLKq0HRxdj81COMZDeRwmfR/7tKp1BuMeZTbkmq/JqL/+V6lnu5QT5hO/9cqFny1JybiH68Siy0mk8dVFcyLQmiZfAY11gQmhlwGPo7CjnDKCirS0D6MifVE9HoZrSkjkm16VDGRDo1LyHQ1CgFQjeipvK5cuN63tK3E/r6y9GT/hDCp6hq0z5JJqZZ7UTdIvF5NmVdu6EBKpq6x2PJW440TCEDhmDN0JSrL4GRopfOu4Xb4ClPzeokv10RfZ+ZBj/SREDVNiwP1EEn2AJM9WPZE+l8eFWhDE55aBPPFJm3wsg6ggoAMb7JNc3rlSgH8ACVvtfU4u2sSkAYA+Pi4eFx6n8J3xJYJ2jU1AAMFsWyZAEOqTil0wqJMiBlOdrfskgEWNSIL7LjN9yhk4Otmwnhl3hoaHjPDn397nsgEbElrjmKIiXqSo/kuNG5A5AOKRsbnys/:0hMb0LNmXdkecUxwJtVPLkqqBNFhFcidBrSnXtWVWHw="}
87
- burp0_headers = {"Pragma": "no-cache", "Cache-Control": "no-cache", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
88
- burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}"
89
- response = self.session.get(burp0_url)
90
-
91
- url_pattern = r'href="(/product-detail/[^"]+)"'
92
- matches = re.findall(url_pattern, response.text)
93
- urls = [f'https://www.heb.com{match}' for match in matches]
 
 
 
 
94
  print(urls)
95
  self.get_all_products_from_category_page(urls)
96
 
 
15
  self.headers = {
16
  'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
17
  }
18
+ self.base_url = 'https://www.heb.com'
19
  self.categories = [
20
  'health', 'beauty', 'personal care'
21
  ]
22
+ self.query = {
23
+ "operationName": "InitialSearchProductsV2",
24
+ "variables": {
25
+ "params": {
26
+ "addressAllowAlcohol": False,
27
+ "doNotSuggestPhrase": False,
28
+ "ignoreRules": False,
29
+ "ignoreSynonyms": False,
30
+ "includeFullCategoryHierarchy": False,
31
+ "pageNumber": 0,
32
+ "pageSize": 60,
33
+ "query": "health",
34
+ "rootRequestId": None,
35
+ "segmentIds": [
36
+ "a5da00d0-7087-4655-93e0-b93ec0fc4757",
37
+ "adb33a3c-512e-4d76-8c24-a76d0efb8656",
38
+ "251287aa-b1a1-4bc4-8652-3c4e74a5b756",
39
+ "0df3ce9c-8f14-4d80-bf4c-3b8b66feab37",
40
+ "29e75183-0916-4337-a605-18c34add93d9",
41
+ "81484884-8948-41aa-a6f6-fed59467ceb9",
42
+ "0a8a667b-13e3-444e-999b-02fcd87026aa",
43
+ "8a7194d0-4643-41e3-8775-f56df17a0cb2",
44
+ "8809067c-8fe2-4151-b26e-e67edf814a57",
45
+ "0ad399c5-4ed6-4f7c-ba85-1c0ccb5f1b8c",
46
+ "8bc9ba87-94b1-45de-8737-8a7ed18e94ca",
47
+ "2b44887a-8e32-40c1-aced-3c7fec8790da",
48
+ "c0c96fe5-b029-48cf-beec-42720c4ac40b",
49
+ "7ca58353-5733-485f-aa37-22f4028e2e2a",
50
+ "68c1099e-bb09-4ac4-831f-d2b53948abd2",
51
+ "502dbe33-362e-4caf-a30e-eed0c5db0d15",
52
+ "61e62fb2-ec7b-431d-a6b0-e2e9e4276fbf",
53
+ "c662ef55-68e2-4255-a077-3077fcc52376",
54
+ "211d5eb4-17de-44da-9d6a-055120c8d9d5",
55
+ "354a6a8a-4034-4a8d-a50f-2c9d2bd7f564",
56
+ "54644db9-3e20-458a-b785-a3fb819bf701",
57
+ "e579e6cd-27b0-4b8d-995b-a5a7b5ef59ad",
58
+ "d818c6a2-7494-4a09-9409-4cef916c8303",
59
+ "7b6ecdf6-0461-407d-9750-e2035cd50834",
60
+ "f88be4a7-fb46-41ec-a39e-6371585a3701",
61
+ "2b62d388-480e-4bda-8ae3-c2db70aaa731",
62
+ "77a893e5-086f-4bdf-93dd-b0dda5accbb4",
63
+ "d8b0cd11-4230-4cab-8696-d55630f034df",
64
+ "caf33fa8-9d41-431a-a64b-2fb3499c48e4",
65
+ "3158e985-edf6-4a9c-9d01-8bade1cffd04",
66
+ "14bba8bc-b4c1-48b6-af10-8acca2db82ce",
67
+ "f9ea635d-4081-4a7f-821d-af8eda75f559",
68
+ "0a570dbd-f905-4261-946f-a1e6b3e9a387",
69
+ "4ac8cc6c-a11b-4803-aee0-47bdc1dc0834",
70
+ "61fb168b-2516-4bb9-97c4-804e8869eb8e",
71
+ "37b60cc5-9238-4653-a08f-fb617a878ef7",
72
+ "c2c1676f-a5fd-4c93-a900-76506a656b4c",
73
+ "3b8c0dbe-fadb-41e9-ad6a-40c7c5772d60",
74
+ "410cfec0-e434-4b0b-9cc3-e5cbcb09a0fe",
75
+ "881bf27d-a875-4b05-8732-87be803eeaa5",
76
+ "fc0bf854-631d-4322-bc2c-809992801e14",
77
+ "e3853f48-c3ec-41d1-a8c9-32eb188cf9ce",
78
+ "a968933f-9e39-4321-a6a1-b79caf397736",
79
+ "a18f9694-f14d-41d7-9da7-68934bb3d229",
80
+ "58ea83c3-00fb-49d0-8710-dddd29e15088",
81
+ "6d47d454-edc8-44e2-99e6-d4c65e0871bc",
82
+ "37930f56-3086-44b6-bc44-795e0c78e390",
83
+ "53ccb672-9c35-4516-89ca-d48414818d40",
84
+ "c1b959a6-b285-4825-8e26-387b871e89d9",
85
+ "07e10eba-f057-4789-8949-bb5ffa800d51",
86
+ "77e2aeed-e5e7-4e9d-bd81-cf456d24158c",
87
+ "71b64f9a-4a01-4bdf-81fd-757096f0e7ce",
88
+ "af84966d-abec-4a11-94bd-632a651d1d51",
89
+ "2ba4117a-24ee-458e-84a8-3063d5b5c2c2",
90
+ "2f2c33b7-501d-41bb-9401-89019a13fd38",
91
+ "eedbf364-1ff9-463a-bc1d-7ac0ae015f94",
92
+ "ec8573bb-e6ad-411f-a33a-14addf2d2aa5",
93
+ "e904a3e6-5273-4fcd-809d-dc5b1bf9b2e0",
94
+ "3f8c11f4-3e57-4467-8d2c-50474126200b",
95
+ "7450aa0c-685f-4433-9245-a8bf1c7d40b3",
96
+ "b725ccfa-f350-4d50-afad-dc9a18d68d78",
97
+ "a4578507-90b2-41c8-9918-ecf45e61c540",
98
+ "8449656d-9093-4cd4-8f1e-a9ce9fcafedf",
99
+ "96f95945-a009-495d-a81c-885912998854",
100
+ "d2ac6fb7-5ee1-4174-98c6-9e6bb79081dd",
101
+ "87f7f262-c304-45fb-965a-4c6cef6b2e27",
102
+ "4adf74f0-2c80-4cc7-b772-dc3d49e2632c",
103
+ "24bc8ce6-ef5c-47bf-b03c-03d28a9aa44d",
104
+ "8a98f32b-83cb-4af8-b513-dd34bdb63807",
105
+ "83c87cc0-e31e-4810-a83e-6e57b006c02a",
106
+ "cc63cdb9-1f6b-4511-bc43-b598e6b13787",
107
+ "638d383a-bce4-4fb9-8b7b-bff85e87a364",
108
+ "aee21288-8f82-410d-b28e-0ff5d9b7f5d7",
109
+ "01255be4-73c6-41b1-81c4-64b0de2852cf",
110
+ "5f0cbece-ad6e-4cf8-b2d9-d10dc372878b",
111
+ "46ec5fd1-5d1d-4837-855d-cd5da948544b",
112
+ "31bc4ab4-3c5b-43df-9d93-be34763b40c4",
113
+ "7a87ba80-8c52-4763-8e4c-38c09e384c49",
114
+ "c5a29a56-fbbc-4395-995f-96d382387c79",
115
+ "1102328c-6f15-4aa1-b4ce-c0ee823cbcb6",
116
+ "9e64093a-4f73-479f-8e9e-323b84cd6039",
117
+ "65e887fd-f949-43ad-ac45-aa852de874a6",
118
+ "3ab0ddd6-48e0-48f8-bf32-56b0d4178600",
119
+ "a435ac50-5d92-49be-84b6-9e5e8e4e9248",
120
+ "26486712-b403-4063-9cd2-9f2961a08de2",
121
+ "99bdd20f-6151-4666-bc0b-444037b41712",
122
+ "66f5d39b-be2b-46a9-9de4-abe08377de8d",
123
+ "7a91fbe1-2074-457e-b6dd-454ee8bf8d74",
124
+ "dbdbccf7-8888-4b04-8310-c25edb43a8c1",
125
+ "4219936d-8f63-4ae1-8bf7-ab5b65496c2c",
126
+ "f8a4a5a8-7bac-475a-a546-f3caecb765b2",
127
+ "42ea82db-272e-4a28-8347-ce6a1c4fa4ff",
128
+ "a75fcb9e-2ecf-49f9-b573-8309a6ec7331",
129
+ "326f604f-ba68-4a06-94c1-ca3a9a46d12a",
130
+ "fe0a665a-d5a0-49d3-acc6-23ba157bc4b8",
131
+ "2131006e-556b-4116-b588-caf647a5c799",
132
+ "61c87e69-932b-4635-896c-3ef4f38ac2c4",
133
+ "0dd7f237-453a-468e-af14-601450cc9ddb",
134
+ "55f27fa2-cbdd-4315-949f-54b77c477870",
135
+ "88b4f5c0-0dae-4b37-8103-9b6f2330d0bc",
136
+ "52f600bd-b66c-4e92-94b0-b88c16893828",
137
+ "4b328bfd-0256-4f8a-9a8b-51aed5a9079c",
138
+ "7f6b6177-06e7-4c38-b610-82bd899232c7",
139
+ "25890f8a-5346-47bc-b0b3-69fc6d9e4812",
140
+ "9cdc9da1-e6c0-425b-8d4b-663cd2bc351f"
141
+ ],
142
+ "shoppingContext": "CURBSIDE_PICKUP",
143
+ "sortBy": "SCORE",
144
+ "sortDirection": "DESC",
145
+ "storeId": 92,
146
+ "timeSlotStartTime": None,
147
+ }
148
+ },
149
+ "extensions": {
150
+ "persistedQuery": {
151
+ "version": 1,
152
+ "sha256Hash": "2ed81ec090540231b28f8e6853767c8f03a0099c0112f2173f69cb06b8d2dd29"
153
+ }
154
+ }
155
+ }
156
  self.session = requests.Session()
157
  self.generate_session()
158
 
 
207
 
208
  def search_category_pages(self, category):
209
  url = f"https://www.heb.com:443/search/?q={category}"
 
 
210
  response = self.session.get(url)
211
  soup = BeautifulSoup(response.text, 'html.parser')
212
  total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text
 
216
  def get_urls_of_category_from_page(self, category, pages):
217
  all_urls = []
218
  for page in range(1, pages + 1):
219
+ self.query['variables']['params']['pageNumber'] = page
220
+ self.query['variables']['params']['query'] = category
221
+ # burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}"
222
+ url = "https://www.heb.com:443/graphql"
223
+ response = self.session.post(url, json=self.query)
224
+ with open('heb.json', 'w+', encoding='utf-8') as file:
225
+ file.write(response.text)
226
+ products = response.json()['data']['productSearchV2']['records']
227
+ urls = []
228
+ for product in products:
229
+ url = f"{self.base_url}{product['product']['productPageURL']}"
230
+ urls.append(url)
231
  print(urls)
232
  self.get_all_products_from_category_page(urls)
233