Spaces:
Runtime error
Runtime error
Commit ·
a6b5498
1
Parent(s): cf56e8e
Update scraper/utils/HebScraper.py
Browse files- scraper/utils/HebScraper.py +147 -10
scraper/utils/HebScraper.py
CHANGED
|
@@ -15,9 +15,144 @@ class HebScraper:
|
|
| 15 |
self.headers = {
|
| 16 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
|
| 17 |
}
|
|
|
|
| 18 |
self.categories = [
|
| 19 |
'health', 'beauty', 'personal care'
|
| 20 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
self.session = requests.Session()
|
| 22 |
self.generate_session()
|
| 23 |
|
|
@@ -72,8 +207,6 @@ class HebScraper:
|
|
| 72 |
|
| 73 |
def search_category_pages(self, category):
|
| 74 |
url = f"https://www.heb.com:443/search/?q={category}"
|
| 75 |
-
cookies = {"HEB_AMP_DEVICE_ID": "h-9553dc8b-559c-4b78-ac9e-29473ea5147b", "USER_SELECT_STORE": "false", "CURR_SESSION_STORE": "92", "visid_incap_2302070": "vJO0L0njTA29eovhNqUWS1G6KmUAAAAAQUIPAAAAAACceG/mmByIIKugw3IJIqb7", "AMP_MKTG_760524e2ba": "JTdCJTdE", "_gcl_au": "1.1.317767528.1697299041", "_cs_mk": "0.2353851536637448_1697299041315", "AMP_TOKEN": "%24NOT_FOUND", "_gid": "GA1.2.534463307.1697299046", "__pdst": "81f3b640db914df78cc13cef62fcce3d", "_scid": "f840b179-6a37-41f0-98a9-74e3442ab044", "_fbp": "fb.1.1697299048562.502869888", "_pin_unauth": "dWlkPVl6azJNekF4TURJdFlXTXhZUzAwTmpaaUxXRmpNRFV0TURFeFpETm1aRGRrT1dFeQ", "_sctr": "1%7C1697223600000", "incap_ses_7245_2302070": "SpfVMbfbJlWH8UP9qGmLZNO/KmUAAAAAQ6DdBaFAErdn71N1kOCBVQ==", "DYN_USER_ID": "16893314532", "DYN_USER_CONFIRM": "837ca6cfe35db1d6751dc9a494700b94", "sessionContext": "curbside", "JSESSIONID": "db-spVppjEPfOL2HNHGA9xXNNPz5yFqJofTA1P5P", "_uetsid": "5e6a5f306aaa11eeb682912c91164f65", "_uetvid": "5e6a66c06aaa11eebfd5cfd70a56b55d", "_scid_r": "f840b179-6a37-41f0-98a9-74e3442ab044", "_ga": "GA1.2.1176000733.1697299042", "_dc_gtm_UA-26725300-5": "1", "AWSALB": "b1DIZXjB38LKdvzXrptoESB/gX95NzqAqHP+iHYgV2VRlMx39vMmZbjj1+3nehO6PJ261/X1VhyYY4fX/h6X0XCWnwahqqMS4ouwhzHiyT2gGlvNSaLlHr647Jgb", "_ga_WKSH6HYPT4": "GS1.1.1697299041.1.1.1697300936.0.0.0", "AMP_760524e2ba": "JTdCJTIyb3B0T3V0JTIyJTNBZmFsc2UlMkMlMjJkZXZpY2VJZCUyMiUzQSUyMmgtOTU1M2RjOGItNTU5Yy00Yjc4LWFjOWUtMjk0NzNlYTUxNDdiJTIyJTJDJTIybGFzdEV2ZW50VGltZSUyMiUzQTE2OTczMDA5Mzc0NTAlMkMlMjJzZXNzaW9uSWQlMjIlM0ExNjk3Mjk5MDQwMzIyJTdE", "reese84": "3:MI9duruS5ddU/GvGLWwcRw==:wdq4U02M5AC5G6t9+QdHIEkgE3bYtn0KTF3BGxo2hot1ZthsfB5dYr+TXfLKq0HRxdj81COMZDeRwmfR/7tKp1BuMeZTbkmq/JqL/+V6lnu5QT5hO/9cqFny1JybiH68Siy0mk8dVFcyLQmiZfAY11gQmhlwGPo7CjnDKCirS0D6MifVE9HoZrSkjkm16VDGRDo1LyHQ1CgFQjeipvK5cuN63tK3E/r6y9GT/hDCp6hq0z5JJqZZ7UTdIvF5NmVdu6EBKpq6x2PJW440TCEDhmDN0JSrL4GRopfOu4Xb4ClPzeokv10RfZ+ZBj/SREDVNiwP1EEn2AJM9WPZE+l8eFWhDE55aBPPFJm3wsg6ggoAMb7JNc3rlSgH8ACVvtfU4u2sSkAYA+Pi4eFx6n8J3xJYJ2jU1AAMFsWyZAEOqTil0wqJMiBlOdrfskgEWNSIL7LjN9yhk4Otmwnhl3hoaHjPDn397nsgEbElrjmKIiXqSo/kuNG5A5AOKRsbnys/:0hMb0LNmXdkecUxwJtVPLkqqBNFhFcidBrSnXtWVWHw="}
|
| 76 |
-
headers = {"Pragma": "no-cache", "Cache-Control": "no-cache", "Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "Sec-Ch-Ua-Platform": "\"Windows\"", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-User": "?1", "Sec-Fetch-Dest": "document", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
|
| 77 |
response = self.session.get(url)
|
| 78 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 79 |
total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text
|
|
@@ -83,14 +216,18 @@ class HebScraper:
|
|
| 83 |
def get_urls_of_category_from_page(self, category, pages):
|
| 84 |
all_urls = []
|
| 85 |
for page in range(1, pages + 1):
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}"
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
print(urls)
|
| 95 |
self.get_all_products_from_category_page(urls)
|
| 96 |
|
|
|
|
| 15 |
self.headers = {
|
| 16 |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
|
| 17 |
}
|
| 18 |
+
self.base_url = 'https://www.heb.com'
|
| 19 |
self.categories = [
|
| 20 |
'health', 'beauty', 'personal care'
|
| 21 |
]
|
| 22 |
+
self.query = {
|
| 23 |
+
"operationName": "InitialSearchProductsV2",
|
| 24 |
+
"variables": {
|
| 25 |
+
"params": {
|
| 26 |
+
"addressAllowAlcohol": False,
|
| 27 |
+
"doNotSuggestPhrase": False,
|
| 28 |
+
"ignoreRules": False,
|
| 29 |
+
"ignoreSynonyms": False,
|
| 30 |
+
"includeFullCategoryHierarchy": False,
|
| 31 |
+
"pageNumber": 0,
|
| 32 |
+
"pageSize": 60,
|
| 33 |
+
"query": "health",
|
| 34 |
+
"rootRequestId": None,
|
| 35 |
+
"segmentIds": [
|
| 36 |
+
"a5da00d0-7087-4655-93e0-b93ec0fc4757",
|
| 37 |
+
"adb33a3c-512e-4d76-8c24-a76d0efb8656",
|
| 38 |
+
"251287aa-b1a1-4bc4-8652-3c4e74a5b756",
|
| 39 |
+
"0df3ce9c-8f14-4d80-bf4c-3b8b66feab37",
|
| 40 |
+
"29e75183-0916-4337-a605-18c34add93d9",
|
| 41 |
+
"81484884-8948-41aa-a6f6-fed59467ceb9",
|
| 42 |
+
"0a8a667b-13e3-444e-999b-02fcd87026aa",
|
| 43 |
+
"8a7194d0-4643-41e3-8775-f56df17a0cb2",
|
| 44 |
+
"8809067c-8fe2-4151-b26e-e67edf814a57",
|
| 45 |
+
"0ad399c5-4ed6-4f7c-ba85-1c0ccb5f1b8c",
|
| 46 |
+
"8bc9ba87-94b1-45de-8737-8a7ed18e94ca",
|
| 47 |
+
"2b44887a-8e32-40c1-aced-3c7fec8790da",
|
| 48 |
+
"c0c96fe5-b029-48cf-beec-42720c4ac40b",
|
| 49 |
+
"7ca58353-5733-485f-aa37-22f4028e2e2a",
|
| 50 |
+
"68c1099e-bb09-4ac4-831f-d2b53948abd2",
|
| 51 |
+
"502dbe33-362e-4caf-a30e-eed0c5db0d15",
|
| 52 |
+
"61e62fb2-ec7b-431d-a6b0-e2e9e4276fbf",
|
| 53 |
+
"c662ef55-68e2-4255-a077-3077fcc52376",
|
| 54 |
+
"211d5eb4-17de-44da-9d6a-055120c8d9d5",
|
| 55 |
+
"354a6a8a-4034-4a8d-a50f-2c9d2bd7f564",
|
| 56 |
+
"54644db9-3e20-458a-b785-a3fb819bf701",
|
| 57 |
+
"e579e6cd-27b0-4b8d-995b-a5a7b5ef59ad",
|
| 58 |
+
"d818c6a2-7494-4a09-9409-4cef916c8303",
|
| 59 |
+
"7b6ecdf6-0461-407d-9750-e2035cd50834",
|
| 60 |
+
"f88be4a7-fb46-41ec-a39e-6371585a3701",
|
| 61 |
+
"2b62d388-480e-4bda-8ae3-c2db70aaa731",
|
| 62 |
+
"77a893e5-086f-4bdf-93dd-b0dda5accbb4",
|
| 63 |
+
"d8b0cd11-4230-4cab-8696-d55630f034df",
|
| 64 |
+
"caf33fa8-9d41-431a-a64b-2fb3499c48e4",
|
| 65 |
+
"3158e985-edf6-4a9c-9d01-8bade1cffd04",
|
| 66 |
+
"14bba8bc-b4c1-48b6-af10-8acca2db82ce",
|
| 67 |
+
"f9ea635d-4081-4a7f-821d-af8eda75f559",
|
| 68 |
+
"0a570dbd-f905-4261-946f-a1e6b3e9a387",
|
| 69 |
+
"4ac8cc6c-a11b-4803-aee0-47bdc1dc0834",
|
| 70 |
+
"61fb168b-2516-4bb9-97c4-804e8869eb8e",
|
| 71 |
+
"37b60cc5-9238-4653-a08f-fb617a878ef7",
|
| 72 |
+
"c2c1676f-a5fd-4c93-a900-76506a656b4c",
|
| 73 |
+
"3b8c0dbe-fadb-41e9-ad6a-40c7c5772d60",
|
| 74 |
+
"410cfec0-e434-4b0b-9cc3-e5cbcb09a0fe",
|
| 75 |
+
"881bf27d-a875-4b05-8732-87be803eeaa5",
|
| 76 |
+
"fc0bf854-631d-4322-bc2c-809992801e14",
|
| 77 |
+
"e3853f48-c3ec-41d1-a8c9-32eb188cf9ce",
|
| 78 |
+
"a968933f-9e39-4321-a6a1-b79caf397736",
|
| 79 |
+
"a18f9694-f14d-41d7-9da7-68934bb3d229",
|
| 80 |
+
"58ea83c3-00fb-49d0-8710-dddd29e15088",
|
| 81 |
+
"6d47d454-edc8-44e2-99e6-d4c65e0871bc",
|
| 82 |
+
"37930f56-3086-44b6-bc44-795e0c78e390",
|
| 83 |
+
"53ccb672-9c35-4516-89ca-d48414818d40",
|
| 84 |
+
"c1b959a6-b285-4825-8e26-387b871e89d9",
|
| 85 |
+
"07e10eba-f057-4789-8949-bb5ffa800d51",
|
| 86 |
+
"77e2aeed-e5e7-4e9d-bd81-cf456d24158c",
|
| 87 |
+
"71b64f9a-4a01-4bdf-81fd-757096f0e7ce",
|
| 88 |
+
"af84966d-abec-4a11-94bd-632a651d1d51",
|
| 89 |
+
"2ba4117a-24ee-458e-84a8-3063d5b5c2c2",
|
| 90 |
+
"2f2c33b7-501d-41bb-9401-89019a13fd38",
|
| 91 |
+
"eedbf364-1ff9-463a-bc1d-7ac0ae015f94",
|
| 92 |
+
"ec8573bb-e6ad-411f-a33a-14addf2d2aa5",
|
| 93 |
+
"e904a3e6-5273-4fcd-809d-dc5b1bf9b2e0",
|
| 94 |
+
"3f8c11f4-3e57-4467-8d2c-50474126200b",
|
| 95 |
+
"7450aa0c-685f-4433-9245-a8bf1c7d40b3",
|
| 96 |
+
"b725ccfa-f350-4d50-afad-dc9a18d68d78",
|
| 97 |
+
"a4578507-90b2-41c8-9918-ecf45e61c540",
|
| 98 |
+
"8449656d-9093-4cd4-8f1e-a9ce9fcafedf",
|
| 99 |
+
"96f95945-a009-495d-a81c-885912998854",
|
| 100 |
+
"d2ac6fb7-5ee1-4174-98c6-9e6bb79081dd",
|
| 101 |
+
"87f7f262-c304-45fb-965a-4c6cef6b2e27",
|
| 102 |
+
"4adf74f0-2c80-4cc7-b772-dc3d49e2632c",
|
| 103 |
+
"24bc8ce6-ef5c-47bf-b03c-03d28a9aa44d",
|
| 104 |
+
"8a98f32b-83cb-4af8-b513-dd34bdb63807",
|
| 105 |
+
"83c87cc0-e31e-4810-a83e-6e57b006c02a",
|
| 106 |
+
"cc63cdb9-1f6b-4511-bc43-b598e6b13787",
|
| 107 |
+
"638d383a-bce4-4fb9-8b7b-bff85e87a364",
|
| 108 |
+
"aee21288-8f82-410d-b28e-0ff5d9b7f5d7",
|
| 109 |
+
"01255be4-73c6-41b1-81c4-64b0de2852cf",
|
| 110 |
+
"5f0cbece-ad6e-4cf8-b2d9-d10dc372878b",
|
| 111 |
+
"46ec5fd1-5d1d-4837-855d-cd5da948544b",
|
| 112 |
+
"31bc4ab4-3c5b-43df-9d93-be34763b40c4",
|
| 113 |
+
"7a87ba80-8c52-4763-8e4c-38c09e384c49",
|
| 114 |
+
"c5a29a56-fbbc-4395-995f-96d382387c79",
|
| 115 |
+
"1102328c-6f15-4aa1-b4ce-c0ee823cbcb6",
|
| 116 |
+
"9e64093a-4f73-479f-8e9e-323b84cd6039",
|
| 117 |
+
"65e887fd-f949-43ad-ac45-aa852de874a6",
|
| 118 |
+
"3ab0ddd6-48e0-48f8-bf32-56b0d4178600",
|
| 119 |
+
"a435ac50-5d92-49be-84b6-9e5e8e4e9248",
|
| 120 |
+
"26486712-b403-4063-9cd2-9f2961a08de2",
|
| 121 |
+
"99bdd20f-6151-4666-bc0b-444037b41712",
|
| 122 |
+
"66f5d39b-be2b-46a9-9de4-abe08377de8d",
|
| 123 |
+
"7a91fbe1-2074-457e-b6dd-454ee8bf8d74",
|
| 124 |
+
"dbdbccf7-8888-4b04-8310-c25edb43a8c1",
|
| 125 |
+
"4219936d-8f63-4ae1-8bf7-ab5b65496c2c",
|
| 126 |
+
"f8a4a5a8-7bac-475a-a546-f3caecb765b2",
|
| 127 |
+
"42ea82db-272e-4a28-8347-ce6a1c4fa4ff",
|
| 128 |
+
"a75fcb9e-2ecf-49f9-b573-8309a6ec7331",
|
| 129 |
+
"326f604f-ba68-4a06-94c1-ca3a9a46d12a",
|
| 130 |
+
"fe0a665a-d5a0-49d3-acc6-23ba157bc4b8",
|
| 131 |
+
"2131006e-556b-4116-b588-caf647a5c799",
|
| 132 |
+
"61c87e69-932b-4635-896c-3ef4f38ac2c4",
|
| 133 |
+
"0dd7f237-453a-468e-af14-601450cc9ddb",
|
| 134 |
+
"55f27fa2-cbdd-4315-949f-54b77c477870",
|
| 135 |
+
"88b4f5c0-0dae-4b37-8103-9b6f2330d0bc",
|
| 136 |
+
"52f600bd-b66c-4e92-94b0-b88c16893828",
|
| 137 |
+
"4b328bfd-0256-4f8a-9a8b-51aed5a9079c",
|
| 138 |
+
"7f6b6177-06e7-4c38-b610-82bd899232c7",
|
| 139 |
+
"25890f8a-5346-47bc-b0b3-69fc6d9e4812",
|
| 140 |
+
"9cdc9da1-e6c0-425b-8d4b-663cd2bc351f"
|
| 141 |
+
],
|
| 142 |
+
"shoppingContext": "CURBSIDE_PICKUP",
|
| 143 |
+
"sortBy": "SCORE",
|
| 144 |
+
"sortDirection": "DESC",
|
| 145 |
+
"storeId": 92,
|
| 146 |
+
"timeSlotStartTime": None,
|
| 147 |
+
}
|
| 148 |
+
},
|
| 149 |
+
"extensions": {
|
| 150 |
+
"persistedQuery": {
|
| 151 |
+
"version": 1,
|
| 152 |
+
"sha256Hash": "2ed81ec090540231b28f8e6853767c8f03a0099c0112f2173f69cb06b8d2dd29"
|
| 153 |
+
}
|
| 154 |
+
}
|
| 155 |
+
}
|
| 156 |
self.session = requests.Session()
|
| 157 |
self.generate_session()
|
| 158 |
|
|
|
|
| 207 |
|
| 208 |
def search_category_pages(self, category):
|
| 209 |
url = f"https://www.heb.com:443/search/?q={category}"
|
|
|
|
|
|
|
| 210 |
response = self.session.get(url)
|
| 211 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 212 |
total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text
|
|
|
|
| 216 |
def get_urls_of_category_from_page(self, category, pages):
|
| 217 |
all_urls = []
|
| 218 |
for page in range(1, pages + 1):
|
| 219 |
+
self.query['variables']['params']['pageNumber'] = page
|
| 220 |
+
self.query['variables']['params']['query'] = category
|
| 221 |
+
# burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}"
|
| 222 |
+
url = "https://www.heb.com:443/graphql"
|
| 223 |
+
response = self.session.post(url, json=self.query)
|
| 224 |
+
with open('heb.json', 'w+', encoding='utf-8') as file:
|
| 225 |
+
file.write(response.text)
|
| 226 |
+
products = response.json()['data']['productSearchV2']['records']
|
| 227 |
+
urls = []
|
| 228 |
+
for product in products:
|
| 229 |
+
url = f"{self.base_url}{product['product']['productPageURL']}"
|
| 230 |
+
urls.append(url)
|
| 231 |
print(urls)
|
| 232 |
self.get_all_products_from_category_page(urls)
|
| 233 |
|