Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import csv | |
| from .DatabaseDataSaver import save_product | |
| from selenium import webdriver | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| from django.conf import settings | |
| # from selenium.webdriver.chrome.options import Options as ChromeOptions | |
| from undetected_chromedriver import Chrome, ChromeOptions | |
| import os | |
| class HebScraper: | |
| def __init__(self): | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ', | |
| } | |
| self.base_url = 'https://www.heb.com' | |
| self.categories = [ | |
| 'health', 'beauty', 'personal care' | |
| ] | |
| self.query = { | |
| "operationName": "InitialSearchProductsV2", | |
| "variables": { | |
| "params": { | |
| "addressAllowAlcohol": False, | |
| "doNotSuggestPhrase": False, | |
| "ignoreRules": False, | |
| "ignoreSynonyms": False, | |
| "includeFullCategoryHierarchy": False, | |
| "pageNumber": 0, | |
| "pageSize": 60, | |
| "query": "health", | |
| "rootRequestId": None, | |
| "segmentIds": [ | |
| "a5da00d0-7087-4655-93e0-b93ec0fc4757", | |
| "adb33a3c-512e-4d76-8c24-a76d0efb8656", | |
| "251287aa-b1a1-4bc4-8652-3c4e74a5b756", | |
| "0df3ce9c-8f14-4d80-bf4c-3b8b66feab37", | |
| "29e75183-0916-4337-a605-18c34add93d9", | |
| "81484884-8948-41aa-a6f6-fed59467ceb9", | |
| "0a8a667b-13e3-444e-999b-02fcd87026aa", | |
| "8a7194d0-4643-41e3-8775-f56df17a0cb2", | |
| "8809067c-8fe2-4151-b26e-e67edf814a57", | |
| "0ad399c5-4ed6-4f7c-ba85-1c0ccb5f1b8c", | |
| "8bc9ba87-94b1-45de-8737-8a7ed18e94ca", | |
| "2b44887a-8e32-40c1-aced-3c7fec8790da", | |
| "c0c96fe5-b029-48cf-beec-42720c4ac40b", | |
| "7ca58353-5733-485f-aa37-22f4028e2e2a", | |
| "68c1099e-bb09-4ac4-831f-d2b53948abd2", | |
| "502dbe33-362e-4caf-a30e-eed0c5db0d15", | |
| "61e62fb2-ec7b-431d-a6b0-e2e9e4276fbf", | |
| "c662ef55-68e2-4255-a077-3077fcc52376", | |
| "211d5eb4-17de-44da-9d6a-055120c8d9d5", | |
| "354a6a8a-4034-4a8d-a50f-2c9d2bd7f564", | |
| "54644db9-3e20-458a-b785-a3fb819bf701", | |
| "e579e6cd-27b0-4b8d-995b-a5a7b5ef59ad", | |
| "d818c6a2-7494-4a09-9409-4cef916c8303", | |
| "7b6ecdf6-0461-407d-9750-e2035cd50834", | |
| "f88be4a7-fb46-41ec-a39e-6371585a3701", | |
| "2b62d388-480e-4bda-8ae3-c2db70aaa731", | |
| "77a893e5-086f-4bdf-93dd-b0dda5accbb4", | |
| "d8b0cd11-4230-4cab-8696-d55630f034df", | |
| "caf33fa8-9d41-431a-a64b-2fb3499c48e4", | |
| "3158e985-edf6-4a9c-9d01-8bade1cffd04", | |
| "14bba8bc-b4c1-48b6-af10-8acca2db82ce", | |
| "f9ea635d-4081-4a7f-821d-af8eda75f559", | |
| "0a570dbd-f905-4261-946f-a1e6b3e9a387", | |
| "4ac8cc6c-a11b-4803-aee0-47bdc1dc0834", | |
| "61fb168b-2516-4bb9-97c4-804e8869eb8e", | |
| "37b60cc5-9238-4653-a08f-fb617a878ef7", | |
| "c2c1676f-a5fd-4c93-a900-76506a656b4c", | |
| "3b8c0dbe-fadb-41e9-ad6a-40c7c5772d60", | |
| "410cfec0-e434-4b0b-9cc3-e5cbcb09a0fe", | |
| "881bf27d-a875-4b05-8732-87be803eeaa5", | |
| "fc0bf854-631d-4322-bc2c-809992801e14", | |
| "e3853f48-c3ec-41d1-a8c9-32eb188cf9ce", | |
| "a968933f-9e39-4321-a6a1-b79caf397736", | |
| "a18f9694-f14d-41d7-9da7-68934bb3d229", | |
| "58ea83c3-00fb-49d0-8710-dddd29e15088", | |
| "6d47d454-edc8-44e2-99e6-d4c65e0871bc", | |
| "37930f56-3086-44b6-bc44-795e0c78e390", | |
| "53ccb672-9c35-4516-89ca-d48414818d40", | |
| "c1b959a6-b285-4825-8e26-387b871e89d9", | |
| "07e10eba-f057-4789-8949-bb5ffa800d51", | |
| "77e2aeed-e5e7-4e9d-bd81-cf456d24158c", | |
| "71b64f9a-4a01-4bdf-81fd-757096f0e7ce", | |
| "af84966d-abec-4a11-94bd-632a651d1d51", | |
| "2ba4117a-24ee-458e-84a8-3063d5b5c2c2", | |
| "2f2c33b7-501d-41bb-9401-89019a13fd38", | |
| "eedbf364-1ff9-463a-bc1d-7ac0ae015f94", | |
| "ec8573bb-e6ad-411f-a33a-14addf2d2aa5", | |
| "e904a3e6-5273-4fcd-809d-dc5b1bf9b2e0", | |
| "3f8c11f4-3e57-4467-8d2c-50474126200b", | |
| "7450aa0c-685f-4433-9245-a8bf1c7d40b3", | |
| "b725ccfa-f350-4d50-afad-dc9a18d68d78", | |
| "a4578507-90b2-41c8-9918-ecf45e61c540", | |
| "8449656d-9093-4cd4-8f1e-a9ce9fcafedf", | |
| "96f95945-a009-495d-a81c-885912998854", | |
| "d2ac6fb7-5ee1-4174-98c6-9e6bb79081dd", | |
| "87f7f262-c304-45fb-965a-4c6cef6b2e27", | |
| "4adf74f0-2c80-4cc7-b772-dc3d49e2632c", | |
| "24bc8ce6-ef5c-47bf-b03c-03d28a9aa44d", | |
| "8a98f32b-83cb-4af8-b513-dd34bdb63807", | |
| "83c87cc0-e31e-4810-a83e-6e57b006c02a", | |
| "cc63cdb9-1f6b-4511-bc43-b598e6b13787", | |
| "638d383a-bce4-4fb9-8b7b-bff85e87a364", | |
| "aee21288-8f82-410d-b28e-0ff5d9b7f5d7", | |
| "01255be4-73c6-41b1-81c4-64b0de2852cf", | |
| "5f0cbece-ad6e-4cf8-b2d9-d10dc372878b", | |
| "46ec5fd1-5d1d-4837-855d-cd5da948544b", | |
| "31bc4ab4-3c5b-43df-9d93-be34763b40c4", | |
| "7a87ba80-8c52-4763-8e4c-38c09e384c49", | |
| "c5a29a56-fbbc-4395-995f-96d382387c79", | |
| "1102328c-6f15-4aa1-b4ce-c0ee823cbcb6", | |
| "9e64093a-4f73-479f-8e9e-323b84cd6039", | |
| "65e887fd-f949-43ad-ac45-aa852de874a6", | |
| "3ab0ddd6-48e0-48f8-bf32-56b0d4178600", | |
| "a435ac50-5d92-49be-84b6-9e5e8e4e9248", | |
| "26486712-b403-4063-9cd2-9f2961a08de2", | |
| "99bdd20f-6151-4666-bc0b-444037b41712", | |
| "66f5d39b-be2b-46a9-9de4-abe08377de8d", | |
| "7a91fbe1-2074-457e-b6dd-454ee8bf8d74", | |
| "dbdbccf7-8888-4b04-8310-c25edb43a8c1", | |
| "4219936d-8f63-4ae1-8bf7-ab5b65496c2c", | |
| "f8a4a5a8-7bac-475a-a546-f3caecb765b2", | |
| "42ea82db-272e-4a28-8347-ce6a1c4fa4ff", | |
| "a75fcb9e-2ecf-49f9-b573-8309a6ec7331", | |
| "326f604f-ba68-4a06-94c1-ca3a9a46d12a", | |
| "fe0a665a-d5a0-49d3-acc6-23ba157bc4b8", | |
| "2131006e-556b-4116-b588-caf647a5c799", | |
| "61c87e69-932b-4635-896c-3ef4f38ac2c4", | |
| "0dd7f237-453a-468e-af14-601450cc9ddb", | |
| "55f27fa2-cbdd-4315-949f-54b77c477870", | |
| "88b4f5c0-0dae-4b37-8103-9b6f2330d0bc", | |
| "52f600bd-b66c-4e92-94b0-b88c16893828", | |
| "4b328bfd-0256-4f8a-9a8b-51aed5a9079c", | |
| "7f6b6177-06e7-4c38-b610-82bd899232c7", | |
| "25890f8a-5346-47bc-b0b3-69fc6d9e4812", | |
| "9cdc9da1-e6c0-425b-8d4b-663cd2bc351f" | |
| ], | |
| "shoppingContext": "CURBSIDE_PICKUP", | |
| "sortBy": "SCORE", | |
| "sortDirection": "DESC", | |
| "storeId": 92, | |
| "timeSlotStartTime": None, | |
| } | |
| }, | |
| "extensions": { | |
| "persistedQuery": { | |
| "version": 1, | |
| "sha256Hash": "2ed81ec090540231b28f8e6853767c8f03a0099c0112f2173f69cb06b8d2dd29" | |
| } | |
| } | |
| } | |
| self.session = requests.Session() | |
| self.generate_session() | |
| def generate_session(self): | |
| options = ChromeOptions() | |
| options.add_argument("--headless") | |
| options.add_argument("--disable-gpu") | |
| options.add_argument("--no-sandbox") | |
| driver = Chrome(executable_path=os.environ['CHROMEDRIVER_PATH'],options=options) | |
| driver.get("https://www.heb.com/") | |
| cookies = driver.get_cookies() | |
| [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies] | |
| driver.quit() | |
| def get_response(self, url): | |
| response = self.session.get(url) | |
| return response | |
| def parse_response(self, response, url): | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| try: | |
| name = soup.find('h1').text | |
| except: | |
| name = '' | |
| product_id_pattern = r"/(\d+)$" | |
| #last digits of url are product id | |
| match = re.search(product_id_pattern, url) | |
| if match: | |
| product_id = match.group(1) | |
| else: | |
| product_id = '' | |
| print(url) | |
| # try: | |
| # Find the outer div with id "accordion-panel-productDetailAccordion-nutrition-ingredients" | |
| outer_ingredients = soup.find('div', {'id': 'accordion-panel-productDetailAccordion-nutrition-ingredients'}) | |
| ingredients_section = soup.find('h4', text='Ingredients') | |
| # Extract the text within the following sibling `span` element | |
| if ingredients_section: | |
| ingredients_span = ingredients_section.find_next('span') | |
| if ingredients_span: | |
| ingredients = ingredients_span.get_text() | |
| else: | |
| ingredients = '' | |
| else: | |
| ingredients = '' | |
| # except: | |
| # ingredients = '' | |
| return [name, product_id, ingredients] | |
| def search_category_pages(self, category): | |
| url = f"https://www.heb.com:443/search/?q={category}" | |
| try: | |
| response = self.session.get(url) | |
| except: | |
| try: | |
| response = self.session.get(url) | |
| except: | |
| return 0 | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text | |
| print(total_pages) | |
| return int(total_pages) | |
| def get_urls_of_category_from_page(self, category, pages): | |
| all_urls = [] | |
| for page in range(1, pages + 1): | |
| self.query['variables']['params']['pageNumber'] = page | |
| self.query['variables']['params']['query'] = category | |
| # burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}" | |
| url = "https://www.heb.com:443/graphql" | |
| try: | |
| response = self.session.post(url, json=self.query) | |
| products = response.json()['data']['productSearchV2']['records'] | |
| except: | |
| try: | |
| self.generate_session() | |
| response = self.session.post(url, json=self.query) | |
| products = response.json()['data']['productSearchV2']['records'] | |
| except: | |
| continue | |
| urls = [] | |
| for product in products: | |
| url = f"{self.base_url}{product['product']['productPageURL']}" | |
| urls.append(url) | |
| print(urls) | |
| try: | |
| self.get_all_products_from_category_page(urls) | |
| except: | |
| try: | |
| self.get_all_products_from_category_page(urls) | |
| except: | |
| print('Error') | |
| def get_all_products_from_category_page(self, urls): | |
| for url in urls: | |
| response = self.get_response(url) | |
| product = self.parse_response(response, url=url) | |
| print(product) | |
| save_product({ | |
| 'title': product[0], | |
| 'ingredients': product[2], | |
| 'product_id': product[1], | |
| 'url': url, | |
| 'store_name': 'Heb' | |
| }) | |
| return True | |
| def save_product_to_csv(self, product): | |
| with open('heb.csv', 'a+', encoding='utf-8', newline='') as file: | |
| #check if the product is already in the csv | |
| file.seek(0) # move the file pointer to the beginning of the file | |
| reader = csv.reader(file) | |
| product_ids = [row[1] for row in reader] | |
| print(product_ids) | |
| if product[1] in product_ids: | |
| return False | |
| writer = csv.writer(file) | |
| writer.writerow(product) | |
| file.close() | |
| def run(self): | |
| for category in self.categories: | |
| pages = self.search_category_pages(category) | |
| self.get_urls_of_category_from_page(category, pages) | |
| self.generate_session() | |
| print(f'Finished {category}') | |
| return True | |
| if __name__ == "__main__": | |
| scraper = HebScraper() | |
| response = scraper.run() |