import requests from bs4 import BeautifulSoup import re import csv from .DatabaseDataSaver import save_product from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from django.conf import settings # from selenium.webdriver.chrome.options import Options as ChromeOptions from undetected_chromedriver import Chrome, ChromeOptions import os class HebScraper: def __init__(self): self.headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ', } self.base_url = 'https://www.heb.com' self.categories = [ 'health', 'beauty', 'personal care' ] self.query = { "operationName": "InitialSearchProductsV2", "variables": { "params": { "addressAllowAlcohol": False, "doNotSuggestPhrase": False, "ignoreRules": False, "ignoreSynonyms": False, "includeFullCategoryHierarchy": False, "pageNumber": 0, "pageSize": 60, "query": "health", "rootRequestId": None, "segmentIds": [ "a5da00d0-7087-4655-93e0-b93ec0fc4757", "adb33a3c-512e-4d76-8c24-a76d0efb8656", "251287aa-b1a1-4bc4-8652-3c4e74a5b756", "0df3ce9c-8f14-4d80-bf4c-3b8b66feab37", "29e75183-0916-4337-a605-18c34add93d9", "81484884-8948-41aa-a6f6-fed59467ceb9", "0a8a667b-13e3-444e-999b-02fcd87026aa", "8a7194d0-4643-41e3-8775-f56df17a0cb2", "8809067c-8fe2-4151-b26e-e67edf814a57", "0ad399c5-4ed6-4f7c-ba85-1c0ccb5f1b8c", "8bc9ba87-94b1-45de-8737-8a7ed18e94ca", "2b44887a-8e32-40c1-aced-3c7fec8790da", "c0c96fe5-b029-48cf-beec-42720c4ac40b", "7ca58353-5733-485f-aa37-22f4028e2e2a", "68c1099e-bb09-4ac4-831f-d2b53948abd2", "502dbe33-362e-4caf-a30e-eed0c5db0d15", "61e62fb2-ec7b-431d-a6b0-e2e9e4276fbf", "c662ef55-68e2-4255-a077-3077fcc52376", "211d5eb4-17de-44da-9d6a-055120c8d9d5", "354a6a8a-4034-4a8d-a50f-2c9d2bd7f564", "54644db9-3e20-458a-b785-a3fb819bf701", "e579e6cd-27b0-4b8d-995b-a5a7b5ef59ad", "d818c6a2-7494-4a09-9409-4cef916c8303", "7b6ecdf6-0461-407d-9750-e2035cd50834", "f88be4a7-fb46-41ec-a39e-6371585a3701", "2b62d388-480e-4bda-8ae3-c2db70aaa731", "77a893e5-086f-4bdf-93dd-b0dda5accbb4", "d8b0cd11-4230-4cab-8696-d55630f034df", "caf33fa8-9d41-431a-a64b-2fb3499c48e4", "3158e985-edf6-4a9c-9d01-8bade1cffd04", "14bba8bc-b4c1-48b6-af10-8acca2db82ce", "f9ea635d-4081-4a7f-821d-af8eda75f559", "0a570dbd-f905-4261-946f-a1e6b3e9a387", "4ac8cc6c-a11b-4803-aee0-47bdc1dc0834", "61fb168b-2516-4bb9-97c4-804e8869eb8e", "37b60cc5-9238-4653-a08f-fb617a878ef7", "c2c1676f-a5fd-4c93-a900-76506a656b4c", "3b8c0dbe-fadb-41e9-ad6a-40c7c5772d60", "410cfec0-e434-4b0b-9cc3-e5cbcb09a0fe", "881bf27d-a875-4b05-8732-87be803eeaa5", "fc0bf854-631d-4322-bc2c-809992801e14", "e3853f48-c3ec-41d1-a8c9-32eb188cf9ce", "a968933f-9e39-4321-a6a1-b79caf397736", "a18f9694-f14d-41d7-9da7-68934bb3d229", "58ea83c3-00fb-49d0-8710-dddd29e15088", "6d47d454-edc8-44e2-99e6-d4c65e0871bc", "37930f56-3086-44b6-bc44-795e0c78e390", "53ccb672-9c35-4516-89ca-d48414818d40", "c1b959a6-b285-4825-8e26-387b871e89d9", "07e10eba-f057-4789-8949-bb5ffa800d51", "77e2aeed-e5e7-4e9d-bd81-cf456d24158c", "71b64f9a-4a01-4bdf-81fd-757096f0e7ce", "af84966d-abec-4a11-94bd-632a651d1d51", "2ba4117a-24ee-458e-84a8-3063d5b5c2c2", "2f2c33b7-501d-41bb-9401-89019a13fd38", "eedbf364-1ff9-463a-bc1d-7ac0ae015f94", "ec8573bb-e6ad-411f-a33a-14addf2d2aa5", "e904a3e6-5273-4fcd-809d-dc5b1bf9b2e0", "3f8c11f4-3e57-4467-8d2c-50474126200b", "7450aa0c-685f-4433-9245-a8bf1c7d40b3", "b725ccfa-f350-4d50-afad-dc9a18d68d78", "a4578507-90b2-41c8-9918-ecf45e61c540", "8449656d-9093-4cd4-8f1e-a9ce9fcafedf", "96f95945-a009-495d-a81c-885912998854", "d2ac6fb7-5ee1-4174-98c6-9e6bb79081dd", "87f7f262-c304-45fb-965a-4c6cef6b2e27", "4adf74f0-2c80-4cc7-b772-dc3d49e2632c", "24bc8ce6-ef5c-47bf-b03c-03d28a9aa44d", "8a98f32b-83cb-4af8-b513-dd34bdb63807", "83c87cc0-e31e-4810-a83e-6e57b006c02a", "cc63cdb9-1f6b-4511-bc43-b598e6b13787", "638d383a-bce4-4fb9-8b7b-bff85e87a364", "aee21288-8f82-410d-b28e-0ff5d9b7f5d7", "01255be4-73c6-41b1-81c4-64b0de2852cf", "5f0cbece-ad6e-4cf8-b2d9-d10dc372878b", "46ec5fd1-5d1d-4837-855d-cd5da948544b", "31bc4ab4-3c5b-43df-9d93-be34763b40c4", "7a87ba80-8c52-4763-8e4c-38c09e384c49", "c5a29a56-fbbc-4395-995f-96d382387c79", "1102328c-6f15-4aa1-b4ce-c0ee823cbcb6", "9e64093a-4f73-479f-8e9e-323b84cd6039", "65e887fd-f949-43ad-ac45-aa852de874a6", "3ab0ddd6-48e0-48f8-bf32-56b0d4178600", "a435ac50-5d92-49be-84b6-9e5e8e4e9248", "26486712-b403-4063-9cd2-9f2961a08de2", "99bdd20f-6151-4666-bc0b-444037b41712", "66f5d39b-be2b-46a9-9de4-abe08377de8d", "7a91fbe1-2074-457e-b6dd-454ee8bf8d74", "dbdbccf7-8888-4b04-8310-c25edb43a8c1", "4219936d-8f63-4ae1-8bf7-ab5b65496c2c", "f8a4a5a8-7bac-475a-a546-f3caecb765b2", "42ea82db-272e-4a28-8347-ce6a1c4fa4ff", "a75fcb9e-2ecf-49f9-b573-8309a6ec7331", "326f604f-ba68-4a06-94c1-ca3a9a46d12a", "fe0a665a-d5a0-49d3-acc6-23ba157bc4b8", "2131006e-556b-4116-b588-caf647a5c799", "61c87e69-932b-4635-896c-3ef4f38ac2c4", "0dd7f237-453a-468e-af14-601450cc9ddb", "55f27fa2-cbdd-4315-949f-54b77c477870", "88b4f5c0-0dae-4b37-8103-9b6f2330d0bc", "52f600bd-b66c-4e92-94b0-b88c16893828", "4b328bfd-0256-4f8a-9a8b-51aed5a9079c", "7f6b6177-06e7-4c38-b610-82bd899232c7", "25890f8a-5346-47bc-b0b3-69fc6d9e4812", "9cdc9da1-e6c0-425b-8d4b-663cd2bc351f" ], "shoppingContext": "CURBSIDE_PICKUP", "sortBy": "SCORE", "sortDirection": "DESC", "storeId": 92, "timeSlotStartTime": None, } }, "extensions": { "persistedQuery": { "version": 1, "sha256Hash": "2ed81ec090540231b28f8e6853767c8f03a0099c0112f2173f69cb06b8d2dd29" } } } self.session = requests.Session() self.generate_session() def generate_session(self): options = ChromeOptions() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") driver = Chrome(executable_path=os.environ['CHROMEDRIVER_PATH'],options=options) driver.get("https://www.heb.com/") cookies = driver.get_cookies() [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies] driver.quit() def get_response(self, url): response = self.session.get(url) return response def parse_response(self, response, url): soup = BeautifulSoup(response.text, 'html.parser') try: name = soup.find('h1').text except: name = '' product_id_pattern = r"/(\d+)$" #last digits of url are product id match = re.search(product_id_pattern, url) if match: product_id = match.group(1) else: product_id = '' print(url) # try: # Find the outer div with id "accordion-panel-productDetailAccordion-nutrition-ingredients" outer_ingredients = soup.find('div', {'id': 'accordion-panel-productDetailAccordion-nutrition-ingredients'}) ingredients_section = soup.find('h4', text='Ingredients') # Extract the text within the following sibling `span` element if ingredients_section: ingredients_span = ingredients_section.find_next('span') if ingredients_span: ingredients = ingredients_span.get_text() else: ingredients = '' else: ingredients = '' # except: # ingredients = '' return [name, product_id, ingredients] def search_category_pages(self, category): url = f"https://www.heb.com:443/search/?q={category}" try: response = self.session.get(url) except: try: response = self.session.get(url) except: return 0 soup = BeautifulSoup(response.text, 'html.parser') total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text print(total_pages) return int(total_pages) def get_urls_of_category_from_page(self, category, pages): all_urls = [] for page in range(1, pages + 1): self.query['variables']['params']['pageNumber'] = page self.query['variables']['params']['query'] = category # burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}" url = "https://www.heb.com:443/graphql" try: response = self.session.post(url, json=self.query) products = response.json()['data']['productSearchV2']['records'] except: try: self.generate_session() response = self.session.post(url, json=self.query) products = response.json()['data']['productSearchV2']['records'] except: continue urls = [] for product in products: url = f"{self.base_url}{product['product']['productPageURL']}" urls.append(url) print(urls) try: self.get_all_products_from_category_page(urls) except: try: self.get_all_products_from_category_page(urls) except: print('Error') def get_all_products_from_category_page(self, urls): for url in urls: response = self.get_response(url) product = self.parse_response(response, url=url) print(product) save_product({ 'title': product[0], 'ingredients': product[2], 'product_id': product[1], 'url': url, 'store_name': 'Heb' }) return True def save_product_to_csv(self, product): with open('heb.csv', 'a+', encoding='utf-8', newline='') as file: #check if the product is already in the csv file.seek(0) # move the file pointer to the beginning of the file reader = csv.reader(file) product_ids = [row[1] for row in reader] print(product_ids) if product[1] in product_ids: return False writer = csv.writer(file) writer.writerow(product) file.close() def run(self): for category in self.categories: pages = self.search_category_pages(category) self.get_urls_of_category_from_page(category, pages) self.generate_session() print(f'Finished {category}') return True if __name__ == "__main__": scraper = HebScraper() response = scraper.run()