Spaces:

mumer119131
/

shops-scraper

Runtime error

File size: 13,569 Bytes

import requests
from bs4 import BeautifulSoup
import re
import csv
from .DatabaseDataSaver import save_product
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from django.conf import settings
# from selenium.webdriver.chrome.options import Options as ChromeOptions
from undetected_chromedriver import Chrome, ChromeOptions
import os

class HebScraper:
    def __init__(self):
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
        }
        self.base_url = 'https://www.heb.com'
        self.categories = [
            'health', 'beauty', 'personal care'
        ]
        self.query = {
            "operationName": "InitialSearchProductsV2",
            "variables": {
                "params": {
                    "addressAllowAlcohol": False,
                    "doNotSuggestPhrase": False,
                    "ignoreRules": False,
                    "ignoreSynonyms": False,
                    "includeFullCategoryHierarchy": False,
                    "pageNumber": 0,
                    "pageSize": 60,
                    "query": "health",
                    "rootRequestId": None,
                    "segmentIds": [
                        "a5da00d0-7087-4655-93e0-b93ec0fc4757",
                        "adb33a3c-512e-4d76-8c24-a76d0efb8656",
                        "251287aa-b1a1-4bc4-8652-3c4e74a5b756",
                        "0df3ce9c-8f14-4d80-bf4c-3b8b66feab37",
                        "29e75183-0916-4337-a605-18c34add93d9",
                        "81484884-8948-41aa-a6f6-fed59467ceb9",
                        "0a8a667b-13e3-444e-999b-02fcd87026aa",
                        "8a7194d0-4643-41e3-8775-f56df17a0cb2",
                        "8809067c-8fe2-4151-b26e-e67edf814a57",
                        "0ad399c5-4ed6-4f7c-ba85-1c0ccb5f1b8c",
                        "8bc9ba87-94b1-45de-8737-8a7ed18e94ca",
                        "2b44887a-8e32-40c1-aced-3c7fec8790da",
                        "c0c96fe5-b029-48cf-beec-42720c4ac40b",
                        "7ca58353-5733-485f-aa37-22f4028e2e2a",
                        "68c1099e-bb09-4ac4-831f-d2b53948abd2",
                        "502dbe33-362e-4caf-a30e-eed0c5db0d15",
                        "61e62fb2-ec7b-431d-a6b0-e2e9e4276fbf",
                        "c662ef55-68e2-4255-a077-3077fcc52376",
                        "211d5eb4-17de-44da-9d6a-055120c8d9d5",
                        "354a6a8a-4034-4a8d-a50f-2c9d2bd7f564",
                        "54644db9-3e20-458a-b785-a3fb819bf701",
                        "e579e6cd-27b0-4b8d-995b-a5a7b5ef59ad",
                        "d818c6a2-7494-4a09-9409-4cef916c8303",
                        "7b6ecdf6-0461-407d-9750-e2035cd50834",
                        "f88be4a7-fb46-41ec-a39e-6371585a3701",
                        "2b62d388-480e-4bda-8ae3-c2db70aaa731",
                        "77a893e5-086f-4bdf-93dd-b0dda5accbb4",
                        "d8b0cd11-4230-4cab-8696-d55630f034df",
                        "caf33fa8-9d41-431a-a64b-2fb3499c48e4",
                        "3158e985-edf6-4a9c-9d01-8bade1cffd04",
                        "14bba8bc-b4c1-48b6-af10-8acca2db82ce",
                        "f9ea635d-4081-4a7f-821d-af8eda75f559",
                        "0a570dbd-f905-4261-946f-a1e6b3e9a387",
                        "4ac8cc6c-a11b-4803-aee0-47bdc1dc0834",
                        "61fb168b-2516-4bb9-97c4-804e8869eb8e",
                        "37b60cc5-9238-4653-a08f-fb617a878ef7",
                        "c2c1676f-a5fd-4c93-a900-76506a656b4c",
                        "3b8c0dbe-fadb-41e9-ad6a-40c7c5772d60",
                        "410cfec0-e434-4b0b-9cc3-e5cbcb09a0fe",
                        "881bf27d-a875-4b05-8732-87be803eeaa5",
                        "fc0bf854-631d-4322-bc2c-809992801e14",
                        "e3853f48-c3ec-41d1-a8c9-32eb188cf9ce",
                        "a968933f-9e39-4321-a6a1-b79caf397736",
                        "a18f9694-f14d-41d7-9da7-68934bb3d229",
                        "58ea83c3-00fb-49d0-8710-dddd29e15088",
                        "6d47d454-edc8-44e2-99e6-d4c65e0871bc",
                        "37930f56-3086-44b6-bc44-795e0c78e390",
                        "53ccb672-9c35-4516-89ca-d48414818d40",
                        "c1b959a6-b285-4825-8e26-387b871e89d9",
                        "07e10eba-f057-4789-8949-bb5ffa800d51",
                        "77e2aeed-e5e7-4e9d-bd81-cf456d24158c",
                        "71b64f9a-4a01-4bdf-81fd-757096f0e7ce",
                        "af84966d-abec-4a11-94bd-632a651d1d51",
                        "2ba4117a-24ee-458e-84a8-3063d5b5c2c2",
                        "2f2c33b7-501d-41bb-9401-89019a13fd38",
                        "eedbf364-1ff9-463a-bc1d-7ac0ae015f94",
                        "ec8573bb-e6ad-411f-a33a-14addf2d2aa5",
                        "e904a3e6-5273-4fcd-809d-dc5b1bf9b2e0",
                        "3f8c11f4-3e57-4467-8d2c-50474126200b",
                        "7450aa0c-685f-4433-9245-a8bf1c7d40b3",
                        "b725ccfa-f350-4d50-afad-dc9a18d68d78",
                        "a4578507-90b2-41c8-9918-ecf45e61c540",
                        "8449656d-9093-4cd4-8f1e-a9ce9fcafedf",
                        "96f95945-a009-495d-a81c-885912998854",
                        "d2ac6fb7-5ee1-4174-98c6-9e6bb79081dd",
                        "87f7f262-c304-45fb-965a-4c6cef6b2e27",
                        "4adf74f0-2c80-4cc7-b772-dc3d49e2632c",
                        "24bc8ce6-ef5c-47bf-b03c-03d28a9aa44d",
                        "8a98f32b-83cb-4af8-b513-dd34bdb63807",
                        "83c87cc0-e31e-4810-a83e-6e57b006c02a",
                        "cc63cdb9-1f6b-4511-bc43-b598e6b13787",
                        "638d383a-bce4-4fb9-8b7b-bff85e87a364",
                        "aee21288-8f82-410d-b28e-0ff5d9b7f5d7",
                        "01255be4-73c6-41b1-81c4-64b0de2852cf",
                        "5f0cbece-ad6e-4cf8-b2d9-d10dc372878b",
                        "46ec5fd1-5d1d-4837-855d-cd5da948544b",
                        "31bc4ab4-3c5b-43df-9d93-be34763b40c4",
                        "7a87ba80-8c52-4763-8e4c-38c09e384c49",
                        "c5a29a56-fbbc-4395-995f-96d382387c79",
                        "1102328c-6f15-4aa1-b4ce-c0ee823cbcb6",
                        "9e64093a-4f73-479f-8e9e-323b84cd6039",
                        "65e887fd-f949-43ad-ac45-aa852de874a6",
                        "3ab0ddd6-48e0-48f8-bf32-56b0d4178600",
                        "a435ac50-5d92-49be-84b6-9e5e8e4e9248",
                        "26486712-b403-4063-9cd2-9f2961a08de2",
                        "99bdd20f-6151-4666-bc0b-444037b41712",
                        "66f5d39b-be2b-46a9-9de4-abe08377de8d",
                        "7a91fbe1-2074-457e-b6dd-454ee8bf8d74",
                        "dbdbccf7-8888-4b04-8310-c25edb43a8c1",
                        "4219936d-8f63-4ae1-8bf7-ab5b65496c2c",
                        "f8a4a5a8-7bac-475a-a546-f3caecb765b2",
                        "42ea82db-272e-4a28-8347-ce6a1c4fa4ff",
                        "a75fcb9e-2ecf-49f9-b573-8309a6ec7331",
                        "326f604f-ba68-4a06-94c1-ca3a9a46d12a",
                        "fe0a665a-d5a0-49d3-acc6-23ba157bc4b8",
                        "2131006e-556b-4116-b588-caf647a5c799",
                        "61c87e69-932b-4635-896c-3ef4f38ac2c4",
                        "0dd7f237-453a-468e-af14-601450cc9ddb",
                        "55f27fa2-cbdd-4315-949f-54b77c477870",
                        "88b4f5c0-0dae-4b37-8103-9b6f2330d0bc",
                        "52f600bd-b66c-4e92-94b0-b88c16893828",
                        "4b328bfd-0256-4f8a-9a8b-51aed5a9079c",
                        "7f6b6177-06e7-4c38-b610-82bd899232c7",
                        "25890f8a-5346-47bc-b0b3-69fc6d9e4812",
                        "9cdc9da1-e6c0-425b-8d4b-663cd2bc351f"
                    ],
                    "shoppingContext": "CURBSIDE_PICKUP",
                    "sortBy": "SCORE",
                    "sortDirection": "DESC",
                    "storeId": 92,
                    "timeSlotStartTime": None,
                }
            },
            "extensions": {
                "persistedQuery": {
                    "version": 1,
                    "sha256Hash": "2ed81ec090540231b28f8e6853767c8f03a0099c0112f2173f69cb06b8d2dd29"
                }
            }
        }
        self.session = requests.Session()
        self.generate_session()

    def generate_session(self):
        options = ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        driver = Chrome(executable_path=os.environ['CHROMEDRIVER_PATH'],options=options)
        driver.get("https://www.heb.com/")
        cookies = driver.get_cookies()
        [self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
        driver.quit()


    def get_response(self, url):
        response = self.session.get(url)
        return response
    
    def parse_response(self, response, url):
        soup = BeautifulSoup(response.text, 'html.parser')
        try:
            name = soup.find('h1').text
        except:
            name = ''
        product_id_pattern = r"/(\d+)$"
        #last digits of url are product id
        match = re.search(product_id_pattern, url)
        if match:
            product_id = match.group(1)
        else:
            product_id = ''
        print(url)
        # try:
        # Find the outer div with id "accordion-panel-productDetailAccordion-nutrition-ingredients"
        outer_ingredients = soup.find('div', {'id': 'accordion-panel-productDetailAccordion-nutrition-ingredients'})

        ingredients_section = soup.find('h4', text='Ingredients')

        # Extract the text within the following sibling `span` element
        if ingredients_section:
            ingredients_span = ingredients_section.find_next('span')
            if ingredients_span:
                ingredients = ingredients_span.get_text()
            else:
                ingredients = ''
        else:
            ingredients = ''
        # except:
        #     ingredients = ''
        return [name, product_id, ingredients]
    
    def search_category_pages(self, category):
        url = f"https://www.heb.com:443/search/?q={category}"
        try:
            response = self.session.get(url)
        except:
            try:
                response = self.session.get(url)
            except:
                return 0
        soup = BeautifulSoup(response.text, 'html.parser')
        total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text
        print(total_pages)
        return int(total_pages)
    
    def get_urls_of_category_from_page(self, category, pages):
        all_urls = []
        for page in range(1, pages + 1):
            self.query['variables']['params']['pageNumber'] = page
            self.query['variables']['params']['query'] = category
            # burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}"
            url = "https://www.heb.com:443/graphql"
            try:
                response = self.session.post(url, json=self.query)
                products = response.json()['data']['productSearchV2']['records']  
            except:
                try:
                    self.generate_session()
                    response = self.session.post(url, json=self.query)
                    products = response.json()['data']['productSearchV2']['records']  
                except:
                    continue
            urls = []
            for product in products:
                url = f"{self.base_url}{product['product']['productPageURL']}"      
                urls.append(url)
            print(urls)
            try:
                self.get_all_products_from_category_page(urls)
            except:
                try:        
                    self.get_all_products_from_category_page(urls)
                except:
                    print('Error')
            
    def get_all_products_from_category_page(self, urls):
        for url in urls:
            response = self.get_response(url)
            product = self.parse_response(response, url=url)
            print(product)
            save_product({
                'title': product[0],
                'ingredients': product[2],
                'product_id': product[1],
                'url': url,
                'store_name': 'Heb'
            })
        return True

    def save_product_to_csv(self, product):
        with open('heb.csv', 'a+', encoding='utf-8', newline='') as file:
            #check if the product is already in the csv
            file.seek(0) # move the file pointer to the beginning of the file
            reader = csv.reader(file)
            product_ids = [row[1] for row in reader]
            print(product_ids)
            if product[1] in product_ids:
                return False
            
            writer = csv.writer(file)
            writer.writerow(product)
            file.close()
    
    def run(self):
        for category in self.categories:
            pages = self.search_category_pages(category)
            self.get_urls_of_category_from_page(category, pages)
            self.generate_session()
            print(f'Finished {category}')
        return True
if __name__ == "__main__":
    scraper = HebScraper()
    response = scraper.run()