shops-scraper / scraper /utils /HebScraper.py
mumer119131's picture
Update scraper/utils/HebScraper.py
c89416a
import requests
from bs4 import BeautifulSoup
import re
import csv
from .DatabaseDataSaver import save_product
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from django.conf import settings
# from selenium.webdriver.chrome.options import Options as ChromeOptions
from undetected_chromedriver import Chrome, ChromeOptions
import os
class HebScraper:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
}
self.base_url = 'https://www.heb.com'
self.categories = [
'health', 'beauty', 'personal care'
]
self.query = {
"operationName": "InitialSearchProductsV2",
"variables": {
"params": {
"addressAllowAlcohol": False,
"doNotSuggestPhrase": False,
"ignoreRules": False,
"ignoreSynonyms": False,
"includeFullCategoryHierarchy": False,
"pageNumber": 0,
"pageSize": 60,
"query": "health",
"rootRequestId": None,
"segmentIds": [
"a5da00d0-7087-4655-93e0-b93ec0fc4757",
"adb33a3c-512e-4d76-8c24-a76d0efb8656",
"251287aa-b1a1-4bc4-8652-3c4e74a5b756",
"0df3ce9c-8f14-4d80-bf4c-3b8b66feab37",
"29e75183-0916-4337-a605-18c34add93d9",
"81484884-8948-41aa-a6f6-fed59467ceb9",
"0a8a667b-13e3-444e-999b-02fcd87026aa",
"8a7194d0-4643-41e3-8775-f56df17a0cb2",
"8809067c-8fe2-4151-b26e-e67edf814a57",
"0ad399c5-4ed6-4f7c-ba85-1c0ccb5f1b8c",
"8bc9ba87-94b1-45de-8737-8a7ed18e94ca",
"2b44887a-8e32-40c1-aced-3c7fec8790da",
"c0c96fe5-b029-48cf-beec-42720c4ac40b",
"7ca58353-5733-485f-aa37-22f4028e2e2a",
"68c1099e-bb09-4ac4-831f-d2b53948abd2",
"502dbe33-362e-4caf-a30e-eed0c5db0d15",
"61e62fb2-ec7b-431d-a6b0-e2e9e4276fbf",
"c662ef55-68e2-4255-a077-3077fcc52376",
"211d5eb4-17de-44da-9d6a-055120c8d9d5",
"354a6a8a-4034-4a8d-a50f-2c9d2bd7f564",
"54644db9-3e20-458a-b785-a3fb819bf701",
"e579e6cd-27b0-4b8d-995b-a5a7b5ef59ad",
"d818c6a2-7494-4a09-9409-4cef916c8303",
"7b6ecdf6-0461-407d-9750-e2035cd50834",
"f88be4a7-fb46-41ec-a39e-6371585a3701",
"2b62d388-480e-4bda-8ae3-c2db70aaa731",
"77a893e5-086f-4bdf-93dd-b0dda5accbb4",
"d8b0cd11-4230-4cab-8696-d55630f034df",
"caf33fa8-9d41-431a-a64b-2fb3499c48e4",
"3158e985-edf6-4a9c-9d01-8bade1cffd04",
"14bba8bc-b4c1-48b6-af10-8acca2db82ce",
"f9ea635d-4081-4a7f-821d-af8eda75f559",
"0a570dbd-f905-4261-946f-a1e6b3e9a387",
"4ac8cc6c-a11b-4803-aee0-47bdc1dc0834",
"61fb168b-2516-4bb9-97c4-804e8869eb8e",
"37b60cc5-9238-4653-a08f-fb617a878ef7",
"c2c1676f-a5fd-4c93-a900-76506a656b4c",
"3b8c0dbe-fadb-41e9-ad6a-40c7c5772d60",
"410cfec0-e434-4b0b-9cc3-e5cbcb09a0fe",
"881bf27d-a875-4b05-8732-87be803eeaa5",
"fc0bf854-631d-4322-bc2c-809992801e14",
"e3853f48-c3ec-41d1-a8c9-32eb188cf9ce",
"a968933f-9e39-4321-a6a1-b79caf397736",
"a18f9694-f14d-41d7-9da7-68934bb3d229",
"58ea83c3-00fb-49d0-8710-dddd29e15088",
"6d47d454-edc8-44e2-99e6-d4c65e0871bc",
"37930f56-3086-44b6-bc44-795e0c78e390",
"53ccb672-9c35-4516-89ca-d48414818d40",
"c1b959a6-b285-4825-8e26-387b871e89d9",
"07e10eba-f057-4789-8949-bb5ffa800d51",
"77e2aeed-e5e7-4e9d-bd81-cf456d24158c",
"71b64f9a-4a01-4bdf-81fd-757096f0e7ce",
"af84966d-abec-4a11-94bd-632a651d1d51",
"2ba4117a-24ee-458e-84a8-3063d5b5c2c2",
"2f2c33b7-501d-41bb-9401-89019a13fd38",
"eedbf364-1ff9-463a-bc1d-7ac0ae015f94",
"ec8573bb-e6ad-411f-a33a-14addf2d2aa5",
"e904a3e6-5273-4fcd-809d-dc5b1bf9b2e0",
"3f8c11f4-3e57-4467-8d2c-50474126200b",
"7450aa0c-685f-4433-9245-a8bf1c7d40b3",
"b725ccfa-f350-4d50-afad-dc9a18d68d78",
"a4578507-90b2-41c8-9918-ecf45e61c540",
"8449656d-9093-4cd4-8f1e-a9ce9fcafedf",
"96f95945-a009-495d-a81c-885912998854",
"d2ac6fb7-5ee1-4174-98c6-9e6bb79081dd",
"87f7f262-c304-45fb-965a-4c6cef6b2e27",
"4adf74f0-2c80-4cc7-b772-dc3d49e2632c",
"24bc8ce6-ef5c-47bf-b03c-03d28a9aa44d",
"8a98f32b-83cb-4af8-b513-dd34bdb63807",
"83c87cc0-e31e-4810-a83e-6e57b006c02a",
"cc63cdb9-1f6b-4511-bc43-b598e6b13787",
"638d383a-bce4-4fb9-8b7b-bff85e87a364",
"aee21288-8f82-410d-b28e-0ff5d9b7f5d7",
"01255be4-73c6-41b1-81c4-64b0de2852cf",
"5f0cbece-ad6e-4cf8-b2d9-d10dc372878b",
"46ec5fd1-5d1d-4837-855d-cd5da948544b",
"31bc4ab4-3c5b-43df-9d93-be34763b40c4",
"7a87ba80-8c52-4763-8e4c-38c09e384c49",
"c5a29a56-fbbc-4395-995f-96d382387c79",
"1102328c-6f15-4aa1-b4ce-c0ee823cbcb6",
"9e64093a-4f73-479f-8e9e-323b84cd6039",
"65e887fd-f949-43ad-ac45-aa852de874a6",
"3ab0ddd6-48e0-48f8-bf32-56b0d4178600",
"a435ac50-5d92-49be-84b6-9e5e8e4e9248",
"26486712-b403-4063-9cd2-9f2961a08de2",
"99bdd20f-6151-4666-bc0b-444037b41712",
"66f5d39b-be2b-46a9-9de4-abe08377de8d",
"7a91fbe1-2074-457e-b6dd-454ee8bf8d74",
"dbdbccf7-8888-4b04-8310-c25edb43a8c1",
"4219936d-8f63-4ae1-8bf7-ab5b65496c2c",
"f8a4a5a8-7bac-475a-a546-f3caecb765b2",
"42ea82db-272e-4a28-8347-ce6a1c4fa4ff",
"a75fcb9e-2ecf-49f9-b573-8309a6ec7331",
"326f604f-ba68-4a06-94c1-ca3a9a46d12a",
"fe0a665a-d5a0-49d3-acc6-23ba157bc4b8",
"2131006e-556b-4116-b588-caf647a5c799",
"61c87e69-932b-4635-896c-3ef4f38ac2c4",
"0dd7f237-453a-468e-af14-601450cc9ddb",
"55f27fa2-cbdd-4315-949f-54b77c477870",
"88b4f5c0-0dae-4b37-8103-9b6f2330d0bc",
"52f600bd-b66c-4e92-94b0-b88c16893828",
"4b328bfd-0256-4f8a-9a8b-51aed5a9079c",
"7f6b6177-06e7-4c38-b610-82bd899232c7",
"25890f8a-5346-47bc-b0b3-69fc6d9e4812",
"9cdc9da1-e6c0-425b-8d4b-663cd2bc351f"
],
"shoppingContext": "CURBSIDE_PICKUP",
"sortBy": "SCORE",
"sortDirection": "DESC",
"storeId": 92,
"timeSlotStartTime": None,
}
},
"extensions": {
"persistedQuery": {
"version": 1,
"sha256Hash": "2ed81ec090540231b28f8e6853767c8f03a0099c0112f2173f69cb06b8d2dd29"
}
}
}
self.session = requests.Session()
self.generate_session()
def generate_session(self):
options = ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
driver = Chrome(executable_path=os.environ['CHROMEDRIVER_PATH'],options=options)
driver.get("https://www.heb.com/")
cookies = driver.get_cookies()
[self.session.cookies.set(cookie['name'], cookie['value']) for cookie in cookies]
driver.quit()
def get_response(self, url):
response = self.session.get(url)
return response
def parse_response(self, response, url):
soup = BeautifulSoup(response.text, 'html.parser')
try:
name = soup.find('h1').text
except:
name = ''
product_id_pattern = r"/(\d+)$"
#last digits of url are product id
match = re.search(product_id_pattern, url)
if match:
product_id = match.group(1)
else:
product_id = ''
print(url)
# try:
# Find the outer div with id "accordion-panel-productDetailAccordion-nutrition-ingredients"
outer_ingredients = soup.find('div', {'id': 'accordion-panel-productDetailAccordion-nutrition-ingredients'})
ingredients_section = soup.find('h4', text='Ingredients')
# Extract the text within the following sibling `span` element
if ingredients_section:
ingredients_span = ingredients_section.find_next('span')
if ingredients_span:
ingredients = ingredients_span.get_text()
else:
ingredients = ''
else:
ingredients = ''
# except:
# ingredients = ''
return [name, product_id, ingredients]
def search_category_pages(self, category):
url = f"https://www.heb.com:443/search/?q={category}"
try:
response = self.session.get(url)
except:
try:
response = self.session.get(url)
except:
return 0
soup = BeautifulSoup(response.text, 'html.parser')
total_pages = soup.find_all('a', {'data-qe-id': 'paginationListNum'})[-1].text
print(total_pages)
return int(total_pages)
def get_urls_of_category_from_page(self, category, pages):
all_urls = []
for page in range(1, pages + 1):
self.query['variables']['params']['pageNumber'] = page
self.query['variables']['params']['query'] = category
# burp0_url = f"https://www.heb.com:443/search/?q={category}&pageNumber={page}"
url = "https://www.heb.com:443/graphql"
try:
response = self.session.post(url, json=self.query)
products = response.json()['data']['productSearchV2']['records']
except:
try:
self.generate_session()
response = self.session.post(url, json=self.query)
products = response.json()['data']['productSearchV2']['records']
except:
continue
urls = []
for product in products:
url = f"{self.base_url}{product['product']['productPageURL']}"
urls.append(url)
print(urls)
try:
self.get_all_products_from_category_page(urls)
except:
try:
self.get_all_products_from_category_page(urls)
except:
print('Error')
def get_all_products_from_category_page(self, urls):
for url in urls:
response = self.get_response(url)
product = self.parse_response(response, url=url)
print(product)
save_product({
'title': product[0],
'ingredients': product[2],
'product_id': product[1],
'url': url,
'store_name': 'Heb'
})
return True
def save_product_to_csv(self, product):
with open('heb.csv', 'a+', encoding='utf-8', newline='') as file:
#check if the product is already in the csv
file.seek(0) # move the file pointer to the beginning of the file
reader = csv.reader(file)
product_ids = [row[1] for row in reader]
print(product_ids)
if product[1] in product_ids:
return False
writer = csv.writer(file)
writer.writerow(product)
file.close()
def run(self):
for category in self.categories:
pages = self.search_category_pages(category)
self.get_urls_of_category_from_page(category, pages)
self.generate_session()
print(f'Finished {category}')
return True
if __name__ == "__main__":
scraper = HebScraper()
response = scraper.run()