shops-scraper / scraper /utils /SephoraScraper.py
mumer119131's picture
Update scraper/utils/SephoraScraper.py
3acf0f3
import requests
from bs4 import BeautifulSoup
import re
import csv
from .DatabaseDataSaver import save_product
class SephoraScraper:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
}
self.categories = [
"makeup-cosmetics", "skincare", "hair-products", "fragrance", "makeup-tools", "bath-body", "travel-size-toiletries", "gifts"
]
def get_response(self, url):
id_pattern = pattern = r'-(P\d+)\?'
match = re.search(pattern, url)
if match:
product_id = match.group(1)
else:
return False
burp0_url = f"https://www.sephora.com:443/api2/catalog/products/{product_id}?addCurrentSkuToProductChildSkus=true&includeRegionsMap=true&showContent=true&includeConfigurableSku=true&countryCode=US&removePersonalizedData=true&includeReviewFilters=true&includeReviewImages=true&sentiments=6"
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "X-Ufe-Request": "true", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h23vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/product/sephora-collection-total-coverage-blending-sponge-set-60-plant-based-P482303?skuId=2497220&icid2=products%20grid:p482303:product", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
try:
response = requests.get(burp0_url, headers=burp0_headers)
print(response.text)
return response.json()
except Exception as e:
print(e)
return False
def parse_response(self, response, url):
try:
name = response['productDetails']['displayName']
product_id = response['productDetails']['productId']
try:
ingredients = response['currentSku']['ingredientDesc'].replace('\n', ' ').replace('<b>', '').replace('</b>', '').replace('<br>', '').replace('</p>', '').replace('<p>', '').replace('<br/>', '').replace('<p>', '')
except:
ingredients = ''
return [name, product_id, ingredients]
except:
return False
def search_category_total_results(self, category):
burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP&currentPage=2&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
response = requests.get(burp0_url, headers=burp0_headers)
data = response.json()
return data['totalProducts']
def get_urls_of_category_from_all_results(self, category, total_results):
all_urls = []
pages = total_results // 60 + 1
for page in range(1, pages+1):
burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP&currentPage={page}&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
response = requests.get(burp0_url, headers=burp0_headers)
data = response.json()
products = data['products']
urls = []
for product in products:
urls.append(f'https://www.sephora.com{product["targetUrl"]}')
print(urls)
self.get_all_products_from_category_page(urls)
# self.get_all_products_from_category_page(all_urls)
def get_all_products_from_category_page(self, urls):
for url in urls:
response = self.get_response(url)
product = self.parse_response(response, url=url)
if product:
save_product({
'title': product[0],
'product_id': product[1],
'ingredients': product[2],
'url': url,
'store_name': 'Sephora'
})
# save_product(product)
return True
def save_product_to_csv(self, product):
with open('sep.csv', 'a+', encoding='utf-8', newline='') as file:
#check if the product is already in the csv
file.seek(0) # move the file pointer to the beginning of the file
reader = csv.reader(file)
product_ids = [row[1] for row in reader]
print(product_ids)
if product[1] in product_ids:
return False
writer = csv.writer(file)
writer.writerow(product)
file.close()
def run(self):
for category in self.categories:
total_results = self.search_category_total_results(category)
self.get_urls_of_category_from_all_results(category, total_results)
print(f'Finished {category}')
return True