Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import csv | |
| from .DatabaseDataSaver import save_product | |
| class SephoraScraper: | |
| def __init__(self): | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ', | |
| } | |
| self.categories = [ | |
| "makeup-cosmetics", "skincare", "hair-products", "fragrance", "makeup-tools", "bath-body", "travel-size-toiletries", "gifts" | |
| ] | |
| def get_response(self, url): | |
| id_pattern = pattern = r'-(P\d+)\?' | |
| match = re.search(pattern, url) | |
| if match: | |
| product_id = match.group(1) | |
| else: | |
| return False | |
| burp0_url = f"https://www.sephora.com:443/api2/catalog/products/{product_id}?addCurrentSkuToProductChildSkus=true&includeRegionsMap=true&showContent=true&includeConfigurableSku=true&countryCode=US&removePersonalizedData=true&includeReviewFilters=true&includeReviewImages=true&sentiments=6" | |
| burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "X-Ufe-Request": "true", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h23vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/product/sephora-collection-total-coverage-blending-sponge-set-60-plant-based-P482303?skuId=2497220&icid2=products%20grid:p482303:product", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} | |
| try: | |
| response = requests.get(burp0_url, headers=burp0_headers) | |
| print(response.text) | |
| return response.json() | |
| except Exception as e: | |
| print(e) | |
| return False | |
| def parse_response(self, response, url): | |
| try: | |
| name = response['productDetails']['displayName'] | |
| product_id = response['productDetails']['productId'] | |
| try: | |
| ingredients = response['currentSku']['ingredientDesc'].replace('\n', ' ').replace('<b>', '').replace('</b>', '').replace('<br>', '').replace('</p>', '').replace('<p>', '').replace('<br/>', '').replace('<p>', '') | |
| except: | |
| ingredients = '' | |
| return [name, product_id, ingredients] | |
| except: | |
| return False | |
| def search_category_total_results(self, category): | |
| burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP¤tPage=2&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd" | |
| burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} | |
| response = requests.get(burp0_url, headers=burp0_headers) | |
| data = response.json() | |
| return data['totalProducts'] | |
| def get_urls_of_category_from_all_results(self, category, total_results): | |
| all_urls = [] | |
| pages = total_results // 60 + 1 | |
| for page in range(1, pages+1): | |
| burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP¤tPage={page}&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd" | |
| burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} | |
| response = requests.get(burp0_url, headers=burp0_headers) | |
| data = response.json() | |
| products = data['products'] | |
| urls = [] | |
| for product in products: | |
| urls.append(f'https://www.sephora.com{product["targetUrl"]}') | |
| print(urls) | |
| self.get_all_products_from_category_page(urls) | |
| # self.get_all_products_from_category_page(all_urls) | |
| def get_all_products_from_category_page(self, urls): | |
| for url in urls: | |
| response = self.get_response(url) | |
| product = self.parse_response(response, url=url) | |
| if product: | |
| save_product({ | |
| 'title': product[0], | |
| 'product_id': product[1], | |
| 'ingredients': product[2], | |
| 'url': url, | |
| 'store_name': 'Sephora' | |
| }) | |
| # save_product(product) | |
| return True | |
| def save_product_to_csv(self, product): | |
| with open('sep.csv', 'a+', encoding='utf-8', newline='') as file: | |
| #check if the product is already in the csv | |
| file.seek(0) # move the file pointer to the beginning of the file | |
| reader = csv.reader(file) | |
| product_ids = [row[1] for row in reader] | |
| print(product_ids) | |
| if product[1] in product_ids: | |
| return False | |
| writer = csv.writer(file) | |
| writer.writerow(product) | |
| file.close() | |
| def run(self): | |
| for category in self.categories: | |
| total_results = self.search_category_total_results(category) | |
| self.get_urls_of_category_from_all_results(category, total_results) | |
| print(f'Finished {category}') | |
| return True | |