Spaces:
Runtime error
Runtime error
File size: 8,144 Bytes
dbdc4c0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | import requests
from bs4 import BeautifulSoup
import re
import csv
import psycopg2
import datetime
import time
class SephoraScraper:
def __init__(self):
self.headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
}
self.categories = [
"makeup-cosmetics", "skincare", "hair-products", "fragrance", "makeup-tools", "bath-body", "travel-size-toiletries", "gifts"
]
def get_response(self, url):
id_pattern = pattern = r'-(P\d+)\?'
match = re.search(pattern, url)
if match:
product_id = match.group(1)
else:
return False
burp0_url = f"https://www.sephora.com:443/api2/catalog/products/{product_id}?addCurrentSkuToProductChildSkus=true&includeRegionsMap=true&showContent=true&includeConfigurableSku=true&countryCode=US&removePersonalizedData=true&includeReviewFilters=true&includeReviewImages=true&sentiments=6"
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "X-Ufe-Request": "true", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h23vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/product/sephora-collection-total-coverage-blending-sponge-set-60-plant-based-P482303?skuId=2497220&icid2=products%20grid:p482303:product", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
try:
response = requests.get(burp0_url, headers=burp0_headers)
return response.json()
except Exception as e:
print(e)
return False
def parse_response(self, response, url):
try:
name = response['productDetails']['displayName']
product_id = response['productDetails']['productId']
try:
ingredients = response['currentSku']['ingredientDesc'].replace('\n', ' ').replace('<b>', '').replace('</b>', '').replace('<br>', '').replace('</p>', '').replace('<p>', '').replace('<br/>', '').replace('<p>', '')
except:
ingredients = ''
return [name, product_id, ingredients]
except:
return False
def search_category_total_results(self, category):
burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP¤tPage=2&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
try:
response = requests.get(burp0_url, headers=burp0_headers)
except:
try:
response = requests.get(burp0_url, headers=burp0_headers)
except:
return 20
data = response.json()
return data['totalProducts']
def get_urls_of_category_from_all_results(self, category, total_results):
all_urls = []
pages = total_results // 60 + 1
for page in range(1, pages+1):
burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP¤tPage={page}&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
try:
response = requests.get(burp0_url, headers=burp0_headers)
except:
try:
response = requests.get(burp0_url, headers=burp0_headers)
except:
print("Error in category: ", category)
continue
data = response.json()
data = response.json()
products = data['products']
urls = []
for product in products:
urls.append(f'https://www.sephora.com{product["targetUrl"]}')
print(urls)
self.get_all_products_from_category_page(urls)
# self.get_all_products_from_category_page(all_urls)
def get_all_products_from_category_page(self, urls):
for url in urls:
response = self.get_response(url)
product = self.parse_response(response, url=url)
if product:
self.save_product({
'title': product[0],
'product_id': product[1],
'ingredients': product[2],
'url': url,
'store_name': 'Sephora'
})
# save_product(product)
return True
def save_product_to_csv(self, product):
with open('sep.csv', 'a+', encoding='utf-8', newline='') as file:
#check if the product is already in the csv
file.seek(0) # move the file pointer to the beginning of the file
reader = csv.reader(file)
product_ids = [row[1] for row in reader]
print(product_ids)
if product[1] in product_ids:
return False
writer = csv.writer(file)
writer.writerow(product)
file.close()
def save_product(self, product):
try:
conn = psycopg2.connect(
host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
database="ingredients-scraper",
user="mumer113141",
password="SFBtp4xnPeA2"
)
cur = conn.cursor()
cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
conn.commit()
cur.close()
conn.close()
return True
except Exception as e:
print(e)
return False
def run(self):
for category in self.categories:
total_results = self.search_category_total_results(category)
self.get_urls_of_category_from_all_results(category, total_results)
print(f'Finished {category}')
return True
if __name__ == '__main__':
SephoraScraper().run() |