Spaces:
Runtime error
Runtime error
Commit ·
dbdc4c0
1
Parent(s): 62b9388
Create SephoraScraper.py
Browse files- SephoraScraper.py +140 -0
SephoraScraper.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import re
|
| 4 |
+
import csv
|
| 5 |
+
import psycopg2
|
| 6 |
+
import datetime
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class SephoraScraper:
|
| 11 |
+
def __init__(self):
|
| 12 |
+
self.headers = {
|
| 13 |
+
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
|
| 14 |
+
}
|
| 15 |
+
self.categories = [
|
| 16 |
+
"makeup-cosmetics", "skincare", "hair-products", "fragrance", "makeup-tools", "bath-body", "travel-size-toiletries", "gifts"
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
def get_response(self, url):
|
| 20 |
+
id_pattern = pattern = r'-(P\d+)\?'
|
| 21 |
+
match = re.search(pattern, url)
|
| 22 |
+
if match:
|
| 23 |
+
product_id = match.group(1)
|
| 24 |
+
else:
|
| 25 |
+
return False
|
| 26 |
+
burp0_url = f"https://www.sephora.com:443/api2/catalog/products/{product_id}?addCurrentSkuToProductChildSkus=true&includeRegionsMap=true&showContent=true&includeConfigurableSku=true&countryCode=US&removePersonalizedData=true&includeReviewFilters=true&includeReviewImages=true&sentiments=6"
|
| 27 |
+
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "X-Ufe-Request": "true", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h23vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/product/sephora-collection-total-coverage-blending-sponge-set-60-plant-based-P482303?skuId=2497220&icid2=products%20grid:p482303:product", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
|
| 28 |
+
try:
|
| 29 |
+
response = requests.get(burp0_url, headers=burp0_headers)
|
| 30 |
+
return response.json()
|
| 31 |
+
except Exception as e:
|
| 32 |
+
print(e)
|
| 33 |
+
return False
|
| 34 |
+
|
| 35 |
+
def parse_response(self, response, url):
|
| 36 |
+
try:
|
| 37 |
+
name = response['productDetails']['displayName']
|
| 38 |
+
product_id = response['productDetails']['productId']
|
| 39 |
+
try:
|
| 40 |
+
ingredients = response['currentSku']['ingredientDesc'].replace('\n', ' ').replace('<b>', '').replace('</b>', '').replace('<br>', '').replace('</p>', '').replace('<p>', '').replace('<br/>', '').replace('<p>', '')
|
| 41 |
+
except:
|
| 42 |
+
ingredients = ''
|
| 43 |
+
|
| 44 |
+
return [name, product_id, ingredients]
|
| 45 |
+
except:
|
| 46 |
+
return False
|
| 47 |
+
def search_category_total_results(self, category):
|
| 48 |
+
burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP¤tPage=2&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
|
| 49 |
+
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
|
| 50 |
+
try:
|
| 51 |
+
response = requests.get(burp0_url, headers=burp0_headers)
|
| 52 |
+
except:
|
| 53 |
+
try:
|
| 54 |
+
response = requests.get(burp0_url, headers=burp0_headers)
|
| 55 |
+
except:
|
| 56 |
+
return 20
|
| 57 |
+
data = response.json()
|
| 58 |
+
|
| 59 |
+
return data['totalProducts']
|
| 60 |
+
|
| 61 |
+
def get_urls_of_category_from_all_results(self, category, total_results):
|
| 62 |
+
|
| 63 |
+
all_urls = []
|
| 64 |
+
pages = total_results // 60 + 1
|
| 65 |
+
for page in range(1, pages+1):
|
| 66 |
+
burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP¤tPage={page}&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
|
| 67 |
+
burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
|
| 68 |
+
try:
|
| 69 |
+
response = requests.get(burp0_url, headers=burp0_headers)
|
| 70 |
+
except:
|
| 71 |
+
try:
|
| 72 |
+
response = requests.get(burp0_url, headers=burp0_headers)
|
| 73 |
+
except:
|
| 74 |
+
print("Error in category: ", category)
|
| 75 |
+
continue
|
| 76 |
+
data = response.json()
|
| 77 |
+
data = response.json()
|
| 78 |
+
products = data['products']
|
| 79 |
+
urls = []
|
| 80 |
+
for product in products:
|
| 81 |
+
urls.append(f'https://www.sephora.com{product["targetUrl"]}')
|
| 82 |
+
print(urls)
|
| 83 |
+
self.get_all_products_from_category_page(urls)
|
| 84 |
+
# self.get_all_products_from_category_page(all_urls)
|
| 85 |
+
|
| 86 |
+
def get_all_products_from_category_page(self, urls):
|
| 87 |
+
for url in urls:
|
| 88 |
+
response = self.get_response(url)
|
| 89 |
+
product = self.parse_response(response, url=url)
|
| 90 |
+
if product:
|
| 91 |
+
self.save_product({
|
| 92 |
+
'title': product[0],
|
| 93 |
+
'product_id': product[1],
|
| 94 |
+
'ingredients': product[2],
|
| 95 |
+
'url': url,
|
| 96 |
+
'store_name': 'Sephora'
|
| 97 |
+
})
|
| 98 |
+
# save_product(product)
|
| 99 |
+
return True
|
| 100 |
+
|
| 101 |
+
def save_product_to_csv(self, product):
|
| 102 |
+
with open('sep.csv', 'a+', encoding='utf-8', newline='') as file:
|
| 103 |
+
#check if the product is already in the csv
|
| 104 |
+
file.seek(0) # move the file pointer to the beginning of the file
|
| 105 |
+
reader = csv.reader(file)
|
| 106 |
+
product_ids = [row[1] for row in reader]
|
| 107 |
+
print(product_ids)
|
| 108 |
+
if product[1] in product_ids:
|
| 109 |
+
return False
|
| 110 |
+
|
| 111 |
+
writer = csv.writer(file)
|
| 112 |
+
writer.writerow(product)
|
| 113 |
+
file.close()
|
| 114 |
+
def save_product(self, product):
|
| 115 |
+
try:
|
| 116 |
+
conn = psycopg2.connect(
|
| 117 |
+
host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
|
| 118 |
+
database="ingredients-scraper",
|
| 119 |
+
user="mumer113141",
|
| 120 |
+
password="SFBtp4xnPeA2"
|
| 121 |
+
)
|
| 122 |
+
cur = conn.cursor()
|
| 123 |
+
cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
|
| 124 |
+
conn.commit()
|
| 125 |
+
cur.close()
|
| 126 |
+
conn.close()
|
| 127 |
+
return True
|
| 128 |
+
except Exception as e:
|
| 129 |
+
print(e)
|
| 130 |
+
return False
|
| 131 |
+
def run(self):
|
| 132 |
+
for category in self.categories:
|
| 133 |
+
total_results = self.search_category_total_results(category)
|
| 134 |
+
self.get_urls_of_category_from_all_results(category, total_results)
|
| 135 |
+
print(f'Finished {category}')
|
| 136 |
+
return True
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
if __name__ == '__main__':
|
| 140 |
+
SephoraScraper().run()
|