mumer119131 commited on
Commit
dbdc4c0
·
1 Parent(s): 62b9388

Create SephoraScraper.py

Browse files
Files changed (1) hide show
  1. SephoraScraper.py +140 -0
SephoraScraper.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import re
4
+ import csv
5
+ import psycopg2
6
+ import datetime
7
+ import time
8
+
9
+
10
+ class SephoraScraper:
11
+ def __init__(self):
12
+ self.headers = {
13
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ',
14
+ }
15
+ self.categories = [
16
+ "makeup-cosmetics", "skincare", "hair-products", "fragrance", "makeup-tools", "bath-body", "travel-size-toiletries", "gifts"
17
+ ]
18
+
19
+ def get_response(self, url):
20
+ id_pattern = pattern = r'-(P\d+)\?'
21
+ match = re.search(pattern, url)
22
+ if match:
23
+ product_id = match.group(1)
24
+ else:
25
+ return False
26
+ burp0_url = f"https://www.sephora.com:443/api2/catalog/products/{product_id}?addCurrentSkuToProductChildSkus=true&includeRegionsMap=true&showContent=true&includeConfigurableSku=true&countryCode=US&removePersonalizedData=true&includeReviewFilters=true&includeReviewImages=true&sentiments=6"
27
+ burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "X-Ufe-Request": "true", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h23vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/product/sephora-collection-total-coverage-blending-sponge-set-60-plant-based-P482303?skuId=2497220&icid2=products%20grid:p482303:product", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
28
+ try:
29
+ response = requests.get(burp0_url, headers=burp0_headers)
30
+ return response.json()
31
+ except Exception as e:
32
+ print(e)
33
+ return False
34
+
35
+ def parse_response(self, response, url):
36
+ try:
37
+ name = response['productDetails']['displayName']
38
+ product_id = response['productDetails']['productId']
39
+ try:
40
+ ingredients = response['currentSku']['ingredientDesc'].replace('\n', ' ').replace('<b>', '').replace('</b>', '').replace('<br>', '').replace('</p>', '').replace('<p>', '').replace('<br/>', '').replace('<p>', '')
41
+ except:
42
+ ingredients = ''
43
+
44
+ return [name, product_id, ingredients]
45
+ except:
46
+ return False
47
+ def search_category_total_results(self, category):
48
+ burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP&currentPage=2&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
49
+ burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
50
+ try:
51
+ response = requests.get(burp0_url, headers=burp0_headers)
52
+ except:
53
+ try:
54
+ response = requests.get(burp0_url, headers=burp0_headers)
55
+ except:
56
+ return 20
57
+ data = response.json()
58
+
59
+ return data['totalProducts']
60
+
61
+ def get_urls_of_category_from_all_results(self, category, total_results):
62
+
63
+ all_urls = []
64
+ pages = total_results // 60 + 1
65
+ for page in range(1, pages+1):
66
+ burp0_url = f"https://www.sephora.com:443/api/v2/catalog/categories/{category}/seo?targetSearchEngine=NLP&currentPage={page}&pageSize=60&content=true&includeRegionsMap=true&headers=%5Bobject%20Object%5D&pickupRampup=true&sddRampup=true&loc=en-US&ch=rwd"
67
+ burp0_headers = {"Sec-Ch-Ua": "\"Chromium\";v=\"117\", \"Not;A=Brand\";v=\"8\"", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.132 Safari/537.36", "X-Dtpc": "5$505172501_268h16vBLLCMFBNGOLFAPFGHKUDBVTGKMEPJULD-0e0", "X-Dtreferer": "https://www.sephora.com/shop/makeup-cosmetics", "X-Timestamp": "1697306065014", "Exclude_personalized_content": "true", "X-Requested-Source": "rwd", "Sec-Ch-Ua-Platform": "\"Windows\"", "Accept": "*/*", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.sephora.com/shop/makeup-cosmetics?currentPage=2", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
68
+ try:
69
+ response = requests.get(burp0_url, headers=burp0_headers)
70
+ except:
71
+ try:
72
+ response = requests.get(burp0_url, headers=burp0_headers)
73
+ except:
74
+ print("Error in category: ", category)
75
+ continue
76
+ data = response.json()
77
+ data = response.json()
78
+ products = data['products']
79
+ urls = []
80
+ for product in products:
81
+ urls.append(f'https://www.sephora.com{product["targetUrl"]}')
82
+ print(urls)
83
+ self.get_all_products_from_category_page(urls)
84
+ # self.get_all_products_from_category_page(all_urls)
85
+
86
+ def get_all_products_from_category_page(self, urls):
87
+ for url in urls:
88
+ response = self.get_response(url)
89
+ product = self.parse_response(response, url=url)
90
+ if product:
91
+ self.save_product({
92
+ 'title': product[0],
93
+ 'product_id': product[1],
94
+ 'ingredients': product[2],
95
+ 'url': url,
96
+ 'store_name': 'Sephora'
97
+ })
98
+ # save_product(product)
99
+ return True
100
+
101
+ def save_product_to_csv(self, product):
102
+ with open('sep.csv', 'a+', encoding='utf-8', newline='') as file:
103
+ #check if the product is already in the csv
104
+ file.seek(0) # move the file pointer to the beginning of the file
105
+ reader = csv.reader(file)
106
+ product_ids = [row[1] for row in reader]
107
+ print(product_ids)
108
+ if product[1] in product_ids:
109
+ return False
110
+
111
+ writer = csv.writer(file)
112
+ writer.writerow(product)
113
+ file.close()
114
+ def save_product(self, product):
115
+ try:
116
+ conn = psycopg2.connect(
117
+ host="ep-rapid-cake-30394055.us-east-2.aws.neon.tech",
118
+ database="ingredients-scraper",
119
+ user="mumer113141",
120
+ password="SFBtp4xnPeA2"
121
+ )
122
+ cur = conn.cursor()
123
+ cur.execute("INSERT INTO scraper_product (title, product_id, ingredients, url, store_name, date_created) VALUES (%s, %s, %s, %s, %s, %s)", (product['title'], product['product_id'], product['ingredients'], product['url'], product['store_name'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")))
124
+ conn.commit()
125
+ cur.close()
126
+ conn.close()
127
+ return True
128
+ except Exception as e:
129
+ print(e)
130
+ return False
131
+ def run(self):
132
+ for category in self.categories:
133
+ total_results = self.search_category_total_results(category)
134
+ self.get_urls_of_category_from_all_results(category, total_results)
135
+ print(f'Finished {category}')
136
+ return True
137
+
138
+
139
+ if __name__ == '__main__':
140
+ SephoraScraper().run()