Spaces:
Runtime error
Runtime error
Commit ·
a6a7b3d
1
Parent(s): 595fcca
Update scraper/utils/TargetScraper.py
Browse files
scraper/utils/TargetScraper.py
CHANGED
|
@@ -37,15 +37,19 @@ class TargetScraper:
|
|
| 37 |
pattern = r'[^/]+/([^/]+)$'
|
| 38 |
match = re.findall(pattern, url)[0]
|
| 39 |
p_id = match.split('-')[1]
|
|
|
|
| 40 |
request_url = f"https://redsky.target.com/redsky_aggregations/v1/web/pdp_client_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcin={p_id}&is_bot=false&store_id=146&pricing_store_id=146&has_pricing_store_id=true&has_financing_options=true&visitor_id={self.visitor_id}&has_size_context=true&skip_personalized=true&skip_variation_hierarchy=true&channel=WEB&page=%2Fp%2F{match}"
|
| 41 |
print(request_url)
|
| 42 |
try:
|
| 43 |
-
response = self.session.get(request_url, headers=
|
| 44 |
json_response = response.json()
|
|
|
|
|
|
|
| 45 |
except:
|
| 46 |
try:
|
| 47 |
self.session = self.create_session()
|
| 48 |
response = self.session.get(request_url, headers=self.headers)
|
|
|
|
| 49 |
json_response = response.json()
|
| 50 |
except:
|
| 51 |
return False
|
|
@@ -99,7 +103,7 @@ class TargetScraper:
|
|
| 99 |
self.session = self.create_session()
|
| 100 |
response = self.session.get(url, headers=self.headers)
|
| 101 |
except:
|
| 102 |
-
return
|
| 103 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 104 |
sub_categories_container = soup.find('div', class_='styles__BubCatNavigationWrapper-sc-2nwvzd-0')
|
| 105 |
lis = sub_categories_container.find_all('li')
|
|
@@ -119,17 +123,24 @@ class TargetScraper:
|
|
| 119 |
def get_sub_category_page(self, url):
|
| 120 |
# sub_category_url = f"https://redsky.target.com/redsky_aggregations/v1/web/product_summary_with_fulfillment_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcins=15847564%2C75557589%2C78809748%2C13347903%2C13347898%2C13302603%2C11046774%2C14827710%2C17447006%2C16649805%2C82347297%2C53079917%2C16821449%2C83067937%2C86217754%2C15150353%2C14502619%2C46806870%2C51107346%2C76858885%2C14688252%2C76534688%2C14214405%2C51612206%2C14686468%2C75557589%2C14923389%2C50045818%2C15118350&store_id=146&zip=37250&state=PB&latitude=31.270&longitude=73.320&required_store_id=146&has_required_store_id=true&skip_price_promo=true&channel=WEB&page=%2Fc%2F{url}"
|
| 121 |
offset = 0
|
| 122 |
-
|
| 123 |
while True:
|
| 124 |
self.generate_visitor_id()
|
| 125 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 126 |
print(sub_category_url)
|
| 127 |
try:
|
| 128 |
-
response = self.session.get(sub_category_url, headers=
|
|
|
|
|
|
|
| 129 |
except:
|
| 130 |
try:
|
| 131 |
self.session = self.create_session()
|
| 132 |
response = self.session.get(sub_category_url, headers=self.headers)
|
|
|
|
| 133 |
except:
|
| 134 |
return False
|
| 135 |
print(response.json(), 'Sub Category')
|
|
|
|
| 37 |
pattern = r'[^/]+/([^/]+)$'
|
| 38 |
match = re.findall(pattern, url)[0]
|
| 39 |
p_id = match.split('-')[1]
|
| 40 |
+
burp0_headers = {"Sec-Ch-Ua": "\"Not=A?Brand\";v=\"99\", \"Chromium\";v=\"118\"", "Accept": "application/json", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.70 Safari/537.36", "Sec-Ch-Ua-Platform": "\"Windows\"", "Origin": "https://www.target.com", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.target.com/c/health/-/N-5xu1n", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
|
| 41 |
request_url = f"https://redsky.target.com/redsky_aggregations/v1/web/pdp_client_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcin={p_id}&is_bot=false&store_id=146&pricing_store_id=146&has_pricing_store_id=true&has_financing_options=true&visitor_id={self.visitor_id}&has_size_context=true&skip_personalized=true&skip_variation_hierarchy=true&channel=WEB&page=%2Fp%2F{match}"
|
| 42 |
print(request_url)
|
| 43 |
try:
|
| 44 |
+
response = self.session.get(request_url, headers=burp0_headers)
|
| 45 |
json_response = response.json()
|
| 46 |
+
print(json_response)
|
| 47 |
+
title = json_response['data']['product']['item']['product_description']['title']
|
| 48 |
except:
|
| 49 |
try:
|
| 50 |
self.session = self.create_session()
|
| 51 |
response = self.session.get(request_url, headers=self.headers)
|
| 52 |
+
title = json_response['data']['product']['item']['product_description']['title']
|
| 53 |
json_response = response.json()
|
| 54 |
except:
|
| 55 |
return False
|
|
|
|
| 103 |
self.session = self.create_session()
|
| 104 |
response = self.session.get(url, headers=self.headers)
|
| 105 |
except:
|
| 106 |
+
return []
|
| 107 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 108 |
sub_categories_container = soup.find('div', class_='styles__BubCatNavigationWrapper-sc-2nwvzd-0')
|
| 109 |
lis = sub_categories_container.find_all('li')
|
|
|
|
| 123 |
def get_sub_category_page(self, url):
|
| 124 |
# sub_category_url = f"https://redsky.target.com/redsky_aggregations/v1/web/product_summary_with_fulfillment_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcins=15847564%2C75557589%2C78809748%2C13347903%2C13347898%2C13302603%2C11046774%2C14827710%2C17447006%2C16649805%2C82347297%2C53079917%2C16821449%2C83067937%2C86217754%2C15150353%2C14502619%2C46806870%2C51107346%2C76858885%2C14688252%2C76534688%2C14214405%2C51612206%2C14686468%2C75557589%2C14923389%2C50045818%2C15118350&store_id=146&zip=37250&state=PB&latitude=31.270&longitude=73.320&required_store_id=146&has_required_store_id=true&skip_price_promo=true&channel=WEB&page=%2Fc%2F{url}"
|
| 125 |
offset = 0
|
| 126 |
+
print(url)
|
| 127 |
while True:
|
| 128 |
self.generate_visitor_id()
|
| 129 |
+
import requests
|
| 130 |
+
|
| 131 |
+
# burp0_url = "https://redsky.target.com:443/redsky_aggregations/v1/web/plp_search_v2?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&category=5xu1n&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&new_search=false&offset=0&page=%2Fc%2F5xu1n&platform=desktop&pricing_store_id=146&store_ids=146%2C2240%2C1983%2C695%2C1059&useragent=Mozilla%2F5.0+%28Windows+NT+10.0%3B+Win64%3B+x64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F118.0.5993.70+Safari%2F537.36&visitor_id=018B5CE4F4C10201A48A8F31291570E0&zip=37250"
|
| 132 |
+
burp0_headers = {"Sec-Ch-Ua": "\"Not=A?Brand\";v=\"99\", \"Chromium\";v=\"118\"", "Accept": "application/json", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.70 Safari/537.36", "Sec-Ch-Ua-Platform": "\"Windows\"", "Origin": "https://www.target.com", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.target.com/c/health/-/N-5xu1n", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
|
| 133 |
+
sub_category_url = f"https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v2?key={self.visitor_id}&category={url}&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&new_search=false&offset={offset}&page=%2Fc%2F{url}&platform=desktop&pricing_store_id=146&store_ids=146%2C2240%2C1983%2C695%2C1059&useragent=Mozilla%2F5.0+%28Windows+NT+10.0%3B+Win64%3B+x64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F118.0.0.0+Safari%2F537.36&zip=37250&visitor_id={self.visitor_id}"
|
| 134 |
print(sub_category_url)
|
| 135 |
try:
|
| 136 |
+
response = self.session.get(sub_category_url, headers=burp0_headers)
|
| 137 |
+
print(response.json(), 'Sub Category')
|
| 138 |
+
products = response.json()['data']['search']['products']
|
| 139 |
except:
|
| 140 |
try:
|
| 141 |
self.session = self.create_session()
|
| 142 |
response = self.session.get(sub_category_url, headers=self.headers)
|
| 143 |
+
products = response.json()['data']['search']['products']
|
| 144 |
except:
|
| 145 |
return False
|
| 146 |
print(response.json(), 'Sub Category')
|