Spaces:
Runtime error
Runtime error
| import requests | |
| from bs4 import BeautifulSoup | |
| import re | |
| import json | |
| import time | |
| from .DatabaseDataSaver import save_product | |
| from httpx_socks import SyncProxyTransport | |
| import httpx | |
| class TargetScraper: | |
| def __init__(self): | |
| self.base_url = 'https://www.target.com' | |
| self.headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) ' | |
| 'AppleWebKit/537.36 (KHTML, like Gecko) ' | |
| 'Chrome/92.0.4515.131 Safari/537.36' | |
| } | |
| self.categories = [ | |
| 'personal-care/-/N-5xtzq', 'health/-/N-5xu1n', 'beauty/-/N-55r1x' | |
| ] | |
| self.visitor_id = '' | |
| self.session = self.create_session() | |
| self.generate_visitor_id() | |
| def create_session(self): | |
| transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321") | |
| session = httpx.Client(transport=transport) | |
| return session | |
| def generate_visitor_id(self): | |
| s = requests.Session() | |
| s.get('https://www.target.com') | |
| self.visitor_id = s.cookies.get_dict()['visitorId'] | |
| print(self.visitor_id) | |
| def get_product_info(self, url): | |
| pattern = r'[^/]+/([^/]+)$' | |
| match = re.findall(pattern, url)[0] | |
| p_id = match.split('-')[1] | |
| burp0_headers = {"Sec-Ch-Ua": "\"Not=A?Brand\";v=\"99\", \"Chromium\";v=\"118\"", "Accept": "application/json", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.70 Safari/537.36", "Sec-Ch-Ua-Platform": "\"Windows\"", "Origin": "https://www.target.com", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.target.com/c/health/-/N-5xu1n", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} | |
| request_url = f"https://redsky.target.com/redsky_aggregations/v1/web/pdp_client_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcin={p_id}&is_bot=false&store_id=146&pricing_store_id=146&has_pricing_store_id=true&has_financing_options=true&visitor_id={self.visitor_id}&has_size_context=true&skip_personalized=true&skip_variation_hierarchy=true&channel=WEB&page=%2Fp%2F{match}" | |
| print(request_url) | |
| try: | |
| response = self.session.get(request_url, headers=burp0_headers) | |
| json_response = response.json() | |
| print(json_response) | |
| title = json_response['data']['product']['item']['product_description']['title'] | |
| except: | |
| try: | |
| self.session = self.create_session() | |
| response = self.session.get(request_url, headers=self.headers) | |
| title = json_response['data']['product']['item']['product_description']['title'] | |
| json_response = response.json() | |
| except: | |
| return False | |
| ingredients = '' | |
| try: | |
| # x.data.product.item.product_description.title | |
| title = json_response['data']['product']['item']['product_description']['title'] | |
| # x.data.product.item.product_description.bullet_descriptions | |
| description = json_response['data']['product']['item']['product_description']['bullet_descriptions'] | |
| except: | |
| return False | |
| try: | |
| drug_facts = json_response['data']['product']['item']["enrichment"]["drug_facts"] | |
| # covert dict to string | |
| try: | |
| for ingridient in drug_facts['active_ingredients']: | |
| ingredients += ingridient['ingredient'] + ', ' | |
| except: | |
| pass | |
| try: | |
| for in_active_ingredient in drug_facts['inactive_ingredients']: | |
| ingredients += in_active_ingredient+ ', ' | |
| except: | |
| pass | |
| except: | |
| for desc in description: | |
| if ('contains' in desc.lower() or 'primary active ingredients' in desc.lower()) and 'consult' not in desc.lower(): | |
| pattern = r'</B>\s*(.*?)$' | |
| match = re.findall(pattern, desc)[0] | |
| if match: | |
| print(match) | |
| ingredients = match | |
| break | |
| print(ingredients) | |
| save_product({ | |
| 'title': title, | |
| 'product_id': p_id, | |
| 'ingredients': ingredients, | |
| 'url': url, | |
| 'store_name': 'Target' | |
| }) | |
| def get_category_page(self, category): | |
| # url = f'{self.base_url}/{category}' | |
| self.generate_visitor_id() | |
| url = f'https://www.target.com/c/{category}' | |
| try: | |
| response = self.session.get(url, headers=self.headers) | |
| except: | |
| try: | |
| self.session = self.create_session() | |
| response = self.session.get(url, headers=self.headers) | |
| except: | |
| return [] | |
| soup = BeautifulSoup(response.content, 'html.parser') | |
| sub_categories_container = soup.find('div', class_='styles__BubCatNavigationWrapper-sc-2nwvzd-0') | |
| lis = sub_categories_container.find_all('li') | |
| urls = [] | |
| for li in lis: | |
| link = li.find('a') | |
| urls.append(f"{self.base_url}{link['href']}") | |
| # Define a regular expression pattern to match the desired part of the URLs | |
| pattern = r'-(\w+)$' | |
| # Extract the matches using re.findall | |
| matches = [re.findall(pattern, url)[0] for url in urls] | |
| print(matches) | |
| return matches | |
| def get_sub_category_page(self, url): | |
| # sub_category_url = f"https://redsky.target.com/redsky_aggregations/v1/web/product_summary_with_fulfillment_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcins=15847564%2C75557589%2C78809748%2C13347903%2C13347898%2C13302603%2C11046774%2C14827710%2C17447006%2C16649805%2C82347297%2C53079917%2C16821449%2C83067937%2C86217754%2C15150353%2C14502619%2C46806870%2C51107346%2C76858885%2C14688252%2C76534688%2C14214405%2C51612206%2C14686468%2C75557589%2C14923389%2C50045818%2C15118350&store_id=146&zip=37250&state=PB&latitude=31.270&longitude=73.320&required_store_id=146&has_required_store_id=true&skip_price_promo=true&channel=WEB&page=%2Fc%2F{url}" | |
| offset = 0 | |
| print(url) | |
| while True: | |
| self.generate_visitor_id() | |
| import requests | |
| # burp0_url = "https://redsky.target.com:443/redsky_aggregations/v1/web/plp_search_v2?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&category=5xu1n&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&new_search=false&offset=0&page=%2Fc%2F5xu1n&platform=desktop&pricing_store_id=146&store_ids=146%2C2240%2C1983%2C695%2C1059&useragent=Mozilla%2F5.0+%28Windows+NT+10.0%3B+Win64%3B+x64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F118.0.5993.70+Safari%2F537.36&visitor_id=018B5CE4F4C10201A48A8F31291570E0&zip=37250" | |
| burp0_headers = {"Sec-Ch-Ua": "\"Not=A?Brand\";v=\"99\", \"Chromium\";v=\"118\"", "Accept": "application/json", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.70 Safari/537.36", "Sec-Ch-Ua-Platform": "\"Windows\"", "Origin": "https://www.target.com", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.target.com/c/health/-/N-5xu1n", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"} | |
| sub_category_url = f"https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v2?key={self.visitor_id}&category={url}&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&new_search=false&offset={offset}&page=%2Fc%2F{url}&platform=desktop&pricing_store_id=146&store_ids=146%2C2240%2C1983%2C695%2C1059&useragent=Mozilla%2F5.0+%28Windows+NT+10.0%3B+Win64%3B+x64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F118.0.0.0+Safari%2F537.36&zip=37250&visitor_id={self.visitor_id}" | |
| print(sub_category_url) | |
| try: | |
| response = self.session.get(sub_category_url, headers=burp0_headers) | |
| print(response.json(), 'Sub Category') | |
| products = response.json()['data']['search']['products'] | |
| except: | |
| try: | |
| self.session = self.create_session() | |
| response = self.session.get(sub_category_url, headers=self.headers) | |
| products = response.json()['data']['search']['products'] | |
| except: | |
| return False | |
| print(response.json(), 'Sub Category') | |
| # x.data.search.products | |
| try: | |
| products = response.json()['data']['search']['products'] | |
| except: | |
| break | |
| offset += 24 | |
| urls = [] | |
| for product in products: | |
| # x.data.search.products[0].item.enrichment.buy_url | |
| product_url = product['item']['enrichment']['buy_url'] | |
| self.get_product_info(product_url) | |
| urls.append(product_url) | |
| # return urls | |
| def run(self): | |
| for category in self.categories: | |
| urls = self.get_category_page(category) | |
| for url in urls: | |
| self.get_sub_category_page(url) | |
| if __name__ == "__main__": | |
| scraper = TargetScraper() | |
| scraper.run() |