File size: 9,609 Bytes
1351c41
 
 
 
 
 
517a739
 
1351c41
 
 
 
 
 
 
 
 
 
 
 
 
517a739
1351c41
 
517a739
 
 
 
 
1351c41
c2ded9d
1351c41
 
 
 
 
 
 
 
a6a7b3d
517a739
 
284b289
a6a7b3d
284b289
a6a7b3d
 
284b289
 
517a739
 
a6a7b3d
284b289
 
 
1351c41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517a739
1351c41
 
 
 
 
 
 
 
 
 
 
517a739
 
 
 
 
 
 
a6a7b3d
1351c41
 
 
 
 
 
 
 
 
 
 
 
 
 
643dbca
1351c41
 
 
 
a6a7b3d
1351c41
 
a6a7b3d
 
 
 
 
1351c41
517a739
a6a7b3d
 
 
517a739
 
 
 
a6a7b3d
517a739
 
643dbca
1351c41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
517a739
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import requests
from bs4 import BeautifulSoup
import re
import json
import time
from .DatabaseDataSaver import save_product
from httpx_socks import SyncProxyTransport
import httpx

class TargetScraper:
    def __init__(self):
        self.base_url = 'https://www.target.com'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
                          'AppleWebKit/537.36 (KHTML, like Gecko) '
                          'Chrome/92.0.4515.131 Safari/537.36'
        }
        self.categories = [
            'personal-care/-/N-5xtzq', 'health/-/N-5xu1n', 'beauty/-/N-55r1x'
        ]
        self.visitor_id = ''
        self.session = self.create_session()
        self.generate_visitor_id()

    def create_session(self):
        transport = SyncProxyTransport.from_url("http://W4a8IruR4dkhNGb6:Hesj0mkBfnJ1n95M_country-us@geo.iproyal.com:12321")
        session = httpx.Client(transport=transport)
        return session
    
    def generate_visitor_id(self):
        s = requests.Session()
        s.get('https://www.target.com')
        self.visitor_id = s.cookies.get_dict()['visitorId']
        print(self.visitor_id)

    def get_product_info(self, url):
        pattern = r'[^/]+/([^/]+)$'
        match = re.findall(pattern, url)[0]
        p_id = match.split('-')[1]
        burp0_headers = {"Sec-Ch-Ua": "\"Not=A?Brand\";v=\"99\", \"Chromium\";v=\"118\"", "Accept": "application/json", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.70 Safari/537.36", "Sec-Ch-Ua-Platform": "\"Windows\"", "Origin": "https://www.target.com", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.target.com/c/health/-/N-5xu1n", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
        request_url = f"https://redsky.target.com/redsky_aggregations/v1/web/pdp_client_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcin={p_id}&is_bot=false&store_id=146&pricing_store_id=146&has_pricing_store_id=true&has_financing_options=true&visitor_id={self.visitor_id}&has_size_context=true&skip_personalized=true&skip_variation_hierarchy=true&channel=WEB&page=%2Fp%2F{match}"
        print(request_url)
        try:
            response  = self.session.get(request_url, headers=burp0_headers)
            json_response = response.json()
            print(json_response)
            title = json_response['data']['product']['item']['product_description']['title']
        except:
            try:
                self.session = self.create_session()
                response  = self.session.get(request_url, headers=self.headers)
                title = json_response['data']['product']['item']['product_description']['title']
                json_response = response.json()
            except:
                return False
        ingredients = ''
        try:
            # x.data.product.item.product_description.title
            title = json_response['data']['product']['item']['product_description']['title']
            # x.data.product.item.product_description.bullet_descriptions
            description = json_response['data']['product']['item']['product_description']['bullet_descriptions']
        except:
            return False
        try:
            drug_facts = json_response['data']['product']['item']["enrichment"]["drug_facts"]
            # covert dict to string
            try:
                for ingridient in drug_facts['active_ingredients']:
                    ingredients += ingridient['ingredient'] + ', '
            except:
                pass
            try:
                for in_active_ingredient in drug_facts['inactive_ingredients']:
                    ingredients += in_active_ingredient+ ', '
            except:
                pass
        except:
            for desc in description:
                if ('contains' in desc.lower() or 'primary active ingredients' in desc.lower()) and 'consult' not in desc.lower():
                    pattern = r'</B>\s*(.*?)$'
                    match = re.findall(pattern, desc)[0]
                    if match:
                        print(match) 
                        ingredients = match
                    break
        print(ingredients)
        
        save_product({
                    'title': title,
                    'product_id': p_id,
                    'ingredients': ingredients,
                    'url': url,
                    'store_name': 'Target'
                })
    def get_category_page(self, category):
        # url = f'{self.base_url}/{category}'
        self.generate_visitor_id()
        url = f'https://www.target.com/c/{category}'
        try:
            response = self.session.get(url, headers=self.headers)
        except:
            try:
                self.session = self.create_session()
                response = self.session.get(url, headers=self.headers)
            except:
                return []
        soup = BeautifulSoup(response.content, 'html.parser')
        sub_categories_container = soup.find('div', class_='styles__BubCatNavigationWrapper-sc-2nwvzd-0')
        lis = sub_categories_container.find_all('li')
        urls = []
        for li in lis:
            link = li.find('a')
            urls.append(f"{self.base_url}{link['href']}")
        
        # Define a regular expression pattern to match the desired part of the URLs
        pattern = r'-(\w+)$'

        # Extract the matches using re.findall
        matches = [re.findall(pattern, url)[0] for url in urls]
        print(matches)
        return matches

    def get_sub_category_page(self, url):
        # sub_category_url = f"https://redsky.target.com/redsky_aggregations/v1/web/product_summary_with_fulfillment_v1?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&tcins=15847564%2C75557589%2C78809748%2C13347903%2C13347898%2C13302603%2C11046774%2C14827710%2C17447006%2C16649805%2C82347297%2C53079917%2C16821449%2C83067937%2C86217754%2C15150353%2C14502619%2C46806870%2C51107346%2C76858885%2C14688252%2C76534688%2C14214405%2C51612206%2C14686468%2C75557589%2C14923389%2C50045818%2C15118350&store_id=146&zip=37250&state=PB&latitude=31.270&longitude=73.320&required_store_id=146&has_required_store_id=true&skip_price_promo=true&channel=WEB&page=%2Fc%2F{url}"
        offset = 0
        print(url)
        while True:
            self.generate_visitor_id()
            import requests

            # burp0_url = "https://redsky.target.com:443/redsky_aggregations/v1/web/plp_search_v2?key=9f36aeafbe60771e321a7cc95a78140772ab3e96&category=5xu1n&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&new_search=false&offset=0&page=%2Fc%2F5xu1n&platform=desktop&pricing_store_id=146&store_ids=146%2C2240%2C1983%2C695%2C1059&useragent=Mozilla%2F5.0+%28Windows+NT+10.0%3B+Win64%3B+x64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F118.0.5993.70+Safari%2F537.36&visitor_id=018B5CE4F4C10201A48A8F31291570E0&zip=37250"
            burp0_headers = {"Sec-Ch-Ua": "\"Not=A?Brand\";v=\"99\", \"Chromium\";v=\"118\"", "Accept": "application/json", "Sec-Ch-Ua-Mobile": "?0", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.5993.70 Safari/537.36", "Sec-Ch-Ua-Platform": "\"Windows\"", "Origin": "https://www.target.com", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "cors", "Sec-Fetch-Dest": "empty", "Referer": "https://www.target.com/c/health/-/N-5xu1n", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9"}
            sub_category_url = f"https://redsky.target.com/redsky_aggregations/v1/web/plp_search_v2?key={self.visitor_id}&category={url}&channel=WEB&count=24&default_purchasability_filter=true&include_sponsored=true&new_search=false&offset={offset}&page=%2Fc%2F{url}&platform=desktop&pricing_store_id=146&store_ids=146%2C2240%2C1983%2C695%2C1059&useragent=Mozilla%2F5.0+%28Windows+NT+10.0%3B+Win64%3B+x64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F118.0.0.0+Safari%2F537.36&zip=37250&visitor_id={self.visitor_id}"
            print(sub_category_url)
            try:
                response = self.session.get(sub_category_url, headers=burp0_headers)
                print(response.json(), 'Sub Category')
                products = response.json()['data']['search']['products']
            except:
                try:
                    self.session = self.create_session()
                    response = self.session.get(sub_category_url, headers=self.headers)
                    products = response.json()['data']['search']['products']
                except:
                    return False
            print(response.json(), 'Sub Category')
            # x.data.search.products
            try:
                products = response.json()['data']['search']['products']
            except:
                break
            offset += 24
            urls = []
            for product in products:
                # x.data.search.products[0].item.enrichment.buy_url
                product_url = product['item']['enrichment']['buy_url']
                self.get_product_info(product_url)
                urls.append(product_url)
        
        
        # return urls    
    def run(self):
        for category in self.categories:
            urls = self.get_category_page(category)
            for url in urls:
                self.get_sub_category_page(url)
       


if __name__ == "__main__":
    scraper = TargetScraper()
    scraper.run()