Spaces:
Sleeping
Sleeping
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.common.exceptions import NoSuchElementException | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.webdriver.chrome.service import Service | |
| import time | |
| # Global driver to use throughout the script | |
| DRIVER = None | |
| # Wrapper to close driver if its created | |
| def close_driver(): | |
| global DRIVER | |
| if DRIVER is not None: | |
| DRIVER.close() | |
| DRIVER = None | |
| # Function to (re)start driver | |
| def start_driver(force_restart=False): | |
| global DRIVER | |
| if force_restart: | |
| close_driver() | |
| # Setting up the driver | |
| service = Service() | |
| options = Options() | |
| options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background | |
| options.add_argument('-no-sandbox') | |
| options.add_argument('-disable-dev-shm-usage') | |
| DRIVER = webdriver.Chrome(service=service, options=options) | |
| ### Function to extract product info from the necessary html and json tags | |
| def get_shopee_product_info_single(product_element, extra_info): | |
| """ | |
| Extract info from a single product element from the driver. | |
| Args: | |
| product_item: (WebDriverElement) the product whose info needs to be | |
| extracted. | |
| Returns: | |
| info: (dict) a dictionary of info of the product. Every product | |
| should at least have four pieces of information: name, price, | |
| link to the product page, and link to the product image. | |
| """ | |
| info = {'source': 'shopee', | |
| 'name':'', | |
| 'price':-1, | |
| 'product_url':'', | |
| 'image':''} | |
| print(product_element.get_attribute('outerHTML')) | |
| try: | |
| # Find the <a> element within the <div class> | |
| product_title_element = product_element.find_element(By.CLASS_NAME, "line-clamp-2") | |
| # Get the text content of the <a> element | |
| info['name'] = product_title_element.text | |
| print(info['name']) | |
| except NoSuchElementException: | |
| info['name'] = "" | |
| # price | |
| try: | |
| # Find the <span> element within the <div class> | |
| price_element = product_element.find_element(By.XPATH,'//div[@class="truncate flex items-baseline"]/span[@class="text-base/5 truncate"]') | |
| # Get the text content of the <span> element | |
| price_text = price_element.text | |
| # Extract the price value | |
| info['price'] = int(price_text.split(" ")[0].replace('.', '')) | |
| print(info['price']) | |
| except (NoSuchElementException, ValueError): | |
| pass | |
| # link | |
| try: | |
| # Find the <a> element within the <div class> | |
| product_link_element = product_element.find_element(By.XPATH, '//a[@class="contents"]') | |
| # Get the href attribute of the <a> element | |
| product_link = product_link_element.get_attribute("href") | |
| # Extract the URL from the href attribute | |
| info['product_url'] = product_link | |
| except NoSuchElementException: | |
| pass | |
| # thumbnail | |
| try: | |
| # Find the <img> element within the <div class> | |
| image_element = product_element.find_element(By.XPATH, '//img[@class="inset-y-0 w-full h-full pointer-events-none object-contain absolute"]') | |
| # Get the src attribute of the <img> element | |
| info['image'] = image_element.get_attribute("src") | |
| except NoSuchElementException: | |
| pass | |
| # If we decide to get extra information | |
| if extra_info: | |
| # sales | |
| try: | |
| # Find the <span> element within the <div class> | |
| sold_element = product_element.find_element(By.XPATH, '//div[@class="truncate text-shopee-black87 text-xs min-h-4 flex-shrink-1"]') | |
| # Get the text content of the <span> element | |
| info['sales'] = sold_element.text | |
| except (NoSuchElementException, ValueError): | |
| info['sales'] = 0 | |
| try: | |
| # Find the <span> element within the <div class> | |
| discount_element = product_element.find_element(By.XPATH, '//div[@class="truncate bg-shopee-voucher-yellow text-white leading-4 text-sp10"]') | |
| # Get the text content of the <span> element | |
| info['discount'] = discount_element.text | |
| except (NoSuchElementException, ValueError): | |
| info['discount'] = '0' | |
| return info | |
| ### Function to scrape all products from a page | |
| def get_shopee_product_info_from_page(page_url, extra_info=False): | |
| """ | |
| Extract info from all products of a specfic page_url on Tiki website | |
| Args: | |
| page_url: (string) url of the page to scrape | |
| Returns: | |
| data: (list) a list of dictionary of products info. If no products | |
| found, return empty list. | |
| """ | |
| global DRIVER | |
| data = [] | |
| DRIVER.get(page_url) # Use the driver to get info from the product page | |
| time.sleep(3) | |
| # FIND ALL PRODUCT ITEMS | |
| products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item') | |
| print(f'Found {len(products)} products') | |
| print(products) | |
| if len(products)>0: | |
| for i in products: | |
| product_dict = get_shopee_product_info_single(i, extra_info) | |
| data.append(product_dict) | |
| return data | |
| ### Function to get product info from a main category | |
| def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False): | |
| ''' | |
| Scrape for multiple pages of products of a category. | |
| Uses get_product_info_from_page(). | |
| Args: | |
| cat_url: (string) a url string of a category | |
| max_page: (int) an integer denoting the maximum number of pages to scrape. | |
| Default value is 0 to scrape all pages. | |
| Returns: | |
| products: a list in which every element is a dictionary of one product's information | |
| ''' | |
| products = [] | |
| page_n = 0 | |
| cat_page_url = 'https://shopee.vn/search?keyword=' + search_product | |
| product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info) | |
| while len(product_list)>0: | |
| products.extend(product_list) | |
| page_n += 1 | |
| # stop_flag = False if max_page <= 0 else (page_n > max_page) | |
| stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page | |
| if stop_flag: | |
| break | |
| cat_page_url = cat_page_url + f'&page={page_n}' | |
| product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info) | |
| return products | |
| def scrap_shopee(search_product, num_max_page, extra_info): | |
| # # #test Shopee | |
| # start_driver() | |
| # URL = 'https://shopee.vn/search?keyword=megaduo&page=0&sortBy=relevancy' | |
| # DRIVER.get(URL) | |
| # time.sleep(3) | |
| # print(URL) | |
| # products = DRIVER.find_elements(By.CLASS_NAME, 'shopee-search-item-result') | |
| # # products = DRIVER.find_element("css selector", 'li[class="col-xs-2-4 shopee-search-item-result__item"]') | |
| # product = products[0] | |
| # # Wait for the element to be present on the page | |
| # info = get_shopee_product_info_single(product, True) | |
| # print(info) | |
| start_driver(force_restart=True) | |
| prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE | |
| prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info) | |
| prod_data.extend(prod_per_cat) | |
| close_driver() # Close driver when we're done | |
| return prod_data |