Spaces:

LinhVuu
/

price-comparison

Sleeping

Linh Vuu

updated files

6d0cb99 about 2 years ago

7.45 kB

	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from selenium.common.exceptions import NoSuchElementException
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.webdriver.chrome.service import Service
	import time

	# Global driver to use throughout the script
	DRIVER = None

	# Wrapper to close driver if its created
	def close_driver():
	global DRIVER
	if DRIVER is not None:
	DRIVER.close()
	DRIVER = None

	# Function to (re)start driver
	def start_driver(force_restart=False):
	global DRIVER

	if force_restart:
	close_driver()

	# Setting up the driver
	service = Service()
	options = Options()
	options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
	options.add_argument('-no-sandbox')
	options.add_argument('-disable-dev-shm-usage')

	DRIVER = webdriver.Chrome(service=service, options=options)

	### Function to extract product info from the necessary html and json tags
	def get_shopee_product_info_single(product_element, extra_info):
	"""
	Extract info from a single product element from the driver.
	Args:
	product_item: (WebDriverElement) the product whose info needs to be
	extracted.
	Returns:
	info: (dict) a dictionary of info of the product. Every product
	should at least have four pieces of information: name, price,
	link to the product page, and link to the product image.
	"""
	info = {'source': 'shopee',
	'name':'',
	'price':-1,
	'product_url':'',
	'image':''}
	print(product_element.get_attribute('outerHTML'))
	try:
	# Find the <a> element within the <div class>
	product_title_element = product_element.find_element(By.CLASS_NAME, "line-clamp-2")

	# Get the text content of the <a> element
	info['name'] = product_title_element.text
	print(info['name'])

	except NoSuchElementException:
	info['name'] = ""


	# price
	try:
	# Find the <span> element within the <div class>
	price_element = product_element.find_element(By.XPATH,'//div[@class="truncate flex items-baseline"]/span[@class="text-base/5 truncate"]')

	# Get the text content of the <span> element
	price_text = price_element.text

	# Extract the price value
	info['price'] = int(price_text.split(" ")[0].replace('.', ''))
	print(info['price'])

	except (NoSuchElementException, ValueError):
	pass

	# link
	try:
	# Find the <a> element within the <div class>
	product_link_element = product_element.find_element(By.XPATH, '//a[@class="contents"]')

	# Get the href attribute of the <a> element
	product_link = product_link_element.get_attribute("href")

	# Extract the URL from the href attribute
	info['product_url'] = product_link

	except NoSuchElementException:
	pass

	# thumbnail
	try:
	# Find the <img> element within the <div class>
	image_element = product_element.find_element(By.XPATH, '//img[@class="inset-y-0 w-full h-full pointer-events-none object-contain absolute"]')

	# Get the src attribute of the <img> element
	info['image'] = image_element.get_attribute("src")

	except NoSuchElementException:
	pass

	# If we decide to get extra information
	if extra_info:
	# sales
	try:
	# Find the <span> element within the <div class>
	sold_element = product_element.find_element(By.XPATH, '//div[@class="truncate text-shopee-black87 text-xs min-h-4 flex-shrink-1"]')

	# Get the text content of the <span> element
	info['sales'] = sold_element.text

	except (NoSuchElementException, ValueError):
	info['sales'] = 0

	try:
	# Find the <span> element within the <div class>
	discount_element = product_element.find_element(By.XPATH, '//div[@class="truncate bg-shopee-voucher-yellow text-white leading-4 text-sp10"]')

	# Get the text content of the <span> element
	info['discount'] = discount_element.text

	except (NoSuchElementException, ValueError):
	info['discount'] = '0'

	return info

	### Function to scrape all products from a page
	def get_shopee_product_info_from_page(page_url, extra_info=False):
	"""
	Extract info from all products of a specfic page_url on Tiki website
	Args:
	page_url: (string) url of the page to scrape
	Returns:
	data: (list) a list of dictionary of products info. If no products
	found, return empty list.
	"""
	global DRIVER

	data = []
	DRIVER.get(page_url) # Use the driver to get info from the product page
	time.sleep(3)

	# FIND ALL PRODUCT ITEMS
	products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
	print(f'Found {len(products)} products')
	print(products)

	if len(products)>0:
	for i in products:
	product_dict = get_shopee_product_info_single(i, extra_info)
	data.append(product_dict)
	return data

	### Function to get product info from a main category
	def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False):
	'''
	Scrape for multiple pages of products of a category.
	Uses get_product_info_from_page().

	Args:
	cat_url: (string) a url string of a category
	max_page: (int) an integer denoting the maximum number of pages to scrape.
	Default value is 0 to scrape all pages.
	Returns:
	products: a list in which every element is a dictionary of one product's information
	'''
	products = []

	page_n = 0
	cat_page_url = 'https://shopee.vn/search?keyword=' + search_product
	product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)

	while len(product_list)>0:
	products.extend(product_list)
	page_n += 1

	# stop_flag = False if max_page <= 0 else (page_n > max_page)
	stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
	if stop_flag:
	break

	cat_page_url = cat_page_url + f'&page={page_n}'
	product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)

	return products

	def scrap_shopee(search_product, num_max_page, extra_info):

	# # #test Shopee
	# start_driver()
	# URL = 'https://shopee.vn/search?keyword=megaduo&page=0&sortBy=relevancy'
	# DRIVER.get(URL)
	# time.sleep(3)
	# print(URL)
	# products = DRIVER.find_elements(By.CLASS_NAME, 'shopee-search-item-result')
	# # products = DRIVER.find_element("css selector", 'li[class="col-xs-2-4 shopee-search-item-result__item"]')
	# product = products[0]
	# # Wait for the element to be present on the page
	# info = get_shopee_product_info_single(product, True)
	# print(info)

	start_driver(force_restart=True)

	prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE

	prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
	prod_data.extend(prod_per_cat)

	close_driver() # Close driver when we're done

	return prod_data