Spaces:
Sleeping
Sleeping
Linh Vuu commited on
Commit ·
39a482a
1
Parent(s): a088ba6
added files
Browse files- README copy.md +12 -0
- app.py +85 -0
- requirements.txt +4 -0
- scraper_lazada.py +208 -0
- scraper_shopee.py +229 -0
- scraper_tiki.py +283 -0
README copy.md
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: PriceComparison
|
| 3 |
+
emoji: 👀
|
| 4 |
+
colorFrom: pink
|
| 5 |
+
colorTo: yellow
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.33.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from scraper_tiki import *
|
| 2 |
+
from scraper_lazada import *
|
| 3 |
+
from scraper_shopee import *
|
| 4 |
+
import pandas as pd
|
| 5 |
+
import streamlit as st
|
| 6 |
+
|
| 7 |
+
# #test Tiki
|
| 8 |
+
# start_driver()
|
| 9 |
+
# DRIVER.get('https://tiki.vn/search?sort=price%2Casc&q=megaduo')
|
| 10 |
+
# time.sleep(3)
|
| 11 |
+
# products = DRIVER.find_elements(By.CLASS_NAME, 'product-item')
|
| 12 |
+
# product = products[2]
|
| 13 |
+
# info = get_tiki_product_info_single(product, True)
|
| 14 |
+
# print(info)
|
| 15 |
+
|
| 16 |
+
# # Test Lazada
|
| 17 |
+
# start_driver()
|
| 18 |
+
# DRIVER.get('https://www.lazada.vn/catalog/?page=1&q=megaduo&sort=priceasc')
|
| 19 |
+
# time.sleep(3)
|
| 20 |
+
# products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
| 21 |
+
# product = products[2]
|
| 22 |
+
# info = get_lazada_product_info_single(product, True)
|
| 23 |
+
# print(info)
|
| 24 |
+
|
| 25 |
+
def main():
|
| 26 |
+
|
| 27 |
+
st.subheader("Price Comparison (So Sánh Giá)")
|
| 28 |
+
|
| 29 |
+
with st.form(key="user_input_form"):
|
| 30 |
+
|
| 31 |
+
search_product = st.text_input("What would you like to buy? (Bạn muốn mua gì?)")
|
| 32 |
+
submit_button = st.form_submit_button(label="Search")
|
| 33 |
+
|
| 34 |
+
if submit_button:
|
| 35 |
+
print('Scraping', search_product)
|
| 36 |
+
# search_product = "megaduo"
|
| 37 |
+
# search_product = input("Search for what? ")
|
| 38 |
+
num_max_page = 1
|
| 39 |
+
extra_info = True
|
| 40 |
+
n_products_to_view = 5 # Change this as you like to check more products
|
| 41 |
+
col_to_display = ['name', 'price', 'product_url', 'image']
|
| 42 |
+
|
| 43 |
+
st.subheader("Shopee")
|
| 44 |
+
shopee_data = scrap_shopee(search_product, num_max_page, extra_info)
|
| 45 |
+
if shopee_data:
|
| 46 |
+
df_shopee = pd.DataFrame(data=shopee_data, columns=shopee_data[0].keys())
|
| 47 |
+
print(df_shopee.head())
|
| 48 |
+
st.write(df_shopee[col_to_display].sort_values(by='price').head(n_products_to_view))
|
| 49 |
+
else:
|
| 50 |
+
df_shopee = pd.DataFrame(columns = col_to_display)
|
| 51 |
+
st.write("Not found.")
|
| 52 |
+
|
| 53 |
+
st.subheader("Lazada")
|
| 54 |
+
lazada_data = scrap_lazada(search_product, num_max_page, extra_info)
|
| 55 |
+
if lazada_data:
|
| 56 |
+
df_lazada = pd.DataFrame(data=lazada_data, columns=lazada_data[0].keys())
|
| 57 |
+
print(df_lazada.head())
|
| 58 |
+
st.write(df_lazada[col_to_display].sort_values(by='price').head(n_products_to_view))
|
| 59 |
+
else:
|
| 60 |
+
df_lazada = pd.DataFrame(columns = col_to_display)
|
| 61 |
+
st.write("Not found.")
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
st.subheader("Tiki")
|
| 65 |
+
tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
|
| 66 |
+
if tiki_data:
|
| 67 |
+
df_tiki = pd.DataFrame(data=tiki_data, columns=tiki_data[0].keys())
|
| 68 |
+
print(df_tiki.head())
|
| 69 |
+
st.write(df_tiki[col_to_display].sort_values(by='price').head(n_products_to_view))
|
| 70 |
+
else:
|
| 71 |
+
df_tiki = pd.DataFrame(columns = col_to_display)
|
| 72 |
+
st.write("Not found.")
|
| 73 |
+
|
| 74 |
+
# Merge the two dataframes
|
| 75 |
+
merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
|
| 76 |
+
|
| 77 |
+
# Sort the merged dataframe by price
|
| 78 |
+
sorted_merged_df = merged_df.sort_values(by='price')
|
| 79 |
+
|
| 80 |
+
print(sorted_merged_df.head(n_products_to_view))
|
| 81 |
+
st.subheader("All sites, sorted by price ascending (Sắp xếp theo giá tăng dần)")
|
| 82 |
+
st.write(sorted_merged_df.head(n_products_to_view))
|
| 83 |
+
|
| 84 |
+
if __name__ == "__main__":
|
| 85 |
+
main()
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
selenium
|
| 2 |
+
pandas
|
| 3 |
+
streamlit==1.13.0
|
| 4 |
+
altair==4.1.0
|
scraper_lazada.py
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from selenium import webdriver
|
| 2 |
+
from selenium.webdriver.chrome.options import Options
|
| 3 |
+
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
+
from selenium.webdriver.common.by import By
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
# Global driver to use throughout the script
|
| 8 |
+
DRIVER = None
|
| 9 |
+
|
| 10 |
+
# Wrapper to close driver if its created
|
| 11 |
+
def close_driver():
|
| 12 |
+
global DRIVER
|
| 13 |
+
if DRIVER is not None:
|
| 14 |
+
DRIVER.close()
|
| 15 |
+
DRIVER = None
|
| 16 |
+
|
| 17 |
+
# Function to (re)start driver
|
| 18 |
+
def start_driver(force_restart=False):
|
| 19 |
+
global DRIVER
|
| 20 |
+
|
| 21 |
+
if force_restart:
|
| 22 |
+
close_driver()
|
| 23 |
+
|
| 24 |
+
# Setting up the driver
|
| 25 |
+
options = Options()
|
| 26 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 27 |
+
options.add_argument('-no-sandbox')
|
| 28 |
+
options.add_argument('-disable-dev-shm-usage')
|
| 29 |
+
|
| 30 |
+
DRIVER = webdriver.Chrome(options=options)
|
| 31 |
+
|
| 32 |
+
### Function to extract product info from the necessary html and json tags
|
| 33 |
+
def get_lazada_product_info_single(product_element, extra_info):
|
| 34 |
+
"""
|
| 35 |
+
Extract info from a single product element from the driver.
|
| 36 |
+
Args:
|
| 37 |
+
product_item: (WebDriverElement) the product whose info needs to be
|
| 38 |
+
extracted.
|
| 39 |
+
Returns:
|
| 40 |
+
info: (dict) a dictionary of info of the product. Every product
|
| 41 |
+
should at least have four pieces of information: name, price,
|
| 42 |
+
link to the product page, and link to the product image.
|
| 43 |
+
"""
|
| 44 |
+
info = {'source': 'lazada',
|
| 45 |
+
'name':'',
|
| 46 |
+
'price':-1,
|
| 47 |
+
'product_url':'',
|
| 48 |
+
'image':''}
|
| 49 |
+
|
| 50 |
+
# print(product_element.get_attribute('outerHTML'))
|
| 51 |
+
try:
|
| 52 |
+
# Find the <a> element within the <div class="RfADt">
|
| 53 |
+
product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
| 54 |
+
|
| 55 |
+
# Get the text content of the <a> element
|
| 56 |
+
info['name'] = product_title_element.text
|
| 57 |
+
|
| 58 |
+
except NoSuchElementException:
|
| 59 |
+
info['name'] = ""
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# price
|
| 63 |
+
try:
|
| 64 |
+
# Find the <span> element with class "ooOxS" within the <div class="aBrP0">
|
| 65 |
+
price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
|
| 66 |
+
|
| 67 |
+
# Get the text content of the <span> element
|
| 68 |
+
price_text = price_element.text
|
| 69 |
+
|
| 70 |
+
# Extract the price value
|
| 71 |
+
info['price'] = int(price_text.split(" ")[0].replace('.', ''))
|
| 72 |
+
|
| 73 |
+
except (NoSuchElementException, ValueError):
|
| 74 |
+
pass
|
| 75 |
+
|
| 76 |
+
# link
|
| 77 |
+
try:
|
| 78 |
+
# Find the <a> element within the <div class="RfADt">
|
| 79 |
+
product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
| 80 |
+
|
| 81 |
+
# Get the href attribute of the <a> element
|
| 82 |
+
product_link = product_link_element.get_attribute("href")
|
| 83 |
+
|
| 84 |
+
# Extract the URL from the href attribute
|
| 85 |
+
info['product_url'] = product_link.split("//")[1]
|
| 86 |
+
|
| 87 |
+
except NoSuchElementException:
|
| 88 |
+
pass
|
| 89 |
+
|
| 90 |
+
# thumbnail
|
| 91 |
+
try:
|
| 92 |
+
# Find the <img> element within the <div class="_95X4G">
|
| 93 |
+
image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
|
| 94 |
+
|
| 95 |
+
# Get the src attribute of the <img> element
|
| 96 |
+
info['image'] = image_element.get_attribute("src")
|
| 97 |
+
|
| 98 |
+
except NoSuchElementException:
|
| 99 |
+
pass
|
| 100 |
+
|
| 101 |
+
# If we decide to get extra information
|
| 102 |
+
if extra_info:
|
| 103 |
+
# sales
|
| 104 |
+
try:
|
| 105 |
+
# Find the <span> element within the <div class="_6uN7R">
|
| 106 |
+
sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
|
| 107 |
+
|
| 108 |
+
# Get the text content of the <span> element
|
| 109 |
+
info['sales'] = sold_element.text
|
| 110 |
+
|
| 111 |
+
except (NoSuchElementException, ValueError):
|
| 112 |
+
info['sales'] = 0
|
| 113 |
+
|
| 114 |
+
try:
|
| 115 |
+
# Find the <span> element within the <div class="WNoq3">
|
| 116 |
+
discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
|
| 117 |
+
|
| 118 |
+
# Get the text content of the <span> element
|
| 119 |
+
info['discount'] = discount_element.text
|
| 120 |
+
|
| 121 |
+
except (NoSuchElementException, ValueError):
|
| 122 |
+
info['discount'] = '0'
|
| 123 |
+
|
| 124 |
+
return info
|
| 125 |
+
|
| 126 |
+
### Function to scrape all products from a page
|
| 127 |
+
def get_lazada_product_info_from_page(page_url, extra_info=False):
|
| 128 |
+
"""
|
| 129 |
+
Extract info from all products of a specfic page_url on Tiki website
|
| 130 |
+
Args:
|
| 131 |
+
page_url: (string) url of the page to scrape
|
| 132 |
+
Returns:
|
| 133 |
+
data: (list) a list of dictionary of products info. If no products
|
| 134 |
+
found, return empty list.
|
| 135 |
+
"""
|
| 136 |
+
global DRIVER
|
| 137 |
+
|
| 138 |
+
data = []
|
| 139 |
+
DRIVER.get(page_url) # Use the driver to get info from the product page
|
| 140 |
+
time.sleep(3)
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
| 144 |
+
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
| 145 |
+
print("EMPTY PAGE")
|
| 146 |
+
return data
|
| 147 |
+
except NoSuchElementException:
|
| 148 |
+
no_product_found = False
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
# FIND ALL PRODUCT ITEMS
|
| 152 |
+
products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
| 153 |
+
print(f'Found {len(products)} products')
|
| 154 |
+
|
| 155 |
+
if (not no_product_found) and len(products)>0:
|
| 156 |
+
for i in products:
|
| 157 |
+
product_dict = get_lazada_product_info_single(i, extra_info)
|
| 158 |
+
data.append(product_dict)
|
| 159 |
+
return data
|
| 160 |
+
|
| 161 |
+
### Function to get product info from a main category
|
| 162 |
+
def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
| 163 |
+
'''
|
| 164 |
+
Scrape for multiple pages of products of a category.
|
| 165 |
+
Uses get_product_info_from_page().
|
| 166 |
+
|
| 167 |
+
Args:
|
| 168 |
+
cat_url: (string) a url string of a category
|
| 169 |
+
max_page: (int) an integer denoting the maximum number of pages to scrape.
|
| 170 |
+
Default value is 0 to scrape all pages.
|
| 171 |
+
Returns:
|
| 172 |
+
products: a list in which every element is a dictionary of one product's information
|
| 173 |
+
'''
|
| 174 |
+
products = []
|
| 175 |
+
|
| 176 |
+
page_n = 1
|
| 177 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
| 178 |
+
product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 179 |
+
|
| 180 |
+
while len(product_list)>0:
|
| 181 |
+
products.extend(product_list)
|
| 182 |
+
page_n += 1
|
| 183 |
+
|
| 184 |
+
# stop_flag = False if max_page <= 0 else (page_n > max_page)
|
| 185 |
+
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
| 186 |
+
if stop_flag:
|
| 187 |
+
break
|
| 188 |
+
|
| 189 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
| 190 |
+
product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 191 |
+
|
| 192 |
+
return products
|
| 193 |
+
|
| 194 |
+
def scrap_lazada(search_product, num_max_page, extra_info):
|
| 195 |
+
|
| 196 |
+
start_driver(force_restart=True)
|
| 197 |
+
|
| 198 |
+
url = 'https://www.lazada.vn/catalog/?q=' + search_product
|
| 199 |
+
|
| 200 |
+
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
| 201 |
+
|
| 202 |
+
# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
|
| 203 |
+
prod_per_cat = get_lazada_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
| 204 |
+
prod_data.extend(prod_per_cat)
|
| 205 |
+
|
| 206 |
+
close_driver() # Close driver when we're done
|
| 207 |
+
|
| 208 |
+
return prod_data
|
scraper_shopee.py
ADDED
|
@@ -0,0 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from selenium import webdriver
|
| 2 |
+
from selenium.webdriver.chrome.options import Options
|
| 3 |
+
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
+
from selenium.webdriver.common.by import By
|
| 5 |
+
from selenium.webdriver.support.ui import WebDriverWait
|
| 6 |
+
from selenium.webdriver.support import expected_conditions as EC
|
| 7 |
+
import time
|
| 8 |
+
|
| 9 |
+
# Global driver to use throughout the script
|
| 10 |
+
DRIVER = None
|
| 11 |
+
|
| 12 |
+
# Wrapper to close driver if its created
|
| 13 |
+
def close_driver():
|
| 14 |
+
global DRIVER
|
| 15 |
+
if DRIVER is not None:
|
| 16 |
+
DRIVER.close()
|
| 17 |
+
DRIVER = None
|
| 18 |
+
|
| 19 |
+
# Function to (re)start driver
|
| 20 |
+
def start_driver(force_restart=False):
|
| 21 |
+
global DRIVER
|
| 22 |
+
|
| 23 |
+
if force_restart:
|
| 24 |
+
close_driver()
|
| 25 |
+
|
| 26 |
+
# Setting up the driver
|
| 27 |
+
options = Options()
|
| 28 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 29 |
+
options.add_argument('-no-sandbox')
|
| 30 |
+
options.add_argument('-disable-dev-shm-usage')
|
| 31 |
+
|
| 32 |
+
DRIVER = webdriver.Chrome(options=options)
|
| 33 |
+
|
| 34 |
+
### Function to extract product info from the necessary html and json tags
|
| 35 |
+
def get_shopee_product_info_single(product_element, extra_info):
|
| 36 |
+
"""
|
| 37 |
+
Extract info from a single product element from the driver.
|
| 38 |
+
Args:
|
| 39 |
+
product_item: (WebDriverElement) the product whose info needs to be
|
| 40 |
+
extracted.
|
| 41 |
+
Returns:
|
| 42 |
+
info: (dict) a dictionary of info of the product. Every product
|
| 43 |
+
should at least have four pieces of information: name, price,
|
| 44 |
+
link to the product page, and link to the product image.
|
| 45 |
+
"""
|
| 46 |
+
info = {'source': 'shopee',
|
| 47 |
+
'name':'',
|
| 48 |
+
'price':-1,
|
| 49 |
+
'product_url':'',
|
| 50 |
+
'image':''}
|
| 51 |
+
print(product_element.get_attribute('outerHTML'))
|
| 52 |
+
try:
|
| 53 |
+
# Find the <a> element within the <div class>
|
| 54 |
+
product_title_element = product_element.find_element(By.CLASS_NAME, "line-clamp-2")
|
| 55 |
+
|
| 56 |
+
# Get the text content of the <a> element
|
| 57 |
+
info['name'] = product_title_element.text
|
| 58 |
+
print(info['name'])
|
| 59 |
+
|
| 60 |
+
except NoSuchElementException:
|
| 61 |
+
info['name'] = ""
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
# price
|
| 65 |
+
try:
|
| 66 |
+
# Find the <span> element within the <div class>
|
| 67 |
+
price_element = product_element.find_element(By.XPATH,'//div[@class="truncate flex items-baseline"]/span[@class="text-base/5 truncate"]')
|
| 68 |
+
|
| 69 |
+
# Get the text content of the <span> element
|
| 70 |
+
price_text = price_element.text
|
| 71 |
+
|
| 72 |
+
# Extract the price value
|
| 73 |
+
info['price'] = int(price_text.split(" ")[0].replace('.', ''))
|
| 74 |
+
print(info['price'])
|
| 75 |
+
|
| 76 |
+
except (NoSuchElementException, ValueError):
|
| 77 |
+
pass
|
| 78 |
+
|
| 79 |
+
# link
|
| 80 |
+
try:
|
| 81 |
+
# Find the <a> element within the <div class>
|
| 82 |
+
product_link_element = product_element.find_element(By.XPATH, '//a[@class="contents"]')
|
| 83 |
+
|
| 84 |
+
# Get the href attribute of the <a> element
|
| 85 |
+
product_link = product_link_element.get_attribute("href")
|
| 86 |
+
|
| 87 |
+
# Extract the URL from the href attribute
|
| 88 |
+
info['product_url'] = product_link
|
| 89 |
+
|
| 90 |
+
except NoSuchElementException:
|
| 91 |
+
pass
|
| 92 |
+
|
| 93 |
+
# thumbnail
|
| 94 |
+
try:
|
| 95 |
+
# Find the <img> element within the <div class>
|
| 96 |
+
image_element = product_element.find_element(By.XPATH, '//img[@class="inset-y-0 w-full h-full pointer-events-none object-contain absolute"]')
|
| 97 |
+
|
| 98 |
+
# Get the src attribute of the <img> element
|
| 99 |
+
info['image'] = image_element.get_attribute("src")
|
| 100 |
+
|
| 101 |
+
except NoSuchElementException:
|
| 102 |
+
pass
|
| 103 |
+
|
| 104 |
+
# If we decide to get extra information
|
| 105 |
+
if extra_info:
|
| 106 |
+
# sales
|
| 107 |
+
try:
|
| 108 |
+
# Find the <span> element within the <div class>
|
| 109 |
+
sold_element = product_element.find_element(By.XPATH, '//div[@class="truncate text-shopee-black87 text-xs min-h-4 flex-shrink-1"]')
|
| 110 |
+
|
| 111 |
+
# Get the text content of the <span> element
|
| 112 |
+
info['sales'] = sold_element.text
|
| 113 |
+
|
| 114 |
+
except (NoSuchElementException, ValueError):
|
| 115 |
+
info['sales'] = 0
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
# Find the <span> element within the <div class>
|
| 119 |
+
discount_element = product_element.find_element(By.XPATH, '//div[@class="truncate bg-shopee-voucher-yellow text-white leading-4 text-sp10"]')
|
| 120 |
+
|
| 121 |
+
# Get the text content of the <span> element
|
| 122 |
+
info['discount'] = discount_element.text
|
| 123 |
+
|
| 124 |
+
except (NoSuchElementException, ValueError):
|
| 125 |
+
info['discount'] = '0'
|
| 126 |
+
|
| 127 |
+
return info
|
| 128 |
+
|
| 129 |
+
### Function to scrape all products from a page
|
| 130 |
+
def get_shopee_product_info_from_page(page_url, extra_info=False):
|
| 131 |
+
"""
|
| 132 |
+
Extract info from all products of a specfic page_url on Tiki website
|
| 133 |
+
Args:
|
| 134 |
+
page_url: (string) url of the page to scrape
|
| 135 |
+
Returns:
|
| 136 |
+
data: (list) a list of dictionary of products info. If no products
|
| 137 |
+
found, return empty list.
|
| 138 |
+
"""
|
| 139 |
+
global DRIVER
|
| 140 |
+
|
| 141 |
+
data = []
|
| 142 |
+
DRIVER.get(page_url) # Use the driver to get info from the product page
|
| 143 |
+
time.sleep(3)
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
try:
|
| 147 |
+
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
| 148 |
+
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
| 149 |
+
print("EMPTY PAGE")
|
| 150 |
+
return data
|
| 151 |
+
except NoSuchElementException:
|
| 152 |
+
no_product_found = False
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
# FIND ALL PRODUCT ITEMS
|
| 156 |
+
# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
|
| 157 |
+
products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
|
| 158 |
+
print(f'Found {len(products)} products')
|
| 159 |
+
print(products)
|
| 160 |
+
|
| 161 |
+
if (not no_product_found) and len(products)>0:
|
| 162 |
+
for i in products:
|
| 163 |
+
product_dict = get_shopee_product_info_single(i, extra_info)
|
| 164 |
+
print(i)
|
| 165 |
+
print(product_dict)
|
| 166 |
+
data.append(product_dict)
|
| 167 |
+
return data
|
| 168 |
+
|
| 169 |
+
### Function to get product info from a main category
|
| 170 |
+
def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
| 171 |
+
'''
|
| 172 |
+
Scrape for multiple pages of products of a category.
|
| 173 |
+
Uses get_product_info_from_page().
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
cat_url: (string) a url string of a category
|
| 177 |
+
max_page: (int) an integer denoting the maximum number of pages to scrape.
|
| 178 |
+
Default value is 0 to scrape all pages.
|
| 179 |
+
Returns:
|
| 180 |
+
products: a list in which every element is a dictionary of one product's information
|
| 181 |
+
'''
|
| 182 |
+
products = []
|
| 183 |
+
|
| 184 |
+
page_n = 1
|
| 185 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
| 186 |
+
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 187 |
+
|
| 188 |
+
while len(product_list)>0:
|
| 189 |
+
products.extend(product_list)
|
| 190 |
+
page_n += 1
|
| 191 |
+
|
| 192 |
+
# stop_flag = False if max_page <= 0 else (page_n > max_page)
|
| 193 |
+
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
| 194 |
+
if stop_flag:
|
| 195 |
+
break
|
| 196 |
+
|
| 197 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
| 198 |
+
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 199 |
+
|
| 200 |
+
return products
|
| 201 |
+
|
| 202 |
+
def scrap_shopee(search_product, num_max_page, extra_info):
|
| 203 |
+
|
| 204 |
+
# # #test Shopee
|
| 205 |
+
# start_driver()
|
| 206 |
+
# URL = 'https://shopee.vn/search?keyword=megaduo&page=0&sortBy=relevancy'
|
| 207 |
+
# DRIVER.get(URL)
|
| 208 |
+
# time.sleep(3)
|
| 209 |
+
# print(URL)
|
| 210 |
+
# products = DRIVER.find_elements(By.CLASS_NAME, 'shopee-search-item-result')
|
| 211 |
+
# # products = DRIVER.find_element("css selector", 'li[class="col-xs-2-4 shopee-search-item-result__item"]')
|
| 212 |
+
# product = products[0]
|
| 213 |
+
# # Wait for the element to be present on the page
|
| 214 |
+
# info = get_shopee_product_info_single(product, True)
|
| 215 |
+
# print(info)
|
| 216 |
+
|
| 217 |
+
start_driver(force_restart=True)
|
| 218 |
+
|
| 219 |
+
url = 'https://shopee.vn/search?keyword=' + search_product
|
| 220 |
+
|
| 221 |
+
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
| 222 |
+
|
| 223 |
+
# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
|
| 224 |
+
prod_per_cat = get_shopee_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
| 225 |
+
prod_data.extend(prod_per_cat)
|
| 226 |
+
|
| 227 |
+
close_driver() # Close driver when we're done
|
| 228 |
+
|
| 229 |
+
return prod_data
|
scraper_tiki.py
ADDED
|
@@ -0,0 +1,283 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from selenium import webdriver
|
| 2 |
+
from selenium.webdriver.chrome.options import Options
|
| 3 |
+
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
+
from selenium.webdriver.common.by import By
|
| 5 |
+
import time
|
| 6 |
+
|
| 7 |
+
# Global driver to use throughout the script
|
| 8 |
+
DRIVER = None
|
| 9 |
+
|
| 10 |
+
# Wrapper to close driver if its created
|
| 11 |
+
def close_driver():
|
| 12 |
+
global DRIVER
|
| 13 |
+
if DRIVER is not None:
|
| 14 |
+
DRIVER.close()
|
| 15 |
+
DRIVER = None
|
| 16 |
+
|
| 17 |
+
# Function to (re)start driver
|
| 18 |
+
def start_driver(force_restart=False):
|
| 19 |
+
global DRIVER
|
| 20 |
+
|
| 21 |
+
if force_restart:
|
| 22 |
+
close_driver()
|
| 23 |
+
|
| 24 |
+
# Setting up the driver
|
| 25 |
+
options = Options()
|
| 26 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 27 |
+
options.add_argument('-no-sandbox')
|
| 28 |
+
options.add_argument('-disable-dev-shm-usage')
|
| 29 |
+
|
| 30 |
+
DRIVER = webdriver.Chrome(options=options)
|
| 31 |
+
|
| 32 |
+
### Function to extract product info from the necessary html and json tags
|
| 33 |
+
def get_tiki_product_info_single(product_element, extra_info):
|
| 34 |
+
"""
|
| 35 |
+
Extract info from a single product element from the driver.
|
| 36 |
+
Args:
|
| 37 |
+
product_item: (WebDriverElement) the product whose info needs to be
|
| 38 |
+
extracted.
|
| 39 |
+
Returns:
|
| 40 |
+
info: (dict) a dictionary of info of the product. Every product
|
| 41 |
+
should at least have four pieces of information: name, price,
|
| 42 |
+
link to the product page, and link to the product image.
|
| 43 |
+
"""
|
| 44 |
+
info = {'source': 'tiki',
|
| 45 |
+
'name':'',
|
| 46 |
+
'price':-1,
|
| 47 |
+
'product_url':'',
|
| 48 |
+
'image':''}
|
| 49 |
+
# print(product_element.get_attribute('outerHTML'))
|
| 50 |
+
try:
|
| 51 |
+
# name = product_element.find_element(By.XPATH, ".//div[@class='name']/h3")
|
| 52 |
+
# name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8 ibOlar').find_element(By.TAG_NAME, 'h3')
|
| 53 |
+
|
| 54 |
+
name = product_element.find_element(By.CLASS_NAME, 'name').find_element(By.TAG_NAME, 'h3')
|
| 55 |
+
|
| 56 |
+
info['name'] = name.get_attribute('innerHTML').strip()
|
| 57 |
+
|
| 58 |
+
except NoSuchElementException:
|
| 59 |
+
|
| 60 |
+
# Find the <h3> element by class name
|
| 61 |
+
name = product_element.find_element(By.CLASS_NAME, 'style__NameStyled-sc-139nb47-8')
|
| 62 |
+
|
| 63 |
+
# Get the text content of the element
|
| 64 |
+
info['name'] = name.text
|
| 65 |
+
|
| 66 |
+
# price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
|
| 67 |
+
# print(price)
|
| 68 |
+
|
| 69 |
+
# price
|
| 70 |
+
try:
|
| 71 |
+
# price=product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML').strip()
|
| 72 |
+
price = product_element.find_element(By.CLASS_NAME, 'price-discount__price').get_attribute('innerHTML')
|
| 73 |
+
# price = product_element.find_element(By.XPATH, ".//div[@class='price-discount__price']").get_attribute('innerHTML')
|
| 74 |
+
|
| 75 |
+
info['price']=int(price.replace('<sup>₫</sup>', '').replace('.', ''))
|
| 76 |
+
# info['price'] = int(re.sub(r'[\.\s₫]', '', price)) # With regex
|
| 77 |
+
# info['price'] = int(''.join([c for c in price if c not in '.₫ '])) # Without regex
|
| 78 |
+
except (NoSuchElementException, ValueError):
|
| 79 |
+
pass
|
| 80 |
+
|
| 81 |
+
# link
|
| 82 |
+
try:
|
| 83 |
+
product_link = product_element.get_attribute('href')
|
| 84 |
+
info['product_url'] = product_link
|
| 85 |
+
except NoSuchElementException:
|
| 86 |
+
pass
|
| 87 |
+
|
| 88 |
+
# thumbnail
|
| 89 |
+
try:
|
| 90 |
+
# thumbnail = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']//child::img")[-1]
|
| 91 |
+
|
| 92 |
+
# thumbnail = product_element.find_element(By.CLASS_NAME, 'thumbnail').find_element(By.TAG_NAME, 'img')
|
| 93 |
+
# info['image'] = thumbnail.get_attribute('src')
|
| 94 |
+
|
| 95 |
+
# Find the <div> element with class "image-wrapper"
|
| 96 |
+
image_div = product_element.find_element(By.CLASS_NAME, 'image-wrapper')
|
| 97 |
+
|
| 98 |
+
# Find the <img> element within the <div> element
|
| 99 |
+
img_element = image_div.find_element(By.TAG_NAME, 'img')
|
| 100 |
+
|
| 101 |
+
# Get the value of the "srcset" attribute
|
| 102 |
+
srcset_value = img_element.get_attribute('srcset')
|
| 103 |
+
|
| 104 |
+
# Extract the link of the image from the srcset value
|
| 105 |
+
image_link = srcset_value.split(',')[0].split(' ')[0]
|
| 106 |
+
info['image'] = image_link
|
| 107 |
+
|
| 108 |
+
except NoSuchElementException:
|
| 109 |
+
pass
|
| 110 |
+
|
| 111 |
+
# If we decide to get extra information
|
| 112 |
+
if extra_info:
|
| 113 |
+
# sales
|
| 114 |
+
try:
|
| 115 |
+
# sales_elem = product_element.find_element(By.XPATH, ".//div[@class='styles__StyledQtySold-sc-732h27-2']")
|
| 116 |
+
# sales_elem = product_element.find_element(By.CLASS_NAME, 'quantity has-border')
|
| 117 |
+
# info['sales'] = sales_elem
|
| 118 |
+
# info['sales'] = int(re.sub(r'\D', '', sales_elem.get_attribute('innerHTML')))
|
| 119 |
+
|
| 120 |
+
# Find the <span> element with class "quantity"
|
| 121 |
+
quantity_span = product_element.find_element(By.CLASS_NAME, 'quantity')
|
| 122 |
+
|
| 123 |
+
# Get the text content of the element
|
| 124 |
+
info['sales'] = quantity_span.text
|
| 125 |
+
|
| 126 |
+
except (NoSuchElementException, ValueError):
|
| 127 |
+
info['sales'] = 0
|
| 128 |
+
|
| 129 |
+
# # rating
|
| 130 |
+
# try:
|
| 131 |
+
# # rating = product_element.find_element(By.XPATH, ".//div[@class='average']").get_attribute('style')
|
| 132 |
+
# rating = product_element.find_element(By.CLASS_NAME, 'average').get_attribute('style')
|
| 133 |
+
# # info['rating'] = float(re.sub(r'\D','', rating))/100*5 # With regex
|
| 134 |
+
# info['rating'] = float(''.join([c for c in rating if c.isdigit()]))/100*5 # Without regex
|
| 135 |
+
# except NoSuchElementException:
|
| 136 |
+
# info['rating'] = 0
|
| 137 |
+
|
| 138 |
+
try:
|
| 139 |
+
# Try to get discount using class name
|
| 140 |
+
discount = product_element.find_element(By.CLASS_NAME, 'price-discount__discount').get_attribute('innerHTML')
|
| 141 |
+
info['discount'] = discount.replace('-', '') # Remove any dashes
|
| 142 |
+
|
| 143 |
+
except (NoSuchElementException, ValueError):
|
| 144 |
+
try:
|
| 145 |
+
# Try to get discount using another method
|
| 146 |
+
discount_div = product_element.find_element(By.CLASS_NAME, 'style__DiscountPercentStyled-sc-e9h7mj-1')
|
| 147 |
+
info['discount'] = discount_div.text.replace('-', '') # Remove any dashes
|
| 148 |
+
|
| 149 |
+
except NoSuchElementException:
|
| 150 |
+
# If both attempts fail, set discount to 0
|
| 151 |
+
info['discount'] = '0'
|
| 152 |
+
|
| 153 |
+
# # tiki now
|
| 154 |
+
# try:
|
| 155 |
+
# info['tiki_now'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-service').find_element(By.CLASS_NAME, 'item'))
|
| 156 |
+
# except NoSuchElementException:
|
| 157 |
+
# info['tiki_now'] = False
|
| 158 |
+
|
| 159 |
+
# # freeship, official seller, and/or trusted seller
|
| 160 |
+
# try:
|
| 161 |
+
# info['freeship'] = False
|
| 162 |
+
# info['official'] = False
|
| 163 |
+
# info['trusted'] = False
|
| 164 |
+
# thumbnail_tag = product_element.find_element(By.CLASS_NAME, 'thumbnail')
|
| 165 |
+
# list_img = thumbnail_tag.find_elements(By.TAG_NAME, 'img')
|
| 166 |
+
# # list_img = product_element.find_elements(By.XPATH, ".//div[@class='thumbnail']/img")
|
| 167 |
+
# for img in list_img:
|
| 168 |
+
# if img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/dc/0d/49/3251737db2de83b74eba8a9ad6d03338.png':
|
| 169 |
+
# info['freeship'] = True
|
| 170 |
+
# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/b9/1f/4b/557eac9c67a4466ccebfa74cde854215.png':
|
| 171 |
+
# info['official'] = True
|
| 172 |
+
# elif img.get_attribute('src') == 'https://salt.tikicdn.com/ts/upload/e0/41/da/bb0fc684a838eff5e264ce0534a148f0.png':
|
| 173 |
+
# info['trusted'] = True
|
| 174 |
+
# except NoSuchElementException:
|
| 175 |
+
# pass
|
| 176 |
+
|
| 177 |
+
# # under price
|
| 178 |
+
# try:
|
| 179 |
+
# # info['under_price'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-under-price']/child::div[@class='item']"))
|
| 180 |
+
# info['under_price'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-under-price').find_element(By.CLASS_NAME, 'item'))
|
| 181 |
+
# except NoSuchElementException:
|
| 182 |
+
# info['under_price'] = False
|
| 183 |
+
|
| 184 |
+
# # installment
|
| 185 |
+
# try:
|
| 186 |
+
# # info['installment'] = bool(product_element.find_element(By.XPATH, ".//div[@class='badge-benefits']//child::img[1]"))
|
| 187 |
+
# info['installment'] = bool(product_element.find_element(By.CLASS_NAME, 'badge-benefits').find_element(By.TAG_NAME, 'img'))
|
| 188 |
+
# except NoSuchElementException:
|
| 189 |
+
# info['installment'] = False
|
| 190 |
+
|
| 191 |
+
# # gift
|
| 192 |
+
# try:
|
| 193 |
+
# # info['gift'] = bool(product_element.find_element(By.XPATH, ".//div[@class='freegift-list']"))
|
| 194 |
+
# info['gift'] = bool(product_element.find_element(By.CLASS_NAME, 'freegift-list'))
|
| 195 |
+
# except NoSuchElementException:
|
| 196 |
+
# info['gift'] = False
|
| 197 |
+
|
| 198 |
+
return info
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
### Function to scrape all products from a page
|
| 202 |
+
def get_tiki_product_info_from_page(page_url, extra_info=False):
|
| 203 |
+
"""
|
| 204 |
+
Extract info from all products of a specfic page_url on Tiki website
|
| 205 |
+
Args:
|
| 206 |
+
page_url: (string) url of the page to scrape
|
| 207 |
+
Returns:
|
| 208 |
+
data: (list) a list of dictionary of products info. If no products
|
| 209 |
+
found, return empty list.
|
| 210 |
+
"""
|
| 211 |
+
global DRIVER
|
| 212 |
+
|
| 213 |
+
data = []
|
| 214 |
+
DRIVER.get(page_url) # Use the driver to get info from the product page
|
| 215 |
+
time.sleep(3)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
try:
|
| 219 |
+
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
| 220 |
+
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
| 221 |
+
print("EMPTY PAGE")
|
| 222 |
+
return data
|
| 223 |
+
except NoSuchElementException:
|
| 224 |
+
no_product_found = False
|
| 225 |
+
|
| 226 |
+
# FIND ALL PRODUCT ITEMS
|
| 227 |
+
# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
|
| 228 |
+
products = DRIVER.find_elements(By.CLASS_NAME, 'product-item')
|
| 229 |
+
print(f'Found {len(products)} products')
|
| 230 |
+
|
| 231 |
+
if (not no_product_found) and len(products)>0:
|
| 232 |
+
for i in products:
|
| 233 |
+
product_dict = get_tiki_product_info_single(i, extra_info)
|
| 234 |
+
data.append(product_dict)
|
| 235 |
+
return data
|
| 236 |
+
|
| 237 |
+
### Function to get product info from a main category
|
| 238 |
+
def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
| 239 |
+
'''
|
| 240 |
+
Scrape for multiple pages of products of a category.
|
| 241 |
+
Uses get_product_info_from_page().
|
| 242 |
+
|
| 243 |
+
Args:
|
| 244 |
+
cat_url: (string) a url string of a category
|
| 245 |
+
max_page: (int) an integer denoting the maximum number of pages to scrape.
|
| 246 |
+
Default value is 0 to scrape all pages.
|
| 247 |
+
Returns:
|
| 248 |
+
products: a list in which every element is a dictionary of one product's information
|
| 249 |
+
'''
|
| 250 |
+
products = []
|
| 251 |
+
|
| 252 |
+
page_n = 1
|
| 253 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
| 254 |
+
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 255 |
+
|
| 256 |
+
while len(product_list)>0:
|
| 257 |
+
products.extend(product_list)
|
| 258 |
+
page_n += 1
|
| 259 |
+
|
| 260 |
+
# stop_flag = False if max_page <= 0 else (page_n > max_page)
|
| 261 |
+
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
| 262 |
+
if stop_flag:
|
| 263 |
+
break
|
| 264 |
+
|
| 265 |
+
cat_page_url = cat_url + f'?page={page_n}'
|
| 266 |
+
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 267 |
+
|
| 268 |
+
return products
|
| 269 |
+
|
| 270 |
+
def scrap_tiki(search_product, num_max_page, extra_info):
|
| 271 |
+
|
| 272 |
+
start_driver(force_restart=True)
|
| 273 |
+
|
| 274 |
+
url = 'https://tiki.vn/search?sort=default&q="' + search_product +'"'
|
| 275 |
+
|
| 276 |
+
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
| 277 |
+
|
| 278 |
+
# prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
|
| 279 |
+
prod_per_cat = get_tiki_product_info_from_category(url, num_max_page, extra_info = extra_info)
|
| 280 |
+
prod_data.extend(prod_per_cat)
|
| 281 |
+
close_driver() # Close driver when we're done
|
| 282 |
+
|
| 283 |
+
return prod_data
|