Spaces:
Sleeping
Sleeping
Linh Vuu
commited on
Commit
·
6d0cb99
1
Parent(s):
3f1beca
updated files
Browse files- __pycache__/scraper_lazada.cpython-39.pyc +0 -0
- __pycache__/scraper_shopee.cpython-39.pyc +0 -0
- __pycache__/scraper_tiki.cpython-39.pyc +0 -0
- app.py +2 -2
- requirements.txt +3 -4
- scraper_lazada.py +20 -30
- scraper_shopee.py +15 -37
- scraper_tiki.py +7 -3
__pycache__/scraper_lazada.cpython-39.pyc
ADDED
|
Binary file (4.45 kB). View file
|
|
|
__pycache__/scraper_shopee.cpython-39.pyc
ADDED
|
Binary file (4.85 kB). View file
|
|
|
__pycache__/scraper_tiki.cpython-39.pyc
ADDED
|
Binary file (4.62 kB). View file
|
|
|
app.py
CHANGED
|
@@ -60,7 +60,6 @@ def main():
|
|
| 60 |
df_lazada = pd.DataFrame(columns = col_to_display)
|
| 61 |
st.write("Not found.")
|
| 62 |
|
| 63 |
-
|
| 64 |
st.subheader("Tiki")
|
| 65 |
tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
|
| 66 |
if tiki_data:
|
|
@@ -72,7 +71,8 @@ def main():
|
|
| 72 |
st.write("Not found.")
|
| 73 |
|
| 74 |
# Merge the two dataframes
|
| 75 |
-
merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
|
|
|
|
| 76 |
|
| 77 |
# Sort the merged dataframe by price
|
| 78 |
sorted_merged_df = merged_df.sort_values(by='price')
|
|
|
|
| 60 |
df_lazada = pd.DataFrame(columns = col_to_display)
|
| 61 |
st.write("Not found.")
|
| 62 |
|
|
|
|
| 63 |
st.subheader("Tiki")
|
| 64 |
tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
|
| 65 |
if tiki_data:
|
|
|
|
| 71 |
st.write("Not found.")
|
| 72 |
|
| 73 |
# Merge the two dataframes
|
| 74 |
+
# merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
|
| 75 |
+
merged_df = pd.concat([df_lazada])
|
| 76 |
|
| 77 |
# Sort the merged dataframe by price
|
| 78 |
sorted_merged_df = merged_df.sort_values(by='price')
|
requirements.txt
CHANGED
|
@@ -1,5 +1,4 @@
|
|
| 1 |
-
selenium
|
| 2 |
-
pandas
|
| 3 |
streamlit==1.13.0
|
| 4 |
-
altair==4.2.0
|
| 5 |
-
webdriver-manager==3.7.1
|
|
|
|
| 1 |
+
selenium
|
| 2 |
+
pandas
|
| 3 |
streamlit==1.13.0
|
| 4 |
+
altair==4.2.0
|
|
|
scraper_lazada.py
CHANGED
|
@@ -2,6 +2,7 @@ from selenium import webdriver
|
|
| 2 |
from selenium.webdriver.chrome.options import Options
|
| 3 |
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
from selenium.webdriver.common.by import By
|
|
|
|
| 5 |
import time
|
| 6 |
|
| 7 |
# Global driver to use throughout the script
|
|
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
|
|
| 22 |
close_driver()
|
| 23 |
|
| 24 |
# Setting up the driver
|
|
|
|
| 25 |
options = Options()
|
| 26 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 27 |
options.add_argument('-no-sandbox')
|
| 28 |
options.add_argument('-disable-dev-shm-usage')
|
| 29 |
|
| 30 |
-
DRIVER = webdriver.Chrome(options=options)
|
| 31 |
|
| 32 |
### Function to extract product info from the necessary html and json tags
|
| 33 |
def get_lazada_product_info_single(product_element, extra_info):
|
|
@@ -51,7 +53,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
| 51 |
try:
|
| 52 |
# Find the <a> element within the <div class="RfADt">
|
| 53 |
product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
| 54 |
-
|
| 55 |
# Get the text content of the <a> element
|
| 56 |
info['name'] = product_title_element.text
|
| 57 |
|
|
@@ -63,7 +65,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
| 63 |
try:
|
| 64 |
# Find the <span> element with class "ooOxS" within the <div class="aBrP0">
|
| 65 |
price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
|
| 66 |
-
|
| 67 |
# Get the text content of the <span> element
|
| 68 |
price_text = price_element.text
|
| 69 |
|
|
@@ -77,10 +79,10 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
| 77 |
try:
|
| 78 |
# Find the <a> element within the <div class="RfADt">
|
| 79 |
product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
| 80 |
-
|
| 81 |
# Get the href attribute of the <a> element
|
| 82 |
product_link = product_link_element.get_attribute("href")
|
| 83 |
-
|
| 84 |
# Extract the URL from the href attribute
|
| 85 |
info['product_url'] = product_link.split("//")[1]
|
| 86 |
|
|
@@ -91,7 +93,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
| 91 |
try:
|
| 92 |
# Find the <img> element within the <div class="_95X4G">
|
| 93 |
image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
|
| 94 |
-
|
| 95 |
# Get the src attribute of the <img> element
|
| 96 |
info['image'] = image_element.get_attribute("src")
|
| 97 |
|
|
@@ -104,7 +106,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
| 104 |
try:
|
| 105 |
# Find the <span> element within the <div class="_6uN7R">
|
| 106 |
sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
|
| 107 |
-
|
| 108 |
# Get the text content of the <span> element
|
| 109 |
info['sales'] = sold_element.text
|
| 110 |
|
|
@@ -114,7 +116,7 @@ def get_lazada_product_info_single(product_element, extra_info):
|
|
| 114 |
try:
|
| 115 |
# Find the <span> element within the <div class="WNoq3">
|
| 116 |
discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
|
| 117 |
-
|
| 118 |
# Get the text content of the <span> element
|
| 119 |
info['discount'] = discount_element.text
|
| 120 |
|
|
@@ -134,32 +136,24 @@ def get_lazada_product_info_from_page(page_url, extra_info=False):
|
|
| 134 |
found, return empty list.
|
| 135 |
"""
|
| 136 |
global DRIVER
|
| 137 |
-
|
| 138 |
data = []
|
| 139 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
| 140 |
time.sleep(3)
|
| 141 |
|
| 142 |
-
try:
|
| 143 |
-
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
| 144 |
-
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
| 145 |
-
print("EMPTY PAGE")
|
| 146 |
-
return data
|
| 147 |
-
except NoSuchElementException:
|
| 148 |
-
no_product_found = False
|
| 149 |
-
|
| 150 |
-
|
| 151 |
# FIND ALL PRODUCT ITEMS
|
| 152 |
products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
|
|
|
| 153 |
print(f'Found {len(products)} products')
|
| 154 |
|
| 155 |
-
if
|
| 156 |
for i in products:
|
| 157 |
product_dict = get_lazada_product_info_single(i, extra_info)
|
| 158 |
data.append(product_dict)
|
| 159 |
return data
|
| 160 |
|
| 161 |
### Function to get product info from a main category
|
| 162 |
-
def get_lazada_product_info_from_category(
|
| 163 |
'''
|
| 164 |
Scrape for multiple pages of products of a category.
|
| 165 |
Uses get_product_info_from_page().
|
|
@@ -172,10 +166,10 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
| 172 |
products: a list in which every element is a dictionary of one product's information
|
| 173 |
'''
|
| 174 |
products = []
|
| 175 |
-
|
| 176 |
page_n = 1
|
| 177 |
-
|
| 178 |
-
|
|
|
|
| 179 |
|
| 180 |
while len(product_list)>0:
|
| 181 |
products.extend(product_list)
|
|
@@ -185,9 +179,8 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
| 185 |
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
| 186 |
if stop_flag:
|
| 187 |
break
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 191 |
|
| 192 |
return products
|
| 193 |
|
|
@@ -195,12 +188,9 @@ def scrap_lazada(search_product, num_max_page, extra_info):
|
|
| 195 |
|
| 196 |
start_driver(force_restart=True)
|
| 197 |
|
| 198 |
-
url = 'https://www.lazada.vn/catalog/?q=' + search_product
|
| 199 |
-
|
| 200 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
| 201 |
|
| 202 |
-
|
| 203 |
-
prod_per_cat = get_lazada_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
| 204 |
prod_data.extend(prod_per_cat)
|
| 205 |
|
| 206 |
close_driver() # Close driver when we're done
|
|
|
|
| 2 |
from selenium.webdriver.chrome.options import Options
|
| 3 |
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
from selenium.webdriver.common.by import By
|
| 5 |
+
from selenium.webdriver.chrome.service import Service
|
| 6 |
import time
|
| 7 |
|
| 8 |
# Global driver to use throughout the script
|
|
|
|
| 23 |
close_driver()
|
| 24 |
|
| 25 |
# Setting up the driver
|
| 26 |
+
service = Service()
|
| 27 |
options = Options()
|
| 28 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 29 |
options.add_argument('-no-sandbox')
|
| 30 |
options.add_argument('-disable-dev-shm-usage')
|
| 31 |
|
| 32 |
+
DRIVER = webdriver.Chrome(service=service, options=options)
|
| 33 |
|
| 34 |
### Function to extract product info from the necessary html and json tags
|
| 35 |
def get_lazada_product_info_single(product_element, extra_info):
|
|
|
|
| 53 |
try:
|
| 54 |
# Find the <a> element within the <div class="RfADt">
|
| 55 |
product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
| 56 |
+
|
| 57 |
# Get the text content of the <a> element
|
| 58 |
info['name'] = product_title_element.text
|
| 59 |
|
|
|
|
| 65 |
try:
|
| 66 |
# Find the <span> element with class "ooOxS" within the <div class="aBrP0">
|
| 67 |
price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
|
| 68 |
+
|
| 69 |
# Get the text content of the <span> element
|
| 70 |
price_text = price_element.text
|
| 71 |
|
|
|
|
| 79 |
try:
|
| 80 |
# Find the <a> element within the <div class="RfADt">
|
| 81 |
product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
|
| 82 |
+
|
| 83 |
# Get the href attribute of the <a> element
|
| 84 |
product_link = product_link_element.get_attribute("href")
|
| 85 |
+
|
| 86 |
# Extract the URL from the href attribute
|
| 87 |
info['product_url'] = product_link.split("//")[1]
|
| 88 |
|
|
|
|
| 93 |
try:
|
| 94 |
# Find the <img> element within the <div class="_95X4G">
|
| 95 |
image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
|
| 96 |
+
|
| 97 |
# Get the src attribute of the <img> element
|
| 98 |
info['image'] = image_element.get_attribute("src")
|
| 99 |
|
|
|
|
| 106 |
try:
|
| 107 |
# Find the <span> element within the <div class="_6uN7R">
|
| 108 |
sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
|
| 109 |
+
|
| 110 |
# Get the text content of the <span> element
|
| 111 |
info['sales'] = sold_element.text
|
| 112 |
|
|
|
|
| 116 |
try:
|
| 117 |
# Find the <span> element within the <div class="WNoq3">
|
| 118 |
discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
|
| 119 |
+
|
| 120 |
# Get the text content of the <span> element
|
| 121 |
info['discount'] = discount_element.text
|
| 122 |
|
|
|
|
| 136 |
found, return empty list.
|
| 137 |
"""
|
| 138 |
global DRIVER
|
| 139 |
+
|
| 140 |
data = []
|
| 141 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
| 142 |
time.sleep(3)
|
| 143 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
# FIND ALL PRODUCT ITEMS
|
| 145 |
products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
|
| 146 |
+
print(products)
|
| 147 |
print(f'Found {len(products)} products')
|
| 148 |
|
| 149 |
+
if len(products)>0:
|
| 150 |
for i in products:
|
| 151 |
product_dict = get_lazada_product_info_single(i, extra_info)
|
| 152 |
data.append(product_dict)
|
| 153 |
return data
|
| 154 |
|
| 155 |
### Function to get product info from a main category
|
| 156 |
+
def get_lazada_product_info_from_category(search_product, max_page=0, extra_info=False):
|
| 157 |
'''
|
| 158 |
Scrape for multiple pages of products of a category.
|
| 159 |
Uses get_product_info_from_page().
|
|
|
|
| 166 |
products: a list in which every element is a dictionary of one product's information
|
| 167 |
'''
|
| 168 |
products = []
|
|
|
|
| 169 |
page_n = 1
|
| 170 |
+
cat_url = 'https://www.lazada.vn/catalog/?q=' + search_product
|
| 171 |
+
|
| 172 |
+
product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
|
| 173 |
|
| 174 |
while len(product_list)>0:
|
| 175 |
products.extend(product_list)
|
|
|
|
| 179 |
stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
|
| 180 |
if stop_flag:
|
| 181 |
break
|
| 182 |
+
cat_url = 'https://www.lazada.vn/catalog/?page=' + page_n + '&q=' + search_product
|
| 183 |
+
product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
|
|
|
|
| 184 |
|
| 185 |
return products
|
| 186 |
|
|
|
|
| 188 |
|
| 189 |
start_driver(force_restart=True)
|
| 190 |
|
|
|
|
|
|
|
| 191 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
| 192 |
|
| 193 |
+
prod_per_cat = get_lazada_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
|
|
|
|
| 194 |
prod_data.extend(prod_per_cat)
|
| 195 |
|
| 196 |
close_driver() # Close driver when we're done
|
scraper_shopee.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
from selenium import webdriver
|
| 2 |
-
from
|
| 3 |
-
# from selenium.webdriver.chrome.options import Options
|
| 4 |
from selenium.common.exceptions import NoSuchElementException
|
| 5 |
from selenium.webdriver.common.by import By
|
| 6 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
| 7 |
import time
|
| 8 |
|
| 9 |
# Global driver to use throughout the script
|
|
@@ -24,19 +24,13 @@ def start_driver(force_restart=False):
|
|
| 24 |
close_driver()
|
| 25 |
|
| 26 |
# Setting up the driver
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
|
|
|
| 31 |
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
gChromeOptions = webdriver.ChromeOptions()
|
| 35 |
-
gChromeOptions.add_argument("window-size=1920x1480")
|
| 36 |
-
gChromeOptions.add_argument("disable-dev-shm-usage")
|
| 37 |
-
DRIVER = webdriver.Chrome(
|
| 38 |
-
chrome_options=gChromeOptions, executable_path=ChromeDriverManager().install()
|
| 39 |
-
)
|
| 40 |
|
| 41 |
### Function to extract product info from the necessary html and json tags
|
| 42 |
def get_shopee_product_info_single(product_element, extra_info):
|
|
@@ -149,32 +143,19 @@ def get_shopee_product_info_from_page(page_url, extra_info=False):
|
|
| 149 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
| 150 |
time.sleep(3)
|
| 151 |
|
| 152 |
-
|
| 153 |
-
try:
|
| 154 |
-
# no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
|
| 155 |
-
no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
|
| 156 |
-
print("EMPTY PAGE")
|
| 157 |
-
return data
|
| 158 |
-
except NoSuchElementException:
|
| 159 |
-
no_product_found = False
|
| 160 |
-
|
| 161 |
-
|
| 162 |
# FIND ALL PRODUCT ITEMS
|
| 163 |
-
# products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
|
| 164 |
products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
|
| 165 |
print(f'Found {len(products)} products')
|
| 166 |
print(products)
|
| 167 |
|
| 168 |
-
if
|
| 169 |
for i in products:
|
| 170 |
product_dict = get_shopee_product_info_single(i, extra_info)
|
| 171 |
-
print(i)
|
| 172 |
-
print(product_dict)
|
| 173 |
data.append(product_dict)
|
| 174 |
return data
|
| 175 |
|
| 176 |
### Function to get product info from a main category
|
| 177 |
-
def get_shopee_product_info_from_category(
|
| 178 |
'''
|
| 179 |
Scrape for multiple pages of products of a category.
|
| 180 |
Uses get_product_info_from_page().
|
|
@@ -188,8 +169,8 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
| 188 |
'''
|
| 189 |
products = []
|
| 190 |
|
| 191 |
-
page_n =
|
| 192 |
-
cat_page_url =
|
| 193 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 194 |
|
| 195 |
while len(product_list)>0:
|
|
@@ -201,7 +182,7 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
|
|
| 201 |
if stop_flag:
|
| 202 |
break
|
| 203 |
|
| 204 |
-
cat_page_url =
|
| 205 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 206 |
|
| 207 |
return products
|
|
@@ -221,14 +202,11 @@ def scrap_shopee(search_product, num_max_page, extra_info):
|
|
| 221 |
# info = get_shopee_product_info_single(product, True)
|
| 222 |
# print(info)
|
| 223 |
|
| 224 |
-
start_driver(force_restart=True)
|
| 225 |
-
|
| 226 |
-
url = 'https://shopee.vn/search?keyword=' + search_product
|
| 227 |
|
| 228 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
| 229 |
|
| 230 |
-
|
| 231 |
-
prod_per_cat = get_shopee_product_info_from_category(url, num_max_page, extra_info=extra_info)
|
| 232 |
prod_data.extend(prod_per_cat)
|
| 233 |
|
| 234 |
close_driver() # Close driver when we're done
|
|
|
|
| 1 |
from selenium import webdriver
|
| 2 |
+
from selenium.webdriver.chrome.options import Options
|
|
|
|
| 3 |
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
from selenium.webdriver.common.by import By
|
| 5 |
from selenium.webdriver.support import expected_conditions as EC
|
| 6 |
+
from selenium.webdriver.chrome.service import Service
|
| 7 |
import time
|
| 8 |
|
| 9 |
# Global driver to use throughout the script
|
|
|
|
| 24 |
close_driver()
|
| 25 |
|
| 26 |
# Setting up the driver
|
| 27 |
+
service = Service()
|
| 28 |
+
options = Options()
|
| 29 |
+
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 30 |
+
options.add_argument('-no-sandbox')
|
| 31 |
+
options.add_argument('-disable-dev-shm-usage')
|
| 32 |
|
| 33 |
+
DRIVER = webdriver.Chrome(service=service, options=options)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
### Function to extract product info from the necessary html and json tags
|
| 36 |
def get_shopee_product_info_single(product_element, extra_info):
|
|
|
|
| 143 |
DRIVER.get(page_url) # Use the driver to get info from the product page
|
| 144 |
time.sleep(3)
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
# FIND ALL PRODUCT ITEMS
|
|
|
|
| 147 |
products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
|
| 148 |
print(f'Found {len(products)} products')
|
| 149 |
print(products)
|
| 150 |
|
| 151 |
+
if len(products)>0:
|
| 152 |
for i in products:
|
| 153 |
product_dict = get_shopee_product_info_single(i, extra_info)
|
|
|
|
|
|
|
| 154 |
data.append(product_dict)
|
| 155 |
return data
|
| 156 |
|
| 157 |
### Function to get product info from a main category
|
| 158 |
+
def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False):
|
| 159 |
'''
|
| 160 |
Scrape for multiple pages of products of a category.
|
| 161 |
Uses get_product_info_from_page().
|
|
|
|
| 169 |
'''
|
| 170 |
products = []
|
| 171 |
|
| 172 |
+
page_n = 0
|
| 173 |
+
cat_page_url = 'https://shopee.vn/search?keyword=' + search_product
|
| 174 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 175 |
|
| 176 |
while len(product_list)>0:
|
|
|
|
| 182 |
if stop_flag:
|
| 183 |
break
|
| 184 |
|
| 185 |
+
cat_page_url = cat_page_url + f'&page={page_n}'
|
| 186 |
product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 187 |
|
| 188 |
return products
|
|
|
|
| 202 |
# info = get_shopee_product_info_single(product, True)
|
| 203 |
# print(info)
|
| 204 |
|
| 205 |
+
start_driver(force_restart=True)
|
|
|
|
|
|
|
| 206 |
|
| 207 |
prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
|
| 208 |
|
| 209 |
+
prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
|
|
|
|
| 210 |
prod_data.extend(prod_per_cat)
|
| 211 |
|
| 212 |
close_driver() # Close driver when we're done
|
scraper_tiki.py
CHANGED
|
@@ -2,6 +2,7 @@ from selenium import webdriver
|
|
| 2 |
from selenium.webdriver.chrome.options import Options
|
| 3 |
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
from selenium.webdriver.common.by import By
|
|
|
|
| 5 |
import time
|
| 6 |
|
| 7 |
# Global driver to use throughout the script
|
|
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
|
|
| 22 |
close_driver()
|
| 23 |
|
| 24 |
# Setting up the driver
|
|
|
|
| 25 |
options = Options()
|
| 26 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 27 |
options.add_argument('-no-sandbox')
|
| 28 |
options.add_argument('-disable-dev-shm-usage')
|
| 29 |
|
| 30 |
-
DRIVER = webdriver.Chrome(options=options)
|
| 31 |
|
| 32 |
### Function to extract product info from the necessary html and json tags
|
| 33 |
def get_tiki_product_info_single(product_element, extra_info):
|
|
@@ -250,7 +252,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
|
| 250 |
products = []
|
| 251 |
|
| 252 |
page_n = 1
|
| 253 |
-
cat_page_url = cat_url + f'
|
|
|
|
| 254 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 255 |
|
| 256 |
while len(product_list)>0:
|
|
@@ -262,7 +265,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
|
|
| 262 |
if stop_flag:
|
| 263 |
break
|
| 264 |
|
| 265 |
-
cat_page_url = cat_url + f'
|
|
|
|
| 266 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 267 |
|
| 268 |
return products
|
|
|
|
| 2 |
from selenium.webdriver.chrome.options import Options
|
| 3 |
from selenium.common.exceptions import NoSuchElementException
|
| 4 |
from selenium.webdriver.common.by import By
|
| 5 |
+
from selenium.webdriver.chrome.service import Service
|
| 6 |
import time
|
| 7 |
|
| 8 |
# Global driver to use throughout the script
|
|
|
|
| 23 |
close_driver()
|
| 24 |
|
| 25 |
# Setting up the driver
|
| 26 |
+
service = Service()
|
| 27 |
options = Options()
|
| 28 |
options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
|
| 29 |
options.add_argument('-no-sandbox')
|
| 30 |
options.add_argument('-disable-dev-shm-usage')
|
| 31 |
|
| 32 |
+
DRIVER = webdriver.Chrome(service=service, options=options)
|
| 33 |
|
| 34 |
### Function to extract product info from the necessary html and json tags
|
| 35 |
def get_tiki_product_info_single(product_element, extra_info):
|
|
|
|
| 252 |
products = []
|
| 253 |
|
| 254 |
page_n = 1
|
| 255 |
+
cat_page_url = cat_url + f'&page={page_n}'
|
| 256 |
+
print(cat_page_url)
|
| 257 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 258 |
|
| 259 |
while len(product_list)>0:
|
|
|
|
| 265 |
if stop_flag:
|
| 266 |
break
|
| 267 |
|
| 268 |
+
cat_page_url = cat_url + f'&page={page_n}'
|
| 269 |
+
print(cat_page_url)
|
| 270 |
product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
|
| 271 |
|
| 272 |
return products
|