Spaces:

LinhVuu
/

price-comparison

Sleeping

App Files Files Community

Linh Vuu commited on Apr 21, 2024

Commit

6d0cb99

1 Parent(s): 3f1beca

updated files

Browse files

Files changed (8) hide show

__pycache__/scraper_lazada.cpython-39.pyc +0 -0
__pycache__/scraper_shopee.cpython-39.pyc +0 -0
__pycache__/scraper_tiki.cpython-39.pyc +0 -0
app.py +2 -2
requirements.txt +3 -4
scraper_lazada.py +20 -30
scraper_shopee.py +15 -37
scraper_tiki.py +7 -3

__pycache__/scraper_lazada.cpython-39.pyc ADDED Viewed

Binary file (4.45 kB). View file

__pycache__/scraper_shopee.cpython-39.pyc ADDED Viewed

Binary file (4.85 kB). View file

__pycache__/scraper_tiki.cpython-39.pyc ADDED Viewed

Binary file (4.62 kB). View file

app.py CHANGED Viewed

@@ -60,7 +60,6 @@ def main():
                 df_lazada = pd.DataFrame(columns = col_to_display)
                 st.write("Not found.")
             st.subheader("Tiki")
             tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
             if tiki_data:
@@ -72,7 +71,8 @@ def main():
                 st.write("Not found.")
             # Merge the two dataframes
-            merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
             # Sort the merged dataframe by price
             sorted_merged_df = merged_df.sort_values(by='price')

                 df_lazada = pd.DataFrame(columns = col_to_display)
                 st.write("Not found.")
             st.subheader("Tiki")
             tiki_data = scrap_tiki(search_product, num_max_page, extra_info)
             if tiki_data:
                 st.write("Not found.")
             # Merge the two dataframes
+            # merged_df = pd.concat([df_tiki, df_lazada, df_shopee])
+            merged_df = pd.concat([df_lazada])
             # Sort the merged dataframe by price
             sorted_merged_df = merged_df.sort_values(by='price')

requirements.txt CHANGED Viewed

@@ -1,5 +1,4 @@
-selenium==4.3.0
-pandas==1.2
 streamlit==1.13.0
-altair==4.2.0
-webdriver-manager==3.7.1

+selenium
+pandas
 streamlit==1.13.0
+altair==4.2.0

scraper_lazada.py CHANGED Viewed

@@ -2,6 +2,7 @@ from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
 import time
 # Global driver to use throughout the script
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
         close_driver()
     # Setting up the driver
     options = Options()
     options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
     options.add_argument('-no-sandbox')
     options.add_argument('-disable-dev-shm-usage')
-    DRIVER = webdriver.Chrome(options=options)
 ### Function to extract product info from the necessary html and json tags
 def get_lazada_product_info_single(product_element, extra_info):
@@ -51,7 +53,7 @@ def get_lazada_product_info_single(product_element, extra_info):
     try:
         # Find the <a> element within the <div class="RfADt">
         product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
         # Get the text content of the <a> element
         info['name'] = product_title_element.text
@@ -63,7 +65,7 @@ def get_lazada_product_info_single(product_element, extra_info):
     try:
         # Find the <span> element with class "ooOxS" within the <div class="aBrP0">
         price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
         # Get the text content of the <span> element
         price_text = price_element.text
@@ -77,10 +79,10 @@ def get_lazada_product_info_single(product_element, extra_info):
     try:
         # Find the <a> element within the <div class="RfADt">
         product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
         # Get the href attribute of the <a> element
         product_link = product_link_element.get_attribute("href")
         # Extract the URL from the href attribute
         info['product_url'] = product_link.split("//")[1]
@@ -91,7 +93,7 @@ def get_lazada_product_info_single(product_element, extra_info):
     try:
         # Find the <img> element within the <div class="_95X4G">
         image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
         # Get the src attribute of the <img> element
         info['image'] = image_element.get_attribute("src")
@@ -104,7 +106,7 @@ def get_lazada_product_info_single(product_element, extra_info):
         try:
             # Find the <span> element within the <div class="_6uN7R">
             sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
             # Get the text content of the <span> element
             info['sales'] = sold_element.text
@@ -114,7 +116,7 @@ def get_lazada_product_info_single(product_element, extra_info):
         try:
             # Find the <span> element within the <div class="WNoq3">
             discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
             # Get the text content of the <span> element
             info['discount'] = discount_element.text
@@ -134,32 +136,24 @@ def get_lazada_product_info_from_page(page_url, extra_info=False):
                 found, return empty list.
     """
     global DRIVER
     data = []
     DRIVER.get(page_url) # Use the driver to get info from the product page
     time.sleep(3)
-    try:
-        # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
-        no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
-        print("EMPTY PAGE")
-        return data
-    except NoSuchElementException:
-        no_product_found = False
     # FIND ALL PRODUCT ITEMS
     products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
     print(f'Found {len(products)} products')
-    if (not no_product_found) and len(products)>0:
         for i in products:
             product_dict = get_lazada_product_info_single(i, extra_info)
             data.append(product_dict)
     return data
 ### Function to get product info from a main category
-def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False):
     '''
     Scrape for multiple pages of products of a category.
     Uses get_product_info_from_page().
@@ -172,10 +166,10 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
         products: a list in which every element is a dictionary of one product's information
     '''
     products = []
     page_n = 1
-    cat_page_url = cat_url + f'?page={page_n}'
-    product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
     while len(product_list)>0:
         products.extend(product_list)
@@ -185,9 +179,8 @@ def get_lazada_product_info_from_category(cat_url, max_page=0, extra_info=False)
         stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
         if stop_flag:
             break
-        cat_page_url = cat_url + f'?page={page_n}'
-        product_list = get_lazada_product_info_from_page(cat_page_url, extra_info=extra_info)
     return products
@@ -195,12 +188,9 @@ def scrap_lazada(search_product, num_max_page, extra_info):
     start_driver(force_restart=True)
-    url = 'https://www.lazada.vn/catalog/?q=' + search_product
     prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
-    # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
-    prod_per_cat = get_lazada_product_info_from_category(url, num_max_page, extra_info=extra_info)
     prod_data.extend(prod_per_cat)
     close_driver() # Close driver when we're done

 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
 import time
 # Global driver to use throughout the script
         close_driver()
     # Setting up the driver
+    service = Service()
     options = Options()
     options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
     options.add_argument('-no-sandbox')
     options.add_argument('-disable-dev-shm-usage')
+    DRIVER = webdriver.Chrome(service=service, options=options)
 ### Function to extract product info from the necessary html and json tags
 def get_lazada_product_info_single(product_element, extra_info):
     try:
         # Find the <a> element within the <div class="RfADt">
         product_title_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
         # Get the text content of the <a> element
         info['name'] = product_title_element.text
     try:
         # Find the <span> element with class "ooOxS" within the <div class="aBrP0">
         price_element = product_element.find_element(By.XPATH, "//div[@class='aBrP0']/span[@class='ooOxS']")
         # Get the text content of the <span> element
         price_text = price_element.text
     try:
         # Find the <a> element within the <div class="RfADt">
         product_link_element = product_element.find_element(By.XPATH, "//div[@class='RfADt']/a")
         # Get the href attribute of the <a> element
         product_link = product_link_element.get_attribute("href")
         # Extract the URL from the href attribute
         info['product_url'] = product_link.split("//")[1]
     try:
         # Find the <img> element within the <div class="_95X4G">
         image_element = product_element.find_element(By.XPATH, "//div[@class='_95X4G']/a/div/img")
         # Get the src attribute of the <img> element
         info['image'] = image_element.get_attribute("src")
         try:
             # Find the <span> element within the <div class="_6uN7R">
             sold_element = product_element.find_element(By.XPATH, "//div[@class='_6uN7R']/span[@class='_1cEkb']/span[1]")
             # Get the text content of the <span> element
             info['sales'] = sold_element.text
         try:
             # Find the <span> element within the <div class="WNoq3">
             discount_element = product_element.find_element(By.XPATH, "//div[@class='WNoq3']/span[@class='IcOsH']")
             # Get the text content of the <span> element
             info['discount'] = discount_element.text
                 found, return empty list.
     """
     global DRIVER
     data = []
     DRIVER.get(page_url) # Use the driver to get info from the product page
     time.sleep(3)
     # FIND ALL PRODUCT ITEMS
     products = DRIVER.find_elements(By.CLASS_NAME, 'Bm3ON')
+    print(products)
     print(f'Found {len(products)} products')
+    if len(products)>0:
         for i in products:
             product_dict = get_lazada_product_info_single(i, extra_info)
             data.append(product_dict)
     return data
 ### Function to get product info from a main category
+def get_lazada_product_info_from_category(search_product, max_page=0, extra_info=False):
     '''
     Scrape for multiple pages of products of a category.
     Uses get_product_info_from_page().
         products: a list in which every element is a dictionary of one product's information
     '''
     products = []
     page_n = 1
+    cat_url = 'https://www.lazada.vn/catalog/?q=' + search_product
+    product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
     while len(product_list)>0:
         products.extend(product_list)
         stop_flag = max_page>0 and page_n>max_page # For stopping the scrape according to max_page
         if stop_flag:
             break
+        cat_url = 'https://www.lazada.vn/catalog/?page=' + page_n + '&q=' + search_product
+        product_list = get_lazada_product_info_from_page(cat_url, extra_info=extra_info)
     return products
     start_driver(force_restart=True)
     prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
+    prod_per_cat = get_lazada_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
     prod_data.extend(prod_per_cat)
     close_driver() # Close driver when we're done

scraper_shopee.py CHANGED Viewed

@@ -1,9 +1,9 @@
 from selenium import webdriver
-from webdriver_manager.chrome import ChromeDriverManager
-# from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
 import time
 # Global driver to use throughout the script
@@ -24,19 +24,13 @@ def start_driver(force_restart=False):
         close_driver()
     # Setting up the driver
-    # options = Options()
-    # options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
-    # options.add_argument('-no-sandbox')
-    # options.add_argument('-disable-dev-shm-usage')
-    # DRIVER = webdriver.Chrome(options=options)
-    gChromeOptions = webdriver.ChromeOptions()
-    gChromeOptions.add_argument("window-size=1920x1480")
-    gChromeOptions.add_argument("disable-dev-shm-usage")
-    DRIVER = webdriver.Chrome(
-        chrome_options=gChromeOptions, executable_path=ChromeDriverManager().install()
-    )
 ### Function to extract product info from the necessary html and json tags
 def get_shopee_product_info_single(product_element, extra_info):
@@ -149,32 +143,19 @@ def get_shopee_product_info_from_page(page_url, extra_info=False):
     DRIVER.get(page_url) # Use the driver to get info from the product page
     time.sleep(3)
-    try:
-        # no_product_found = bool(DRIVER.find_element(By.XPATH, "//div[@class='style__StyledNotFoundProductView-sc-1uz0b49-0']"))
-        no_product_found = bool(DRIVER.find_element(By.CLASS_NAME, 'style__StyledNotFoundProductView-sc-1uz0b49-0'))
-        print("EMPTY PAGE")
-        return data
-    except NoSuchElementException:
-        no_product_found = False
     # FIND ALL PRODUCT ITEMS
-    # products = DRIVER.find_elements(By.XPATH, "//a[@class='product-item']")
     products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
     print(f'Found {len(products)} products')
     print(products)
-    if (not no_product_found) and len(products)>0:
         for i in products:
             product_dict = get_shopee_product_info_single(i, extra_info)
-            print(i)
-            print(product_dict)
             data.append(product_dict)
     return data
 ### Function to get product info from a main category
-def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False):
     '''
     Scrape for multiple pages of products of a category.
     Uses get_product_info_from_page().
@@ -188,8 +169,8 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
     '''
     products = []
-    page_n = 1
-    cat_page_url = cat_url + f'?page={page_n}'
     product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
     while len(product_list)>0:
@@ -201,7 +182,7 @@ def get_shopee_product_info_from_category(cat_url, max_page=0, extra_info=False)
         if stop_flag:
             break
-        cat_page_url = cat_url + f'?page={page_n}'
         product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
     return products
@@ -221,14 +202,11 @@ def scrap_shopee(search_product, num_max_page, extra_info):
     # info = get_shopee_product_info_single(product, True)
     # print(info)
-    start_driver(force_restart=True)
-    url = 'https://shopee.vn/search?keyword=' + search_product
     prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
-    # prod_per_cat = get_product_info_from_category(main_cat['URL'], num_max_page, extra_info=extra_info)
-    prod_per_cat = get_shopee_product_info_from_category(url, num_max_page, extra_info=extra_info)
     prod_data.extend(prod_per_cat)
     close_driver() # Close driver when we're done

 from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.chrome.service import Service
 import time
 # Global driver to use throughout the script
         close_driver()
     # Setting up the driver
+    service = Service()
+    options = Options()
+    options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
+    options.add_argument('-no-sandbox')
+    options.add_argument('-disable-dev-shm-usage')
+    DRIVER = webdriver.Chrome(service=service, options=options)
 ### Function to extract product info from the necessary html and json tags
 def get_shopee_product_info_single(product_element, extra_info):
     DRIVER.get(page_url) # Use the driver to get info from the product page
     time.sleep(3)
     # FIND ALL PRODUCT ITEMS
     products = DRIVER.find_elements(By.CLASS_NAME, 'col-xs-2-4 shopee-search-item-result__item')
     print(f'Found {len(products)} products')
     print(products)
+    if len(products)>0:
         for i in products:
             product_dict = get_shopee_product_info_single(i, extra_info)
             data.append(product_dict)
     return data
 ### Function to get product info from a main category
+def get_shopee_product_info_from_category(search_product, max_page=0, extra_info=False):
     '''
     Scrape for multiple pages of products of a category.
     Uses get_product_info_from_page().
     '''
     products = []
+    page_n = 0
+    cat_page_url = 'https://shopee.vn/search?keyword=' + search_product
     product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
     while len(product_list)>0:
         if stop_flag:
             break
+        cat_page_url = cat_page_url + f'&page={page_n}'
         product_list = get_shopee_product_info_from_page(cat_page_url, extra_info=extra_info)
     return products
     # info = get_shopee_product_info_single(product, True)
     # print(info)
+    start_driver(force_restart=True)
     prod_data = [] # STORE YOUR PRODUCT INFO DICTIONARIES IN HERE
+    prod_per_cat = get_shopee_product_info_from_category(search_product, num_max_page, extra_info=extra_info)
     prod_data.extend(prod_per_cat)
     close_driver() # Close driver when we're done

scraper_tiki.py CHANGED Viewed

@@ -2,6 +2,7 @@ from selenium import webdriver
 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
 import time
 # Global driver to use throughout the script
@@ -22,12 +23,13 @@ def start_driver(force_restart=False):
         close_driver()
     # Setting up the driver
     options = Options()
     options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
     options.add_argument('-no-sandbox')
     options.add_argument('-disable-dev-shm-usage')
-    DRIVER = webdriver.Chrome(options=options)
 ### Function to extract product info from the necessary html and json tags
 def get_tiki_product_info_single(product_element, extra_info):
@@ -250,7 +252,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
     products = []
     page_n = 1
-    cat_page_url = cat_url + f'?page={page_n}'
     product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
     while len(product_list)>0:
@@ -262,7 +265,8 @@ def get_tiki_product_info_from_category(cat_url, max_page=0, extra_info=False):
         if stop_flag:
             break
-        cat_page_url = cat_url + f'?page={page_n}'
         product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
     return products

 from selenium.webdriver.chrome.options import Options
 from selenium.common.exceptions import NoSuchElementException
 from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
 import time
 # Global driver to use throughout the script
         close_driver()
     # Setting up the driver
+    service = Service()
     options = Options()
     options.add_argument('-headless') # we don't want a chrome browser opens, so it will run in the background
     options.add_argument('-no-sandbox')
     options.add_argument('-disable-dev-shm-usage')
+    DRIVER = webdriver.Chrome(service=service, options=options)
 ### Function to extract product info from the necessary html and json tags
 def get_tiki_product_info_single(product_element, extra_info):
     products = []
     page_n = 1
+    cat_page_url = cat_url + f'&page={page_n}'
+    print(cat_page_url)
     product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
     while len(product_list)>0:
         if stop_flag:
             break
+        cat_page_url = cat_url + f'&page={page_n}'
+        print(cat_page_url)
         product_list = get_tiki_product_info_from_page(cat_page_url, extra_info=extra_info)
     return products