Spaces:

webshop
/

amazon_shop

Runtime error

App Files Files Community

John Yang commited on Jul 4, 2022

Commit

5d16b15

1 Parent(s): a8fcac2

eBay parsing working commit

Browse files

Files changed (2) hide show

app.py +17 -0
predict_help.py +123 -3

app.py CHANGED Viewed

@@ -7,6 +7,7 @@ from predict_help import (
     Page, convert_dict_to_actions, convert_html_to_text,
     parse_results_amz, parse_item_page_amz,
     parse_results_ws, parse_item_page_ws,
     WEBSHOP_URL, WEBSHOP_SESSION
 )
@@ -181,6 +182,12 @@ def run_episode(goal, verbose=True, env='amazon'):
                         f'{asin}/{query_str}/{page_num}/{options_str}'
                     )
                     return_value = "Product URL: " + asin_url
                 return return_value
             elif prev_page_type == Page.ITEM_PAGE:
@@ -210,6 +217,8 @@ def run_episode(goal, verbose=True, env='amazon'):
                     data = parse_results_amz(search_terms, page_num)
                 if env == 'webshop':
                     data = parse_results_ws(search_terms, page_num)
                 end = time.time()
                 print("Parsing search results took", end-begin, "seconds")
@@ -227,6 +236,8 @@ def run_episode(goal, verbose=True, env='amazon'):
                     data = parse_item_page_amz(asin)
                 if env == 'webshop':
                     data = parse_item_page_ws(asin, search_terms, page_num, options)
                 end = time.time()
                 print("Parsing item page took", end-begin, "seconds")
                 product_map[asin] = data
@@ -269,6 +280,12 @@ def run_episode(goal, verbose=True, env='amazon'):
                     f'{asin}/{query_str}/{page_num}/{options_str}'
                 )
                 return_value = "Product URL: " + asin_url
             return return_value
 gr.Interface(fn=run_episode,\

     Page, convert_dict_to_actions, convert_html_to_text,
     parse_results_amz, parse_item_page_amz,
     parse_results_ws, parse_item_page_ws,
+    parse_results_ebay, parse_item_page_ebay,
     WEBSHOP_URL, WEBSHOP_SESSION
 )
                         f'{asin}/{query_str}/{page_num}/{options_str}'
                     )
                     return_value = "Product URL: " + asin_url
+                if env == 'ebay':
+                    asin_url = f"https:///www.ebay.com/itm/{asin}"
+                    return_value = "Product URL: " + asin_url
+                    if len(clicked_options) > 0:
+                        options_str = ', '.join(list(clicked_options))
+                        return_value += "\nSelected Options: " + options_str
                 return return_value
             elif prev_page_type == Page.ITEM_PAGE:
                     data = parse_results_amz(search_terms, page_num)
                 if env == 'webshop':
                     data = parse_results_ws(search_terms, page_num)
+                if env == 'ebay':
+                    data = parse_results_ebay(search_terms, page_num)
                 end = time.time()
                 print("Parsing search results took", end-begin, "seconds")
                     data = parse_item_page_amz(asin)
                 if env == 'webshop':
                     data = parse_item_page_ws(asin, search_terms, page_num, options)
+                if env == 'ebay':
+                    data = parse_item_page_ebay(asin)
                 end = time.time()
                 print("Parsing item page took", end-begin, "seconds")
                 product_map[asin] = data
                     f'{asin}/{query_str}/{page_num}/{options_str}'
                 )
                 return_value = "Product URL: " + asin_url
+            if env == 'ebay':
+                asin_url = f"https:///www.ebay.com/itm/{asin}"
+                return_value = "Product URL: " + asin_url
+                if len(clicked_options) > 0:
+                    options_str = ', '.join(list(clicked_options))
+                    return_value += "\nSelected Options: " + options_str
             return return_value
 gr.Interface(fn=run_episode,\

predict_help.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from bs4 import BeautifulSoup
 from bs4.element import Comment
 from enum import Enum
-import time
 from urllib.parse import urlencode
 import json, requests, torch
@@ -19,6 +19,7 @@ HEADER_ = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (K
 DEBUG_HTML = "temp.html"
 VERBOSE = True
 NUM_PROD_LIMIT = 10
 WEBSHOP_URL = "http://3.83.245.205:3000"
 WEBSHOP_SESSION = "abc"
@@ -28,6 +29,125 @@ def get_url(url):
     proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
     return proxy_url
 def parse_results_ws(query, page_num=None):
     query_string = '+'.join(query.split())
     page_num = 1 if page_num is None else page_num
@@ -181,7 +301,8 @@ def parse_item_page_amz(asin):
     begin = time.time()
     webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
     end = time.time()
-    print("Item page scraping took", end-begin, "seconds")
     soup = BeautifulSoup(webpage.content, "html.parser")
     # Title
@@ -225,7 +346,6 @@ def parse_item_page_amz(asin):
         desc_div = desc_body.find(name="div", attrs={"id": "productDescription"})
         desc_ps = desc_div.findAll(name="p")
         desc = " ".join([p.text for p in desc_ps])
     except AttributeError:
         desc = "N/A"
     product_dict["Description"] = desc.strip()

 from bs4 import BeautifulSoup
 from bs4.element import Comment
 from enum import Enum
+import re, time
 from urllib.parse import urlencode
 import json, requests, torch
 DEBUG_HTML = "temp.html"
 VERBOSE = True
 NUM_PROD_LIMIT = 10
 WEBSHOP_URL = "http://3.83.245.205:3000"
 WEBSHOP_SESSION = "abc"
     proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload)
     return proxy_url
+def parse_results_ebay(query, page_num=None):
+    query_string = '+'.join(query.split())
+    page_num = 1 if page_num is None else page_num
+    url = f'https://www.ebay.com/sch/i.html?_nkw={query_string}&_pgn={page_num}'
+    if VERBOSE:
+        print(f"Search Results URL: {url}")
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+    soup = BeautifulSoup(webpage.text, 'html.parser')
+    products = soup.select('.s-item__wrapper.clearfix')
+    results = []
+    for item in products[:NUM_PROD_LIMIT]:
+        title = item.select_one('.s-item__title').text.strip()
+        if "shop on ebay" in title.lower():
+            # Skip "Shop on ebay" product title
+            continue
+        link = item.select_one('.s-item__link')['href']
+        asin = link.split("?")[0][len("https://www.ebay.com/itm/"):]
+        try:
+            price = item.select_one('.s-item__price').text
+            if "to" in price:
+                prices = price.split(" to ")
+                price = [p.strip("$") for p in prices]
+        except:
+            price = None
+        results.append({
+            "asin": asin,
+            "Title": title,
+            "Price": price
+        })
+    if VERBOSE:
+        print(f"Scraped {len(results)} products")
+    return results
+def parse_item_page_ebay(asin):
+    product_dict = {}
+    product_dict["asin"] = asin
+    url = f"https://www.ebay.com/itm/{asin}"
+    if VERBOSE:
+        print(f"Item Page URL: {url}")
+    begin = time.time()
+    webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+    end = time.time()
+    if VERBOSE:
+        print(f"Item page scraping took {end-begin} seconds")
+    soup = BeautifulSoup(webpage.content, "html.parser")
+    # Title
+    try:
+        product_dict["Title"] = soup.find('h1', {'class': 'x-item-title__mainTitle'}).text.strip()
+    except:
+        product_dict["Title"] = "N/A"
+    # Price: Get price string, extract decimal numbers from string
+    try:
+        price_str = soup.find('div', {'class': 'mainPrice'}).text
+        prices = re.findall('\d*\.?\d+', price_str)
+        product_dict["Price"] = prices[0]
+    except:
+        product_dict["Price"] = "N/A"
+     # Main Image
+    try:
+        img_div = soup.find('div', {'id': 'mainImgHldr'})
+        img_link = img_div.find('img', {'id': 'icImg'})["src"]
+        product_dict["MainImage"] = img_link
+    except:
+        product_dict["MainImage"] = ""
+    # Rating
+    try:
+        rating = soup.find('span', {'class': 'reviews-star-rating'})["title"].split()[0]
+    except:
+        rating = None
+    product_dict["Rating"] = rating
+    # Options
+    options, options_to_images = {}, {} # TODO: options_to_images possible?
+    try:
+        option_blocks = soup.findAll('select', {'class': 'msku-sel'})
+        for block in option_blocks:
+            name = block["name"].strip().strip(":")
+            option_tags = block.findAll("option")
+            opt_list = []
+            for option_tag in option_tags:
+                if "select" not in option_tag.text.lower():
+                    # Do not include "- select -" (aka `not selected`) choice
+                    opt_list.append(option_tag.text)
+            options[name] = opt_list
+    except:
+        options = {}
+    product_dict["options"], product_dict["option_to_image"] = options, options_to_images
+    # Description
+    desc = None
+    try:
+        # Ebay descriptions are shown in `iframe`s
+        desc_link = soup.find('iframe', {'id': 'desc_ifr'})["src"]
+        desc_webpage = requests.get(desc_link, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
+        desc_soup = BeautifulSoup(desc_webpage.content, "html.parser")
+        desc = ' '.join(desc_soup.text.split())
+    except:
+        desc = "N/A"
+    product_dict["Description"] = desc
+    # Features
+    features = None
+    try:
+        features = soup.find('div', {'class': 'x-about-this-item'}).text
+    except:
+        features = "N/A"
+    product_dict["BulletPoints"] = features
+    return product_dict
 def parse_results_ws(query, page_num=None):
     query_string = '+'.join(query.split())
     page_num = 1 if page_num is None else page_num
     begin = time.time()
     webpage = requests.get(url, headers={'User-Agent': HEADER_, 'Accept-Language': 'en-US, en;q=0.5'})
     end = time.time()
+    if VERBOSE:
+        print(f"Item page scraping took {end-begin} seconds")
     soup = BeautifulSoup(webpage.content, "html.parser")
     # Title
         desc_div = desc_body.find(name="div", attrs={"id": "productDescription"})
         desc_ps = desc_div.findAll(name="p")
         desc = " ".join([p.text for p in desc_ps])
     except AttributeError:
         desc = "N/A"
     product_dict["Description"] = desc.strip()