| | import re
|
| | import json
|
| | import requests
|
| | import pandas as pd
|
| | import datetime as dt
|
| | import time
|
| |
|
| | class ShopeeCrawler:
|
| | """ A class to crawl product reviews on Shopee.vn """
|
| | def __init__(self):
|
| | self.data = {"itemid": [], "shopid": [], "username": [], "rating": [], "time": [], "source": [], "comment": []}
|
| | self.shop_id, self.item_id = None, None
|
| |
|
| | def get_ids_from_link(self, base_url):
|
| | """
|
| | Gets Product id and Shop id
|
| |
|
| | Parameters
|
| | ----------
|
| | base_url : str
|
| | Product link
|
| |
|
| | Returns
|
| | ----------
|
| | tuple
|
| | a tuple containing Product id and Shop id
|
| | """
|
| |
|
| | r = re.search(r"i\.(\d+)\.(\d+)", base_url)
|
| | return (r[1], r[2])
|
| |
|
| | def Crawl(self, item_id, shop_id, display = False, most_recent = False, verbose = 100):
|
| | """
|
| | Gets reviews and related information about a product
|
| |
|
| | Parameters
|
| | ----------
|
| | item_id : int
|
| | Product ID
|
| |
|
| | shop_id : int
|
| | Shop ID
|
| |
|
| | display : bool
|
| | Display data as crawled
|
| |
|
| | most_recent: bool
|
| | only save reviews that are at most 1 day old
|
| |
|
| | Returns
|
| | ----------
|
| | dict
|
| | a dictionary containing username (reviewer's account name), rating (number of stars the reviewer gave), comment (the review on the product), time (date and time of the comment in unix time),
|
| | itemid and shopid.
|
| |
|
| |
|
| | """
|
| | offset = 0
|
| | while True:
|
| | ratings_url = f"https://shopee.vn/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"
|
| | response = requests.get(ratings_url).json()
|
| | if not response["data"]["ratings"]:
|
| | break
|
| |
|
| | for rating in response["data"]["ratings"]:
|
| | if most_recent:
|
| | delta = dt.datetime.now() - dt.datetime.fromtimestamp(rating["ctime"])
|
| | if delta <= dt.timedelta(days=1):
|
| | self.data["username"].append(rating["author_username"])
|
| | self.data["rating"].append(rating["rating_star"])
|
| | self.data["comment"].append(rating["comment"])
|
| | self.data["time"].append(rating["ctime"])
|
| | self.data["shopid"].append(item_id)
|
| | self.data["itemid"].append(shop_id)
|
| | self.data["source"].append("Shopee")
|
| | else:
|
| | self.data["username"].append(rating["author_username"])
|
| | self.data["rating"].append(rating["rating_star"])
|
| | self.data["comment"].append(rating["comment"])
|
| | self.data["time"].append(rating["ctime"])
|
| | self.data["shopid"].append(item_id)
|
| | self.data["itemid"].append(shop_id)
|
| | self.data["source"].append("Shopee")
|
| | if display:
|
| | print(rating["author_username"])
|
| | print(rating["rating_star"])
|
| | print(rating["comment"])
|
| | print("-" * 100)
|
| | print(offset)
|
| |
|
| |
|
| | offset += 20
|
| |
|
| |
|
| | return self.data
|
| |
|
| | def get_data(self):
|
| | """
|
| | Get all data crawled within the object
|
| |
|
| | Returns
|
| | ----------
|
| | dict
|
| | a dictionary containing username (reviewer's account name), rating (number of stars the reviewer gave), comment (the review on the product), time (date and time of the comment in unix time),
|
| | itemid and shopid.
|
| | """
|
| | return self.data
|
| |
|
| | def CrawlByCat(self, catid, cat_level = 2, limit = None):
|
| | """
|
| | Crawl reviews by categories
|
| |
|
| | Parameters
|
| | ----------
|
| | catid : int
|
| | category ID
|
| |
|
| | cat_level : int
|
| | 1 for category, 2 for subcategory
|
| |
|
| | limit : int
|
| | limit number of products in the category (None to crawl all products)
|
| |
|
| | Returns
|
| | ----------
|
| | dict
|
| | a dictionary containing username (reviewer's account name), rating (number of stars the reviewer gave), comment (the review on the product), time (date and time of the comment in unix time),
|
| | itemid and shopid.
|
| |
|
| | """
|
| |
|
| | params = {
|
| | "bundle": "category_landing_page",
|
| | "cat_level": cat_level,
|
| | "catid": catid,
|
| | "offset": 0,
|
| | }
|
| |
|
| |
|
| | crawler = ShopeeCrawler()
|
| | product_data = []
|
| | url = 'https://shopee.vn/api/v4/recommend/recommend'
|
| |
|
| | response = requests.get(url, params=params)
|
| | n = response.json().get('data').get('sections')[0].get('data').get('item')
|
| |
|
| | for record in n:
|
| | product_data.append({'itemid': record['itemid'], 'shopid': record['shopid']})
|
| |
|
| | for product in product_data:
|
| | crawler.Crawl(product["itemid"], product["shopid"])
|
| | self.data = crawler.get_data()
|
| | return self.data
|
| |
|
| | def GetShopInfo(self):
|
| | """
|
| | Get shop information
|
| |
|
| | Returns
|
| | ----------
|
| | dict
|
| | a dictionary containing shop information
|
| |
|
| | """
|
| | df = pd.DataFrame(self.data)
|
| | itemids = df["itemid"].unique()
|
| |
|
| | output = {"shopid": [], "name": [], "ctime": [], "is_shopee_verified": [], "is_preferred_plus_seller": [], "is_official_shop": [], "shop_location": [], "item_count": [],
|
| | "rating_star": [], "response_rate": [], "response_time": [], 'rating_bad': [],'rating_good': [], 'rating_normal': []}
|
| | for itemid in itemids:
|
| | url = f'https://shopee.vn/api/v4/product/get_shop_info?shopid={itemid}'
|
| | response = requests.get(url).json()
|
| | data = response.get("data")
|
| | for key in list(output.keys())[1::]:
|
| | output[key].append(data[key])
|
| | output["shopid"].append(df[df["itemid"] == str(itemid)]["shopid"].iloc[0])
|
| | return output |