NTDuy commited on
Commit
6fa2f32
·
verified ·
1 Parent(s): 72d9d17

add comments

Browse files
Files changed (1) hide show
  1. data_crawler/Tiki_Crawl.py +35 -6
data_crawler/Tiki_Crawl.py CHANGED
@@ -64,6 +64,19 @@ params = {
64
  }
65
 
66
  def comment_parser(json):
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  d = dict()
68
  d['id'] = json.get(id)
69
  d['title'] = json.get('title')
@@ -78,11 +91,27 @@ def comment_parser(json):
78
 
79
 
80
  def crawl_tiki(link, pages = 1):
81
- pid = re.search(r"p(\d+)", link).group(1)
82
- result = []
83
- params['product_id'] = pid
84
- print('Crawl comment for product {}'.format(pid))
85
- for i in range(pages):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  params['page'] = i
87
  response = requests.get('https://tiki.vn/api/v2/reviews', headers=headers, params=params, cookies=cookies)
88
  if response.status_code == 200:
@@ -91,4 +120,4 @@ def crawl_tiki(link, pages = 1):
91
  result.append(comment_parser(comment))
92
  else:
93
  break
94
- return pd.DataFrame(result)
 
64
  }
65
 
66
  def comment_parser(json):
67
+ """
68
+ A function to extract information from json produced by Tiki API
69
+
70
+ Parameters
71
+ ----------
72
+ json : dictionary
73
+ The json format produced by Tiki API.
74
+
75
+ Returns
76
+ ----------
77
+ dict
78
+ a dictionary containing relevant information from the reviews
79
+ """
80
  d = dict()
81
  d['id'] = json.get(id)
82
  d['title'] = json.get('title')
 
91
 
92
 
93
  def crawl_tiki(link, pages = 1):
94
+ """
95
+ A function to request and retrieve information from Tiki API
96
+
97
+ Parameters
98
+ ----------
99
+ link : string
100
+ Product link from Tiki website
101
+
102
+ pages: int
103
+ Number of pages from the product page to crawl
104
+
105
+ Returns
106
+ ----------
107
+ dict
108
+ a DataFrame containing relevant information from the reviews
109
+ """
110
+ pid = re.search(r"p(\d+)", link).group(1)
111
+ result = []
112
+ params['product_id'] = pid
113
+ print('Crawl comment for product {}'.format(pid))
114
+ for i in range(pages):
115
  params['page'] = i
116
  response = requests.get('https://tiki.vn/api/v2/reviews', headers=headers, params=params, cookies=cookies)
117
  if response.status_code == 200:
 
120
  result.append(comment_parser(comment))
121
  else:
122
  break
123
+ return pd.DataFrame(result)