NTDuy commited on
Commit
3c7015e
·
verified ·
1 Parent(s): 6fa2f32

add comments

Browse files
Files changed (1) hide show
  1. supervised_model/phobert.py +102 -3
supervised_model/phobert.py CHANGED
@@ -15,6 +15,19 @@ MODEL_PATH = "D:\\Thesis Topic modelling\\Phobert-base-v2-shopee"
15
  TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
16
 
17
  def get_prediction(predictions, threshold=0.5):
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
19
  sigmoid = torch.nn.Sigmoid()
20
  probs = sigmoid(torch.Tensor(predictions))
@@ -26,6 +39,18 @@ def get_prediction(predictions, threshold=0.5):
26
 
27
  class InferencePhobert:
28
  def __init__(self, tokenize_model = "underthesea", classification_model = MODEL_PATH):
 
 
 
 
 
 
 
 
 
 
 
 
29
  labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
30
  id2label = {idx:label for idx, label in enumerate(labels)}
31
  label2id = {label:idx for idx, label in enumerate(labels)}
@@ -39,11 +64,40 @@ class InferencePhobert:
39
  self.segmenter_path = tokenize_model
40
 
41
  def rdrsegment(self, text):
42
- text = self.rdrsegmenter.tokenize(text)
43
- text = ' '.join([' '.join(x) for x in text])
44
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def preprocess(self, data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  text_list = []
48
  if self.segmenter_path == "underthesea":
49
  for text in data:
@@ -59,6 +113,23 @@ class InferencePhobert:
59
  return encoding
60
 
61
  def generate_dataset(self, processed_data, batch_size = 10):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  inputs = torch.tensor(processed_data["input_ids"])
63
  masks = torch.tensor(processed_data["attention_mask"])
64
  dataset = TensorDataset(inputs, masks)
@@ -67,6 +138,20 @@ class InferencePhobert:
67
  return data_loader
68
 
69
  def predict(self, dataset):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  predictions = []
71
  for step, batch in stqdm(enumerate(dataset), total = len(dataset)):
72
  b_input_ids, b_input_mask = batch
@@ -83,6 +168,20 @@ class InferencePhobert:
83
  return res
84
 
85
  def predict_sentence(self, text):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  if self.segmenter_path == "underthesea":
87
  text = word_tokenize(text, format="text")
88
  else:
 
15
  TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
16
 
17
  def get_prediction(predictions, threshold=0.5):
18
+ """
19
+ Produce probability from the classification model
20
+
21
+ Parameters
22
+ ----------
23
+ predictions : torch.tensor
24
+ output from the last linear layer
25
+
26
+ Returns
27
+ ----------
28
+ numpy.array
29
+ an array containing probabilities for each label
30
+ """
31
  # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
32
  sigmoid = torch.nn.Sigmoid()
33
  probs = sigmoid(torch.Tensor(predictions))
 
39
 
40
  class InferencePhobert:
41
  def __init__(self, tokenize_model = "underthesea", classification_model = MODEL_PATH):
42
+ """
43
+ A class for inferencing PhoBERT model
44
+
45
+ Parameters
46
+ ----------
47
+ tokenize_model : string
48
+ choosing which model to tokenize text (underthesea or rdrsegementer)
49
+
50
+ classification_model: string
51
+ path to model weight
52
+
53
+ """
54
  labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
55
  id2label = {idx:label for idx, label in enumerate(labels)}
56
  label2id = {label:idx for idx, label in enumerate(labels)}
 
64
  self.segmenter_path = tokenize_model
65
 
66
  def rdrsegment(self, text):
67
+ """
68
+ Tokenize text using rdrsegmenter
69
+
70
+ Parameters
71
+ ----------
72
+ text : string
73
+ input text
74
+
75
+ Returns
76
+ ----------
77
+ string
78
+ tokenized text (For example, "san pham tot" to "san_pham tot")
79
+
80
+ """
81
+ text = self.rdrsegmenter.tokenize(text)
82
+ text = ' '.join([' '.join(x) for x in text])
83
+ return text
84
 
85
  def preprocess(self, data):
86
+ """
87
+ Reformatting text to fit into PhoBERT model. This process include tokenzing, byte-pair-encoding and padding
88
+
89
+ Parameters
90
+ ----------
91
+ data : list
92
+ input text data
93
+
94
+ Returns
95
+ ----------
96
+ dictionary
97
+ Containing encoded values, masked attention.
98
+
99
+ """
100
+
101
  text_list = []
102
  if self.segmenter_path == "underthesea":
103
  for text in data:
 
113
  return encoding
114
 
115
  def generate_dataset(self, processed_data, batch_size = 10):
116
+ """
117
+ Gemerate torch dataset from data
118
+
119
+ Parameters
120
+ ----------
121
+ processed_data : dictionary
122
+ output from preprocess function
123
+
124
+ batch_size: int
125
+ How many reviews to be included for each iteration
126
+
127
+ Returns
128
+ ----------
129
+ torch.dataset
130
+ Dataset representing the reviews and associated labels
131
+
132
+ """
133
  inputs = torch.tensor(processed_data["input_ids"])
134
  masks = torch.tensor(processed_data["attention_mask"])
135
  dataset = TensorDataset(inputs, masks)
 
138
  return data_loader
139
 
140
  def predict(self, dataset):
141
+ """
142
+ Get prediction from PhoBERT model
143
+
144
+ Parameters
145
+ ----------
146
+ dataset : torch.dataset
147
+ output from generate_dataset function
148
+
149
+ Returns
150
+ ----------
151
+ numpy.array
152
+ containing probabilities for each label
153
+
154
+ """
155
  predictions = []
156
  for step, batch in stqdm(enumerate(dataset), total = len(dataset)):
157
  b_input_ids, b_input_mask = batch
 
168
  return res
169
 
170
  def predict_sentence(self, text):
171
+ """
172
+ Get prediction from PhoBERT model for a single review
173
+
174
+ Parameters
175
+ ----------
176
+ text : string
177
+ output from generate_dataset function
178
+
179
+ Returns
180
+ ----------
181
+ numpy.array
182
+ containing probabilities for each label
183
+
184
+ """
185
  if self.segmenter_path == "underthesea":
186
  text = word_tokenize(text, format="text")
187
  else: