Spaces:

NTDuy
/

vietnamese_ecommerce_topic_classification

Paused

App Files Files Community

NTDuy commited on Jun 24, 2024

Commit

3c7015e

verified ·

1 Parent(s): 6fa2f32

add comments

Browse files

Files changed (1) hide show

supervised_model/phobert.py +102 -3

supervised_model/phobert.py CHANGED Viewed

@@ -15,6 +15,19 @@ MODEL_PATH = "D:\\Thesis Topic modelling\\Phobert-base-v2-shopee"
 TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
 def get_prediction(predictions, threshold=0.5):
     # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
     sigmoid = torch.nn.Sigmoid()
     probs = sigmoid(torch.Tensor(predictions))
@@ -26,6 +39,18 @@ def get_prediction(predictions, threshold=0.5):
 class InferencePhobert:
   def __init__(self, tokenize_model = "underthesea", classification_model = MODEL_PATH):
     labels = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"]
     id2label = {idx:label for idx, label in enumerate(labels)}
     label2id = {label:idx for idx, label in enumerate(labels)}
@@ -39,11 +64,40 @@ class InferencePhobert:
     self.segmenter_path = tokenize_model
   def rdrsegment(self, text):
-      text = self.rdrsegmenter.tokenize(text)
-      text = ' '.join([' '.join(x) for x in text])
-      return text
   def preprocess(self, data):
     text_list = []
     if self.segmenter_path == "underthesea":
       for text in data:
@@ -59,6 +113,23 @@ class InferencePhobert:
     return encoding
   def generate_dataset(self, processed_data, batch_size = 10):
     inputs = torch.tensor(processed_data["input_ids"])
     masks = torch.tensor(processed_data["attention_mask"])
     dataset = TensorDataset(inputs, masks)
@@ -67,6 +138,20 @@ class InferencePhobert:
     return data_loader
   def predict(self, dataset):
     predictions = []
     for step, batch in stqdm(enumerate(dataset), total = len(dataset)):
       b_input_ids, b_input_mask = batch
@@ -83,6 +168,20 @@ class InferencePhobert:
     return res
   def predict_sentence(self, text):
     if self.segmenter_path == "underthesea":
       text = word_tokenize(text, format="text")
     else:

 TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
 def get_prediction(predictions, threshold=0.5):
+    """
+    Produce probability from the classification model
+    Parameters
+    ----------
+    predictions : torch.tensor
+        output from the last linear layer
+    Returns
+    ----------
+    numpy.array
+        an array containing probabilities for each label
+    """
     # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
     sigmoid = torch.nn.Sigmoid()
     probs = sigmoid(torch.Tensor(predictions))
 class InferencePhobert:
   def __init__(self, tokenize_model = "underthesea", classification_model = MODEL_PATH):
+    """
+    A class for inferencing PhoBERT model
+    Parameters
+    ----------
+    tokenize_model : string
+        choosing which model to tokenize text (underthesea or rdrsegementer)
+    classification_model: string
+        path to model weight
+    """
     labels = ["Quality",	"Serve",	"Pack",	"Shipping", "Price", "Other"]
     id2label = {idx:label for idx, label in enumerate(labels)}
     label2id = {label:idx for idx, label in enumerate(labels)}
     self.segmenter_path = tokenize_model
   def rdrsegment(self, text):
+    """
+    Tokenize text using rdrsegmenter
+    Parameters
+    ----------
+    text : string
+        input text
+    Returns
+    ----------
+    string
+        tokenized text (For example, "san pham tot" to "san_pham tot")
+    """
+    text = self.rdrsegmenter.tokenize(text)
+    text = ' '.join([' '.join(x) for x in text])
+    return text
   def preprocess(self, data):
+    """
+    Reformatting text to fit into PhoBERT model. This process include tokenzing, byte-pair-encoding and padding
+    Parameters
+    ----------
+    data : list
+        input text data
+    Returns
+    ----------
+    dictionary
+       Containing encoded values, masked attention.
+    """
     text_list = []
     if self.segmenter_path == "underthesea":
       for text in data:
     return encoding
   def generate_dataset(self, processed_data, batch_size = 10):
+    """
+    Gemerate torch dataset from data
+    Parameters
+    ----------
+    processed_data : dictionary
+        output from preprocess function
+    batch_size: int
+        How many reviews to be included for each iteration
+    Returns
+    ----------
+    torch.dataset
+        Dataset representing the reviews and associated labels
+    """
     inputs = torch.tensor(processed_data["input_ids"])
     masks = torch.tensor(processed_data["attention_mask"])
     dataset = TensorDataset(inputs, masks)
     return data_loader
   def predict(self, dataset):
+    """
+    Get prediction from PhoBERT model
+    Parameters
+    ----------
+    dataset : torch.dataset
+        output from generate_dataset function
+    Returns
+    ----------
+    numpy.array
+        containing probabilities for each label
+    """
     predictions = []
     for step, batch in stqdm(enumerate(dataset), total = len(dataset)):
       b_input_ids, b_input_mask = batch
     return res
   def predict_sentence(self, text):
+    """
+    Get prediction from PhoBERT model for a single review
+    Parameters
+    ----------
+    text : string
+        output from generate_dataset function
+    Returns
+    ----------
+    numpy.array
+        containing probabilities for each label
+    """
     if self.segmenter_path == "underthesea":
       text = word_tokenize(text, format="text")
     else: