NTDuy commited on
Commit
48cee12
·
verified ·
1 Parent(s): 00165bd

fixed preprocessing function

Browse files
Files changed (1) hide show
  1. supervised_model/phobert.py +100 -100
supervised_model/phobert.py CHANGED
@@ -1,101 +1,101 @@
1
- from transformers import AutoTokenizer
2
- from transformers import AutoModelForSequenceClassification
3
- from distutils.dir_util import copy_tree
4
- from underthesea import word_tokenize
5
- from utils.data_preprocessing import *
6
- from vncorenlp import VnCoreNLP
7
- from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
8
- import torch
9
- import pandas as pd
10
- import numpy as np
11
- from optimum.bettertransformer import BetterTransformer
12
- from stqdm import stqdm
13
-
14
- MODEL_PATH = "D:\\Thesis Topic modelling\\Phobert-base-v2-shopee"
15
- TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
16
-
17
- def get_prediction(predictions, threshold=0.5):
18
- # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
19
- sigmoid = torch.nn.Sigmoid()
20
- probs = sigmoid(torch.Tensor(predictions))
21
- # next, use threshold to turn them into integer predictions
22
- y_pred = np.zeros(probs.shape)
23
- y_pred[np.where(probs >= threshold)] = 1
24
- return y_pred
25
-
26
-
27
- class InferencePhobert:
28
- def __init__(self, tokenize_model = "underthesea", classification_model = MODEL_PATH):
29
- labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
30
- id2label = {idx:label for idx, label in enumerate(labels)}
31
- label2id = {label:idx for idx, label in enumerate(labels)}
32
- model = AutoModelForSequenceClassification.from_pretrained(classification_model, problem_type="multi_label_classification",
33
- num_labels=len(labels),
34
- id2label=id2label,
35
- label2id=label2id)
36
- model.eval()
37
- self.model = BetterTransformer.transform(model, keep_original_model=True)
38
- self.tokenizer = AutoTokenizer.from_pretrained(classification_model)
39
- self.segmenter_path = tokenize_model
40
-
41
- def rdrsegment(self, text):
42
- text = self.rdrsegmenter.tokenize(text)
43
- text = ' '.join([' '.join(x) for x in text])
44
- return text
45
-
46
- def preprocess(self, data):
47
- text_list = []
48
- if self.segmenter_path == "underthesea":
49
- for text in data:
50
- text = word_tokenize(text, format="text")
51
- text_list.append(text)
52
- else:
53
- self.rdrsegmenter = VnCoreNLP(self.segmenter_path, annotators="wseg", max_heap_size='-Xmx500m')
54
- for text in data:
55
- text = self.segmenter.tokenize(text)
56
- text = ' '.join([' '.join(x) for x in text])
57
- text_list.append(text)
58
- encoding = self.tokenizer(text_list, padding = "max_length", truncation = True, max_length = 125)
59
- return encoding
60
-
61
- def generate_dataset(self, processed_data, batch_size = 10):
62
- inputs = torch.tensor(processed_data["input_ids"])
63
- masks = torch.tensor(processed_data["attention_mask"])
64
- dataset = TensorDataset(inputs, masks)
65
- dataset_sampler = SequentialSampler(dataset)
66
- data_loader = DataLoader(dataset, sampler=dataset_sampler, batch_size=batch_size)
67
- return data_loader
68
-
69
- def predict(self, dataset):
70
- predictions = []
71
- for step, batch in stqdm(enumerate(dataset), total = len(dataset)):
72
- b_input_ids, b_input_mask = batch
73
- with torch.no_grad():
74
- self.model.eval()
75
- input_ids = torch.tensor(b_input_ids)
76
- attention_mask = torch.tensor(b_input_mask)
77
- outputs = self.model(input_ids,
78
- token_type_ids=None,
79
- attention_mask=attention_mask)
80
- prediction = get_prediction(outputs[0], threshold=0.5)
81
- predictions.append(prediction)
82
- res = np.concatenate(predictions)
83
- return res
84
-
85
- def predict_sentence(self, text):
86
- if self.segmenter_path == "underthesea":
87
- text = word_tokenize(text, format="text")
88
- else:
89
- self.rdrsegmenter = VnCoreNLP(self.segmenter_path, annotators="wseg", max_heap_size='-Xmx500m')
90
- text = self.rdrsegment(text)
91
- encoding = self.tokenizer([text], padding = "max_length", truncation = True, max_length = 125)
92
- inputs = torch.tensor(encoding["input_ids"])
93
- masks = torch.tensor(encoding["attention_mask"])
94
- with torch.no_grad():
95
- self.model.eval()
96
- output = self.model(inputs,
97
- token_type_ids=None,
98
- attention_mask=masks)
99
- sigmoid = torch.nn.Sigmoid()
100
- probs = sigmoid(torch.Tensor(output[0]))
101
  return probs
 
1
+ from transformers import AutoTokenizer
2
+ from transformers import AutoModelForSequenceClassification
3
+ from distutils.dir_util import copy_tree
4
+ from underthesea import word_tokenize
5
+ from utils.data_preprocessing import *
6
+ from vncorenlp import VnCoreNLP
7
+ from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
8
+ import torch
9
+ import pandas as pd
10
+ import numpy as np
11
+ from optimum.bettertransformer import BetterTransformer
12
+ from stqdm import stqdm
13
+
14
+ MODEL_PATH = "D:\\Thesis Topic modelling\\Phobert-base-v2-shopee"
15
+ TOKENIZE_PATH = "./vncorenlp/VnCoreNLP-1.1.1.jar"
16
+
17
+ def get_prediction(predictions, threshold=0.5):
18
+ # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
19
+ sigmoid = torch.nn.Sigmoid()
20
+ probs = sigmoid(torch.Tensor(predictions))
21
+ # next, use threshold to turn them into integer predictions
22
+ y_pred = np.zeros(probs.shape)
23
+ y_pred[np.where(probs >= threshold)] = 1
24
+ return y_pred
25
+
26
+
27
+ class InferencePhobert:
28
+ def __init__(self, tokenize_model = "underthesea", classification_model = MODEL_PATH):
29
+ labels = ["Quality", "Serve", "Pack", "Shipping", "Price", "Other"]
30
+ id2label = {idx:label for idx, label in enumerate(labels)}
31
+ label2id = {label:idx for idx, label in enumerate(labels)}
32
+ model = AutoModelForSequenceClassification.from_pretrained(classification_model, problem_type="multi_label_classification",
33
+ num_labels=len(labels),
34
+ id2label=id2label,
35
+ label2id=label2id)
36
+ model.eval()
37
+ self.model = BetterTransformer.transform(model, keep_original_model=True)
38
+ self.tokenizer = AutoTokenizer.from_pretrained(classification_model)
39
+ self.segmenter_path = tokenize_model
40
+
41
+ def rdrsegment(self, text):
42
+ text = self.rdrsegmenter.tokenize(text)
43
+ text = ' '.join([' '.join(x) for x in text])
44
+ return text
45
+
46
+ def preprocess(self, data):
47
+ text_list = []
48
+ if self.segmenter_path == "underthesea":
49
+ for text in data:
50
+ text = word_tokenize(text, format="text")
51
+ text_list.append(text)
52
+ else:
53
+ self.rdrsegmenter = VnCoreNLP(self.segmenter_path, annotators="wseg", max_heap_size='-Xmx500m')
54
+ for text in data:
55
+ text = self.rdrsegmenter.tokenize(text)
56
+ text = ' '.join([' '.join(x) for x in text])
57
+ text_list.append(text)
58
+ encoding = self.tokenizer(text_list, padding = "max_length", truncation = True, max_length = 125)
59
+ return encoding
60
+
61
+ def generate_dataset(self, processed_data, batch_size = 10):
62
+ inputs = torch.tensor(processed_data["input_ids"])
63
+ masks = torch.tensor(processed_data["attention_mask"])
64
+ dataset = TensorDataset(inputs, masks)
65
+ dataset_sampler = SequentialSampler(dataset)
66
+ data_loader = DataLoader(dataset, sampler=dataset_sampler, batch_size=batch_size)
67
+ return data_loader
68
+
69
+ def predict(self, dataset):
70
+ predictions = []
71
+ for step, batch in stqdm(enumerate(dataset), total = len(dataset)):
72
+ b_input_ids, b_input_mask = batch
73
+ with torch.no_grad():
74
+ self.model.eval()
75
+ input_ids = torch.tensor(b_input_ids)
76
+ attention_mask = torch.tensor(b_input_mask)
77
+ outputs = self.model(input_ids,
78
+ token_type_ids=None,
79
+ attention_mask=attention_mask)
80
+ prediction = get_prediction(outputs[0], threshold=0.5)
81
+ predictions.append(prediction)
82
+ res = np.concatenate(predictions)
83
+ return res
84
+
85
+ def predict_sentence(self, text):
86
+ if self.segmenter_path == "underthesea":
87
+ text = word_tokenize(text, format="text")
88
+ else:
89
+ self.rdrsegmenter = VnCoreNLP(self.segmenter_path, annotators="wseg", max_heap_size='-Xmx500m')
90
+ text = self.rdrsegment(text)
91
+ encoding = self.tokenizer([text], padding = "max_length", truncation = True, max_length = 125)
92
+ inputs = torch.tensor(encoding["input_ids"])
93
+ masks = torch.tensor(encoding["attention_mask"])
94
+ with torch.no_grad():
95
+ self.model.eval()
96
+ output = self.model(inputs,
97
+ token_type_ids=None,
98
+ attention_mask=masks)
99
+ sigmoid = torch.nn.Sigmoid()
100
+ probs = sigmoid(torch.Tensor(output[0]))
101
  return probs