Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModel, AutoTokenizer | |
| from underthesea import word_tokenize | |
| import __main__ | |
| import time | |
| #phobert = AutoModel.from_pretrained("vinai/phobert-base") | |
| tokenizer = AutoTokenizer.from_pretrained("./bert/bert_tokenizer") | |
| class PhoBertModel(torch.nn.Module): | |
| def __init__(self): | |
| super(PhoBertModel, self).__init__() | |
| self.bert = phobert | |
| self.pre_classifier = torch.nn.Linear(self.bert.config.hidden_size, self.bert.config.hidden_size) | |
| self.dropout = torch.nn.Dropout(0.1) | |
| self.classifier = torch.nn.Linear(self.bert.config.hidden_size, 6) | |
| def forward(self, input_ids, attention_mask, token_type_ids): | |
| hidden_state, output_1 = self.bert( | |
| input_ids = input_ids, | |
| attention_mask=attention_mask, | |
| return_dict = False | |
| ) | |
| pooler = self.pre_classifier(output_1) | |
| activation_1 = torch.nn.Tanh()(pooler) | |
| drop = self.dropout(activation_1) | |
| output_2 = self.classifier(drop) | |
| # activation_2 = torch.nn.Tanh()(output_2) | |
| output = torch.nn.Sigmoid()(output_2) | |
| return output | |
| setattr(__main__, "PhoBertModel", PhoBertModel) | |
| def getModel(): | |
| model = torch.load('./bert/phoBertModel.pth', map_location=torch.device('cpu')) | |
| model.eval() | |
| return model | |
| model = getModel() | |
| def tokenize(data): | |
| max_length = 200 | |
| for line in data: | |
| token = tokenizer.encode_plus( | |
| line, | |
| max_length=200, | |
| add_special_tokens=False, | |
| pad_to_max_length=True | |
| ) | |
| ids = torch.tensor([token['input_ids']]) | |
| mask = torch.tensor([token['attention_mask']]) | |
| token_type_ids = torch.tensor([token['token_type_ids']]) | |
| output = { | |
| 'ids': ids, | |
| 'mask': mask, | |
| 'token_type_ids': token_type_ids, | |
| } | |
| #outputs.append(output) | |
| return output | |
| def BERT_predict(text): | |
| t1 = time.time() | |
| text = [text] | |
| token = tokenize(text) | |
| ids = token['ids'] | |
| mask = token['mask'] | |
| token_type_ids = token['token_type_ids'] | |
| result = model(ids, mask, token_type_ids) | |
| # print(result) | |
| t2 = time.time() | |
| print(f'phoBERT: {t2-t1}s') | |
| return result.tolist()[0] | |
| # print(BERT_predict("xin chaof")) | |
| # print(BERT_predict("con chó")) | |
| # print(BERT_predict("đồ chó")) | |
| # print(BERT_predict("đồ ngu")) | |
| # print(BERT_predict("cái lồn")) | |
| # print(BERT_predict("óc chó")) | |
| # print(BERT_predict("đồ chó đẻ")) | |
| # print(BERT_predict("con đĩ")) |