File size: 3,681 Bytes
f948662 c24bb0e e8f2766 4bfc56c a2db1d8 4bfc56c 38be73f 4bfc56c 73337c1 4bfc56c 73337c1 27dc83c 4bfc56c 803b81a 4bfc56c f539e87 4bfc56c f948662 f539e87 f948662 f539e87 f948662 f539e87 59bc195 f948662 3e2a410 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 |
import streamlit as st
from scipy.special import softmax
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
from pytorch_transformers import BertForTokenClassification
import torch
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
max_len = 60
# Define the directory where the model is saved
bert_out_address = 'model/'
# Load the configuration file
config = BertConfig.from_json_file(os.path.join(bert_out_address, "config.json"))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# Load the pre-trained model's weights for sequence classification
model = BertForSequenceClassification(config)
# model.load_state_dict(torch.load(os.path.join(bert_out_address, "pytorch_model.bin")))
model.load_state_dict(torch.load(os.path.join(bert_out_address, "pytorch_model.bin"), map_location=torch.device(device)))
model = BertForTokenClassification.from_pretrained(bert_out_address,num_labels=5)
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_out_address)
# Set the model to evaluation mode (if you're not going to train it further)
model.eval()
def predict(test_query):
import torch
tokenized_texts = []
temp_token = []
# Add [CLS] at the front
temp_token.append('[CLS]')
token_list = tokenizer.tokenize(test_query)
for m,token in enumerate(token_list):
temp_token.append(token)
# Trim the token to fit the length requirement
if len(temp_token) > max_len-1:
temp_token= temp_token[:max_len-1]
# Add [SEP] at the end
temp_token.append('[SEP]')
tokenized_texts.append(temp_token)
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
maxlen=max_len, dtype="long", truncating="post", padding="post")
attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
segment_ids = [[0] * len(input_id) for input_id in input_ids]
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
segment_ids = torch.tensor(segment_ids)
# Assuming you have defined your model and input_ids somewhere before this
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device) # Move model to GPU if available
# Move input tensors to the same device as the model
input_ids = input_ids.to(device)
with torch.no_grad():
outputs = model(input_ids, token_type_ids=None, attention_mask=None)
logits = outputs[0] # Ensure this is on the same device
# Make logits into numpy type predict result
# The predict result contain each token's all tags predict result
predict_results = logits.detach().cpu().numpy()
from scipy.special import softmax
result_arrays_soft = softmax(predict_results[0])
tag2name = {0: 'O', 1: 'ASP', 2: 'X', 3: '[CLS]', 4: '[SEP]'}
result_array = result_arrays_soft
result_list = np.argmax(result_array,axis=-1)
asp = []
for i, mark in enumerate(attention_masks[0]):
if mark>0:
if tag2name[result_list[i]] == "ASP":
# print("Token:%s"%(temp_token[i]))
asp.append(temp_token[i])
# print("Predict_Tag:%s"%(tag2name[result_list[i]]))
return asp
# Title for the Streamlit app
st.title("Aspect Prediction App")
# Text input
user_input = st.text_area("Enter the text for Aspect Prection:", "")
# Check if there is input text
if user_input:
# Perform sentiment analysis
with st.spinner("Analyzing..."):
result = predict(user_input)
result
# st.write(f"**Aspects** : {sentiment}") |