Spaces:

AshenR
/

ASPWords

Sleeping

File size: 3,681 Bytes

import streamlit as st
from scipy.special import softmax
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os
from pytorch_transformers import BertForTokenClassification
import torch
from transformers import BertConfig, BertForSequenceClassification, BertTokenizer
max_len = 60
# Define the directory where the model is saved
bert_out_address = 'model/'

# Load the configuration file
config = BertConfig.from_json_file(os.path.join(bert_out_address, "config.json"))
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the pre-trained model's weights for sequence classification
model = BertForSequenceClassification(config)
# model.load_state_dict(torch.load(os.path.join(bert_out_address, "pytorch_model.bin")))

model.load_state_dict(torch.load(os.path.join(bert_out_address, "pytorch_model.bin"), map_location=torch.device(device)))
model = BertForTokenClassification.from_pretrained(bert_out_address,num_labels=5)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_out_address)

# Set the model to evaluation mode (if you're not going to train it further)
model.eval()




def predict(test_query):
    import torch
    tokenized_texts = []
    temp_token = []
    # Add [CLS] at the front 
    temp_token.append('[CLS]')
    token_list = tokenizer.tokenize(test_query)
    for m,token in enumerate(token_list):
        temp_token.append(token)
    # Trim the token to fit the length requirement
    if len(temp_token) > max_len-1:
        temp_token= temp_token[:max_len-1]
    # Add [SEP] at the end
    temp_token.append('[SEP]')
    tokenized_texts.append(temp_token)
    input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                              maxlen=max_len, dtype="long", truncating="post", padding="post")
    attention_masks = [[int(i>0) for i in ii] for ii in input_ids]
    segment_ids = [[0] * len(input_id) for input_id in input_ids]
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)
    segment_ids = torch.tensor(segment_ids)

    # Assuming you have defined your model and input_ids somewhere before this
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)  # Move model to GPU if available

    # Move input tensors to the same device as the model
    input_ids = input_ids.to(device)

    with torch.no_grad():
        outputs = model(input_ids, token_type_ids=None, attention_mask=None)
        logits = outputs[0]  # Ensure this is on the same device

    # Make logits into numpy type predict result
    # The predict result contain each token's all tags predict result
    predict_results = logits.detach().cpu().numpy()

    from scipy.special import softmax
    result_arrays_soft = softmax(predict_results[0])
    tag2name = {0: 'O', 1: 'ASP', 2: 'X', 3: '[CLS]', 4: '[SEP]'}
    result_array = result_arrays_soft
    result_list = np.argmax(result_array,axis=-1)
    asp = []
    for i, mark in enumerate(attention_masks[0]):
        if mark>0:
            if tag2name[result_list[i]] == "ASP":
                # print("Token:%s"%(temp_token[i]))
                asp.append(temp_token[i])
#             print("Predict_Tag:%s"%(tag2name[result_list[i]]))
    return asp

















# Title for the Streamlit app
st.title("Aspect Prediction App")

# Text input
user_input = st.text_area("Enter the text for Aspect Prection:", "")


# Check if there is input text
if user_input:
    # Perform sentiment analysis
    with st.spinner("Analyzing..."):
        result = predict(user_input)
        result
    

    
    # st.write(f"**Aspects** : {sentiment}")