Spaces:

HYCCC
/

sentiment_analysis

Sleeping

File size: 4,111 Bytes

import gradio as gr
from transformers import pipeline
import torch
import pandas as pd
from openprompt.plms import load_plm
from openprompt import PromptDataLoader
from openprompt.prompts import ManualVerbalizer
from openprompt.prompts import ManualTemplate
from openprompt.data_utils import InputExample
from openprompt import PromptForClassification
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def readLMwords():
    alldata = pd.read_csv("LoughranMcDonald_MasterDictionary_2020.csv")
    positive = list(alldata[alldata["Positive"]!=0]["Word"].str.lower())
    negative = list(alldata[alldata["Negative"]!=0]["Word"].str.lower())
    uncertainty = list(alldata[alldata["Uncertainty"]!=0]["Word"].str.lower())
    return positive,negative,uncertainty


def sentiment_analysis(sentence, model_name):
    model_name = "HYCCC/"+model_name
    raw_sentences = sentence.strip().split('\n')
    template = '{"placeholder":"text_a"} Shares are {"mask"}.'
    classes = ['positive', 'neutral', 'negative']
    positive,negative,neutral = readLMwords()
    label_words = {
        "positive": positive,
        "neutral": neutral,
        "negative": negative,
    }
    type_dic = {
        "HYCCC/RoBERTa_Chinese_AnnualReport_tuned":"roberta",
        "HYCCC/RoBERTa_Chinese_FinancialNews_tuned":"roberta",
        "HYCCC/RoBERTa_English_AnnualReport_tuned":"roberta",
        "HYCCC/RoBERTa_English_FinancialNews_tuned":"roberta",
    }

    if 'Chinese' in model_name:
        tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
        model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
        
        translated_tokens = model.generate(
             **tokenizer(raw_sentences, return_tensors="pt", padding=True)
        )
        sentences_translated = []
        for t in translated_tokens:
            sentences_translated.append(tokenizer.decode(t, skip_special_tokens=True))
        sentences = sentences_translated
    else:
        sentences = raw_sentences

    testdata = []
    for i,sentence in enumerate(sentences):
        testdata.append(InputExample(guid=i,text_a=sentence,label=0))

    plm, tokenizer, model_config, WrapperClass = load_plm(type_dic[model_name], model_name)

    promptTemplate = ManualTemplate(
        text = template,
        tokenizer = tokenizer,
    )
    promptVerbalizer = ManualVerbalizer(
        classes = classes,
        label_words = label_words,
        tokenizer = tokenizer,
    )
    test_dataloader = PromptDataLoader(
        dataset = testdata,
        tokenizer = tokenizer, 
        template = promptTemplate, 
        tokenizer_wrapper_class = WrapperClass,
        batch_size = 4,
        max_seq_length = 512,
    )
    prompt_model = PromptForClassification(
        plm=plm,
        template=promptTemplate, 
        verbalizer=promptVerbalizer, 
        freeze_plm=True
    )
    result = []
    for step, inputs in enumerate(test_dataloader):
        logits = prompt_model(inputs)
        result.extend(torch.argmax(logits, dim=-1))
    output = '\n'.join([f"{classes[res]}, {raw_sentences[i]}" for i,res in enumerate(result)])
    return str(output)



demo = gr.Interface(fn=sentiment_analysis,
                    inputs = [gr.TextArea(placeholder="Enter sentence here. If you have multiple sentences, separate them with '\\n'.",
                                          label="Sentence",lines=5,
                                          max_lines = 10),
                            gr.Radio(choices=["RoBERTa_Chinese_AnnualReport_tuned",
                                              "RoBERTa_Chinese_FinancialNews_tuned",
                                              "RoBERTa_English_AnnualReport_tuned",
                                              "RoBERTa_English_FinancialNews_tuned"],
                                     label="Model Selection")],
                    outputs=gr.TextArea(label="Sentiment",lines=5, show_copy_button=True, max_lines = 10),
                    title = "Prompt Learning-Based Disclosure Sentiment Detection"
        )
    
demo.launch()