Spaces:
Sleeping
Sleeping
File size: 4,111 Bytes
2ceb2ca 31e3f30 2ceb2ca 8554332 e254324 2ceb2ca 8554332 2ceb2ca e254324 1245f95 e254324 1245f95 e254324 1245f95 e254324 876aef6 93cb5d2 1245f95 93cb5d2 31e3f30 93cb5d2 31e3f30 93cb5d2 e254324 128044f 31e3f30 2ceb2ca b3cdb53 553d160 b3cdb53 8fdae23 9124f5b 2ceb2ca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
import gradio as gr
from transformers import pipeline
import torch
import pandas as pd
from openprompt.plms import load_plm
from openprompt import PromptDataLoader
from openprompt.prompts import ManualVerbalizer
from openprompt.prompts import ManualTemplate
from openprompt.data_utils import InputExample
from openprompt import PromptForClassification
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
def readLMwords():
alldata = pd.read_csv("LoughranMcDonald_MasterDictionary_2020.csv")
positive = list(alldata[alldata["Positive"]!=0]["Word"].str.lower())
negative = list(alldata[alldata["Negative"]!=0]["Word"].str.lower())
uncertainty = list(alldata[alldata["Uncertainty"]!=0]["Word"].str.lower())
return positive,negative,uncertainty
def sentiment_analysis(sentence, model_name):
model_name = "HYCCC/"+model_name
raw_sentences = sentence.strip().split('\n')
template = '{"placeholder":"text_a"} Shares are {"mask"}.'
classes = ['positive', 'neutral', 'negative']
positive,negative,neutral = readLMwords()
label_words = {
"positive": positive,
"neutral": neutral,
"negative": negative,
}
type_dic = {
"HYCCC/RoBERTa_Chinese_AnnualReport_tuned":"roberta",
"HYCCC/RoBERTa_Chinese_FinancialNews_tuned":"roberta",
"HYCCC/RoBERTa_English_AnnualReport_tuned":"roberta",
"HYCCC/RoBERTa_English_FinancialNews_tuned":"roberta",
}
if 'Chinese' in model_name:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en")
translated_tokens = model.generate(
**tokenizer(raw_sentences, return_tensors="pt", padding=True)
)
sentences_translated = []
for t in translated_tokens:
sentences_translated.append(tokenizer.decode(t, skip_special_tokens=True))
sentences = sentences_translated
else:
sentences = raw_sentences
testdata = []
for i,sentence in enumerate(sentences):
testdata.append(InputExample(guid=i,text_a=sentence,label=0))
plm, tokenizer, model_config, WrapperClass = load_plm(type_dic[model_name], model_name)
promptTemplate = ManualTemplate(
text = template,
tokenizer = tokenizer,
)
promptVerbalizer = ManualVerbalizer(
classes = classes,
label_words = label_words,
tokenizer = tokenizer,
)
test_dataloader = PromptDataLoader(
dataset = testdata,
tokenizer = tokenizer,
template = promptTemplate,
tokenizer_wrapper_class = WrapperClass,
batch_size = 4,
max_seq_length = 512,
)
prompt_model = PromptForClassification(
plm=plm,
template=promptTemplate,
verbalizer=promptVerbalizer,
freeze_plm=True
)
result = []
for step, inputs in enumerate(test_dataloader):
logits = prompt_model(inputs)
result.extend(torch.argmax(logits, dim=-1))
output = '\n'.join([f"{classes[res]}, {raw_sentences[i]}" for i,res in enumerate(result)])
return str(output)
demo = gr.Interface(fn=sentiment_analysis,
inputs = [gr.TextArea(placeholder="Enter sentence here. If you have multiple sentences, separate them with '\\n'.",
label="Sentence",lines=5,
max_lines = 10),
gr.Radio(choices=["RoBERTa_Chinese_AnnualReport_tuned",
"RoBERTa_Chinese_FinancialNews_tuned",
"RoBERTa_English_AnnualReport_tuned",
"RoBERTa_English_FinancialNews_tuned"],
label="Model Selection")],
outputs=gr.TextArea(label="Sentiment",lines=5, show_copy_button=True, max_lines = 10),
title = "Prompt Learning-Based Disclosure Sentiment Detection"
)
demo.launch()
|