Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from transformers import pipeline | |
| import torch | |
| import pandas as pd | |
| from openprompt.plms import load_plm | |
| from openprompt import PromptDataLoader | |
| from openprompt.prompts import ManualVerbalizer | |
| from openprompt.prompts import ManualTemplate | |
| from openprompt.data_utils import InputExample | |
| from openprompt import PromptForClassification | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| def readLMwords(): | |
| alldata = pd.read_csv("LoughranMcDonald_MasterDictionary_2020.csv") | |
| positive = list(alldata[alldata["Positive"]!=0]["Word"].str.lower()) | |
| negative = list(alldata[alldata["Negative"]!=0]["Word"].str.lower()) | |
| uncertainty = list(alldata[alldata["Uncertainty"]!=0]["Word"].str.lower()) | |
| return positive,negative,uncertainty | |
| def sentiment_analysis(sentence, model_name): | |
| model_name = "HYCCC/"+model_name | |
| raw_sentences = sentence.strip().split('\n') | |
| template = '{"placeholder":"text_a"} Shares are {"mask"}.' | |
| classes = ['positive', 'neutral', 'negative'] | |
| positive,negative,neutral = readLMwords() | |
| label_words = { | |
| "positive": positive, | |
| "neutral": neutral, | |
| "negative": negative, | |
| } | |
| type_dic = { | |
| "HYCCC/RoBERTa_Chinese_AnnualReport_tuned":"roberta", | |
| "HYCCC/RoBERTa_Chinese_FinancialNews_tuned":"roberta", | |
| "HYCCC/RoBERTa_English_AnnualReport_tuned":"roberta", | |
| "HYCCC/RoBERTa_English_FinancialNews_tuned":"roberta", | |
| } | |
| if 'Chinese' in model_name: | |
| tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-zh-en") | |
| model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-zh-en") | |
| translated_tokens = model.generate( | |
| **tokenizer(raw_sentences, return_tensors="pt", padding=True) | |
| ) | |
| sentences_translated = [] | |
| for t in translated_tokens: | |
| sentences_translated.append(tokenizer.decode(t, skip_special_tokens=True)) | |
| sentences = sentences_translated | |
| else: | |
| sentences = raw_sentences | |
| testdata = [] | |
| for i,sentence in enumerate(sentences): | |
| testdata.append(InputExample(guid=i,text_a=sentence,label=0)) | |
| plm, tokenizer, model_config, WrapperClass = load_plm(type_dic[model_name], model_name) | |
| promptTemplate = ManualTemplate( | |
| text = template, | |
| tokenizer = tokenizer, | |
| ) | |
| promptVerbalizer = ManualVerbalizer( | |
| classes = classes, | |
| label_words = label_words, | |
| tokenizer = tokenizer, | |
| ) | |
| test_dataloader = PromptDataLoader( | |
| dataset = testdata, | |
| tokenizer = tokenizer, | |
| template = promptTemplate, | |
| tokenizer_wrapper_class = WrapperClass, | |
| batch_size = 4, | |
| max_seq_length = 512, | |
| ) | |
| prompt_model = PromptForClassification( | |
| plm=plm, | |
| template=promptTemplate, | |
| verbalizer=promptVerbalizer, | |
| freeze_plm=True | |
| ) | |
| result = [] | |
| for step, inputs in enumerate(test_dataloader): | |
| logits = prompt_model(inputs) | |
| result.extend(torch.argmax(logits, dim=-1)) | |
| output = '\n'.join([f"{classes[res]}, {raw_sentences[i]}" for i,res in enumerate(result)]) | |
| return str(output) | |
| demo = gr.Interface(fn=sentiment_analysis, | |
| inputs = [gr.TextArea(placeholder="Enter sentence here. If you have multiple sentences, separate them with '\\n'.", | |
| label="Sentence",lines=5, | |
| max_lines = 10), | |
| gr.Radio(choices=["RoBERTa_Chinese_AnnualReport_tuned", | |
| "RoBERTa_Chinese_FinancialNews_tuned", | |
| "RoBERTa_English_AnnualReport_tuned", | |
| "RoBERTa_English_FinancialNews_tuned"], | |
| label="Model Selection")], | |
| outputs=gr.TextArea(label="Sentiment",lines=5, show_copy_button=True, max_lines = 10), | |
| title = "Prompt Learning-Based Disclosure Sentiment Detection" | |
| ) | |
| demo.launch() | |