Commit
·
dba2b44
1
Parent(s):
2c0c269
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import pickle
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
|
| 7 |
+
import torch
|
| 8 |
+
|
| 9 |
+
import math
|
| 10 |
+
|
| 11 |
+
from transformers import AutoTokenizer, AutoModel
|
| 12 |
+
|
| 13 |
+
import transformers
|
| 14 |
+
|
| 15 |
+
import re
|
| 16 |
+
|
| 17 |
+
mlp = pickle.load(open("MLP_over_embeddings.pickle", "rb"))
|
| 18 |
+
|
| 19 |
+
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-num")
|
| 20 |
+
model = AutoModel.from_pretrained('nlpaueb/sec-bert-num')
|
| 21 |
+
|
| 22 |
+
"""# Input here"""
|
| 23 |
+
|
| 24 |
+
def convert_actual_to_num(text, number, offset):
|
| 25 |
+
length = len(str(number))
|
| 26 |
+
offset = int(offset)
|
| 27 |
+
new_text= text[:offset] + " [NUM] " + text[offset+length:]
|
| 28 |
+
return new_text
|
| 29 |
+
|
| 30 |
+
def num_detector_highlighter_adv(text):
|
| 31 |
+
num_posn = []
|
| 32 |
+
posn = -1
|
| 33 |
+
num = ""
|
| 34 |
+
text = text + " "
|
| 35 |
+
others = ""
|
| 36 |
+
for i in range(len(text)-2):
|
| 37 |
+
if (text[i].isdigit() and text[i+1].isdigit()) or (text[i].isdigit() and text[i+1]=="." and text[i+2].isdigit()):
|
| 38 |
+
num = num + str(text[i])
|
| 39 |
+
if posn == -1:
|
| 40 |
+
posn = i
|
| 41 |
+
if others!="":
|
| 42 |
+
num_posn.append((others,""))
|
| 43 |
+
others = ""
|
| 44 |
+
elif (text[i].isdigit() and text[i+1].isdigit()==False and text[i+1]!=".") or (text[i].isdigit() and text[i+1]=="." and text[i].isdigit() and text[i+2].isdigit()==False):
|
| 45 |
+
num = num + str(text[i])
|
| 46 |
+
if len(num)==1:
|
| 47 |
+
posn = i
|
| 48 |
+
if others!="":
|
| 49 |
+
num_posn.append((others,""))
|
| 50 |
+
others = ""
|
| 51 |
+
num_posn.append((str(num), "@POSITION " + str(posn)))
|
| 52 |
+
num = ""
|
| 53 |
+
posn = -1
|
| 54 |
+
elif text[i] == ".":
|
| 55 |
+
if text[i+1].isdigit():
|
| 56 |
+
num = num + str(text[i])
|
| 57 |
+
else:
|
| 58 |
+
others = others + str(text[i])
|
| 59 |
+
elif text[i]!=' ':
|
| 60 |
+
others = others + str(text[i])
|
| 61 |
+
elif text[i]==" ":
|
| 62 |
+
if others!="" and others!=" ":
|
| 63 |
+
num_posn.append((others,""))
|
| 64 |
+
others = ""
|
| 65 |
+
if others!="":
|
| 66 |
+
num_posn.append((others,""))
|
| 67 |
+
#print(num_posn)
|
| 68 |
+
return num_posn
|
| 69 |
+
|
| 70 |
+
def exnum_evaluator(df):
|
| 71 |
+
df['preprocessed_text'] = df.apply(lambda x: convert_actual_to_num(x.text, x.number, x.position), axis = 1)
|
| 72 |
+
df['number_processed'] = df['number'].apply(lambda x: str(x)[0:str(x).index(".")+2] if "." in str(x) else str(x))
|
| 73 |
+
#preprocessed_text = convert_actual_to_num(raw_text,number,offset)
|
| 74 |
+
all_preds = []
|
| 75 |
+
for preprocessed_text in df["preprocessed_text"].values:
|
| 76 |
+
tokenized_text = tokenizer.tokenize(preprocessed_text)
|
| 77 |
+
|
| 78 |
+
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
|
| 79 |
+
index = tokenized_text.index('[NUM]')
|
| 80 |
+
tokens_tensor = torch.tensor([indexed_tokens])
|
| 81 |
+
|
| 82 |
+
model.eval()
|
| 83 |
+
with torch.no_grad():
|
| 84 |
+
last_hidden_states = model(tokens_tensor)[0]
|
| 85 |
+
|
| 86 |
+
embedding_of_num = last_hidden_states[:,index,:]
|
| 87 |
+
embedding_of_num_use = list(embedding_of_num[0].cpu().detach().numpy())
|
| 88 |
+
pred = mlp.predict([embedding_of_num_use])[0]
|
| 89 |
+
all_preds.append(pred)
|
| 90 |
+
df['pred'] = all_preds
|
| 91 |
+
df['calculated_magnitude'] = df['number_processed'].apply(lambda x : min(6,int(math.log10(float(x)))+1)) # restric upto 2 dp in x if decimal
|
| 92 |
+
df["prediction"] = np.where((df['calculated_magnitude'] != df['pred']), "Exaggerated", "Non-Exaggerated") #df.apply(lambda x : "Exaggerated" if x.calculated_magnitude!=x.prediction else "Non-Exaggerated", axis=1)
|
| 93 |
+
return df[["number", "position", "prediction"]]#, "text", "preprocessed_text",'number_processed', "pred", "calculated_magnitude"]]
|
| 94 |
+
|
| 95 |
+
def change_checkbox_group(text2):
|
| 96 |
+
num_posn_inp = [(num, posn) for (num,posn) in eval(text2) if posn!=""]
|
| 97 |
+
num_posn_dislay = [str(num) + " " + str(posn) for (num,posn) in num_posn_inp]
|
| 98 |
+
return gr.CheckboxGroup.update(choices = num_posn_dislay, label="Numerals", visible=True, value=num_posn_dislay)
|
| 99 |
+
|
| 100 |
+
def combined_fns(text, text2, choices=[]):
|
| 101 |
+
num_posn_inp = [(num, posn) for (num,posn) in eval(text2) if posn!=""]#[(num, posn) for (num,posn) in num_detector_highlighter_adv(text) if posn!=""]
|
| 102 |
+
#num_posn_dislay = [str(num) + " " + str(posn) for (num,posn) in num_posn]
|
| 103 |
+
df = pd.DataFrame({"text": [text]*len(num_posn_inp), "number" : [i[0] for i in num_posn_inp], "position" : [i[1].replace("@POSITION ", "") for i in num_posn_inp]})
|
| 104 |
+
df['num_position'] = [str(num) + " " + str(posn) for (num,posn) in num_posn_inp]
|
| 105 |
+
if len(choices)>0:
|
| 106 |
+
df = df[df['num_position'].isin(choices)]
|
| 107 |
+
return exnum_evaluator(df)
|
| 108 |
+
|
| 109 |
+
#examples
|
| 110 |
+
def set_example_text(example_text):
|
| 111 |
+
return gr.Textbox.update(value=example_text[0])
|
| 112 |
+
|
| 113 |
+
demo = gr.Blocks(theme=gr.themes.Soft())
|
| 114 |
+
|
| 115 |
+
with demo:
|
| 116 |
+
gr.Markdown("# **Financial Exaggerated Numeral ClassifiEr (FENCE)**")
|
| 117 |
+
with gr.Row():
|
| 118 |
+
with gr.Column():
|
| 119 |
+
text = gr.components.Textbox(label="Enter financial text here", lines=2, placeholder="Enter Financial Text here...")
|
| 120 |
+
b1 = gr.Button("Get numerals present in the entered text")
|
| 121 |
+
b1.click(num_detector_highlighter_adv, inputs = text, outputs = gr.HighlightedText(label='Numerals present in the text'))
|
| 122 |
+
text2 = gr.components.Textbox(visible=False)
|
| 123 |
+
b1.click(num_detector_highlighter_adv, inputs = text, outputs =text2)
|
| 124 |
+
with gr.Row():
|
| 125 |
+
with gr.Tabs():
|
| 126 |
+
with gr.TabItem("All numerals"):
|
| 127 |
+
b2 = gr.Button("Predict for all numerals")
|
| 128 |
+
b2.click(combined_fns, inputs = [text, text2], outputs = gr.DataFrame())
|
| 129 |
+
with gr.TabItem("Specific numerals"):
|
| 130 |
+
b3 = gr.Button("Get option to select numerals")
|
| 131 |
+
num_posn_inp_ckbx = gr.CheckboxGroup(choices = [], interactive=True, label='Specific Numerals')
|
| 132 |
+
b3.click(change_checkbox_group, inputs=text2, outputs=num_posn_inp_ckbx)
|
| 133 |
+
b4 = gr.Button("Predict for specific numerals")
|
| 134 |
+
b4.click(combined_fns, inputs = [text, text2, num_posn_inp_ckbx], outputs = gr.DataFrame())
|
| 135 |
+
example_text = gr.Dataset(components=[text], samples=[["Get 30% off Gap denim whilst recycling your old denim for communities in need"], [" Matthew Perry puts Malibu mansion on the market for $13.5 million"], ["Anton Art Center in Mt. Clemens hosts 19th Annual ArtParty Fundraiser - Twilight in the Tropics"], ["Black Friday Sales! - Vegas hotel packages 50% savings from Southwest Vacations"]])
|
| 136 |
+
example_text.click(fn=set_example_text,
|
| 137 |
+
inputs=example_text,
|
| 138 |
+
outputs=example_text.components)
|
| 139 |
+
gr.Markdown("<sub><sup>How to use? [link](https://github.com/sohomghosh/FENCE_Financial_Exaggerated_Numeral_ClassifiEr/blob/main/README.md), Warning: User discretion is advised.</sup></sub>")
|
| 140 |
+
demo.launch(share = True)
|