puji4ml's picture
Update app.py
d038b75 verified
import tensorflow as tf
import gradio as gr
from webScraping import parseURL
import spacy
'''
Preprocessing Abstract data from given url
'''
def preprocess_abstract_data(url):
abstract_paragraph = parseURL(url)
nlp =spacy.load('en_core_web_lg')
nlp.add_pipe("sentencizer") #will add sentencizer to NLP pipeline,responsible for splitting a paragraph into individual sentence
doc= nlp(abstract_paragraph)
abstract_sentences = [str(sentence) for sentence in list(doc.sents)]
abstract_sent_dictList=[]
#Total Line parameter
total_line_in_abstract = len(abstract_sentences)-1 #as indexing will start from 0 in for loop
for index,value in enumerate(abstract_sentences):
abstract_sent_dict={}
abstract_sent_dict['line_number'] = index #line no parameter
abstract_sent_dict['text'] =value #text
abstract_sent_dict['total_lines'] = total_line_in_abstract
abstract_sent_dict['line_position'] = str(index) +"_of_" + str(total_line_in_abstract)
abstract_sent_dictList.append(abstract_sent_dict)
# Get all line_number values from sample abstract
test_abstract_line_numbers = [sent_dictionary['line_number'] for sent_dictionary in abstract_sent_dictList]
# One-hot encode to same depth as training data, so model accepts right input shape
abstract_line_numbers_one_hot = tf.one_hot(test_abstract_line_numbers,depth=15)
test_abstract_total_lines = [sent_dictionary['total_lines'] for sent_dictionary in abstract_sent_dictList]
abstract_total_lines_one_hot = tf.one_hot(test_abstract_total_lines,20)
#list(sample_sentence) will convert into list of characters
abstract_characters = [' '.join(list(sentence)) for sentence in abstract_sentences]
return abstract_sentences,abstract_characters,abstract_line_numbers_one_hot,abstract_total_lines_one_hot
'''
Making Prediction
'''
def make_prediction(url):
abstract_sentences,abstract_characters,abstract_line_numbers_one_hot,abstract_total_lines_one_hot=preprocess_abstract_data(url)
class_names= ['BACKGROUND','CONCLUSIONS','METHODS','OBJECTIVE','RESULTS'] #our model has encode in this format that's why
skimlit_model = tf.keras.models.load_model('tribid_token_char_lineNo_totalLine_embedded_model')
skimlit_model_pred_prob = skimlit_model.predict(x=((tf.constant(abstract_sentences),
tf.constant(abstract_characters),
abstract_line_numbers_one_hot,
abstract_total_lines_one_hot
)))
skimlit_model_preds = tf.argmax(skimlit_model_pred_prob,axis=1)
skimlit_model_prediction_label = [class_names[prediction] for prediction in skimlit_model_preds]
# for text,target in zip(abstract_sentences,skimlit_model_prediction_label):
# #print(f"for Line Position:\n{linePosition}\nText:\n{text}\nPredicted Target Label:\n{target}\n")
# print(f"{target} : {text}")
background_text,conclusion_text,methods_text,objective_text,results_text ="", "", "", "", ""
for text,target in zip(abstract_sentences,skimlit_model_prediction_label):
if target=="BACKGROUND":
background_text +=text
elif target == "CONCLUSIONS":
conclusion_text +=text
elif target == "METHODS":
methods_text += text
elif target == "OBJECTIVE":
objective_text += text
elif target == "RESULTS":
results_text += text
#gradio output component
abstract_output = [objective_text,background_text,methods_text,results_text,conclusion_text]
return abstract_output
inputs = gr.Textbox(placeholder="Paste your PubMed article URL here",interactive=True,label="URL")
outputs=[gr.Textbox(label="OBJECTIVE"),
gr.Textbox(label="BACKGROUND"),
gr.Textbox(label="METHODS"),
gr.Textbox(label="RESULTS"),
gr.Textbox(label="CONCLUSIONS")]
examples = [
["https://pubmed.ncbi.nlm.nih.gov/20232240/"],
["https://pubmed.ncbi.nlm.nih.gov/22244707/"]
]
app = gr.Interface(fn=make_prediction,
inputs=inputs,
outputs=outputs,
title="PubMed Article Abstract Skimming Tool",
description="Classifies abstract sentences of your PubMed article into the role they play (e.g. objective, methods, results, etc) to get the overview of literature within seconds.",
theme="soft",
examples=examples,
)
app.launch()