dreji18 commited on
Commit
946703c
·
1 Parent(s): 38537ba

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +185 -0
app.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Created on Mon Jul 4 08:43:02 2022
4
+
5
+ @author: dreji18
6
+ """
7
+
8
+ import streamlit as st
9
+ import hydralit_components as hc
10
+ import datetime
11
+ import time
12
+ from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
13
+ from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit
14
+ from functionforDownloadButtons import download_button
15
+ import fitz
16
+ import pandas as pd
17
+ import base64
18
+
19
+ # set page size wide and theme
20
+ st.set_page_config(layout='wide', initial_sidebar_state='collapsed',)
21
+ over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'}
22
+
23
+ # app page setup
24
+ import hydralit as hy
25
+ app = hy.HydraApp(title='Biomedical Epidemiology NER App',
26
+ nav_container= None,
27
+ nav_horizontal=bool,
28
+ layout='wide',
29
+ #favicon = "🧊",
30
+ use_navbar=True,
31
+ navbar_theme=over_theme,
32
+ navbar_sticky=True,
33
+ navbar_mode='pinned',
34
+ use_loader=True,
35
+ use_cookie_cache=True,
36
+ sidebar_state = 'auto',
37
+ navbar_animation=True,
38
+ allow_url_nav=False,
39
+ hide_streamlit_markers = True,
40
+ #use_banner_images=["./background.png",None,{'header':"<h1 style='text-align:center;padding: 10px 10px;color:black;font-size:200%;'>Biomedical Epidemiology Entity Recognizer</h1><br>"},None,"./background.png"],
41
+ #banner_spacing=[5,30,60,30,5],
42
+ clear_cross_app_sessions=True,
43
+ session_params=None
44
+ )
45
+
46
+
47
+ # individual pages
48
+ @app.addapp(is_home=True)
49
+ def my_home():
50
+ hy.markdown("<h3 style='text-align: center; color: black;'>Biomedical Epidemiology Named Entity Recognition System </h3>", unsafe_allow_html=True)
51
+
52
+ st.write("""There are a few challenges related to the task of biomedical named
53
+ entity recognition, which are: the existing methods consider a fewer
54
+ number of biomedical entities (e.g., disease, symptom, proteins,
55
+ genes); and these methods do not consider the social determinants
56
+ of health (age, gender, employment, race), which are the non-
57
+ medical factors related to patients’ health. We propose a machine
58
+ learning pipeline that improves on previous efforts in the following
59
+ ways: first, it recognizes many biomedical entity types other than
60
+ the standard ones; second, it considers non-clinical factors related
61
+ to patient’s health. This pipeline also consists of stages, such as pre-
62
+ processing, tokenization, mapping embedding lookup and named
63
+ entity recognition task to extract biomedical named entities from
64
+ the free texts. We present a new dataset that we prepare by curating
65
+ the COVID-19 case reports. The proposed approach outperforms
66
+ the baseline methods on five benchmark datasets with macro-and
67
+ micro-average F1 scores around 90, as well as our dataset with a
68
+ macro-and micro-average F1 score of 95.25 and 93.18 respectively""")
69
+ hy.image("Epidemiologist.jpeg")
70
+
71
+ @app.addapp(title='Entity Recognizer', icon="far fa-copy",)
72
+ def app2():
73
+ hy.subheader("NER from text corpus")
74
+ with hy.form(key="text_form"):
75
+ ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
76
+ with c1:
77
+ hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities")
78
+ hy.image("medical care logo template social media.png")
79
+
80
+ with c2:
81
+ doc = st.text_area(
82
+ "Paste your text below (max 500 words)",
83
+ height=310,
84
+ )
85
+
86
+ MAX_WORDS = 500
87
+ import re
88
+ res = len(re.findall(r"\w+", doc))
89
+ if res > MAX_WORDS:
90
+ st.warning(
91
+ "⚠️ Your text contains "
92
+ + str(res)
93
+ + " words."
94
+ + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
95
+ )
96
+
97
+ doc = doc[:MAX_WORDS]
98
+
99
+ submit_button = st.form_submit_button(label="🍃 Get me the data!")
100
+
101
+ if len(doc)!=0:
102
+ pred_df = ner_prediction(corpus=doc, compute='gpu') #pass compute='gpu' if using gpu
103
+ with c3:
104
+ st.dataframe(pred_df)
105
+ CSVButton1 = download_button(pred_df, "key-value-content.csv", "📥 Download (.csv)")
106
+
107
+ hy.markdown(" ")
108
+ hy.markdown(" ")
109
+ hy.markdown(" ")
110
+
111
+ hy.subheader("NER from Pdf Reports")
112
+ with hy.form(key="pdf_form"):
113
+ ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
114
+ with c1:
115
+ hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities")
116
+ hy.image("medical care logo template social media.png")
117
+
118
+ with c2:
119
+ uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"])
120
+ submit_button1 = st.form_submit_button(label="🍃 Get me the data!")
121
+
122
+ if uploaded_file is not None:
123
+
124
+ try:
125
+ document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
126
+ page = 0
127
+ final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"])
128
+ while page < document.pageCount:
129
+ page_text=document.get_page_text(page)
130
+ out = ner_prediction(corpus=page_text, compute='gpu')
131
+ output = out.drop_duplicates(subset=["value"],keep='first')
132
+ #to iterate through every row in the dataframe
133
+ for index, row in output.iterrows():
134
+ text = row['value']
135
+ #selecting values which has threshold greater than 0.5
136
+ #avoiding words less than than length of 3 to avoid false positives
137
+ if row["score"] > 0.5 and len(text) > 2:
138
+ final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']]
139
+
140
+ text_instances = document[page].search_for(text)
141
+ current_page = document[page]
142
+ if text_instances is not None:
143
+ #for adding/marking the annotation in the pdf
144
+ for inst in text_instances:
145
+ #coordinates of the annoation in the pdf
146
+ x0,x1,x2,x3 = inst
147
+ rect = (x0,x1,x2,x3)
148
+ annot = current_page.add_rect_annot(rect)
149
+ info = annot.info
150
+ info["title"] = row['entity_group']
151
+ annot.set_info(info)
152
+ annot.update()
153
+
154
+ page+=1
155
+
156
+ if len(final_df)!=0:
157
+ final_df['Pdf File'] = uploaded_file.name
158
+ final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']]
159
+ with c2:
160
+ st.dataframe(final_df)
161
+ CSVButton2 = download_button(final_df, "key-value-pdf.csv", "📥 Download (.csv)")
162
+ else:
163
+ print("No Entities Extracted!!!")
164
+
165
+
166
+ document.save(uploaded_file.name.replace(".pdf", "_annot.pdf"))
167
+
168
+ #final_df.to_csv(uploaded_file.replace(".pdf", "_df.csv"))
169
+ #return final_df
170
+
171
+ with c2:
172
+ with open(uploaded_file.name.replace(".pdf", "_annot.pdf"),"rb") as f:
173
+ base64_pdf = base64.b64encode(f.read()).decode('utf-8')
174
+ pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
175
+ st.markdown(pdf_display, unsafe_allow_html=True)
176
+
177
+
178
+ except Exception as e:
179
+ print("Error occured: {}".format(e))
180
+ raise e
181
+
182
+
183
+
184
+ app.run()
185
+