Spaces:

d4data
/

Biomedical-Epidemiology-NER-App

Sleeping

App Files Files Community

dreji18 commited on Jul 6, 2022

Commit

946703c

1 Parent(s): 38537ba

Upload app.py

Browse files

Files changed (1) hide show

app.py +185 -0

app.py ADDED Viewed

	@@ -0,0 +1,185 @@

+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jul  4 08:43:02 2022
+@author: dreji18
+"""
+import streamlit as st
+import hydralit_components as hc
+import datetime
+import time
+from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
+from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit
+from functionforDownloadButtons import download_button
+import fitz
+import pandas as pd
+import base64
+# set page size wide and theme
+st.set_page_config(layout='wide', initial_sidebar_state='collapsed',)
+over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'}
+# app page setup
+import hydralit as hy
+app = hy.HydraApp(title='Biomedical Epidemiology NER App',
+                  nav_container= None,
+                  nav_horizontal=bool,
+                  layout='wide',
+                  #favicon = "🧊",
+                  use_navbar=True,
+                  navbar_theme=over_theme,
+                  navbar_sticky=True,
+                  navbar_mode='pinned',
+                  use_loader=True,
+                  use_cookie_cache=True,
+                  sidebar_state = 'auto',
+                  navbar_animation=True,
+                  allow_url_nav=False,
+                  hide_streamlit_markers = True,
+                  #use_banner_images=["./background.png",None,{'header':"<h1 style='text-align:center;padding: 10px 10px;color:black;font-size:200%;'>Biomedical Epidemiology Entity Recognizer</h1><br>"},None,"./background.png"],
+                  #banner_spacing=[5,30,60,30,5],
+                  clear_cross_app_sessions=True,
+                  session_params=None
+                  )
+# individual pages
+@app.addapp(is_home=True)
+def my_home():
+    hy.markdown("<h3 style='text-align: center; color: black;'>Biomedical Epidemiology Named Entity Recognition System </h3>", unsafe_allow_html=True)
+    st.write("""There are a few challenges related to the task of biomedical named
+    entity recognition, which are: the existing methods consider a fewer
+    number of biomedical entities (e.g., disease, symptom, proteins,
+    genes); and these methods do not consider the social determinants
+    of health (age, gender, employment, race), which are the non-
+    medical factors related to patients’ health. We propose a machine
+    learning pipeline that improves on previous efforts in the following
+    ways: first, it recognizes many biomedical entity types other than
+    the standard ones; second, it considers non-clinical factors related
+    to patient’s health. This pipeline also consists of stages, such as pre-
+    processing, tokenization, mapping embedding lookup and named
+    entity recognition task to extract biomedical named entities from
+    the free texts. We present a new dataset that we prepare by curating
+    the COVID-19 case reports. The proposed approach outperforms
+    the baseline methods on five benchmark datasets with macro-and
+    micro-average F1 scores around 90, as well as our dataset with a
+    macro-and micro-average F1 score of 95.25 and 93.18 respectively""")
+    hy.image("Epidemiologist.jpeg")
+@app.addapp(title='Entity Recognizer', icon="far fa-copy",)
+def app2():
+    hy.subheader("NER from text corpus")
+    with hy.form(key="text_form"):
+        ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
+        with c1:
+            hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities")
+            hy.image("medical care logo template social media.png")
+    with c2:
+        doc = st.text_area(
+            "Paste your text below (max 500 words)",
+            height=310,
+        )
+        MAX_WORDS = 500
+        import re
+        res = len(re.findall(r"\w+", doc))
+        if res > MAX_WORDS:
+            st.warning(
+                "⚠️ Your text contains "
+                + str(res)
+                + " words."
+                + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
+            )
+            doc = doc[:MAX_WORDS]
+        submit_button = st.form_submit_button(label="🍃 Get me the data!")
+    if len(doc)!=0:
+        pred_df = ner_prediction(corpus=doc, compute='gpu') #pass compute='gpu' if using gpu
+        with c3:
+            st.dataframe(pred_df)
+            CSVButton1 = download_button(pred_df, "key-value-content.csv", "📥 Download (.csv)")
+    hy.markdown(" ")
+    hy.markdown(" ")
+    hy.markdown(" ")
+    hy.subheader("NER from Pdf Reports")
+    with hy.form(key="pdf_form"):
+        ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
+        with c1:
+            hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities")
+            hy.image("medical care logo template social media.png")
+        with c2:
+            uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"])
+            submit_button1 = st.form_submit_button(label="🍃 Get me the data!")
+        if uploaded_file is not None:
+            try:
+                document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
+                page = 0
+                final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"])
+                while page <  document.pageCount:
+                    page_text=document.get_page_text(page)
+                    out = ner_prediction(corpus=page_text, compute='gpu')
+                    output = out.drop_duplicates(subset=["value"],keep='first')
+                    #to iterate through every row in the dataframe
+                    for index, row in output.iterrows():
+                        text = row['value']
+                        #selecting values which has threshold greater than 0.5
+                        #avoiding words less than than length of 3 to avoid false positives
+                        if row["score"] > 0.5 and len(text) > 2:
+                            final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']]
+                            text_instances = document[page].search_for(text)
+                            current_page = document[page]
+                            if text_instances is not None:
+                                #for adding/marking the annotation in the pdf
+                                for inst in text_instances:
+                                    #coordinates of the annoation in the pdf
+                                    x0,x1,x2,x3 = inst
+                                    rect = (x0,x1,x2,x3)
+                                    annot = current_page.add_rect_annot(rect)
+                                    info = annot.info
+                                    info["title"]   = row['entity_group']
+                                    annot.set_info(info)
+                                    annot.update()
+                    page+=1
+                if len(final_df)!=0:
+                    final_df['Pdf File'] = uploaded_file.name
+                    final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']]
+                    with c2:
+                        st.dataframe(final_df)
+                        CSVButton2 = download_button(final_df, "key-value-pdf.csv", "📥 Download (.csv)")
+                else:
+                    print("No Entities Extracted!!!")
+                document.save(uploaded_file.name.replace(".pdf", "_annot.pdf"))
+                #final_df.to_csv(uploaded_file.replace(".pdf", "_df.csv"))
+                #return final_df
+                with c2:
+                    with open(uploaded_file.name.replace(".pdf", "_annot.pdf"),"rb") as f:
+                        base64_pdf = base64.b64encode(f.read()).decode('utf-8')
+                    pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
+                    st.markdown(pdf_display, unsafe_allow_html=True)
+            except Exception as e:
+                  print("Error occured: {}".format(e))
+                  raise e
+app.run()