Spaces:
Sleeping
Sleeping
firqaaa
commited on
Commit
·
e04865c
1
Parent(s):
dea570f
add app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,239 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import time
|
| 3 |
+
import json
|
| 4 |
+
import glob
|
| 5 |
+
import shutil
|
| 6 |
+
import textwrap
|
| 7 |
+
import pdf2image
|
| 8 |
+
import pytesseract
|
| 9 |
+
|
| 10 |
+
import nltk
|
| 11 |
+
import openai
|
| 12 |
+
import pandas as pd
|
| 13 |
+
from langchain.chat_models import ChatOpenAI
|
| 14 |
+
from langchain.prompts import ChatPromptTemplate
|
| 15 |
+
from langchain.chains import create_extraction_chain
|
| 16 |
+
|
| 17 |
+
import re
|
| 18 |
+
from Bio import Entrez
|
| 19 |
+
from tqdm.auto import tqdm
|
| 20 |
+
|
| 21 |
+
import streamlit as st
|
| 22 |
+
from ast import literal_eval
|
| 23 |
+
|
| 24 |
+
nltk.download('punkt')
|
| 25 |
+
|
| 26 |
+
os.environ['OPENAI_API_KEY'] = "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"
|
| 27 |
+
Entrez.email = "firqaaa@gmail.com"
|
| 28 |
+
Entrez.api_key = "3d7a71231fe7f2d2bd7599e022535199a908"
|
| 29 |
+
|
| 30 |
+
fold = -1
|
| 31 |
+
# chunk_size = 8000
|
| 32 |
+
|
| 33 |
+
st.cache_data()
|
| 34 |
+
def convert_df(df):
|
| 35 |
+
return df.to_csv().encode('utf-8')
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def replace_quotes(text):
|
| 39 |
+
pattern = r'(?<=")[^"]*(?=")'
|
| 40 |
+
return re.sub(pattern, lambda match: match.group(0).replace('"', "'"), text)
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def clean_text(text):
|
| 44 |
+
"""Remove section titles and figure descriptions from text"""
|
| 45 |
+
pattern = r'[^\w\s]'
|
| 46 |
+
clean = "\n".join([row for row in text.split("\n") if (len(row.split(" "))) > 3 and not (row.startswith("(a)")) and not row.startswith("Figure")])
|
| 47 |
+
return re.sub(pattern, '', clean)
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def truncate_text(text, max_tokens):
|
| 51 |
+
wrapper = textwrap.TextWrapper(width=max_tokens)
|
| 52 |
+
truncated_text = wrapper.wrap(text)
|
| 53 |
+
if len(truncated_text) > 0:
|
| 54 |
+
return truncated_text[0]
|
| 55 |
+
else:
|
| 56 |
+
return ""
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def split_text(text, chunk_size):
|
| 60 |
+
chunks = []
|
| 61 |
+
start = 0
|
| 62 |
+
end = chunk_size
|
| 63 |
+
while start < len(text):
|
| 64 |
+
chunks.append(text[start:end])
|
| 65 |
+
start = end
|
| 66 |
+
end += chunk_size
|
| 67 |
+
return chunks
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def extract_gene_name(text):
|
| 71 |
+
text_str = text.decode("utf-8")
|
| 72 |
+
text_str = text_str.replace("\\n", "").replace("\\t", "").replace("\\'", "'")
|
| 73 |
+
pattern = r"<NAME>(.*?)</NAME>"
|
| 74 |
+
match = re.search(pattern, text_str)
|
| 75 |
+
if match:
|
| 76 |
+
gene_name = match.group(1)
|
| 77 |
+
return gene_name
|
| 78 |
+
else:
|
| 79 |
+
return None
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def get_geneName(rsid):
|
| 83 |
+
text = Entrez.efetch(db="snp", id=rsid, retmode='xml').read()
|
| 84 |
+
text = extract_gene_name(text)
|
| 85 |
+
return text
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def split_text_into_sentences(text, num_sentences):
|
| 89 |
+
sentences = nltk.sent_tokenize(text)
|
| 90 |
+
grouped_sentences = [sentences[i:i+num_sentences] for i in range(0, len(sentences), num_sentences)]
|
| 91 |
+
return grouped_sentences
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def flatten_list(nested_list):
|
| 95 |
+
flattened_list = []
|
| 96 |
+
for item in nested_list:
|
| 97 |
+
if isinstance(item, list):
|
| 98 |
+
flattened_list.extend(flatten_list(item))
|
| 99 |
+
else:
|
| 100 |
+
flattened_list.append(item)
|
| 101 |
+
return flattened_list
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
def move_file(source_path, destination_path):
|
| 105 |
+
|
| 106 |
+
# Make sure the destination folder exists before moving the file
|
| 107 |
+
if not os.path.exists(destination_path):
|
| 108 |
+
os.makedirs(destination_path)
|
| 109 |
+
|
| 110 |
+
try:
|
| 111 |
+
shutil.move(source_path, destination_path)
|
| 112 |
+
print(f"File moved successfully from '{source_path}' to '{destination_path}'.")
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"Error: {e}")
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
llm = ChatOpenAI(temperature=0, model="gpt-3.5-turbo-16k-0613")
|
| 118 |
+
|
| 119 |
+
schema = {
|
| 120 |
+
"properties" : {
|
| 121 |
+
"title" : {"type" : "string"},
|
| 122 |
+
"author" : {"type" : "string"},
|
| 123 |
+
"publisher" : {"type" : "string"},
|
| 124 |
+
"publication_year" : {"type" : "string"},
|
| 125 |
+
"gene_codes" : {"type" : "string"},
|
| 126 |
+
"population_race" : {"type" : "string"},
|
| 127 |
+
"phenotypes_or_diseases" : {"type" : "string"},
|
| 128 |
+
"sample_size" : {"type" : "string"},
|
| 129 |
+
"SNPs" : {"type" : "string"},
|
| 130 |
+
"Study_Methodology" : {"type" : "string"},
|
| 131 |
+
"Study_Level" : {"type" : "string"},
|
| 132 |
+
"Outcome/Recommendation/Conclusion" : {"type" : "string"}
|
| 133 |
+
},
|
| 134 |
+
"required" : ["title"]
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
chain = create_extraction_chain(schema, llm)
|
| 138 |
+
err_path = []
|
| 139 |
+
|
| 140 |
+
# Page title
|
| 141 |
+
st.set_page_config(page_title="PubMed Paper Extraction")
|
| 142 |
+
st.title("PubMed Paper Extraction")
|
| 143 |
+
|
| 144 |
+
uploaded_file = st.file_uploader('Upload Paper Here : ', type="pdf")
|
| 145 |
+
if uploaded_file:
|
| 146 |
+
st.write(f"{uploaded_file.name} successfully uploaded")
|
| 147 |
+
|
| 148 |
+
chunk_size = st.selectbox(
|
| 149 |
+
'Tokens amounts per process :',
|
| 150 |
+
(16000, 12000, 10000, 8000, 5000)
|
| 151 |
+
)
|
| 152 |
+
|
| 153 |
+
parseButton = st.button("Extract Text")
|
| 154 |
+
|
| 155 |
+
if parseButton:
|
| 156 |
+
with st.spinner(text='Extraction in progress ...'):
|
| 157 |
+
try:
|
| 158 |
+
images = pdf2image.convert_from_bytes(uploaded_file.getvalue())
|
| 159 |
+
extracted_text = ""
|
| 160 |
+
for image in images[:-1]:
|
| 161 |
+
text = pytesseract.image_to_string(image)
|
| 162 |
+
text = clean_text(text)
|
| 163 |
+
extracted_text += text + " "
|
| 164 |
+
|
| 165 |
+
text = replace_quotes(extracted_text)
|
| 166 |
+
text_chunk = split_text(text, chunk_size)[:fold]
|
| 167 |
+
|
| 168 |
+
chunkdf = []
|
| 169 |
+
|
| 170 |
+
for i, chunk in enumerate(text_chunk):
|
| 171 |
+
inp = chunk
|
| 172 |
+
df = pd.DataFrame(literal_eval(str(json.dumps(chain.run(inp)[0])).replace("\'", "\"")), index=[0]).fillna('')
|
| 173 |
+
chunkdf.append(df)
|
| 174 |
+
|
| 175 |
+
concat = pd.concat(chunkdf, axis=0).reset_index().drop('index', axis=1).fillna('')
|
| 176 |
+
concat['title'] = concat['title'][0]
|
| 177 |
+
concat['author'] = concat['author'][0]
|
| 178 |
+
concat['publisher'] = concat['publisher'][0]
|
| 179 |
+
concat['publication_year'] = concat['publication_year'][0]
|
| 180 |
+
# concat = concat.min().to_frame().T
|
| 181 |
+
concat['SNPs'] = concat['SNPs'].apply(lambda x: x if x.startswith('rs') else '')
|
| 182 |
+
for col in list(concat.columns):
|
| 183 |
+
concat[col] = concat[col].apply(lambda x: x if x not in ['N/A', 'not mentioned', 'Not mentioned', 'Unknown'] else '')
|
| 184 |
+
|
| 185 |
+
L = []
|
| 186 |
+
for i in range(len(concat)):
|
| 187 |
+
if (len(concat['gene_codes'][i].split(',')) >= 1) and concat['SNPs'][i] == '':
|
| 188 |
+
for g in concat['gene_codes'][i].split(','):
|
| 189 |
+
L.append({
|
| 190 |
+
'Title' : concat['title'][0],
|
| 191 |
+
'Author' : concat['author'][0],
|
| 192 |
+
'Publisher' : concat['publisher'][0],
|
| 193 |
+
'Publication Year' : concat['publication_year'][0],
|
| 194 |
+
'Genes' : g.upper(),
|
| 195 |
+
'Population' : concat['population_race'][i],
|
| 196 |
+
'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
|
| 197 |
+
'Sample Size' : concat['sample_size'][i],
|
| 198 |
+
'SNPs' : concat['SNPs'][i],
|
| 199 |
+
'Study Methodology' : concat['Study_Methodology'][i].title(),
|
| 200 |
+
'Study Level' : concat['Study_Level'][i].title(),
|
| 201 |
+
'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
|
| 202 |
+
})
|
| 203 |
+
elif (len(concat['SNPs'][i].split(',')) >= 1):
|
| 204 |
+
for s in concat['SNPs'][i].split(','):
|
| 205 |
+
try:
|
| 206 |
+
L.append({
|
| 207 |
+
'Title' : concat['title'][0],
|
| 208 |
+
'Author' : concat['author'][0],
|
| 209 |
+
'Publisher' : concat['publisher'][0],
|
| 210 |
+
'Publication Year' : concat['publication_year'][0],
|
| 211 |
+
'Genes' : get_geneName(s.strip()).upper(),
|
| 212 |
+
'Population' : concat['population_race'][0],
|
| 213 |
+
'Phenotype' : concat['phenotypes_or_diseases'][i].title(),
|
| 214 |
+
'Sample Size' : concat['sample_size'][i],
|
| 215 |
+
'SNPs' : s,
|
| 216 |
+
'Study Methodology' : concat['Study_Methodology'][i],
|
| 217 |
+
'Study Level' : concat['Study_Level'][i].title(),
|
| 218 |
+
'Outcomes' : concat['Outcome/Recommendation/Conclusion'][i].capitalize()
|
| 219 |
+
})
|
| 220 |
+
except Exception as e:
|
| 221 |
+
print(e)
|
| 222 |
+
# result = pd.DataFrame(L)
|
| 223 |
+
st.dataframe(pd.DataFrame(L))
|
| 224 |
+
csv = convert_df(pd.DataFrame(L))
|
| 225 |
+
|
| 226 |
+
st.download_button(
|
| 227 |
+
label="Save Result",
|
| 228 |
+
data=csv,
|
| 229 |
+
file_name=str(uploaded_file.name).replace('.pdf', ''),
|
| 230 |
+
mime='text/csv'
|
| 231 |
+
)
|
| 232 |
+
|
| 233 |
+
except Exception as e:
|
| 234 |
+
if e == json.JSONDecodeError:
|
| 235 |
+
st.write("Sorry, we are experiencing difficulties in extracting the information. Please try again with different context length.")
|
| 236 |
+
# st.write(e)
|
| 237 |
+
else:
|
| 238 |
+
st.write("Sorry, we are experiencing difficulties in extracting the information. Please ensure that you input an uncorrupted file.")
|
| 239 |
+
# move_file(pdf, "./unprocessed")
|