Spaces:

NBayer
/

Streamlit_app_paper

Runtime error

App Files Files Community

Streamlit_app_paper / app.py

NBayer

Update app.py

1fb5180 almost 3 years ago

raw

history blame contribute delete

4.09 kB

	import streamlit as st
	from streamlit.components.v1 import html
	import os
	import PyPDF2
	import requests
	from transformers import pipeline

	def get_pdf_text(pdf_path):
	# creating a pdf file object
	pdfFileObj = open(pdf_path, 'rb')

	# creating a pdf reader object
	pdf_reader = PyPDF2.PdfReader(pdfFileObj)

	# extract text
	total_text_list = []

	for i in range(len(pdf_reader.pages)):
	page_text = pdf_reader.pages[i].extract_text()
	total_text_list.append(page_text)

	pdf_text = " ".join(total_text_list)
	pdfFileObj.close()

	return pdf_text

	# sum_model = pipeline("text2text-generation", model="yasminesarraj/flan-t5-small-samsum")

	headers = {"Authorization": f"Bearer {st.secrets['HF_AUTH']}"}

	def create_tags(payload):
	API_URL_TAGS = "https://api-inference.huggingface.co/models/fabiochiu/t5-base-tag-generation"

	response = requests.post(API_URL_TAGS, headers=headers, json=payload)
	return response.json()

	def summarize_text(payload):
	API_URL = "https://api-inference.huggingface.co/models/yasminesarraj/flan-t5-small-samsum"
	response = requests.post(API_URL, headers=headers, json=payload)
	return response.json()


	# Start of the app code

	tab_your_paper, tab_general_topics = st.tabs(["Summarize your paper(s)", "Research topics"])

	with tab_your_paper:
	html("", height=10)

	st.markdown("""
	### Simply upload one or multiple PDFs and we summarize the content for you!
	""")

	pdf_files = st.file_uploader("Upload your paper as a pdf", type=[".pdf"], accept_multiple_files=True, help="You can summarize one or also multiple papers at once. The file format needs to be a pdf.")
	if pdf_files:
	recently_added = []
	for pdf in pdf_files:
	# Saving the files
	pdf_data = pdf.getvalue()
	pdf_path = os.path.join(pdf.name)
	with open(pdf_path, "wb") as f:
	f.write(pdf_data)
	recently_added.append(pdf_path)

	pdfs_content_list = []
	for recent_pdf in recently_added:
	# Reading the pdf files
	pdf_content = get_pdf_text(recent_pdf)
	print("**", pdf_content)
	pdfs_content_list.append(pdf_content)

	# Delete the files
	os.remove(recent_pdf)

	all_text_together = " ".join(pdfs_content_list)

	try:
	tags = create_tags({
	"inputs": all_text_together,
	})[0]["generated_text"]
	tags_available = True
	except:
	tags_available = False

	try:
	summary = summarize_text({
	"inputs": "Summarize: "+all_text_together
	})[0]["summary_text"]
	sum_available = True
	except:
	sum_available = False

	col1, col2 = st.columns(2)
	if sum_available == True:
	with col1:
	if len(recently_added) > 1:
	st.markdown("#### Summary of your paper(s):")
	else:
	st.markdown("#### Summary of your paper:")
	st.write(summary)
	# else:
	# with col1:
	# st.write(sum_model(all_text_together))
	else:
	with col1:
	st.markdown("#### Summary currently unavailable.")

	if tags_available == True:
	with col2:
	if len(recently_added) > 1:
	st.markdown("#### Identified topics of your paper(s):")
	else:
	st.markdown("#### Identified topics of your paper:")
	st.write(tags)
	else:
	with col2:
	st.markdown("#### Topics currently unavailable")

	with st.expander("See your total text"):
	st.write(all_text_together)


	with tab_general_topics:
	html("", height=10)

	st.header("See the status of a research topic through a summary of the most cited papers")

	st.selectbox("Select a research topic", ["Artificial Intelligence", "Sustainability", "Cooking"])