Spaces:
Sleeping
Sleeping
Commit
·
20b1f3c
1
Parent(s):
c766880
docubot-v1.5.3
Browse files- .streamlit/config.toml +3 -0
- __pycache__/utilities.cpython-310.pyc +0 -0
- app.py +46 -0
- logo.png +0 -0
- requirements.txt +3 -0
- utilities.py +37 -0
.streamlit/config.toml
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[server]
|
| 2 |
+
enableXsrfProtection = false
|
| 3 |
+
enableCORS = false
|
__pycache__/utilities.cpython-310.pyc
ADDED
|
Binary file (1.39 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import base64
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
import time
|
| 6 |
+
import utilities as util
|
| 7 |
+
|
| 8 |
+
def main():
|
| 9 |
+
st.set_page_config(page_title="PDF Viewer", layout="wide", page_icon='./logo.png')
|
| 10 |
+
# st.image('./logo.png', width=60)
|
| 11 |
+
st.title(":blue[DocuBot]",anchor=False)
|
| 12 |
+
st.write("View and chat with your PDF")
|
| 13 |
+
|
| 14 |
+
if 'messages' not in st.session_state:
|
| 15 |
+
st.session_state.messages = [{'role': 'assistant', "content": "Hello! Upload a document and let's get started."}]
|
| 16 |
+
state = True
|
| 17 |
+
# with st.sidebar:
|
| 18 |
+
uploaded_file = st.sidebar.file_uploader("Upload your PDF File", type="pdf")
|
| 19 |
+
if uploaded_file:
|
| 20 |
+
state = False
|
| 21 |
+
with tempfile.TemporaryDirectory() as tmp_dir:
|
| 22 |
+
file_path = os.path.join(tmp_dir, uploaded_file.name)
|
| 23 |
+
with open(file_path, "wb") as f:
|
| 24 |
+
f.write(uploaded_file.getvalue()) # Write the PDF content
|
| 25 |
+
pdf_text = util.get_pdf_text(file_path)
|
| 26 |
+
pdf_frame = util.display_pdf(file_path)
|
| 27 |
+
st.sidebar.markdown(pdf_frame, unsafe_allow_html=True)
|
| 28 |
+
user_prompt = st.chat_input("What do you wanna know about the document?", disabled=state)
|
| 29 |
+
if st.sidebar.button(label="summarize"):
|
| 30 |
+
st.session_state.messages.append({'role': 'user', "content": "Summarize the document"})
|
| 31 |
+
with st.spinner("..."):
|
| 32 |
+
summary = util.summarize(pdf_text, max_length=200)
|
| 33 |
+
st.session_state.messages.append({'role': 'assistant', "content": "Summary of "+uploaded_file.name+": <br>"+summary})
|
| 34 |
+
if user_prompt:
|
| 35 |
+
st.session_state.messages.append({'role': 'user', "content": user_prompt})
|
| 36 |
+
response = "You asked: "+user_prompt
|
| 37 |
+
with st.spinner("..."):
|
| 38 |
+
time.sleep(2)
|
| 39 |
+
st.session_state.messages.append({'role': 'assistant', "content": response})
|
| 40 |
+
|
| 41 |
+
for message in st.session_state.messages:
|
| 42 |
+
with st.chat_message(message['role']):
|
| 43 |
+
st.markdown(message['content'], unsafe_allow_html=True)
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
main()
|
logo.png
ADDED
|
requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit
|
| 2 |
+
transformers
|
| 3 |
+
pdfplumber
|
utilities.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import pdfplumber
|
| 3 |
+
from transformers import pipeline
|
| 4 |
+
# Function to extract text from a PDF and summarize it
|
| 5 |
+
def get_pdf_text(pdf_file):
|
| 6 |
+
text = ""
|
| 7 |
+
# Open the PDF file and extract text
|
| 8 |
+
with pdfplumber.open(pdf_file) as pdf:
|
| 9 |
+
for page in pdf.pages:
|
| 10 |
+
text += page.extract_text() # Extract text from each page
|
| 11 |
+
return text
|
| 12 |
+
|
| 13 |
+
def display_pdf(file_path):
|
| 14 |
+
# Read the PDF file
|
| 15 |
+
with open(file_path, "rb") as f:
|
| 16 |
+
data = f.read()
|
| 17 |
+
# Convert PDF content to base64
|
| 18 |
+
base64_pdf = base64.b64encode(data).decode("utf-8")
|
| 19 |
+
# Create an iframe to display the PDF
|
| 20 |
+
pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
|
| 21 |
+
return pdf_display
|
| 22 |
+
|
| 23 |
+
def split_text(text, max_length):
|
| 24 |
+
"""Split text into smaller chunks based on a specified length."""
|
| 25 |
+
words = text.split()
|
| 26 |
+
chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
|
| 27 |
+
return chunks
|
| 28 |
+
|
| 29 |
+
def summarize(text,max_length):
|
| 30 |
+
summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
|
| 31 |
+
text_chunks = split_text(text, max_length=max_length) # Split into chunks of 500 words
|
| 32 |
+
# Summarize each chunk and combine the results
|
| 33 |
+
summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
|
| 34 |
+
# Combine the summaries into a final summary
|
| 35 |
+
final_summary = ' '.join(summaries)
|
| 36 |
+
return final_summary
|
| 37 |
+
# return text_chunks[0]
|