Spaces:

Mohamed-BC
/

DocuBot

Sleeping

App Files Files Community

Mohamed-BC commited on May 3, 2024

Commit

20b1f3c

1 Parent(s): c766880

docubot-v1.5.3

Browse files

Files changed (6) hide show

.streamlit/config.toml +3 -0
__pycache__/utilities.cpython-310.pyc +0 -0
app.py +46 -0
logo.png +0 -0
requirements.txt +3 -0
utilities.py +37 -0

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,3 @@

+[server]
+enableXsrfProtection = false
+enableCORS = false

__pycache__/utilities.cpython-310.pyc ADDED Viewed

Binary file (1.39 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,46 @@

+import streamlit as st
+import base64
+import os
+import tempfile
+import time
+import utilities as util
+def main():
+    st.set_page_config(page_title="PDF Viewer", layout="wide", page_icon='./logo.png')
+    # st.image('./logo.png', width=60)
+    st.title(":blue[DocuBot]",anchor=False)
+    st.write("View and chat with your PDF")
+    if 'messages' not in st.session_state:
+        st.session_state.messages = [{'role': 'assistant', "content": "Hello! Upload a document and let's get started."}]
+    state = True
+    # with st.sidebar:
+    uploaded_file = st.sidebar.file_uploader("Upload your PDF File", type="pdf")
+    if uploaded_file:
+        state = False
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            file_path = os.path.join(tmp_dir, uploaded_file.name)
+            with open(file_path, "wb") as f:
+                f.write(uploaded_file.getvalue())  # Write the PDF content
+            pdf_text = util.get_pdf_text(file_path)
+            pdf_frame = util.display_pdf(file_path)
+            st.sidebar.markdown(pdf_frame, unsafe_allow_html=True)
+    user_prompt = st.chat_input("What do you wanna know about the document?", disabled=state)
+    if st.sidebar.button(label="summarize"):
+        st.session_state.messages.append({'role': 'user', "content": "Summarize the document"})
+        with st.spinner("..."):
+            summary = util.summarize(pdf_text, max_length=200)
+            st.session_state.messages.append({'role': 'assistant', "content": "Summary of "+uploaded_file.name+": <br>"+summary})
+    if user_prompt:
+        st.session_state.messages.append({'role': 'user', "content": user_prompt})
+        response = "You asked: "+user_prompt
+        with st.spinner("..."):
+            time.sleep(2)
+            st.session_state.messages.append({'role': 'assistant', "content": response})
+    for message in st.session_state.messages:
+        with st.chat_message(message['role']):
+            st.markdown(message['content'], unsafe_allow_html=True)
+if __name__ == "__main__":
+    main()

logo.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+streamlit
+transformers
+pdfplumber

utilities.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import base64
+import pdfplumber
+from transformers import pipeline
+# Function to extract text from a PDF and summarize it
+def get_pdf_text(pdf_file):
+    text = ""
+    # Open the PDF file and extract text
+    with pdfplumber.open(pdf_file) as pdf:
+        for page in pdf.pages:
+            text += page.extract_text()  # Extract text from each page
+    return text
+def display_pdf(file_path):
+  # Read the PDF file
+  with open(file_path, "rb") as f:
+      data = f.read()
+  # Convert PDF content to base64
+  base64_pdf = base64.b64encode(data).decode("utf-8")
+  # Create an iframe to display the PDF
+  pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="100%" height="600px"></iframe>'
+  return pdf_display
+def split_text(text, max_length):
+  """Split text into smaller chunks based on a specified length."""
+  words = text.split()
+  chunks = [' '.join(words[i:i+max_length]) for i in range(0, len(words), max_length)]
+  return chunks
+def summarize(text,max_length):
+  summarizer = pipeline(task="summarization", model='facebook/bart-large-cnn')
+  text_chunks = split_text(text, max_length=max_length)  # Split into chunks of 500 words
+  # Summarize each chunk and combine the results
+  summaries = [summarizer(chunk)[0]['summary_text'] for chunk in text_chunks]
+  # Combine the summaries into a final summary
+  final_summary = ' '.join(summaries)
+  return final_summary
+  # return text_chunks[0]