Spaces:

datascientist22
/

rag-pdf-chatbot

Build error

App Files Files Community

datascientist22 commited on Aug 25, 2024

Commit

2fb99d1

verified ·

1 Parent(s): 47b3b73

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -34

app.py CHANGED Viewed

@@ -4,10 +4,14 @@ import PyPDF2
 import torch
 from transformers import AutoTokenizer, AutoModel
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-# Set up the title
-st.title("Engr. Hamesh Raj's PDF Chunking & Embedding Viewer")
-st.markdown("[LinkedIn](https://www.linkedin.com/in/datascientisthameshraj/)")
 # Load the pre-trained model and tokenizer
 @st.cache_resource
@@ -40,38 +44,51 @@ def get_embeddings(texts):
     embeddings = outputs.last_hidden_state.mean(dim=1)
     return embeddings
-# Sidebar for file upload
-st.sidebar.title("Upload PDF")
-uploaded_files = st.sidebar.file_uploader("Choose a PDF file(s)", type="pdf", accept_multiple_files=True)
-if uploaded_files:
-    pdf_chunks_embeddings = {}
-    for uploaded_file in uploaded_files:
-        pdf_name = uploaded_file.name
-        st.write(f"### Processing `{pdf_name}`...")
-        # Extract text from the uploaded PDF
-        text = extract_text_from_pdf(uploaded_file)
-        # Chunkize the extracted text
-        chunks = chunkize_text(text)
-        # Generate embeddings for each chunk
-        embeddings = get_embeddings(chunks)
-        # Store the chunks and embeddings
-        pdf_chunks_embeddings[pdf_name] = {
-            'chunks': chunks,
-            'embeddings': embeddings
-        }
-        # Display chunks and embeddings
-        st.write(f"#### Chunks and Embeddings for `{pdf_name}`")
-        for i, (chunk, embedding) in enumerate(zip(chunks, embeddings)):
-            st.write(f"**Chunk {i+1}:**\n{chunk}")
-            st.write(f"**Embedding {i+1}:**\n{embedding}\n{'-'*50}")
-    st.success("Processing completed!")
 else:
-    st.write("Upload a PDF file to get started.")

 import torch
 from transformers import AutoTokenizer, AutoModel
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.chains import ConversationChain
+from langchain.llms import OpenAI
+from langchain.embeddings import HuggingFaceEmbeddings
+# Set up the title and LinkedIn link
+st.markdown("### Engr. Hamesh Raj")
+st.markdown("[Engr. Hamesh Raj](https://www.linkedin.com/in/datascientisthameshraj/)")
+st.title("PDF Query Chatbot")
 # Load the pre-trained model and tokenizer
 @st.cache_resource
     embeddings = outputs.last_hidden_state.mean(dim=1)
     return embeddings
+# Sidebar for file upload and link input
+st.sidebar.title("Load PDF")
+pdf_url = st.sidebar.text_input("Paste PDF link here:")
+uploaded_files = st.sidebar.file_uploader("Or upload PDF file(s)", type="pdf", accept_multiple_files=True)
+submit_button = st.sidebar.button("Submit")
+# Initialize an empty dictionary for storing processed PDFs
+pdf_chunks_embeddings = {}
+if submit_button:
+    if pdf_url:
+        try:
+            response = requests.get(pdf_url)
+            response.raise_for_status()
+            pdf_file = BytesIO(response.content)
+            st.write(f"Processing document from URL: {pdf_url}")
+            text = extract_text_from_pdf(pdf_file)
+            chunks = chunkize_text(text)
+            embeddings = get_embeddings(chunks)
+            pdf_chunks_embeddings[pdf_url] = {'chunks': chunks, 'embeddings': embeddings}
+            st.success("PDF processed successfully!")
+        except requests.exceptions.RequestException as e:
+            st.error(f"Error loading PDF from URL: {e}")
+    if uploaded_files:
+        for uploaded_file in uploaded_files:
+            pdf_name = uploaded_file.name
+            st.write(f"Processing `{pdf_name}`...")
+            text = extract_text_from_pdf(uploaded_file)
+            chunks = chunkize_text(text)
+            embeddings = get_embeddings(chunks)
+            pdf_chunks_embeddings[pdf_name] = {'chunks': chunks, 'embeddings': embeddings}
+        st.success("PDF(s) processed successfully!")
+# Chatbot section for querying the PDF content
+st.write("### PDF Query Chatbot")
+if pdf_chunks_embeddings:
+    chatbot = ConversationChain(llm=OpenAI(), embedding_model=HuggingFaceEmbeddings())
+    query = st.text_input("Enter your query here:")
+    if query:
+        # Generate a response from the chatbot based on the processed PDFs
+        for pdf_name, data in pdf_chunks_embeddings.items():
+            chatbot.add_documents(data['chunks'])
+            response = chatbot.run(query)
+            st.write(f"**Response from `{pdf_name}`:**\n{response}\n{'-'*50}")
 else:
+    st.write("No PDFs processed yet. Please submit a PDF to get started.")