Spaces:

yousifalishah
/

chatWithMultiplePDF1

Sleeping

App Files Files Community

yousifalishah commited on Feb 26, 2025

Commit

df29bbf

verified ·

1 Parent(s): b456574

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -21

app.py CHANGED Viewed

@@ -4,8 +4,8 @@ from dotenv import load_dotenv
 import streamlit as st
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
-from sentence_transformers import SentenceTransformer
-from langchain.vectorstores import FAISS
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from groq import Groq
@@ -19,35 +19,39 @@ logging.basicConfig(
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
-# Function to extract text from PDF files
 def get_pdf_text(pdf_docs):
     text = ""
     for pdf in pdf_docs:
         pdf_reader = PdfReader(pdf)
         for page in pdf_reader.pages:
-            text += page.extract_text()
     return text
-# Function to split the extracted text into chunks
 def get_text_chunks(text):
     text_splitter = CharacterTextSplitter(
         separator="\n",
         chunk_size=1000,
         chunk_overlap=200,
         length_function=len
     )
-    chunks = text_splitter.split_text(text)
-    return chunks
-# Function to create a FAISS vectorstore
 def get_vectorstore(text_chunks):
-    model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-    embeddings = model.encode(text_chunks, convert_to_tensor=True)
-    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
-    return vectorstore
-# Function to set up the conversational retrieval chain
 def get_conversation_chain(vectorstore):
     try:
         client = Groq(api_key=os.getenv("GROQ_API_KEY"))
         conversation_chain = ConversationalRetrievalChain.from_llm(
@@ -60,12 +64,13 @@ def get_conversation_chain(vectorstore):
     except Exception as e:
         logging.error(f"Error creating conversation chain: {e}")
         st.error("An error occurred while setting up the conversation chain.")
-# Handle user input
 def handle_userinput(user_question):
     if st.session_state.conversation is not None:
         response = st.session_state.conversation({'question': user_question})
-        st.session_state.chat_history = response['chat_history']
         for i, message in enumerate(st.session_state.chat_history):
             if i % 2 == 0:
@@ -75,15 +80,15 @@ def handle_userinput(user_question):
     else:
         st.warning("Please process the documents first.")
-# Main function to run the Streamlit app
 def main():
     load_dotenv()
     st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
-        st.session_state.chat_history = None
     st.header("Chat with multiple PDFs :books:")
     user_question = st.text_input("Ask a question about your documents:")
@@ -92,15 +97,14 @@ def main():
     with st.sidebar:
         st.subheader("Your documents")
-        pdf_docs = st.file_uploader(
-            "Upload your PDFs here and click on 'Process'", accept_multiple_files=True
-        )
         if st.button("Process"):
             with st.spinner("Processing..."):
                 raw_text = get_pdf_text(pdf_docs)
                 text_chunks = get_text_chunks(raw_text)
                 vectorstore = get_vectorstore(text_chunks)
-                st.session_state.conversation = get_conversation_chain(vectorstore)
 if __name__ == '__main__':
     main()

 import streamlit as st
 from PyPDF2 import PdfReader
 from langchain.text_splitter import CharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
 from langchain.memory import ConversationBufferMemory
 from langchain.chains import ConversationalRetrievalChain
 from groq import Groq
     format='%(asctime)s - %(levelname)s - %(message)s'
 )
 def get_pdf_text(pdf_docs):
+    """Extract text from uploaded PDF files."""
     text = ""
     for pdf in pdf_docs:
         pdf_reader = PdfReader(pdf)
         for page in pdf_reader.pages:
+            text += page.extract_text() or ""
     return text
 def get_text_chunks(text):
+    """Split the extracted text into manageable chunks."""
     text_splitter = CharacterTextSplitter(
         separator="\n",
         chunk_size=1000,
         chunk_overlap=200,
         length_function=len
     )
+    return text_splitter.split_text(text)
 def get_vectorstore(text_chunks):
+    """Create a FAISS vectorstore from text chunks."""
+    try:
+        embedding_function = SentenceTransformerEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
+        vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embedding_function)
+        logging.info("Vectorstore created successfully.")
+        return vectorstore
+    except Exception as e:
+        logging.error(f"Error creating vectorstore: {e}")
+        st.error("An error occurred while creating the vectorstore.")
+        return None
 def get_conversation_chain(vectorstore):
+    """Set up the conversational retrieval chain."""
     try:
         client = Groq(api_key=os.getenv("GROQ_API_KEY"))
         conversation_chain = ConversationalRetrievalChain.from_llm(
     except Exception as e:
         logging.error(f"Error creating conversation chain: {e}")
         st.error("An error occurred while setting up the conversation chain.")
+        return None
 def handle_userinput(user_question):
+    """Handle user input and generate a response."""
     if st.session_state.conversation is not None:
         response = st.session_state.conversation({'question': user_question})
+        st.session_state.chat_history = response.get('chat_history', [])
         for i, message in enumerate(st.session_state.chat_history):
             if i % 2 == 0:
     else:
         st.warning("Please process the documents first.")
 def main():
+    """Run the Streamlit app."""
     load_dotenv()
     st.set_page_config(page_title="Chat with multiple PDFs", page_icon=":books:")
     if "conversation" not in st.session_state:
         st.session_state.conversation = None
     if "chat_history" not in st.session_state:
+        st.session_state.chat_history = []
     st.header("Chat with multiple PDFs :books:")
     user_question = st.text_input("Ask a question about your documents:")
     with st.sidebar:
         st.subheader("Your documents")
+        pdf_docs = st.file_uploader("Upload your PDFs here and click on 'Process'", accept_multiple_files=True)
         if st.button("Process"):
             with st.spinner("Processing..."):
                 raw_text = get_pdf_text(pdf_docs)
                 text_chunks = get_text_chunks(raw_text)
                 vectorstore = get_vectorstore(text_chunks)
+                if vectorstore:
+                    st.session_state.conversation = get_conversation_chain(vectorstore)
 if __name__ == '__main__':
     main()