Spaces:

Arxived
/

quick-spin

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 20, 2024

Commit

0b3783b

verified ·

1 Parent(s): f1054fc

Update app.py

Browse files

Files changed (1) hide show

app.py +64 -28

app.py CHANGED Viewed

@@ -65,13 +65,11 @@ def load_docs(document_path):
         text_splitter = NLTKTextSplitter(chunk_size=1000)
         split_docs = text_splitter.split_documents(documents)
-        # Filter metadata to only include str, int, float, or bool
-        for doc in split_docs:
-            if hasattr(doc, "metadata") and isinstance(doc.metadata, dict):
-                doc.metadata = {
-                    k: v for k, v in doc.metadata.items()
-                    if isinstance(v, (str, int, float, bool))
-                }
         return split_docs
     except Exception as e:
         st.error(f"Failed to load and process PDF: {e}")
@@ -86,32 +84,68 @@ def already_indexed(vectordb, file_name):
 def load_chain(file_name=None):
     loaded_patent = st.session_state.get("LOADED_PATENT")
     vectordb = Chroma(
         persist_directory=PERSISTED_DIRECTORY,
         embedding_function=HuggingFaceEmbeddings(),
     )
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
         st.write("✅ Already indexed.")
     else:
         vectordb.delete_collection()
         docs = load_docs(file_name)
-        st.write("🔍 Number of Documents: ", len(docs))
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
         )
         vectordb.persist()
         st.session_state["LOADED_PATENT"] = file_name
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True,
         input_key="question",
         output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
-        vectordb.as_retriever(search_kwargs={"k": 3}),
         return_source_documents=False,
         memory=memory,
     )
@@ -160,14 +194,9 @@ if __name__ == "__main__":
     )
     # Initialize session state
-    if "LOADED_PATENT" not in st.session_state:
-        st.session_state.LOADED_PATENT = None
-    if "pdf_preview" not in st.session_state:
-        st.session_state.pdf_preview = None
-    if "loaded_pdf_path" not in st.session_state:
-        st.session_state.loaded_pdf_path = None
-    if "chain" not in st.session_state:
-        st.session_state.chain = None
     # Button to load and process patent
     if st.button("Load and Process Patent"):
@@ -187,8 +216,12 @@ if __name__ == "__main__":
         pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
         if not os.path.isfile(pdf_path):
             st.write("📥 Downloading patent file...")
-            pdf_path = download_pdf(patent_number)
-            st.write(f"✅ File downloaded: {pdf_path}")
         else:
             st.write("✅ File already downloaded.")
@@ -204,20 +237,22 @@ if __name__ == "__main__":
         # Load the document into the system
         st.write("🔄 Loading document into the system...")
-        st.session_state.chain = load_chain(pdf_path)
-        st.session_state.LOADED_PATENT = patent_number
-        st.session_state.loaded_pdf_path = pdf_path
-        # Initialize messages AFTER processing
-        st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
-        st.success("🚀 Document successfully loaded! You can now start asking questions.")
     # Display the PDF preview if available
     if st.session_state.pdf_preview:
         st.image(st.session_state.pdf_preview, caption="First Page Preview", use_container_width=True)
     # Display previous chat messages
-    if "messages" in st.session_state:
         for message in st.session_state.messages:
             with st.chat_message(message["role"]):
                 st.markdown(message["content"])
@@ -237,8 +272,9 @@ if __name__ == "__main__":
                 with st.spinner("Generating response..."):
                     try:
                         assistant_response = st.session_state.chain({"question": user_input})
-                        full_response = assistant_response.get("answer", "I couldn't process that question.")
                     except Exception as e:
                         full_response = f"An error occurred: {e}"

         text_splitter = NLTKTextSplitter(chunk_size=1000)
         split_docs = text_splitter.split_documents(documents)
+        # Debug: Check text chunking
+        st.write(f"🔍 Loaded Documents: {len(split_docs)}")
+        for i, doc in enumerate(split_docs[:5]):  # Show first 5 chunks
+            st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
         return split_docs
     except Exception as e:
         st.error(f"Failed to load and process PDF: {e}")
 def load_chain(file_name=None):
     loaded_patent = st.session_state.get("LOADED_PATENT")
+    # Debug: Check PERSISTED_DIRECTORY
+    st.write(f"Using Persisted Directory: {PERSISTED_DIRECTORY}")
     vectordb = Chroma(
         persist_directory=PERSISTED_DIRECTORY,
         embedding_function=HuggingFaceEmbeddings(),
     )
+    # Debug: Confirm already indexed
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
         st.write("✅ Already indexed.")
     else:
+        st.write("🔄 Starting document processing and vectorstore update...")
+        # Remove existing collection and load new docs
         vectordb.delete_collection()
         docs = load_docs(file_name)
+        # Debug: Verify text chunking
+        st.write(f"🔍 Number of Documents Loaded: {len(docs)}")
+        for i, doc in enumerate(docs[:5]):  # Show first 5 chunks for debugging
+            st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
+        # Update vectorstore
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
         )
         vectordb.persist()
+        st.write("✅ Vectorstore successfully updated and persisted.")
+        # Save loaded patent in session state
         st.session_state["LOADED_PATENT"] = file_name
+    # Debug: Check vectorstore indexing
+    indexed_docs = vectordb.get(include=["documents"])
+    st.write(f"✅ Indexed Documents in Vectorstore: {len(indexed_docs['documents'])}")
+    for i, doc in enumerate(indexed_docs["documents"][:3]):  # Show first 3 indexed docs
+        st.write(f"Indexed Doc {i + 1}: {doc[:200]}...")
+    # Test retrieval with a sample query
+    retriever = vectordb.as_retriever(search_kwargs={"k": 3})
+    test_query = "What is this document about?"
+    results = retriever.get_relevant_documents(test_query)
+    # Debug: Verify document retrieval
+    st.write("🔍 Test Retrieval Results for Query:")
+    if results:
+        for i, res in enumerate(results):
+            st.write(f"Retrieved Doc {i + 1}: {res.page_content[:200]}...")
+    else:
+        st.warning("No documents retrieved for test query.")
+    # Configure memory for conversation
     memory = ConversationBufferMemory(
         memory_key="chat_history",
         return_messages=True,
         input_key="question",
         output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
+        retriever,
         return_source_documents=False,
         memory=memory,
     )
     )
     # Initialize session state
+    for key in ["LOADED_PATENT", "pdf_preview", "loaded_pdf_path", "chain", "messages"]:
+        if key not in st.session_state:
+            st.session_state[key] = None
     # Button to load and process patent
     if st.button("Load and Process Patent"):
         pdf_path = os.path.join(tempfile.gettempdir(), f"{patent_number}.pdf")
         if not os.path.isfile(pdf_path):
             st.write("📥 Downloading patent file...")
+            try:
+                pdf_path = download_pdf(patent_number)
+                st.write(f"✅ File downloaded: {pdf_path}")
+            except Exception as e:
+                st.error(f"Failed to download patent: {e}")
+                st.stop()
         else:
             st.write("✅ File already downloaded.")
         # Load the document into the system
         st.write("🔄 Loading document into the system...")
+        try:
+            st.session_state.chain = load_chain(pdf_path)
+            st.session_state.LOADED_PATENT = patent_number
+            st.session_state.loaded_pdf_path = pdf_path
+            st.session_state.messages = [{"role": "assistant", "content": "Hello! How can I assist you with this patent?"}]
+            st.success("🚀 Document successfully loaded! You can now start asking questions.")
+        except Exception as e:
+            st.error(f"Failed to load the document: {e}")
+            st.stop()
     # Display the PDF preview if available
     if st.session_state.pdf_preview:
         st.image(st.session_state.pdf_preview, caption="First Page Preview", use_container_width=True)
     # Display previous chat messages
+    if st.session_state.messages:
         for message in st.session_state.messages:
             with st.chat_message(message["role"]):
                 st.markdown(message["content"])
                 with st.spinner("Generating response..."):
                     try:
+                        # Generate response using the chain
                         assistant_response = st.session_state.chain({"question": user_input})
+                        full_response = assistant_response.get("answer", "I'm sorry, I couldn't process that question.")
                     except Exception as e:
                         full_response = f"An error occurred: {e}"