Spaces:

Arxived
/

quick-spin

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 20, 2024

Commit

8e22da0

verified ·

1 Parent(s): acd48bc

Update app.py

Browse files

Files changed (1) hide show

app.py +20 -24

app.py CHANGED Viewed

@@ -56,6 +56,9 @@ def check_poppler_installed():
 check_poppler_installed()
 def load_docs(document_path):
     try:
         import fitz  # PyMuPDF for text extraction
@@ -71,11 +74,11 @@ def load_docs(document_path):
         doc.close()
-        # Step 2: Combine cleaned text
         full_text = "\n".join(extracted_text)
         st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
-        # Step 3: Chunk the cleaned text
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=100,
@@ -83,9 +86,9 @@ def load_docs(document_path):
         )
         split_docs = text_splitter.create_documents([full_text])
-        # Debug: Show filtered chunks
         st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
-        for i, doc in enumerate(split_docs[:5]):  # Show first 5 chunks
             st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
         return split_docs
@@ -126,30 +129,28 @@ def already_indexed(vectordb, file_name):
     return file_name in indexed_sources
 def load_chain(file_name=None):
     loaded_patent = st.session_state.get("LOADED_PATENT")
-    # Debug: Check PERSISTED_DIRECTORY
-    st.write(f"Using Persisted Directory: {PERSISTED_DIRECTORY}")
     vectordb = Chroma(
         persist_directory=PERSISTED_DIRECTORY,
         embedding_function=HuggingFaceEmbeddings(),
     )
-    # Debug: Confirm already indexed
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
         st.write("✅ Already indexed.")
     else:
         st.write("🔄 Starting document processing and vectorstore update...")
         # Remove existing collection and load new docs
         vectordb.delete_collection()
         docs = load_docs(file_name)
-        # Debug: Verify text chunking
-        st.write(f"🔍 Number of Documents Loaded: {len(docs)}")
-        for i, doc in enumerate(docs[:5]):  # Show first 5 chunks for debugging
-            st.write(f"Chunk {i + 1}: {doc.page_content[:200]}...")
         # Update vectorstore
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
@@ -160,18 +161,15 @@ def load_chain(file_name=None):
         # Save loaded patent in session state
         st.session_state["LOADED_PATENT"] = file_name
-    # Debug: Check vectorstore indexing
     indexed_docs = vectordb.get(include=["documents"])
-    st.write(f"✅ Indexed Documents in Vectorstore: {len(indexed_docs['documents'])}")
-    for i, doc in enumerate(indexed_docs["documents"][:3]):  # Show first 3 indexed docs
-        st.write(f"Indexed Doc {i + 1}: {doc[:200]}...")
-    # Test retrieval with a sample query
     retriever = vectordb.as_retriever(search_kwargs={"k": 3})
     test_query = "What is this document about?"
     results = retriever.get_relevant_documents(test_query)
-    # Debug: Verify document retrieval
     st.write("🔍 Test Retrieval Results for Query:")
     if results:
         for i, res in enumerate(results):
@@ -182,18 +180,16 @@ def load_chain(file_name=None):
     # Configure memory for conversation
     memory = ConversationBufferMemory(
         memory_key="chat_history",
-        return_messages=True,
-        input_key="question",
-        output_key="answer",
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         retriever,
-        return_source_documents=False,
-        memory=memory,
     )
 def extract_patent_number(url):
     pattern = r"/patent/([A-Z]{2}\d+)"
     match = re.search(pattern, url)

 check_poppler_installed()
 def load_docs(document_path):
+    """
+    Load and clean the PDF content, then split into chunks.
+    """
     try:
         import fitz  # PyMuPDF for text extraction
         doc.close()
+        # Combine all pages into one text
         full_text = "\n".join(extracted_text)
         st.write(f"📄 Total Cleaned Text Length: {len(full_text)} characters")
+        # Step 2: Chunk the cleaned text
         text_splitter = RecursiveCharacterTextSplitter(
             chunk_size=1000,
             chunk_overlap=100,
         )
         split_docs = text_splitter.create_documents([full_text])
+        # Debug: Show total chunks count and first 3 chunks for verification
         st.write(f"🔍 Total Chunks After Splitting: {len(split_docs)}")
+        for i, doc in enumerate(split_docs[:3]):  # Show first 3 chunks only
             st.write(f"Chunk {i + 1}: {doc.page_content[:300]}...")
         return split_docs
     return file_name in indexed_sources
 def load_chain(file_name=None):
+    """
+    Load cleaned PDF text, split into chunks, and update the vectorstore.
+    """
     loaded_patent = st.session_state.get("LOADED_PATENT")
+    # Debug: Show persist directory
+    st.write(f"🗂 Using Persisted Directory: {PERSISTED_DIRECTORY}")
     vectordb = Chroma(
         persist_directory=PERSISTED_DIRECTORY,
         embedding_function=HuggingFaceEmbeddings(),
     )
     if loaded_patent == file_name or already_indexed(vectordb, file_name):
         st.write("✅ Already indexed.")
     else:
         st.write("🔄 Starting document processing and vectorstore update...")
         # Remove existing collection and load new docs
         vectordb.delete_collection()
         docs = load_docs(file_name)
         # Update vectorstore
         vectordb = Chroma.from_documents(
             docs, HuggingFaceEmbeddings(), persist_directory=PERSISTED_DIRECTORY
         # Save loaded patent in session state
         st.session_state["LOADED_PATENT"] = file_name
+    # Debug: Check vectorstore indexing summary
     indexed_docs = vectordb.get(include=["documents"])
+    st.write(f"✅ Total Indexed Documents: {len(indexed_docs['documents'])}")
+    # Test retrieval with a simple query
     retriever = vectordb.as_retriever(search_kwargs={"k": 3})
     test_query = "What is this document about?"
     results = retriever.get_relevant_documents(test_query)
     st.write("🔍 Test Retrieval Results for Query:")
     if results:
         for i, res in enumerate(results):
     # Configure memory for conversation
     memory = ConversationBufferMemory(
         memory_key="chat_history",
+        return_messages=True
     )
     return ConversationalRetrievalChain.from_llm(
         OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY),
         retriever,
+        memory=memory
     )
 def extract_patent_number(url):
     pattern = r"/patent/([A-Z]{2}\d+)"
     match = re.search(pattern, url)