Spaces:

ash2203
/

DocumentAnalyzer

Sleeping

App Files Files Community

ash2203 commited on Jan 7, 2025

Commit

65e37fd

verified ·

1 Parent(s): 061c1bf

Update app.py

Browse files

Handling deletion of indexes

Files changed (1) hide show

app.py +59 -37

app.py CHANGED Viewed

@@ -58,6 +58,16 @@ def get_session_index_name():
     # Combine base name with unique ID, ensuring total length is under 45 chars
     return f"{base_name}-{unique_id}"  # This will be like "docdb-12345678"
 if not st.session_state.initialized:
     # Clear everything only on first run or page refresh
     if os.path.exists("data"):
@@ -69,14 +79,8 @@ if not st.session_state.initialized:
     st.session_state.retriever = None
     st.session_state.initialized = True
-    # Delete any existing index for this session (in case of page refresh)
-    try:
-        pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
-        index_name = get_session_index_name()
-        if index_name in pc.list_indexes().names():
-            pc.delete_index(index_name)
-    except Exception as e:
-        st.error(f"Error cleaning up old index: {str(e)}")
 def save_uploaded_file(uploaded_file):
     """Save uploaded file to the data directory"""
@@ -93,11 +97,11 @@ def save_uploaded_file(uploaded_file):
         if os.path.exists(file_path):
             return file_path
         else:
-            st.error(f"File not saved: {file_path}")
             return None
     except Exception as e:
-        st.error(f"Error saving file: {str(e)}")
         return None
 def process_documents(uploaded_files_dict):
@@ -108,13 +112,16 @@ def process_documents(uploaded_files_dict):
     try:
         with st.spinner('Processing documents...'):
             docs = []
             # Process each file
             for filename, file_info in uploaded_files_dict.items():
                 file_path = file_info["path"]
                 if not os.path.exists(file_path):
-                    st.error(f"File not found: {file_path}")
                     continue
                 if filename.endswith(".pdf"):
@@ -131,7 +138,7 @@ def process_documents(uploaded_files_dict):
                     docs.extend(file_doc)
             if not docs:
-                st.error("No documents were successfully processed")
                 return False
             # Split documents
@@ -150,10 +157,6 @@ def process_documents(uploaded_files_dict):
             index_name = get_session_index_name()
             try:
-                # Recreate the index
-                if index_name in pc.list_indexes().names():
-                    pc.delete_index(index_name)
                 pc.create_index(
                     name=index_name,
                     dimension=512,
@@ -178,12 +181,14 @@ def process_documents(uploaded_files_dict):
                 return True
             except PineconeApiException as e:
-                st.error("File upload failed! Avoid interrupting document processing by uploading or removing files. Kindly refresh the app to continue.")
                 st.session_state.chat_enabled = False
                 return False
     except Exception as e:
-        st.error(f"An error occurred during processing: {str(e)}")
         st.session_state.chat_enabled = False
         return False
     finally:
@@ -194,24 +199,35 @@ def doc2str(docs):
 def format_reranked_docs(pc, retriever, question):
     """Rerank documents using Pinecone's reranking model"""
-    relevant_docs = [doc.page_content for doc in retriever.invoke(question) if len(doc.page_content)>5]
-    reranked_docs = pc.inference.rerank(
-        model="pinecone-rerank-v0",
-        query=question,
-        documents=relevant_docs,
-        top_n=3,
-        return_documents=True
-    )
-    final_docs = [d.document.text for d in reranked_docs.data]
-    context = doc2str(final_docs)
-    return context
 def run_chatbot(retriever, pc, llm):
     """Run the chatbot with the given components"""
-    # st.markdown("<h4>💬 Chat with your Documents</h4>", unsafe_allow_html=True)
     # Initialize chat prompt
     prompt = ChatPromptTemplate.from_template("""
     You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
@@ -220,8 +236,11 @@ def run_chatbot(retriever, pc, llm):
     {context}
     </context>
-    Important: Don't start revealing context in your responses until its asked. First look at the question and then think if the context is needed to answer this or its a normal question, once you have judged then only answer the question.
     Answer the following question:
     {question}""")
@@ -262,10 +281,11 @@ def run_chatbot(retriever, pc, llm):
                     # Add assistant response to chat history
                     st.session_state.messages.append({"role": "assistant", "content": response})
             except Exception as e:
-                error_msg = f"An error occurred while processing your question: {str(e)}"
                 with st.chat_message("assistant"):
-                    st.error(error_msg)
-                    st.session_state.messages.append({"role": "assistant", "content": f"❌ {error_msg}"})
 def process_and_chat():
     """Process documents and handle chat interface"""
@@ -285,6 +305,8 @@ def process_and_chat():
     # Check for removed files
     files_to_remove = set(st.session_state.uploaded_files.keys()) - current_uploaded_filenames
     if files_to_remove:
         for file_name in files_to_remove:
             # Remove file from session state
             if file_name in st.session_state.uploaded_files:

     # Combine base name with unique ID, ensuring total length is under 45 chars
     return f"{base_name}-{unique_id}"  # This will be like "docdb-12345678"
+def cleanup_pinecone_index():
+    """Clean up existing Pinecone index for the current session"""
+    try:
+        pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+        index_name = get_session_index_name()
+        if index_name in pc.list_indexes().names():
+            pc.delete_index(index_name)
+    except Exception as e:
+        print(f"Error cleaning up index: {str(e)}")  # Log error internally
 if not st.session_state.initialized:
     # Clear everything only on first run or page refresh
     if os.path.exists("data"):
     st.session_state.retriever = None
     st.session_state.initialized = True
+    # Clean up any existing index
+    cleanup_pinecone_index()
 def save_uploaded_file(uploaded_file):
     """Save uploaded file to the data directory"""
         if os.path.exists(file_path):
             return file_path
         else:
+            print(f"File not saved: {file_path}")  # Log error internally
             return None
     except Exception as e:
+        print(f"Error saving file: {str(e)}")  # Log error internally
         return None
 def process_documents(uploaded_files_dict):
     try:
         with st.spinner('Processing documents...'):
+            # Clean up existing index before processing
+            cleanup_pinecone_index()
             docs = []
             # Process each file
             for filename, file_info in uploaded_files_dict.items():
                 file_path = file_info["path"]
                 if not os.path.exists(file_path):
+                    print(f"File not found: {file_path}")  # Log error internally
                     continue
                 if filename.endswith(".pdf"):
                     docs.extend(file_doc)
             if not docs:
+                st.warning("Unable to process the documents. Please try again.")
                 return False
             # Split documents
             index_name = get_session_index_name()
             try:
                 pc.create_index(
                     name=index_name,
                     dimension=512,
                 return True
             except PineconeApiException as e:
+                print(f"Pinecone API error: {str(e)}")  # Log error internally
+                st.warning("Unable to process documents at the moment. Please try again.")
                 st.session_state.chat_enabled = False
                 return False
     except Exception as e:
+        print(f"Processing error: {str(e)}")  # Log error internally
+        st.warning("Unable to process documents at the moment. Please try again.")
         st.session_state.chat_enabled = False
         return False
     finally:
 def format_reranked_docs(pc, retriever, question):
     """Rerank documents using Pinecone's reranking model"""
+    # Get relevant docs and ensure they're not empty
+    relevant_docs = [doc.page_content for doc in retriever.invoke(question) if doc.page_content.strip()]
+    if not relevant_docs:
+        return "I don't have enough context to answer this question."
+    try:
+        # Format documents for reranking
+        formatted_docs = [{"text": doc} for doc in relevant_docs]
+        reranked_docs = pc.inference.rerank(
+            model="pinecone-rerank-v0",
+            query=question,
+            documents=formatted_docs,
+            top_n=3,
+            return_documents=True
+        )
+        # Extract text from reranked documents
+        final_docs = [d.document["text"] for d in reranked_docs.data]
+        context = "\n\n".join(final_docs)
+        return context
+    except Exception as e:
+        print(f"Error during reranking: {str(e)}")  # Log error internally
+        # Fallback to using retrieved docs without reranking
+        return "\n\n".join(relevant_docs[:3])
 def run_chatbot(retriever, pc, llm):
     """Run the chatbot with the given components"""
     # Initialize chat prompt
     prompt = ChatPromptTemplate.from_template("""
     You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
     {context}
     </context>
+    <important>
+    Don't start revealing context in your responses until its asked. First look at the question and then think if the context is needed to answer this or its a normal question, once you have judged then only answer the question.
+    When there is no context, just respond on your own knowledge as a normal assistant.
+    </important>
     Answer the following question:
     {question}""")
                     # Add assistant response to chat history
                     st.session_state.messages.append({"role": "assistant", "content": response})
             except Exception as e:
+                print(f"Chat error: {str(e)}")  # Log error internally
                 with st.chat_message("assistant"):
+                    error_msg = "I'm having trouble processing your question. Please try asking something else."
+                    st.markdown(error_msg)
+                    st.session_state.messages.append({"role": "assistant", "content": error_msg})
 def process_and_chat():
     """Process documents and handle chat interface"""
     # Check for removed files
     files_to_remove = set(st.session_state.uploaded_files.keys()) - current_uploaded_filenames
     if files_to_remove:
+        # Clean up index when files are removed
+        cleanup_pinecone_index()
         for file_name in files_to_remove:
             # Remove file from session state
             if file_name in st.session_state.uploaded_files: