Spaces:
Sleeping
Sleeping
Update app.py
Browse filesHandling deletion of indexes
app.py
CHANGED
|
@@ -58,6 +58,16 @@ def get_session_index_name():
|
|
| 58 |
# Combine base name with unique ID, ensuring total length is under 45 chars
|
| 59 |
return f"{base_name}-{unique_id}" # This will be like "docdb-12345678"
|
| 60 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
if not st.session_state.initialized:
|
| 62 |
# Clear everything only on first run or page refresh
|
| 63 |
if os.path.exists("data"):
|
|
@@ -69,14 +79,8 @@ if not st.session_state.initialized:
|
|
| 69 |
st.session_state.retriever = None
|
| 70 |
st.session_state.initialized = True
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
-
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 75 |
-
index_name = get_session_index_name()
|
| 76 |
-
if index_name in pc.list_indexes().names():
|
| 77 |
-
pc.delete_index(index_name)
|
| 78 |
-
except Exception as e:
|
| 79 |
-
st.error(f"Error cleaning up old index: {str(e)}")
|
| 80 |
|
| 81 |
def save_uploaded_file(uploaded_file):
|
| 82 |
"""Save uploaded file to the data directory"""
|
|
@@ -93,11 +97,11 @@ def save_uploaded_file(uploaded_file):
|
|
| 93 |
if os.path.exists(file_path):
|
| 94 |
return file_path
|
| 95 |
else:
|
| 96 |
-
|
| 97 |
return None
|
| 98 |
|
| 99 |
except Exception as e:
|
| 100 |
-
|
| 101 |
return None
|
| 102 |
|
| 103 |
def process_documents(uploaded_files_dict):
|
|
@@ -108,13 +112,16 @@ def process_documents(uploaded_files_dict):
|
|
| 108 |
|
| 109 |
try:
|
| 110 |
with st.spinner('Processing documents...'):
|
|
|
|
|
|
|
|
|
|
| 111 |
docs = []
|
| 112 |
# Process each file
|
| 113 |
for filename, file_info in uploaded_files_dict.items():
|
| 114 |
file_path = file_info["path"]
|
| 115 |
|
| 116 |
if not os.path.exists(file_path):
|
| 117 |
-
|
| 118 |
continue
|
| 119 |
|
| 120 |
if filename.endswith(".pdf"):
|
|
@@ -131,7 +138,7 @@ def process_documents(uploaded_files_dict):
|
|
| 131 |
docs.extend(file_doc)
|
| 132 |
|
| 133 |
if not docs:
|
| 134 |
-
st.
|
| 135 |
return False
|
| 136 |
|
| 137 |
# Split documents
|
|
@@ -150,10 +157,6 @@ def process_documents(uploaded_files_dict):
|
|
| 150 |
index_name = get_session_index_name()
|
| 151 |
|
| 152 |
try:
|
| 153 |
-
# Recreate the index
|
| 154 |
-
if index_name in pc.list_indexes().names():
|
| 155 |
-
pc.delete_index(index_name)
|
| 156 |
-
|
| 157 |
pc.create_index(
|
| 158 |
name=index_name,
|
| 159 |
dimension=512,
|
|
@@ -178,12 +181,14 @@ def process_documents(uploaded_files_dict):
|
|
| 178 |
return True
|
| 179 |
|
| 180 |
except PineconeApiException as e:
|
| 181 |
-
|
|
|
|
| 182 |
st.session_state.chat_enabled = False
|
| 183 |
return False
|
| 184 |
|
| 185 |
except Exception as e:
|
| 186 |
-
|
|
|
|
| 187 |
st.session_state.chat_enabled = False
|
| 188 |
return False
|
| 189 |
finally:
|
|
@@ -194,24 +199,35 @@ def doc2str(docs):
|
|
| 194 |
|
| 195 |
def format_reranked_docs(pc, retriever, question):
|
| 196 |
"""Rerank documents using Pinecone's reranking model"""
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
def run_chatbot(retriever, pc, llm):
|
| 212 |
"""Run the chatbot with the given components"""
|
| 213 |
-
# st.markdown("<h4>💬 Chat with your Documents</h4>", unsafe_allow_html=True)
|
| 214 |
-
|
| 215 |
# Initialize chat prompt
|
| 216 |
prompt = ChatPromptTemplate.from_template("""
|
| 217 |
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
|
|
@@ -220,8 +236,11 @@ def run_chatbot(retriever, pc, llm):
|
|
| 220 |
{context}
|
| 221 |
</context>
|
| 222 |
|
| 223 |
-
|
| 224 |
-
|
|
|
|
|
|
|
|
|
|
| 225 |
Answer the following question:
|
| 226 |
|
| 227 |
{question}""")
|
|
@@ -262,10 +281,11 @@ def run_chatbot(retriever, pc, llm):
|
|
| 262 |
# Add assistant response to chat history
|
| 263 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
| 264 |
except Exception as e:
|
| 265 |
-
|
| 266 |
with st.chat_message("assistant"):
|
| 267 |
-
|
| 268 |
-
st.
|
|
|
|
| 269 |
|
| 270 |
def process_and_chat():
|
| 271 |
"""Process documents and handle chat interface"""
|
|
@@ -285,6 +305,8 @@ def process_and_chat():
|
|
| 285 |
# Check for removed files
|
| 286 |
files_to_remove = set(st.session_state.uploaded_files.keys()) - current_uploaded_filenames
|
| 287 |
if files_to_remove:
|
|
|
|
|
|
|
| 288 |
for file_name in files_to_remove:
|
| 289 |
# Remove file from session state
|
| 290 |
if file_name in st.session_state.uploaded_files:
|
|
|
|
| 58 |
# Combine base name with unique ID, ensuring total length is under 45 chars
|
| 59 |
return f"{base_name}-{unique_id}" # This will be like "docdb-12345678"
|
| 60 |
|
| 61 |
+
def cleanup_pinecone_index():
|
| 62 |
+
"""Clean up existing Pinecone index for the current session"""
|
| 63 |
+
try:
|
| 64 |
+
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
|
| 65 |
+
index_name = get_session_index_name()
|
| 66 |
+
if index_name in pc.list_indexes().names():
|
| 67 |
+
pc.delete_index(index_name)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
print(f"Error cleaning up index: {str(e)}") # Log error internally
|
| 70 |
+
|
| 71 |
if not st.session_state.initialized:
|
| 72 |
# Clear everything only on first run or page refresh
|
| 73 |
if os.path.exists("data"):
|
|
|
|
| 79 |
st.session_state.retriever = None
|
| 80 |
st.session_state.initialized = True
|
| 81 |
|
| 82 |
+
# Clean up any existing index
|
| 83 |
+
cleanup_pinecone_index()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
def save_uploaded_file(uploaded_file):
|
| 86 |
"""Save uploaded file to the data directory"""
|
|
|
|
| 97 |
if os.path.exists(file_path):
|
| 98 |
return file_path
|
| 99 |
else:
|
| 100 |
+
print(f"File not saved: {file_path}") # Log error internally
|
| 101 |
return None
|
| 102 |
|
| 103 |
except Exception as e:
|
| 104 |
+
print(f"Error saving file: {str(e)}") # Log error internally
|
| 105 |
return None
|
| 106 |
|
| 107 |
def process_documents(uploaded_files_dict):
|
|
|
|
| 112 |
|
| 113 |
try:
|
| 114 |
with st.spinner('Processing documents...'):
|
| 115 |
+
# Clean up existing index before processing
|
| 116 |
+
cleanup_pinecone_index()
|
| 117 |
+
|
| 118 |
docs = []
|
| 119 |
# Process each file
|
| 120 |
for filename, file_info in uploaded_files_dict.items():
|
| 121 |
file_path = file_info["path"]
|
| 122 |
|
| 123 |
if not os.path.exists(file_path):
|
| 124 |
+
print(f"File not found: {file_path}") # Log error internally
|
| 125 |
continue
|
| 126 |
|
| 127 |
if filename.endswith(".pdf"):
|
|
|
|
| 138 |
docs.extend(file_doc)
|
| 139 |
|
| 140 |
if not docs:
|
| 141 |
+
st.warning("Unable to process the documents. Please try again.")
|
| 142 |
return False
|
| 143 |
|
| 144 |
# Split documents
|
|
|
|
| 157 |
index_name = get_session_index_name()
|
| 158 |
|
| 159 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
pc.create_index(
|
| 161 |
name=index_name,
|
| 162 |
dimension=512,
|
|
|
|
| 181 |
return True
|
| 182 |
|
| 183 |
except PineconeApiException as e:
|
| 184 |
+
print(f"Pinecone API error: {str(e)}") # Log error internally
|
| 185 |
+
st.warning("Unable to process documents at the moment. Please try again.")
|
| 186 |
st.session_state.chat_enabled = False
|
| 187 |
return False
|
| 188 |
|
| 189 |
except Exception as e:
|
| 190 |
+
print(f"Processing error: {str(e)}") # Log error internally
|
| 191 |
+
st.warning("Unable to process documents at the moment. Please try again.")
|
| 192 |
st.session_state.chat_enabled = False
|
| 193 |
return False
|
| 194 |
finally:
|
|
|
|
| 199 |
|
| 200 |
def format_reranked_docs(pc, retriever, question):
|
| 201 |
"""Rerank documents using Pinecone's reranking model"""
|
| 202 |
+
# Get relevant docs and ensure they're not empty
|
| 203 |
+
relevant_docs = [doc.page_content for doc in retriever.invoke(question) if doc.page_content.strip()]
|
| 204 |
+
|
| 205 |
+
if not relevant_docs:
|
| 206 |
+
return "I don't have enough context to answer this question."
|
| 207 |
+
|
| 208 |
+
try:
|
| 209 |
+
# Format documents for reranking
|
| 210 |
+
formatted_docs = [{"text": doc} for doc in relevant_docs]
|
| 211 |
+
|
| 212 |
+
reranked_docs = pc.inference.rerank(
|
| 213 |
+
model="pinecone-rerank-v0",
|
| 214 |
+
query=question,
|
| 215 |
+
documents=formatted_docs,
|
| 216 |
+
top_n=3,
|
| 217 |
+
return_documents=True
|
| 218 |
+
)
|
| 219 |
|
| 220 |
+
# Extract text from reranked documents
|
| 221 |
+
final_docs = [d.document["text"] for d in reranked_docs.data]
|
| 222 |
+
context = "\n\n".join(final_docs)
|
| 223 |
+
return context
|
| 224 |
+
except Exception as e:
|
| 225 |
+
print(f"Error during reranking: {str(e)}") # Log error internally
|
| 226 |
+
# Fallback to using retrieved docs without reranking
|
| 227 |
+
return "\n\n".join(relevant_docs[:3])
|
| 228 |
|
| 229 |
def run_chatbot(retriever, pc, llm):
|
| 230 |
"""Run the chatbot with the given components"""
|
|
|
|
|
|
|
| 231 |
# Initialize chat prompt
|
| 232 |
prompt = ChatPromptTemplate.from_template("""
|
| 233 |
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
|
|
|
|
| 236 |
{context}
|
| 237 |
</context>
|
| 238 |
|
| 239 |
+
<important>
|
| 240 |
+
Don't start revealing context in your responses until its asked. First look at the question and then think if the context is needed to answer this or its a normal question, once you have judged then only answer the question.
|
| 241 |
+
When there is no context, just respond on your own knowledge as a normal assistant.
|
| 242 |
+
</important>
|
| 243 |
+
|
| 244 |
Answer the following question:
|
| 245 |
|
| 246 |
{question}""")
|
|
|
|
| 281 |
# Add assistant response to chat history
|
| 282 |
st.session_state.messages.append({"role": "assistant", "content": response})
|
| 283 |
except Exception as e:
|
| 284 |
+
print(f"Chat error: {str(e)}") # Log error internally
|
| 285 |
with st.chat_message("assistant"):
|
| 286 |
+
error_msg = "I'm having trouble processing your question. Please try asking something else."
|
| 287 |
+
st.markdown(error_msg)
|
| 288 |
+
st.session_state.messages.append({"role": "assistant", "content": error_msg})
|
| 289 |
|
| 290 |
def process_and_chat():
|
| 291 |
"""Process documents and handle chat interface"""
|
|
|
|
| 305 |
# Check for removed files
|
| 306 |
files_to_remove = set(st.session_state.uploaded_files.keys()) - current_uploaded_filenames
|
| 307 |
if files_to_remove:
|
| 308 |
+
# Clean up index when files are removed
|
| 309 |
+
cleanup_pinecone_index()
|
| 310 |
for file_name in files_to_remove:
|
| 311 |
# Remove file from session state
|
| 312 |
if file_name in st.session_state.uploaded_files:
|