Spaces:
Sleeping
Sleeping
File size: 16,035 Bytes
ab5a4af 0590ae6 ab5a4af 061c1bf ab5a4af 0590ae6 ab5a4af 061c1bf 0590ae6 ab5a4af 0590ae6 65e37fd 0590ae6 65e37fd 0590ae6 65e37fd ab5a4af 0590ae6 ab5a4af 0590ae6 ab5a4af 65e37fd ab5a4af 65e37fd ab5a4af 0590ae6 ab5a4af 0590ae6 65e37fd ab5a4af 65e37fd ab5a4af 65e37fd ab5a4af 8e6c014 ab5a4af 0590ae6 ab5a4af 0590ae6 65e37fd ab5a4af 65e37fd ab5a4af 0590ae6 ab5a4af 0590ae6 ab5a4af 65e37fd ab5a4af 0590ae6 ab5a4af 0590ae6 ab5a4af 65e37fd ab5a4af 65e37fd ab5a4af 4619d02 0590ae6 4619d02 ab5a4af 0590ae6 ab5a4af 4619d02 ab5a4af 0590ae6 ab5a4af 8e6c014 ab5a4af 0590ae6 ab5a4af 0590ae6 ab5a4af 0590ae6 ab5a4af 4619d02 ab5a4af 0590ae6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 |
import os
import time
import streamlit as st
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List
from langchain_core.documents import Document
from langchain_openai import OpenAIEmbeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_chroma import Chroma
import shutil
import uuid
from dotenv import load_dotenv
load_dotenv()
# Set page configuration
st.set_page_config(page_title="Document Analyzer", layout="wide")
st.title("📚 Document Analyzer")
# Add instructions in an expander
with st.expander("ℹ️ Click here to view instructions"):
st.markdown("""
- Upload files by clicking on "Browse Files"
- Avoid interrupting when file/files are under processing, this interrupts the execution and you would have to refresh the page to run the webapp again
- You can add more files anytime, just avoid adding/removing files when it's processing the uploaded documents
- The processing will trigger whenever you make any changes to the files
""")
# Initialize session states
if 'initialized' not in st.session_state:
st.session_state.initialized = False
if 'processing' not in st.session_state:
st.session_state.processing = False
if 'chat_enabled' not in st.session_state:
st.session_state.chat_enabled = False
if 'session_id' not in st.session_state:
# Generate a unique session ID using UUID
st.session_state.session_id = str(uuid.uuid4())[:8]
def get_chroma_directory():
"""Get unique directory name for current session's ChromaDB"""
base_dir = "vectorstores"
if not os.path.exists(base_dir):
os.makedirs(base_dir)
return os.path.join(base_dir, f"chroma_db_{st.session_state.session_id}")
def cleanup_chroma_db():
"""Clean up existing ChromaDB for the current session"""
try:
chroma_dir = get_chroma_directory()
if os.path.exists(chroma_dir):
shutil.rmtree(chroma_dir)
except Exception as e:
print(f"Error cleaning up ChromaDB: {str(e)}") # Log error internally
def cleanup_old_vectorstores():
"""Clean up vector stores that are older than 24 hours"""
try:
base_dir = "vectorstores"
if not os.path.exists(base_dir):
return
current_time = time.time()
one_day_in_seconds = 24 * 60 * 60
# Get all directories in vectorstores
for dir_name in os.listdir(base_dir):
dir_path = os.path.join(base_dir, dir_name)
if os.path.isdir(dir_path):
# Get directory's last modification time
last_modified = os.path.getmtime(dir_path)
if current_time - last_modified > one_day_in_seconds:
shutil.rmtree(dir_path)
except Exception as e:
print(f"Error cleaning up old vector stores: {str(e)}") # Log error internally
if not st.session_state.initialized:
# Clean up old vector stores first
cleanup_old_vectorstores()
# Clear everything only on first run or page refresh
if os.path.exists("data"):
shutil.rmtree("data")
os.makedirs("data")
# Clear vectorstores directory for current session
if os.path.exists("vectorstores"):
os.makedirs("vectorstores", exist_ok=True)
st.session_state.uploaded_files = {}
st.session_state.previous_files = set()
st.session_state.initialized = True
def save_uploaded_file(uploaded_file):
"""Save uploaded file to the data directory"""
try:
# Create full path
file_path = os.path.join("data", uploaded_file.name)
# Save the file
with open(file_path, "wb") as f:
file_bytes = uploaded_file.getvalue() # Get file bytes
f.write(file_bytes)
# Verify file was saved
if os.path.exists(file_path):
return file_path
else:
print(f"File not saved: {file_path}") # Log error internally
return None
except Exception as e:
print(f"Error saving file: {str(e)}") # Log error internally
return None
def process_documents(uploaded_files_dict):
"""Process documents and store in ChromaDB"""
warning_placeholder = st.empty()
warning_placeholder.warning("⚠️ Document processing in progress. Please wait before adding or removing files.")
success_placeholder = st.empty()
try:
with st.spinner('Processing documents...'):
# Clean up existing ChromaDB before processing
cleanup_chroma_db()
docs = []
# Process each file
for filename, file_info in uploaded_files_dict.items():
file_path = file_info["path"]
if not os.path.exists(file_path):
print(f"File not found: {file_path}") # Log error internally
continue
if filename.endswith(".pdf"):
document = PyMuPDFLoader(file_path)
file_doc = document.load()
docs.extend(file_doc)
elif filename.endswith(".txt"):
document = TextLoader(file_path)
file_doc = document.load()
docs.extend(file_doc)
elif filename.endswith(".docx"):
document = Docx2txtLoader(file_path)
file_doc = document.load()
docs.extend(file_doc)
if not docs:
st.warning("Unable to process the documents. Please try again.")
return False
# Split documents
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=400,
length_function=len
)
chunks = text_splitter.split_documents(docs)
# Initialize embeddings
embed_func = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=512)
try:
# Create vectorstore and add documents
vectorstore = Chroma.from_documents(
collection_name="collection",
documents=chunks,
embedding=embed_func,
persist_directory=get_chroma_directory()
)
st.session_state.chat_enabled = True
success_placeholder.success('Documents processed successfully!')
time.sleep(2) # Show success message for 2 seconds
success_placeholder.empty() # Clear the success message
return True
except Exception as e:
print(f"ChromaDB error: {str(e)}") # Log error internally
st.warning("Unable to process documents at the moment. Please try again.")
st.session_state.chat_enabled = False
return False
except Exception as e:
print(f"Processing error: {str(e)}") # Log error internally
st.warning("Unable to process documents at the moment. Please try again.")
st.session_state.chat_enabled = False
return False
finally:
warning_placeholder.empty()
def doc2str(docs):
return "\n\n".join(doc.page_content for doc in docs)
def run_chatbot(retriever, llm):
"""Run the chatbot with the given components"""
# Initialize chat prompt
prompt = ChatPromptTemplate.from_template("""
You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
<context>
{context}
</context>
<important>
Don't start revealing context in your responses until its asked. First look at the question and then think if the context is needed to answer this or its a normal question, once you have judged then only answer the question.
When there is no context, just respond on your own knowledge as a normal assistant.
</important>
Answer the following question:
{question}""")
# Create the QA chain
qa_chain = (
RunnablePassthrough.assign(context=lambda input: doc2str(retriever.invoke(input["question"])))
| prompt
| llm
| StrOutputParser()
)
# Initialize messages in session state if not exists
if "messages" not in st.session_state:
st.session_state.messages = []
# Display chat messages
for message in st.session_state.messages:
with st.chat_message(message["role"]):
st.markdown(message["content"])
# Chat input
if question := st.chat_input("Ask a question about your documents"):
# Add user message to chat history
st.session_state.messages.append({"role": "user", "content": question})
with st.chat_message("user"):
st.markdown(question)
# Create a spinner outside the chat message
with st.spinner("Thinking..."):
try:
# Generate response
response = qa_chain.invoke({"question": question})
# Display response in chat message after generation
with st.chat_message("assistant"):
st.markdown(response)
# Add assistant response to chat history
st.session_state.messages.append({"role": "assistant", "content": response})
except Exception as e:
print(f"Chat error: {str(e)}") # Log error internally
with st.chat_message("assistant"):
error_msg = "I'm having trouble processing your question. Please try asking something else."
st.markdown(error_msg)
st.session_state.messages.append({"role": "assistant", "content": error_msg})
def process_and_chat():
"""Process documents and handle chat interface"""
# File uploader section
with st.container():
uploaded_files = st.file_uploader(
"Upload your documents",
type=["pdf", "txt", "docx"],
accept_multiple_files=True,
key="file_uploader",
label_visibility="collapsed" if st.session_state.processing else "visible"
)
# Get current uploaded filenames
current_uploaded_filenames = {file.name for file in uploaded_files} if uploaded_files else set()
# Check for removed files
files_to_remove = set(st.session_state.uploaded_files.keys()) - current_uploaded_filenames
if files_to_remove:
# Set processing state immediately
st.session_state.processing = True
st.session_state.chat_enabled = False
if "messages" in st.session_state:
del st.session_state.messages
# Clean up ChromaDB when files are removed
cleanup_chroma_db()
for file_name in files_to_remove:
# Remove file from session state
if file_name in st.session_state.uploaded_files:
# Delete the file from data directory
file_path = st.session_state.uploaded_files[file_name]["path"]
if os.path.exists(file_path):
os.remove(file_path)
# Remove from session state
del st.session_state.uploaded_files[file_name]
# Process newly uploaded files
if uploaded_files:
files_added = False
for file in uploaded_files:
# Only process files that haven't been uploaded before
if file.name not in st.session_state.uploaded_files:
# Set processing state immediately when new file is detected
st.session_state.processing = True
st.session_state.chat_enabled = False
if "messages" in st.session_state:
del st.session_state.messages
file_path = save_uploaded_file(file)
if file_path: # Only add to session state if file was saved successfully
st.session_state.uploaded_files[file.name] = {
"path": file_path,
"type": file.type
}
files_added = True
# Check for changes in files
current_files = set(st.session_state.uploaded_files.keys())
# If files have changed (added or removed), reset chat and process documents
if current_files != st.session_state.previous_files or files_to_remove:
st.session_state.previous_files = current_files
if current_files:
# Process documents and enable chat if successful
if process_documents(st.session_state.uploaded_files):
st.session_state.chat_enabled = True
st.session_state.processing = False
else:
st.warning('Please upload a file to continue')
st.session_state.processing = False
# If files exist and chat is enabled, show chat interface
if current_files and st.session_state.chat_enabled:
try:
# Initialize components for chat
llm = ChatGroq(temperature=0, model_name="llama-3.3-70b-versatile", groq_api_key=os.getenv("GROQ_API_KEY"), max_tokens=8000)
# Create vectorstore
embed_func = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=512)
vectorstore = Chroma(
collection_name="collection",
embedding_function=embed_func,
persist_directory=get_chroma_directory()
)
# Create retrievers
vectorstore_retriever = vectorstore.as_retriever(
search_kwargs={"k": 3}
)
# Create keyword retriever
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1500,
chunk_overlap=400,
length_function=len
)
docs = []
for file_info in st.session_state.uploaded_files.values():
if file_info["path"].endswith(".pdf"):
docs.extend(PyMuPDFLoader(file_info["path"]).load())
elif file_info["path"].endswith(".txt"):
docs.extend(TextLoader(file_info["path"]).load())
elif file_info["path"].endswith(".docx"):
docs.extend(Docx2txtLoader(file_info["path"]).load())
chunks = text_splitter.split_documents(docs)
keyword_retriever = BM25Retriever.from_documents(chunks)
keyword_retriever.k = 3
# Combine retrievers
ensemble_retriever = EnsembleRetriever(
retrievers=[vectorstore_retriever, keyword_retriever],
weights=[0.5, 0.5]
)
# Run chatbot with fresh components
run_chatbot(ensemble_retriever, llm)
except Exception as e:
print(f"Chat interface error: {str(e)}") # Log error internally
st.warning("Please try uploading your documents again.")
st.session_state.chat_enabled = False
# Clear the previous files to force reprocessing
st.session_state.previous_files = set()
if "messages" in st.session_state:
del st.session_state.messages
# Call the main function
process_and_chat() |