Spaces:

ash2203
/

DocumentAnalyzer

Sleeping

App Files Files Community

ash2203 commited on Jan 6, 2025

Commit

ab5a4af

verified ·

1 Parent(s): 31e8e92

Create app.py

Browse files

Files changed (1) hide show

app.py +325 -0

app.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import os
+import time
+import streamlit as st
+from langchain_groq import ChatGroq
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.output_parsers import StrOutputParser
+from langchain_community.document_loaders import TextLoader, PyMuPDFLoader, Docx2txtLoader
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from typing import List
+from langchain_core.documents import Document
+from langchain_openai import OpenAIEmbeddings
+from langchain_core.runnables import RunnablePassthrough
+from langchain_community.retrievers import BM25Retriever
+from langchain.retrievers import EnsembleRetriever
+from langchain_community.retrievers import PineconeHybridSearchRetriever
+from langchain_pinecone import PineconeVectorStore
+from pinecone import Pinecone, ServerlessSpec
+from pinecone import PineconeApiException, NotFoundException
+import shutil
+from dotenv import load_dotenv
+load_dotenv()
+# Set page configuration
+st.set_page_config(page_title="Document Analyzer", layout="wide", )
+st.title("📚 Document Analyzer")
+# Add instructions in an expander
+with st.expander("ℹ️ Click here to view instructions"):
+    st.markdown("""
+    - Upload files by clicking on "Browse Files"
+    - Avoid interrupting when file/files are under processing, this interrupts the execution and you would have to refresh the page to run the webapp again
+    - You can add more files anytime, just avoid adding/removing files when it's processing the uploaded documents
+    - The processing will trigger whenever you make any changes to the files
+    """)
+# Initialize session states
+if 'initialized' not in st.session_state:
+    st.session_state.initialized = False
+if 'processing' not in st.session_state:
+    st.session_state.processing = False
+if 'last_processed_files' not in st.session_state:
+    st.session_state.last_processed_files = set()
+if 'chat_history' not in st.session_state:
+    st.session_state.chat_history = []
+if 'chat_enabled' not in st.session_state:
+    st.session_state.chat_enabled = False
+if not st.session_state.initialized:
+    # Clear everything only on first run or page refresh
+    if os.path.exists("data"):
+        shutil.rmtree("data")
+    os.makedirs("data")
+    st.session_state.uploaded_files = {}
+    st.session_state.previous_files = set()
+    st.session_state.vectorstore = None
+    st.session_state.retriever = None
+    st.session_state.initialized = True
+def save_uploaded_file(uploaded_file):
+    """Save uploaded file to the data directory"""
+    try:
+        # Create full path
+        file_path = os.path.join("data", uploaded_file.name)
+        # Save the file
+        with open(file_path, "wb") as f:
+            file_bytes = uploaded_file.getvalue()  # Get file bytes
+            f.write(file_bytes)
+        # Verify file was saved
+        if os.path.exists(file_path):
+            return file_path
+        else:
+            st.error(f"File not saved: {file_path}")
+            return None
+    except Exception as e:
+        st.error(f"Error saving file: {str(e)}")
+        return None
+def process_documents(uploaded_files_dict):
+    """Process documents and store in Pinecone"""
+    warning_placeholder = st.empty()
+    warning_placeholder.warning("⚠️ Document processing in progress. Please wait before adding or removing files.")
+    success_placeholder = st.empty()
+    try:
+        with st.spinner('Processing documents...'):
+            docs = []
+            # Process each file
+            for filename, file_info in uploaded_files_dict.items():
+                file_path = file_info["path"]
+                if not os.path.exists(file_path):
+                    st.error(f"File not found: {file_path}")
+                    continue
+                if filename.endswith(".pdf"):
+                    document = PyMuPDFLoader(file_path)
+                    file_doc = document.load()
+                    docs.extend(file_doc)
+                elif filename.endswith(".txt"):
+                    document = TextLoader(file_path)
+                    file_doc = document.load()
+                    docs.extend(file_doc)
+                elif filename.endswith(".docx"):
+                    document = Docx2txtLoader(file_path)
+                    file_doc = document.load()
+                    docs.extend(file_doc)
+            if not docs:
+                st.error("No documents were successfully processed")
+                return False
+            # Split documents
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=2000,
+                chunk_overlap=400,
+                length_function=len
+            )
+            chunks = text_splitter.split_documents(docs)
+            # Initialize embeddings
+            embed_func = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=512)
+            # Initialize Pinecone
+            pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+            index_name = os.getenv("VECTORDB_NAME")
+            try:
+                # Recreate the index
+                if index_name in pc.list_indexes().names():
+                    pc.delete_index(index_name)
+                pc.create_index(
+                    name=index_name,
+                    dimension=512,
+                    metric='cosine',
+                    spec=ServerlessSpec(cloud='aws', region='us-east-1')
+                )
+                # Wait for index to be ready
+                while not pc.describe_index(index_name).status['ready']:
+                    time.sleep(1)
+                pc_index = pc.Index(index_name)
+                # Create vectorstore and add documents
+                vectorstore = PineconeVectorStore(index=pc_index, embedding=embed_func)
+                vectorstore.add_documents(documents=chunks)
+                st.session_state.chat_enabled = True
+                success_placeholder.success('Documents processed successfully!')
+                time.sleep(2)  # Show success message for 2 seconds
+                success_placeholder.empty()  # Clear the success message
+                return True
+            except PineconeApiException as e:
+                st.error("File upload failed! Avoid interrupting document processing by uploading or removing files. Kindly refresh the app to continue.")
+                st.session_state.chat_enabled = False
+                return False
+    except Exception as e:
+        st.error(f"An error occurred during processing: {str(e)}")
+        st.session_state.chat_enabled = False
+        return False
+    finally:
+        warning_placeholder.empty()
+def doc2str(docs):
+    return "\n\n".join(doc for doc in docs)
+def format_reranked_docs(pc, retriever, question):
+    """Rerank documents using Pinecone's reranking model"""
+    relevant_docs = [doc.page_content for doc in retriever.invoke(question) if len(doc.page_content)>5]
+    reranked_docs = pc.inference.rerank(
+        model="pinecone-rerank-v0",
+        query=question,
+        documents=relevant_docs,
+        top_n=3,
+        return_documents=True
+    )
+    final_docs = [d.document.text for d in reranked_docs.data]
+    context = doc2str(final_docs)
+    return context
+def run_chatbot(retriever, pc, llm):
+    """Run the chatbot with the given components"""
+    # st.markdown("<h4>💬 Chat with your Documents</h4>", unsafe_allow_html=True)
+    # Initialize chat prompt
+    prompt = ChatPromptTemplate.from_template("""
+    You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know.
+    <context>
+    {context}
+    </context>
+    Important: You cannot quote the context in the responses. If you do that, there will be a strict penalty for it.
+    Answer the following question:
+    {question}""")
+    # Create the QA chain with reranking
+    qa_chain = (
+        RunnablePassthrough.assign(context=lambda input: format_reranked_docs(pc, retriever, input["question"]))
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+    # Initialize messages in session state if not exists
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Display chat messages
+    for message in st.session_state.messages:
+        with st.chat_message(message["role"]):
+            st.markdown(message["content"])
+    # Chat input
+    if question := st.chat_input("Ask a question about your documents"):
+        # Add user message to chat history
+        st.session_state.messages.append({"role": "user", "content": question})
+        with st.chat_message("user"):
+            st.markdown(question)
+        # Create a spinner outside the chat message
+        with st.spinner("Thinking..."):
+            try:
+                # Generate response
+                response = qa_chain.invoke({"question": question})
+                # Display response in chat message after generation
+                with st.chat_message("assistant"):
+                    st.markdown(response)
+                    # Add assistant response to chat history
+                    st.session_state.messages.append({"role": "assistant", "content": response})
+            except Exception as e:
+                error_msg = f"An error occurred while processing your question: {str(e)}"
+                with st.chat_message("assistant"):
+                    st.error(error_msg)
+                    st.session_state.messages.append({"role": "assistant", "content": f"❌ {error_msg}"})
+def process_and_chat():
+    """Process documents and handle chat interface"""
+    # File uploader section
+    with st.container():
+        uploaded_files = st.file_uploader(
+            "Upload your documents",
+            type=["pdf", "txt", "docx"],
+            accept_multiple_files=True,
+            key="file_uploader",
+            label_visibility="collapsed" if st.session_state.processing else "visible"
+        )
+    # Get current uploaded filenames
+    current_uploaded_filenames = {file.name for file in uploaded_files} if uploaded_files else set()
+    # Process newly uploaded files
+    if uploaded_files:
+        files_added = False
+        for file in uploaded_files:
+            # Only process files that haven't been uploaded before
+            if file.name not in st.session_state.uploaded_files:
+                file_path = save_uploaded_file(file)
+                if file_path:  # Only add to session state if file was saved successfully
+                    st.session_state.uploaded_files[file.name] = {
+                        "path": file_path,
+                        "type": file.type
+                    }
+                    files_added = True
+    # Check for changes in files
+    current_files = set(st.session_state.uploaded_files.keys())
+    # Process documents only if files have changed
+    if current_files != st.session_state.previous_files:
+        st.session_state.previous_files = current_files
+        if current_files:
+            st.session_state.processing = True
+            # Process documents and enable chat if successful
+            if process_documents(st.session_state.uploaded_files):
+                st.session_state.chat_enabled = True
+            st.session_state.processing = False
+        else:
+            st.warning('Please upload a file to continue')
+            st.session_state.chat_enabled = False
+    # If files exist and chat is enabled, show chat interface
+    if current_files and st.session_state.chat_enabled:
+        try:
+            # Initialize components for chat
+            llm = ChatGroq(temperature=0, model_name="mixtral-8x7b-32768", groq_api_key=os.getenv("GROQ_API_KEY"))
+            pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
+            index_name = os.getenv("VECTORDB_NAME")
+            pc_index = pc.Index(index_name)
+            # Create vectorstore
+            embed_func = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=512)
+            vectorstore = PineconeVectorStore(index=pc_index, embedding=embed_func)
+            # Create retrievers
+            vectorstore_retriever = vectorstore.as_retriever(
+                search_type="similarity_score_threshold",
+                search_kwargs={"k": 5, "score_threshold": 0.6},
+            )
+            # Run chatbot with fresh components
+            run_chatbot(vectorstore_retriever, pc, llm)
+        except NotFoundException:
+            st.error("Vector database not found. Please try uploading your documents again.")
+            st.session_state.chat_enabled = False
+            # Clear the previous files to force reprocessing
+            st.session_state.previous_files = set()
+# Call the main function
+process_and_chat()