Spaces:

pragatheeswaran
/

langgraph-document-qa-assistant

Sleeping

App Files Files Community

pragatheeswaran commited on Mar 5, 2025

Commit

2c5d424

verified ·

1 Parent(s): 8e725de

Upload 3 files

Browse files

Files changed (3) hide show

README_app.md +28 -0
app.py +663 -0
requirements.txt +20 -0

README_app.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# LangGraph Document Q&A Assistant
+This repository showcases a Document Question & Answering (Q&A) Assistant built using [LangGraph](https://gritholdings.gitbook.io/docs/langgraph) and the [DeepSeek-V3](https://huggingface.co/deepseek-ai/DeepSeek-V3) model. The assistant allows users to upload documents and receive AI-generated answers to their queries based on the content of those documents.
+## Features
+- **Document Upload**: Users can upload various document formats for analysis.
+- **Intelligent Q&A**: Utilizes the DeepSeek-V3 model to provide accurate answers based on the uploaded document's content.
+- **Scalable Architecture**: Built with LangGraph to ensure modularity and scalability.
+## Getting Started
+Follow these instructions to set up and run the project locally.
+### Prerequisites
+- Python 3.8 or higher
+- [LangGraph](https://gritholdings.gitbook.io/docs/langgraph)
+- [DeepSeek-V3 model weights](https://huggingface.co/deepseek-ai/DeepSeek-V3)
+### Installation
+1. **Clone the Repository**:
+   ```bash
+   git clone https://huggingface.co/pragatheeswaran/langgraph-document-qa-assistant
+   cd langgraph-document-qa-assistant

app.py ADDED Viewed

	@@ -0,0 +1,663 @@

+import os
+import tempfile
+import streamlit as st
+from PIL import Image
+import pytesseract
+from pdf2image import convert_from_path
+import pypdf
+from dotenv import load_dotenv
+import time
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
+from langchain_core.output_parsers import StrOutputParser
+from langchain_together import Together
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import FAISS
+from langchain_community.embeddings import HuggingFaceEmbeddings
+import langgraph
+from langgraph.graph import END
+from typing import List, Dict, Any, TypedDict, Optional
+# Load environment variables
+load_dotenv()
+# Set page configuration
+st.set_page_config(
+    page_title="Document Q&A",
+    page_icon="📚",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Custom CSS for better UI
+st.markdown("""
+<style>
+    /* Base styles */
+    .main {
+        background-color: #f8fafc;
+        color: #333;
+        padding: 1rem;
+    }
+    /* Sidebar styling */
+    [data-testid="stSidebar"] {
+        background-color: #1e293b;
+        color: #f8fafc;
+        padding: 1rem;
+    }
+    /* Example questions */
+    .example-button {
+        background-color: #7c3aed;
+        color: white;
+        border: none;
+        border-radius: 0.5rem;
+        padding: 0.75rem 1rem;
+        margin-bottom: 0.75rem;
+        cursor: pointer;
+        text-align: left;
+        display: block;
+        width: 100%;
+        font-size: 0.9rem;
+    }
+    /* Chat container */
+    .chat-container {
+        min-height: 60vh;
+        overflow-y: auto;
+        padding: 1rem;
+        background-color: white;
+        border-radius: 0.5rem;
+        border: 1px solid #e2e8f0;
+        margin-bottom: 1rem;
+    }
+    /* Sidebar title */
+    .sidebar-title {
+        color: #f8fafc;
+        font-size: 1.2rem;
+        font-weight: 600;
+        margin-bottom: 1rem;
+        padding-bottom: 0.5rem;
+        border-bottom: 1px solid #475569;
+    }
+    /* File list */
+    .file-item {
+        padding: 0.5rem;
+        background-color: #334155;
+        border-radius: 0.25rem;
+        margin-bottom: 0.5rem;
+        color: #f8fafc;
+    }
+    .file-name {
+        font-weight: 500;
+    }
+    .file-type {
+        font-size: 0.75rem;
+        color: #cbd5e1;
+    }
+    /* Instructions */
+    .instructions {
+        color: #cbd5e1;
+    }
+    .instructions ol {
+        margin-left: 1.5rem;
+        padding-left: 0;
+    }
+    .instructions li {
+        margin-bottom: 0.5rem;
+    }
+    /* Divider */
+    .divider {
+        height: 1px;
+        background-color: #475569;
+        margin: 1.5rem 0;
+    }
+    /* Override Streamlit button styles */
+    .stButton > button {
+        background-color: #7c3aed;
+        color: white;
+    }
+    /* Override Streamlit file uploader */
+    .stFileUploader > div > div {
+        background-color: #334155;
+        color: #f8fafc;
+        border: 1px dashed #7c3aed;
+        border-radius: 0.5rem;
+        padding: 1rem;
+    }
+    /* Controls section */
+    .controls-section {
+        margin-top: 1rem;
+    }
+    /* Control buttons */
+    .control-button {
+        background-color: #7c3aed;
+        color: white;
+        border: none;
+        border-radius: 0.25rem;
+        padding: 0.5rem 1rem;
+        margin-right: 0.5rem;
+        margin-bottom: 0.5rem;
+        cursor: pointer;
+    }
+    /* How to use section */
+    .how-to-use {
+        margin-bottom: 1.5rem;
+    }
+    .how-to-use ol {
+        margin-left: 1.5rem;
+        padding-left: 0;
+    }
+    .how-to-use li {
+        margin-bottom: 0.5rem;
+        color: #f8fafc;
+    }
+    /* Input field */
+    .stTextInput > div > div > input {
+        border: 1px solid #e2e8f0;
+        border-radius: 0.5rem;
+        padding: 0.75rem;
+        font-size: 1rem;
+    }
+    /* Form styling */
+    [data-testid="stForm"] {
+        border: none;
+        padding: 0;
+    }
+    /* Hide Streamlit branding */
+    #MainMenu {visibility: hidden;}
+    footer {visibility: hidden;}
+    /* Chat messages */
+    .user-message {
+        background-color: #f3f4f6;
+        padding: 0.75rem;
+        border-radius: 0.5rem;
+        margin-bottom: 0.75rem;
+        color: #1e293b;
+    }
+    .assistant-message {
+        background-color: #f8fafc;
+        padding: 0.75rem;
+        border-radius: 0.5rem;
+        margin-bottom: 0.75rem;
+        border: 1px solid #e2e8f0;
+        color: #1e293b;
+    }
+    /* Chat input container */
+    .chat-input-container {
+        display: flex;
+        align-items: center;
+        background-color: white;
+        border-radius: 0.5rem;
+        padding: 0.5rem;
+        border: 1px solid #e2e8f0;
+    }
+    /* Document status */
+    .document-status {
+        padding: 0.5rem;
+        border-radius: 0.5rem;
+        margin-top: 0.5rem;
+        font-size: 0.9rem;
+    }
+    .status-success {
+        background-color: #dcfce7;
+        color: #166534;
+    }
+    .status-waiting {
+        background-color: #f3f4f6;
+        color: #4b5563;
+    }
+    /* Tabs styling */
+    .stTabs [data-baseweb="tab-list"] {
+        gap: 8px;
+    }
+    .stTabs [data-baseweb="tab"] {
+        background-color: #f1f5f9;
+        border-radius: 4px 4px 0 0;
+        padding: 8px 16px;
+        height: auto;
+    }
+    .stTabs [aria-selected="true"] {
+        background-color: white !important;
+        border-bottom: 2px solid #7c3aed !important;
+    }
+    /* Sidebar section headers */
+    .sidebar-section-header {
+        color: #f8fafc;
+        font-size: 1rem;
+        font-weight: 600;
+        margin-top: 1rem;
+        margin-bottom: 0.5rem;
+    }
+    /* Sidebar file uploader label */
+    .sidebar-uploader-label {
+        color: #f8fafc;
+        font-size: 0.9rem;
+        margin-bottom: 0.5rem;
+    }
+</style>
+""", unsafe_allow_html=True)
+# Example questions
+EXAMPLE_QUESTIONS = [
+    "How do the different topics in these documents relate to each other?",
+    "What is the structure of this document?",
+    "Can you analyze the writing style of this text?",
+    "Extract all dates and events mentioned in the document",
+    "What are the main arguments presented in this document?"
+]
+# Initialize the LLM
+@st.cache_resource
+def get_llm():
+    return Together(
+        model="deepseek-ai/DeepSeek-V3",
+        temperature=0.7,
+        max_tokens=1024
+    )
+# Initialize embeddings
+@st.cache_resource
+def get_embeddings():
+    return HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
+# Initialize text splitter
+@st.cache_resource
+def get_text_splitter():
+    return RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200
+    )
+# Function to extract text from PDF
+def extract_text_from_pdf(pdf_file):
+    pdf_reader = pypdf.PdfReader(pdf_file)
+    text = ""
+    for page in pdf_reader.pages:
+        text += page.extract_text() or ""
+    return text
+# Function to extract text from image using OCR
+def extract_text_from_image(image_file):
+    image = Image.open(image_file)
+    text = pytesseract.image_to_string(image)
+    return text
+# Function to process PDF with OCR if needed
+def process_pdf_with_ocr(pdf_file):
+    # First try normal text extraction
+    text = extract_text_from_pdf(pdf_file)
+    # If little or no text was extracted, try OCR
+    if len(text.strip()) < 100:
+        images = convert_from_path(pdf_file)
+        text = ""
+        for image in images:
+            text += pytesseract.image_to_string(image)
+    return text
+# Function to process uploaded files
+def process_uploaded_files(uploaded_files):
+    all_texts = []
+    file_info = []
+    for file in uploaded_files:
+        # Create a temporary file
+        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
+            temp_file.write(file.getvalue())
+            temp_file_path = temp_file.name
+        # Process based on file type
+        if file.name.lower().endswith('.pdf'):
+            text = process_pdf_with_ocr(temp_file_path)
+            file_type = "PDF"
+        elif file.name.lower().endswith(('.png', '.jpg', '.jpeg')):
+            text = extract_text_from_image(temp_file_path)
+            file_type = "Image"
+        elif file.name.lower().endswith(('.txt', '.md')):
+            text = file.getvalue().decode('utf-8')
+            file_type = "Text"
+        else:
+            text = f"Unsupported file format: {file.name}"
+            file_type = "Unknown"
+        all_texts.append(f"--- Content from {file.name} ---\n{text}")
+        file_info.append({"name": file.name, "type": file_type})
+        # Clean up the temporary file
+        os.unlink(temp_file_path)
+    return "\n\n".join(all_texts), file_info
+# Function to create vector store from text
+def create_vectorstore(text):
+    text_splitter = get_text_splitter()
+    chunks = text_splitter.split_text(text)
+    # Use FAISS instead of Chroma to avoid SQLite dependency
+    return FAISS.from_texts(
+        texts=chunks,
+        embedding=get_embeddings()
+    )
+# Define the state schema for the graph using TypedDict
+class GraphState(TypedDict):
+    messages: List
+    documents: List
+    thinking: str
+# Define the RAG agent using LangGraph
+def create_rag_agent(vectorstore):
+    # Define the retrieval component
+    def retrieve(state: GraphState) -> GraphState:
+        query = state["messages"][-1].content
+        docs = vectorstore.similarity_search(query, k=5)
+        return {"documents": docs, "messages": state["messages"], "thinking": state.get("thinking", "")}
+    # Define the generation component with thinking step
+    def generate(state: GraphState) -> GraphState:
+        messages = state["messages"]
+        documents = state["documents"]
+        # Extract relevant context from documents
+        context = "\n\n".join([f"Document {i+1}:\n{doc.page_content}" for i, doc in enumerate(documents)])
+        # First, have the model think about the query
+        thinking_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="You are an assistant that thinks step by step before answering."),
+            MessagesPlaceholder(variable_name="messages"),
+            SystemMessage(content=f"Here is relevant context from the knowledge base:\n{context}\n\nThink step by step about how to answer the query using this context.")
+        ])
+        thinking = thinking_prompt | get_llm() | StrOutputParser()
+        thinking_result = thinking.invoke({"messages": messages})
+        # Then generate the final answer
+        answer_prompt = ChatPromptTemplate.from_messages([
+            SystemMessage(content="You are a helpful assistant that provides accurate information based on the given context."),
+            MessagesPlaceholder(variable_name="messages"),
+            SystemMessage(content=f"Here is relevant context from the knowledge base:\n{context}\n\nHere is your thinking process:\n{thinking_result}\n\nNow provide a clear and helpful answer based on this context and thinking.")
+        ])
+        answer = answer_prompt | get_llm() | StrOutputParser()
+        response = answer.invoke({"messages": messages})
+        return {
+            "messages": messages + [AIMessage(content=response)],
+            "thinking": thinking_result,
+            "documents": documents
+        }
+    # Create the graph
+    from langgraph.graph import StateGraph
+    workflow = StateGraph(GraphState)
+    workflow.add_node("retrieve", retrieve)
+    workflow.add_node("generate", generate)
+    workflow.set_entry_point("retrieve")
+    workflow.add_edge("retrieve", "generate")
+    workflow.add_edge("generate", END)
+    # Compile the graph
+    app = workflow.compile()
+    return app
+# Function to clear all session state
+def clear_session_state():
+    for key in list(st.session_state.keys()):
+        del st.session_state[key]
+# Main app layout
+def main():
+    # Initialize session state for showing examples
+    if "show_examples" not in st.session_state:
+        st.session_state.show_examples = True
+    # Initialize messages if not exists
+    if "messages" not in st.session_state:
+        st.session_state.messages = []
+    # Initialize thinking history if not exists
+    if "thinking_history" not in st.session_state:
+        st.session_state.thinking_history = []
+    # Sidebar for document upload and controls
+    with st.sidebar:
+        st.markdown('<div class="sidebar-title">📚 Document Q&A</div>', unsafe_allow_html=True)
+        st.markdown("""
+        <div class="how-to-use">
+        <ol>
+            <li>Upload your documents using the form below</li>
+            <li>Process the documents</li>
+            <li>Ask questions about your documents</li>
+            <li>View the AI's answers and thinking process</li>
+        </ol>
+        </div>
+        """, unsafe_allow_html=True)
+        # Document upload section
+        st.markdown('<div class="sidebar-section-header">📄 Upload Documents</div>', unsafe_allow_html=True)
+        st.markdown('<div class="sidebar-uploader-label">Select files to upload:</div>', unsafe_allow_html=True)
+        # File uploader
+        uploaded_files = st.file_uploader("Upload documents",
+                                        type=["pdf", "txt", "png", "jpg", "jpeg"],
+                                        accept_multiple_files=True,
+                                        label_visibility="collapsed")
+        # Process button
+        if uploaded_files:
+            if st.button("Process Documents"):
+                with st.spinner("Processing documents..."):
+                    # Process progress bar
+                    progress_bar = st.progress(0)
+                    for i in range(100):
+                        time.sleep(0.01)
+                        progress_bar.progress(i + 1)
+                    # Process the files
+                    text, file_info = process_uploaded_files(uploaded_files)
+                    st.session_state.vectorstore = create_vectorstore(text)
+                    st.session_state.documents_processed = True
+                    st.session_state.file_info = file_info
+                    # Display success message
+                    st.success(f"✅ Processed {len(uploaded_files)} documents successfully!")
+        # Document info section
+        if "file_info" in st.session_state and st.session_state.file_info:
+            st.markdown('<div class="divider"></div>', unsafe_allow_html=True)
+            st.markdown('<div class="sidebar-section-header">📋 Document Information</div>', unsafe_allow_html=True)
+            # Display file list
+            for i, file in enumerate(st.session_state.file_info):
+                st.markdown(f"""
+                <div class="file-item">
+                    <div class="file-name">{file['name']}</div>
+                    <div class="file-type">{file['type']} file</div>
+                </div>
+                """, unsafe_allow_html=True)
+            # Remove documents button
+            if st.button("Remove All Documents"):
+                if "vectorstore" in st.session_state:
+                    del st.session_state.vectorstore
+                if "file_info" in st.session_state:
+                    del st.session_state.file_info
+                if "documents_processed" in st.session_state:
+                    del st.session_state.documents_processed
+                st.success("All documents removed!")
+                st.rerun()
+        # Controls section
+        st.markdown('<div class="divider"></div>', unsafe_allow_html=True)
+        st.markdown('<div class="sidebar-section-header">⚙️ Controls</div>', unsafe_allow_html=True)
+        # Clear chat button
+        if st.button("Clear Chat"):
+            if "messages" in st.session_state:
+                st.session_state.messages = []
+            if "thinking_history" in st.session_state:
+                st.session_state.thinking_history = []
+            st.rerun()
+        # Reset all button
+        if st.button("Reset All"):
+            clear_session_state()
+            st.rerun()
+        # Hide/Show examples button
+        if st.button("Hide Examples" if st.session_state.show_examples else "Show Examples"):
+            st.session_state.show_examples = not st.session_state.show_examples
+            st.rerun()
+    # Main content area
+    st.title("Document Q&A Assistant")
+    # Example questions section - only show if flag is True
+    if st.session_state.show_examples:
+        st.markdown("### Example Questions")
+        cols = st.columns(len(EXAMPLE_QUESTIONS))
+        for i, question in enumerate(EXAMPLE_QUESTIONS):
+            with cols[i]:
+                if st.button(question, key=f"example_{hash(question)}"):
+                    st.session_state.messages.append(HumanMessage(content=question))
+                    # Generate response if vectorstore exists
+                    if "vectorstore" in st.session_state:
+                        with st.spinner("Thinking..."):
+                            # Create RAG agent
+                            rag_agent = create_rag_agent(st.session_state.vectorstore)
+                            # Run the agent
+                            result = rag_agent.invoke({
+                                "messages": [HumanMessage(content=question)],
+                                "documents": [],
+                                "thinking": ""
+                            })
+                            # Store thinking process
+                            st.session_state.thinking_history.append(result["thinking"])
+                            # Add AI message to chat history
+                            st.session_state.messages.append(result["messages"][-1])
+                    else:
+                        # Add AI message to chat history
+                        st.session_state.messages.append(AIMessage(content="Please upload and process documents first."))
+                    st.rerun()
+    # Chat container
+    st.markdown("### 💬 Chat")
+    chat_container = st.container()
+    with chat_container:
+        # Display chat messages
+        if st.session_state.messages:
+            for i, message in enumerate(st.session_state.messages):
+                if isinstance(message, HumanMessage):
+                    st.markdown(f"""
+                    <div class="user-message">
+                        <strong>User:</strong> {message.content}
+                    </div>
+                    """, unsafe_allow_html=True)
+                else:
+                    st.markdown(f"""
+                    <div class="assistant-message">
+                        <strong>Assistant:</strong> {message.content}
+                    </div>
+                    """, unsafe_allow_html=True)
+                    # Show thinking process if available
+                    if "thinking_history" in st.session_state and i//2 < len(st.session_state.thinking_history):
+                        thinking = st.session_state.thinking_history[i//2]
+                        # Create a unique key for this thinking process
+                        thinking_key = f"thinking_{i//2}"
+                        # Store the visibility state in session_state if not already there
+                        if thinking_key not in st.session_state:
+                            st.session_state[thinking_key] = False
+                        # Toggle button for thinking process
+                        toggle_text = "Show thinking" if not st.session_state[thinking_key] else "Hide thinking"
+                        # Create the toggle button
+                        if st.button(toggle_text, key=f"toggle_{thinking_key}"):
+                            st.session_state[thinking_key] = not st.session_state[thinking_key]
+                            st.rerun()
+                        # Show thinking process if toggled on
+                        if st.session_state[thinking_key]:
+                            with st.expander("Thinking Process", expanded=True):
+                                st.write(thinking)
+        else:
+            st.info("Upload documents and start asking questions!")
+    # Chat input
+    st.markdown("### Ask a question about your documents")
+    with st.form(key="chat_form", clear_on_submit=True):
+        user_input = st.text_input("Type your question here...", key="user_question", label_visibility="collapsed")
+        cols = st.columns([6, 1])
+        with cols[0]:
+            submit_button = st.form_submit_button("Ask", use_container_width=True)
+    if submit_button and user_input:
+        # Add user message to chat history
+        st.session_state.messages.append(HumanMessage(content=user_input))
+        # Generate response if vectorstore exists
+        if "vectorstore" in st.session_state:
+            with st.spinner("Thinking..."):
+                # Create RAG agent
+                rag_agent = create_rag_agent(st.session_state.vectorstore)
+                # Run the agent
+                result = rag_agent.invoke({
+                    "messages": [HumanMessage(content=user_input)],
+                    "documents": [],
+                    "thinking": ""
+                })
+                # Store thinking process
+                st.session_state.thinking_history.append(result["thinking"])
+                # Add AI message to chat history
+                st.session_state.messages.append(result["messages"][-1])
+        else:
+            # Add AI message to chat history
+            st.session_state.messages.append(AIMessage(content="Please upload and process documents first."))
+        # Rerun to update the UI
+        st.rerun()
+if __name__ == "__main__":
+    main()

requirements.txt ADDED Viewed

	@@ -0,0 +1,20 @@

+langchain>=0.1.0
+langchain-community>=0.0.13
+langchain-together>=0.0.2
+langchain-core>=0.1.10
+langchain-text-splitters>=0.0.1
+langchain-openai>=0.0.2
+langchain-chroma>=0.0.1
+langchain-experimental>=0.0.37
+langchain-groq>=0.1.1
+langsmith>=0.0.69
+chromadb>=0.4.22
+pydantic>=2.5.2
+streamlit>=1.29.0
+streamlit-chat>=0.1.1
+python-dotenv>=1.0.0
+pypdf>=3.17.1
+pillow>=10.1.0
+pytesseract>=0.3.10
+pdf2image>=1.16.3
+langgraph>=0.0.19