Spaces:

KingArthur111
/

DocuMind

Sleeping

App Files Files Community

MOHITRAJDEO12345 commited on Aug 28, 2025

Commit

b3f1583

0 Parent(s):

Fresh start: Clean repository without binary files

Browse files

Files changed (9) hide show

.gitattributes +35 -0
.gitignore +164 -0
.streamlit/config.toml +2 -0
Dockerfile +21 -0
README.md +20 -0
requirements.txt +9 -0
src/ingestor.py +102 -0
src/pipeline.py +124 -0
src/streamlit_app.py +126 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,164 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# ChromaDB vector database
+data/
+chroma/
+*.db
+*.sqlite3
+# Streamlit
+.streamlit/secrets.toml
+.streamlit/config.toml.backup
+# IDEs
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS generated files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Logs
+*.log
+logs/
+# Temporary files
+*.tmp
+*.temp
+.cache/

.streamlit/config.toml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Streamlit configuration
2	+ # Data directory will be set automatically

Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+FROM python:3.13.5-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt ./
+COPY src/ ./src/
+COPY .streamlit/ .streamlit/
+RUN pip3 install -r requirements.txt
+EXPOSE 8501
+HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
+ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

README.md ADDED Viewed

	@@ -0,0 +1,20 @@

+---
+title: DocuMind
+emoji: 🚀
+colorFrom: red
+colorTo: red
+sdk: docker
+app_port: 8501
+tags:
+- streamlit
+pinned: false
+short_description: The DocuMind system, as outlined and implemented in this rep
+license: mit
+---
+# Welcome to Streamlit!
+Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
+If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
+forums](https://discuss.streamlit.io).

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+streamlit
+langchain-core
+langchain
+langchain-community
+langchain-google-genai
+chromadb
+pypdf
+pymupdf
+python-dotenv

src/ingestor.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+import streamlit as st
+import fitz  # PyMuPDF
+from typing import List
+from langchain_google_genai import GoogleGenerativeAIEmbeddings
+import asyncio
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.vectorstores import Chroma
+from langchain_core.documents import Document
+import hashlib
+import json
+class Ingestor:
+    def __init__(self, api_key: str):
+        self.api_key = api_key
+        # Ensure an event loop is available for GoogleGenerativeAIEmbeddings
+        try:
+            asyncio.get_running_loop()
+        except RuntimeError:
+            asyncio.set_event_loop(asyncio.new_event_loop())
+        # Initialize the embedding model
+        self.embeddings = GoogleGenerativeAIEmbeddings(
+            model="models/embedding-001",
+            google_api_key=self.api_key,
+        )
+    def load_and_chunk_pdfs(self, file_paths: List[str]) -> List:
+        """Loads PDFs and splits them into chunks with metadata."""
+        all_chunks = []
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=1000,
+            chunk_overlap=100,
+            separators=["\n\n", "\n", " ", ""],
+            length_function=len
+        )
+        for file_path in file_paths:
+            try:
+                # Use PyMuPDF to open and extract text from the PDF
+                doc = fitz.open(file_path)
+                # Extract text page by page with metadata
+                for page_num, page in enumerate(doc):
+                    text = page.get_text()
+                    # Create LangChain Document object with metadata
+                    langchain_doc = Document(
+                        page_content=text,
+                        metadata={
+                            "source": os.path.basename(file_path),
+                            "page": page_num + 1,
+                        }
+                    )
+                    # Split the page text into chunks
+                    chunks = text_splitter.split_documents([langchain_doc])
+                    all_chunks.extend(chunks)
+                doc.close()
+            except Exception as e:
+                print(f"Error processing {file_path}: {e}")
+        return all_chunks
+    def ingest_documents(self, file_paths: List[str]):
+        """Ingests documents, creates embeddings, and initializes a ChromaDB vector store."""
+        # Check if vector store cache exists, and load if it does
+        # The cache key is a hash of the file paths, ensuring it's unique per set of docs
+        cache_key = hashlib.sha256(json.dumps(sorted(file_paths)).encode()).hexdigest()
+        # Using a fixed directory for persistence
+        persist_directory = "./data/db"
+        # Check if the vector store has been created and cached before
+        if os.path.exists(persist_directory):
+            print("Loading existing vector store from cache...")
+            vector_store = Chroma(
+                persist_directory=persist_directory,
+                embedding_function=self.embeddings,
+            )
+            # A simple check to ensure the vector store is not empty
+            if vector_store.get()['documents']:
+                return vector_store
+        print("Creating new vector store from documents...")
+        # Load and chunk documents
+        chunks = self.load_and_chunk_pdfs(file_paths)
+        if not chunks:
+            raise ValueError("No valid document chunks could be created.")
+        # Create the ChromaDB vector store from the chunks and embeddings
+        vector_store = Chroma.from_documents(
+            documents=chunks,
+            embedding=self.embeddings,
+            persist_directory=persist_directory,
+        )
+        # Persist the vector store to disk
+        vector_store.persist()
+        return vector_store

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from langchain_google_genai import ChatGoogleGenerativeAI
+from langchain.chains import RetrievalQAWithSourcesChain
+from langchain_community.vectorstores import Chroma
+from langchain_core.prompts import PromptTemplate
+from langchain_core.documents import Document
+from typing import List
+class RAGPipeline:
+    def __init__(self, vector_store: Chroma, api_key: str):
+        self.vector_store = vector_store
+        self.llm = ChatGoogleGenerativeAI(
+            model="gemini-2.0-flash",
+            google_api_key=api_key,
+            temperature=0.2,
+        )
+        self.retriever = self.vector_store.as_retriever(
+            search_type="similarity",
+            search_kwargs={"k": 5}
+        )
+        # Define the prompt template for the LLM
+        # This template instructs the model to answer based on the provided context
+        # and to include source citations.
+        template = """
+        You are a helpful assistant. Use the following context to answer the question at the end.
+        If you don't know the answer, just say that you don't know, don't try to make up an answer.
+        Context:
+        {context}
+        Question:
+        {question}
+        Instructions:
+        1. Provide a detailed and accurate answer based ONLY on the provided context.
+        2. When referencing information, mention which source and page it comes from.
+        3. If the context doesn't contain enough information, say so clearly.
+        4. Keep your answer concise but comprehensive.
+        Answer:
+        """
+        self.prompt = PromptTemplate(
+            template=template,
+            input_variables=["context", "question"]
+        )
+    def format_documents_with_citations(self, documents: List) -> str:
+        """
+        Formats the retrieved documents into a single string, including metadata for citations.
+        """
+        formatted_text = []
+        for i, doc in enumerate(documents, 1):
+            content = doc.page_content
+            source = doc.metadata.get("source", "unknown")
+            page = doc.metadata.get("page", "unknown")
+            formatted_text.append(f"Source {i}:\nFile: {source}\nPage: {page}\nContent:\n{content}\n")
+        return "\n---\n".join(formatted_text)
+    def get_source_info_with_scores(self, documents: List) -> str:
+        """
+        Gets source information with confidence scores for the retrieved documents.
+        """
+        source_info = []
+        for i, doc in enumerate(documents, 1):
+            source = doc.metadata.get("source", "unknown")
+            page = doc.metadata.get("page", "unknown")
+            # Calculate confidence score based on multiple factors:
+            # 1. Retrieval order (higher for top results)
+            # 2. Content length (longer content might be more relevant)
+            # 3. Position in document (earlier pages might be more important)
+            base_score = 1.0 - (i - 1) * 0.15  # Order factor
+            length_factor = min(1.0, len(doc.page_content) / 1000)  # Length factor
+            page_factor = max(0.8, 1.0 - (page - 1) * 0.05) if isinstance(page, int) else 1.0
+            confidence_score = base_score * length_factor * page_factor
+            confidence_score = max(0.1, min(1.0, confidence_score))  # Clamp between 0.1 and 1.0
+            confidence_percent = int(confidence_score * 100)
+            # Determine confidence level
+            if confidence_percent >= 90:
+                level = "Very High"
+            elif confidence_percent >= 75:
+                level = "High"
+            elif confidence_percent >= 60:
+                level = "Medium"
+            elif confidence_percent >= 40:
+                level = "Low"
+            else:
+                level = "Very Low"
+            source_info.append(f"• **Source {i}**: {source}")
+            source_info.append(f"  - **Page**: {page}")
+            source_info.append(f"  - **Confidence**: {confidence_percent}% ({level})")
+            source_info.append(f"  - **Content Preview**: {doc.page_content[:200]}...")
+        return "\n".join(source_info)
+    def answer_question(self, question: str) -> str:
+        """
+        Executes the RAG pipeline: retrieves documents and generates a response.
+        """
+        # Step 1: Retrieve relevant documents with scores
+        retrieved_docs = self.retriever.get_relevant_documents(question)
+        if not retrieved_docs:
+            return "I am sorry, I could not find any relevant information in the documents to answer your question."
+        # Step 2: Format the retrieved documents for the prompt
+        formatted_context = self.format_documents_with_citations(retrieved_docs)
+        # Step 3: Create the final prompt
+        final_prompt = self.prompt.format(context=formatted_context, question=question)
+        # Step 4: Call the LLM to generate the answer
+        response = self.llm.invoke(final_prompt).content
+        # Step 5: Add source information and confidence scores to the response
+        source_info = self.get_source_info_with_scores(retrieved_docs)
+        # Combine the response with source information
+        full_response = f"{response}\n\n**Sources and Context:**\n{source_info}"
+        return full_response

src/streamlit_app.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import asyncio
+import altair as alt
+import numpy as np
+import pandas as pd
+import streamlit as st
+from dotenv import load_dotenv
+import os
+from ingestor import Ingestor
+from pipeline import RAGPipeline
+import tempfile
+# Set the event loop policy for Windows (if available)
+try:
+    asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
+except AttributeError:
+    # WindowsSelectorEventLoopPolicy not available, use default
+    pass
+# 1. Set up the Streamlit page configuration and title
+st.set_page_config(page_title="📄 DocuMind: Your Document AI", page_icon="📄")
+st.title("📄 DocuMind: Document QA with Gemini")
+# 2. Add a sidebar for API key and instructions
+with st.sidebar:
+    st.header("Configuration")
+    st.info("To get started, please upload your PDF document(s).")
+    gemini_api_key = st.text_input("Gemini API Key", type="password")
+    # Check for API key and load from.env if available
+    if not gemini_api_key:
+        load_dotenv()
+        gemini_api_key = os.getenv("GEMINI_API_KEY")
+    if not gemini_api_key:
+        st.warning("Please enter a valid Gemini API key!")
+        st.stop()
+# Store API key in session state for reuse
+st.session_state["gemini_api_key"] = gemini_api_key
+# 3. Handle file uploads
+uploaded_files = st.file_uploader(
+    "Upload your PDF documents",
+    type="pdf",
+    accept_multiple_files=True,
+)
+# Use st.session_state to handle RAG state persistence across reruns
+if "rag_pipeline" not in st.session_state:
+    st.session_state["rag_pipeline"] = None
+    st.session_state["ingested_docs"] = []
+# 4. Ingest documents and set up the RAG pipeline
+if uploaded_files and st.session_state["rag_pipeline"] is None:
+    with st.spinner("Processing documents... This may take a moment."):
+        # Create a temporary directory to save uploaded files
+        with tempfile.TemporaryDirectory() as temp_dir:
+            file_paths = []
+            for uploaded_file in uploaded_files:
+                file_path = os.path.join(temp_dir, uploaded_file.name)
+                with open(file_path, "wb") as f:
+                    f.write(uploaded_file.getbuffer())
+                file_paths.append(file_path)
+            try:
+                # Ingest documents and create the ChromaDB vector store
+                ingestor = Ingestor(api_key=gemini_api_key)
+                vector_store = ingestor.ingest_documents(file_paths)
+                # Initialize the RAG pipeline with the vector store
+                st.session_state["rag_pipeline"] = RAGPipeline(
+                    vector_store=vector_store,
+                    api_key=gemini_api_key,
+                )
+                # Store the names of the ingested documents for display
+                st.session_state["ingested_docs"] = [f.name for f in uploaded_files]
+                st.success("Documents processed successfully!")
+            except Exception as e:
+                st.error(f"An error occurred during document ingestion: {e}")
+                st.session_state["rag_pipeline"] = None
+# 5. Display a list of ingested documents
+if st.session_state["ingested_docs"]:
+    with st.expander("Documents in Knowledge Base"):
+        st.write("The following documents have been successfully ingested:")
+        for doc_name in st.session_state["ingested_docs"]:
+            st.markdown(f"- {doc_name}")
+# 6. Set up the chat interface
+if "messages" not in st.session_state:
+    st.session_state.messages = []
+# Display chat messages from history
+for message in st.session_state.messages:
+    with st.chat_message(message["role"]):
+        st.markdown(message["content"])
+# Process user question if RAG pipeline is ready
+if st.session_state["rag_pipeline"]:
+    question = st.chat_input("Ask a question about the documents...")
+    if question:
+        # Display user message
+        st.session_message = st.chat_message("user")
+        st.session_message.markdown(question)
+        st.session_state.messages.append({"role": "user", "content": question})
+        with st.chat_message("assistant"):
+            with st.spinner("Thinking..."):
+                try:
+                    # Get the answer from the RAG pipeline
+                    response = st.session_state["rag_pipeline"].answer_question(question)
+                    # Display the response using st.markdown
+                    st.markdown(response)
+                    # Add assistant response to chat history
+                    st.session_state.messages.append({"role": "assistant", "content": response})
+                except Exception as e:
+                    st.error(f"An error occurred during response generation: {e}")