Spaces:

Ronochieng
/

DocMindAI

Build error

App Files Files Community

Adeptschneider commited on Apr 25, 2025

Commit

18a68e7

1 Parent(s): 3c81ee5

Feat: DocMindAI

Browse files

Files changed (9) hide show

Dockerfile +30 -0
Ingestion/__init__.py +0 -0
Ingestion/__pycache__/__init__.cpython-312.pyc +0 -0
Ingestion/__pycache__/ingest.cpython-312.pyc +0 -0
Ingestion/ingest.py +264 -0
README.md +104 -12
app.py +625 -0
docker-compose.yml +9 -0
requirements.txt +18 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,30 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    software-properties-common \
+    libpoppler-dev \
+    libmagic1 \
+    tesseract-ocr \
+    libreoffice \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of the application
+COPY . .
+# Expose the port Streamlit runs on
+EXPOSE 7860
+# Command to run the application
+CMD ["streamlit", "run", "app.py", "--server.port=7860", "--server.address=0.0.0.0"]

Ingestion/__init__.py ADDED Viewed

File without changes

Ingestion/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (139 Bytes). View file

Ingestion/__pycache__/ingest.cpython-312.pyc ADDED Viewed

Binary file (9.71 kB). View file

Ingestion/ingest.py ADDED Viewed

	@@ -0,0 +1,264 @@

+import os
+import pymupdf4llm
+import pandas as pd
+import tempfile
+from typing import Dict, Any, Optional, List
+# Import unstructured components for different file types
+from unstructured.partition.auto import partition
+from unstructured.partition.pdf import partition_pdf
+from unstructured.partition.docx import partition_docx
+from unstructured.partition.pptx import partition_pptx
+from unstructured.partition.xlsx import partition_xlsx
+from unstructured.partition.md import partition_md
+from unstructured.partition.html import partition_html
+from unstructured.partition.xml import partition_xml
+from unstructured.partition.email import partition_email
+from unstructured.partition.text import partition_text
+from unstructured.partition.epub import partition_epub
+def get_processor_for_file(file_path: str) -> Optional[callable]:
+    """
+    Determine the appropriate processor function for the given file type
+    """
+    file_extension = os.path.splitext(file_path)[1].lower()
+    # Map file extensions to specific partition functions
+    processors = {
+        ".pdf": process_pdf,
+        ".docx": process_docx,
+        ".doc": process_docx,
+        ".pptx": process_pptx,
+        ".ppt": process_pptx,
+        ".xlsx": process_xlsx,
+        ".xls": process_xlsx,
+        ".md": process_markdown,
+        ".html": process_html,
+        ".htm": process_html,
+        ".xml": process_xml,
+        ".msg": process_email,
+        ".eml": process_email,
+        ".epub": process_epub,
+        ".txt": process_text,
+        ".csv": process_text,
+        ".rtf": process_text,
+        # Code files
+        ".py": process_text,
+        ".js": process_text,
+        ".java": process_text,
+        ".ts": process_text,
+        ".tsx": process_text,
+        ".jsx": process_text,
+        ".c": process_text,
+        ".cpp": process_text,
+        ".h": process_text,
+        ".cs": process_text,
+        ".rb": process_text,
+        ".go": process_text,
+        ".rs": process_text,
+        ".php": process_text,
+        ".sql": process_text,
+        ".css": process_text,
+    }
+    return processors.get(file_extension, process_generic)
+def process_document(file_path: str) -> Optional[str]:
+    """
+    Process a document using the appropriate processor based on file type
+    """
+    processor = get_processor_for_file(file_path)
+    if processor:
+        return processor(file_path)
+    return None
+def process_pdf(file_path: str) -> str:
+    """
+    Process PDF documents using unstructured
+    """
+    temp_dir = tempfile.mkdtemp()
+    try:
+        # Try hi_res mode first with OCR capabilities
+        elements = partition_pdf(
+            filename=file_path,
+            strategy="hi_res",
+            extract_images_in_pdf=True,
+            extract_image_block_types=["Image", "Table"],
+            extract_image_block_to_payload=False,
+            extract_image_block_output_dir=temp_dir,
+            hi_res_model_name="yolox",
+            infer_table_structure=True,
+            chunking_strategy="by_title",
+            max_characters=4000,
+            new_after_n_chars=3800,
+            combine_text_under_n_chars=2000,
+        )
+    except Exception as e:
+        # Fall back to fast mode if hi_res fails
+        elements = partition_pdf(
+            filename=file_path,
+            strategy="fast",
+            chunking_strategy="by_title",
+            max_characters=4000,
+            new_after_n_chars=3800,
+            combine_text_under_n_chars=2000,
+        )
+    # Extract text from elements
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_docx(file_path: str) -> str:
+    """
+    Process DOCX documents using unstructured
+    """
+    elements = partition_docx(
+        filename=file_path,
+        chunking_strategy="by_title",
+        max_characters=4000,
+        new_after_n_chars=3800,
+        combine_text_under_n_chars=2000,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_pptx(file_path: str) -> str:
+    """
+    Process PPTX documents using unstructured
+    """
+    elements = partition_pptx(
+        filename=file_path,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_xlsx(file_path: str) -> str:
+    """
+    Process XLSX documents using unstructured
+    """
+    elements = partition_xlsx(
+        filename=file_path,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_markdown(file_path: str) -> str:
+    """
+    Process Markdown documents using unstructured
+    """
+    elements = partition_md(
+        filename=file_path,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_html(file_path: str) -> str:
+    """
+    Process HTML documents using unstructured
+    """
+    elements = partition_html(
+        filename=file_path,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_xml(file_path: str) -> str:
+    """
+    Process XML documents using unstructured
+    """
+    elements = partition_xml(
+        filename=file_path,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_email(file_path: str) -> str:
+    """
+    Process email documents using unstructured
+    """
+    elements = partition_email(
+        filename=file_path,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_text(file_path: str) -> str:
+    """
+    Process text documents using unstructured
+    """
+    elements = partition_text(
+        filename=file_path,
+        chunking_strategy="by_title",
+        max_characters=4000,
+        new_after_n_chars=3800,
+        combine_text_under_n_chars=2000,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_epub(file_path: str) -> str:
+    """
+    Process EPUB documents using unstructured
+    """
+    elements = partition_epub(
+        filename=file_path,
+    )
+    texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+    combined_text = "\n\n".join(texts)
+    return combined_text
+def process_generic(file_path: str) -> str:
+    """
+    Generic document processor using unstructured's auto partitioning
+    """
+    try:
+        elements = partition(
+            filename=file_path,
+        )
+        texts = [element.text for element in elements if hasattr(element, 'text') and element.text]
+        combined_text = "\n\n".join(texts)
+        return combined_text
+    except Exception as e:
+        # Fall back to basic text processing if auto-partition fails
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return f.read()
+        except Exception:
+            # Try with a different encoding if utf-8 fails
+            try:
+                with open(file_path, 'r', encoding='latin-1') as f:
+                    return f.read()
+            except Exception as e2:
+                raise Exception(f"Could not process file: {str(e)} / {str(e2)}")

README.md CHANGED Viewed

@@ -1,12 +1,104 @@
----
-title: DocMindAI
-emoji: 🌖
-colorFrom: yellow
-colorTo: pink
-sdk: docker
-pinned: false
-license: apache-2.0
-short_description: DocMindAI
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# DocMind AI Installation Guide
+This guide will help you set up and run DocMind AI, an open-source LLM-powered document analysis application.
+## Prerequisites
+1. [Python 3.8+](https://www.python.org/downloads/)
+2. [Ollama](https://ollama.com/) - For running local LLMs
+3. (Optional) [Docker](https://www.docker.com/) and [Docker Compose](https://docs.docker.com/compose/install/) for containerized deployment
+## Option 1: Local Installation
+1. **Clone the repository:**
+```bash
+git clone https://huggingface.co/spaces/davisandshirtliff/DocMindAI
+cd DocMindAI
+```
+2. **Create a virtual environment:**
+```bash
+python -m venv venv
+source venv/bin/activate  # On Windows, use: venv\Scripts\activate
+```
+3. **Install dependencies:**
+```bash
+pip install -r requirements.txt
+```
+4. **Run Ollama:**
+Make sure Ollama is installed and running locally. Pull a model to use with the application:
+```bash
+ollama pull gemma3:1b
+```
+5. **Run the application:**
+```bash
+streamlit run app.py
+```
+The application will be accessible at `http://localhost:8501` in your web browser.
+## Option 2: Docker Deployment
+1. **Clone the repository:**
+```bash
+git clone https://huggingface.co/spaces/davisandshirtliff/DocMindAI
+cd DocMindAI
+```
+2. **Run with Docker Compose:**
+Make sure Ollama is running on your host machine, then:
+```bash
+docker-compose up --build
+```
+The application will be accessible at `http://localhost:8501` in your web browser.
+## Usage
+1. Enter your Ollama Base URL (default: `http://localhost:11434`)
+2. Select an Ollama model from the dropdown
+3. Upload documents for analysis
+4. Choose your analysis settings:
+   - Select a prompt type
+   - Choose a tone
+   - Select instructions
+   - Set the desired length/detail
+   - Choose the analysis mode
+5. Click "Extract and Analyze"
+6. Once analysis is complete, you can chat with your documents in the chat interface
+## Supported File Types
+DocMind AI supports a wide range of file formats including:
+- PDF
+- DOCX, DOC
+- TXT
+- XLSX, XLS
+- MD (Markdown)
+- JSON
+- XML
+- RTF
+- CSV
+- MSG, EML (Email)
+- PPTX, PPT (PowerPoint)
+- ODT (OpenDocument Text)
+- EPUB (E-book)
+- Code files (PY, JS, JAVA, TS, TSX, C, CPP, H, and many more)
+## Troubleshooting
+- If you encounter issues connecting to Ollama, make sure it's running and the URL is correct.
+- For Docker deployment, ensure that your Docker configuration allows access to the host network.
+- For document processing issues, check that you have the necessary dependencies installed.

app.py ADDED Viewed

	@@ -0,0 +1,625 @@

+import streamlit as st
+import pandas as pd
+import os
+import tempfile
+from typing import List, Optional, Dict, Any, Union
+import json
+from datetime import datetime
+from llama_cpp import Llama
+from langchain.output_parsers import PydanticOutputParser
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema import HumanMessage, SystemMessage
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain.schema.runnable import RunnablePassthrough
+from langchain.prompts.prompt import PromptTemplate
+from langchain.chains import ConversationalRetrievalChain
+from langchain.chains import LLMChain
+from langchain.memory import ConversationBufferMemory
+from langchain.vectorstores import Chroma
+from pydantic import BaseModel, Field
+from Ingestion.ingest import process_document, get_processor_for_file
+import warnings
+warnings.filterwarnings("ignore", category=RuntimeWarning)
+# Set page configuration
+st.set_page_config(
+    page_title="DocMind AI: AI-Powered Document Analysis",
+    page_icon="🧠",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# Custom CSS for better dark/light mode compatibility
+st.markdown("""
+<style>
+    /* Common styles for both modes */
+    .stApp {
+        max-width: 1200px;
+        margin: 0 auto;
+    }
+    /* Card styling for results */
+    .card {
+        border-radius: 5px;
+        padding: 1.5rem;
+        margin-bottom: 1rem;
+        border: 1px solid rgba(128, 128, 128, 0.2);
+    }
+    /* Dark mode specific */
+    @media (prefers-color-scheme: dark) {
+        .card {
+            background-color: rgba(255, 255, 255, 0.05);
+        }
+        .highlight-container {
+            background-color: rgba(255, 255, 255, 0.05);
+            border-left: 3px solid #4CAF50;
+        }
+        .chat-user {
+            background-color: rgba(0, 0, 0, 0.2);
+        }
+        .chat-ai {
+            background-color: rgba(76, 175, 80, 0.1);
+        }
+    }
+    /* Light mode specific */
+    @media (prefers-color-scheme: light) {
+        .card {
+            background-color: rgba(0, 0, 0, 0.02);
+        }
+        .highlight-container {
+            background-color: rgba(0, 0, 0, 0.03);
+            border-left: 3px solid #4CAF50;
+        }
+        .chat-user {
+            background-color: rgba(240, 240, 240, 0.7);
+        }
+        .chat-ai {
+            background-color: rgba(76, 175, 80, 0.05);
+        }
+    }
+    /* Chat message styling */
+    .chat-container {
+        margin-bottom: 1rem;
+    }
+    .chat-message {
+        padding: 1rem;
+        border-radius: 5px;
+        margin-bottom: 0.5rem;
+    }
+    /* Highlight sections */
+    .highlight-container {
+        padding: 1rem;
+        margin: 1rem 0;
+        border-radius: 4px;
+    }
+    /* Status indicators */
+    .status-success {
+        color: #4CAF50;
+    }
+    .status-error {
+        color: #F44336;
+    }
+    /* Document list */
+    .doc-list {
+        list-style-type: none;
+        padding-left: 0;
+    }
+    .doc-list li {
+        padding: 0.5rem 0;
+        border-bottom: 1px solid rgba(128, 128, 128, 0.2);
+    }
+</style>
+""", unsafe_allow_html=True)
+# Define the output structures using Pydantic
+class DocumentAnalysis(BaseModel):
+    summary: str = Field(description="A concise summary of the document")
+    key_insights: List[str] = Field(description="A list of key insights from the document")
+    action_items: Optional[List[str]] = Field(None, description="A list of action items derived from the document")
+    open_questions: Optional[List[str]] = Field(None, description="A list of open questions or areas needing clarification")
+# Function to clean up LLM responses for better parsing
+def clean_llm_response(response):
+    """Clean up the LLM response to extract JSON content from potential markdown code blocks."""
+    # Extract content from the response
+    if isinstance(response, dict) and 'choices' in response:
+        content = response['choices'][0]['message']['content']
+    else:
+        content = str(response)
+    # Remove markdown code block formatting if present
+    if '```' in content:
+        # Handle ```json format
+        parts = content.split('```')
+        if len(parts) >= 3:  # Has opening and closing backticks
+            # Take the content between first pair of backticks
+            content = parts[1]
+            # Remove json language specifier if present
+            if content.startswith('json') or content.startswith('JSON'):
+                content = content[4:].lstrip()
+    elif '`json' in content:
+        # Handle `json format
+        parts = content.split('`json')
+        if len(parts) >= 2:
+            content = parts[1]
+            if '`' in content:
+                content = content.split('`')[0]
+    # Strip any leading/trailing whitespace
+    content = content.strip()
+    return content
+# Initialize LLM and Model Cache
+@st.cache_resource(experimental_allow_widgets=True)
+def load_model():
+    with st.spinner("Loading model..."):
+        try:
+            llm = Llama.from_pretrained(
+                repo_id="stduhpf/google-gemma-3-1b-it-qat-q4_0-gguf-small",
+                filename="gemma-3-1b-it-q4_0_s.gguf",
+            )
+            return llm
+        except Exception as e:
+            st.error(f"Error loading model: {str(e)}")
+            return None
+# Initialize embeddings - but only when needed to avoid torch inspection issues
+@st.cache_resource(experimental_allow_widgets=True)
+def load_embeddings():
+    from langchain_community.embeddings import HuggingFaceEmbeddings
+    with st.spinner("Loading embeddings..."):
+        embeddings = HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={'device': 'cpu'}
+        )
+        return embeddings
+# Sidebar Configuration with improved styling
+st.sidebar.markdown("<div style='text-align: center;'><h1>🧠 DocMind AI</h1></div>", unsafe_allow_html=True)
+st.sidebar.markdown("<div style='text-align: center;'>AI-Powered Document Analysis</div>", unsafe_allow_html=True)
+st.sidebar.markdown("---")
+# Load LLM
+with st.sidebar:
+    llm = load_model()
+    if llm is not None:
+        st.markdown("<div class='status-success'>✅ Model loaded successfully!</div>", unsafe_allow_html=True)
+    else:
+        st.markdown("<div class='status-error'>❌ Error loading model. Check logs for details.</div>", unsafe_allow_html=True)
+        st.stop()
+# Mode Selection
+with st.sidebar:
+    st.markdown("### Analysis Configuration")
+    analysis_mode = st.radio(
+        "Analysis Mode",
+        ["Analyze each document separately", "Combine analysis for all documents"]
+    )
+# Prompt Selection
+prompt_options = {
+    "Comprehensive Document Analysis": "Analyze the provided document comprehensively. Generate a summary, extract key insights, identify action items, and list open questions.",
+    "Extract Key Insights and Action Items": "Extract key insights and action items from the provided document.",
+    "Summarize and Identify Open Questions": "Summarize the provided document and identify any open questions that need clarification.",
+    "Custom Prompt": "Enter a custom prompt below:"
+}
+with st.sidebar:
+    st.markdown("### Prompt Settings")
+    selected_prompt_option = st.selectbox("Select Prompt", list(prompt_options.keys()))
+    custom_prompt = ""
+    if selected_prompt_option == "Custom Prompt":
+        custom_prompt = st.text_area("Enter Custom Prompt", height=100)
+# Tone Selection
+tone_options = [
+    "Professional", "Academic", "Informal", "Creative", "Neutral",
+    "Direct", "Empathetic", "Humorous", "Authoritative", "Inquisitive"
+]
+with st.sidebar:
+    selected_tone = st.selectbox("Select Tone", tone_options)
+# Instructions Selection
+instruction_options = {
+    "General Assistant": "Act as a helpful assistant.",
+    "Researcher": "Act as a researcher providing in-depth analysis.",
+    "Software Engineer": "Act as a software engineer focusing on code and technical details.",
+    "Product Manager": "Act as a product manager considering strategy and user experience.",
+    "Data Scientist": "Act as a data scientist emphasizing data analysis.",
+    "Business Analyst": "Act as a business analyst considering strategic aspects.",
+    "Technical Writer": "Act as a technical writer creating clear documentation.",
+    "Marketing Specialist": "Act as a marketing specialist focusing on branding.",
+    "HR Manager": "Act as an HR manager considering people aspects.",
+    "Legal Advisor": "Act as a legal advisor providing legal perspective.",
+    "Custom Instructions": "Enter custom instructions below:"
+}
+with st.sidebar:
+    st.markdown("### Assistant Behavior")
+    selected_instruction = st.selectbox("Select Instructions", list(instruction_options.keys()))
+    custom_instruction = ""
+    if selected_instruction == "Custom Instructions":
+        custom_instruction = st.text_area("Enter Custom Instructions", height=100)
+# Length/Detail Selection
+length_options = ["Concise", "Detailed", "Comprehensive", "Bullet Points"]
+with st.sidebar:
+    st.markdown("### Response Format")
+    selected_length = st.selectbox("Select Length/Detail", length_options)
+# Main Area
+st.markdown("<h1 style='text-align: center;'>📄 DocMind AI: Document Analysis</h1>", unsafe_allow_html=True)
+st.markdown("<p style='text-align: center;'>Upload documents and analyze them using the Gemma model</p>", unsafe_allow_html=True)
+# File Upload with improved UI
+uploaded_files = st.file_uploader(
+    "Upload Documents",
+    accept_multiple_files=True,
+    type=["pdf", "docx", "txt", "xlsx", "md", "json", "xml", "rtf", "csv", "msg", "pptx", "odt", "epub",
+          "py", "js", "java", "ts", "tsx", "c", "cpp", "h", "html", "css", "sql", "rb", "go", "rs", "php"]
+)
+# Display uploaded files with better visual indication
+if uploaded_files:
+    st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
+    st.markdown("### Uploaded Documents")
+    st.markdown("<ul class='doc-list'>", unsafe_allow_html=True)
+    for file in uploaded_files:
+        st.markdown(f"<li>📄 {file.name}</li>", unsafe_allow_html=True)
+    st.markdown("</ul>", unsafe_allow_html=True)
+    st.markdown("</div>", unsafe_allow_html=True)
+# Function to process the documents and run analysis
+def run_analysis():
+    if not uploaded_files:
+        st.error("Please upload at least one document.")
+        return
+    # Save uploaded files to temporary directory
+    temp_dir = tempfile.mkdtemp()
+    file_paths = []
+    for uploaded_file in uploaded_files:
+        file_path = os.path.join(temp_dir, uploaded_file.name)
+        with open(file_path, "wb") as f:
+            f.write(uploaded_file.getbuffer())
+        file_paths.append(file_path)
+    # Process documents
+    with st.spinner("Processing documents..."):
+        all_texts = []
+        processed_docs = []
+        progress_bar = st.progress(0)
+        for i, file_path in enumerate(file_paths):
+            processor = get_processor_for_file(file_path)
+            if processor:
+                try:
+                    doc_data = process_document(file_path)
+                    if doc_data is not None and len(doc_data.strip()) > 0:  # Ensure we have content
+                        all_texts.append(doc_data)
+                        processed_docs.append({"name": os.path.basename(file_path), "data": doc_data})
+                except Exception as e:
+                    st.error(f"Error processing {os.path.basename(file_path)}: {str(e)}")
+            progress_bar.progress((i + 1) / len(file_paths))
+    if not all_texts:
+        st.error("No documents could be processed. Please check the file formats and try again.")
+        return
+    # Build the prompt
+    if selected_prompt_option == "Custom Prompt":
+        prompt_text = custom_prompt
+    else:
+        prompt_text = prompt_options[selected_prompt_option]
+    if selected_instruction == "Custom Instructions":
+        instruction_text = custom_instruction
+    else:
+        instruction_text = instruction_options[selected_instruction]
+    # Add tone guidance
+    tone_guidance = f"Use a {selected_tone.lower()} tone in your response."
+    # Add length guidance
+    length_guidance = ""
+    if selected_length == "Concise":
+        length_guidance = "Keep your response brief and to the point."
+    elif selected_length == "Detailed":
+        length_guidance = "Provide a detailed response with thorough explanations."
+    elif selected_length == "Comprehensive":
+        length_guidance = "Provide a comprehensive in-depth analysis covering all aspects."
+    elif selected_length == "Bullet Points":
+        length_guidance = "Format your response primarily using bullet points for clarity."
+    # Set up the output parser
+    output_parser = PydanticOutputParser(pydantic_object=DocumentAnalysis)
+    format_instructions = output_parser.get_format_instructions()
+    if analysis_mode == "Analyze each document separately":
+        results = []
+        for doc in processed_docs:
+            with st.spinner(f"Analyzing {doc['name']}..."):
+                # Create system message with combined instructions
+                system_message = f"{instruction_text} {tone_guidance} {length_guidance} Format your response according to these instructions: {format_instructions}"
+                prompt = f"""
+                {prompt_text}
+                Document: {doc['name']}
+                Content: {doc['data']}
+                """
+                # Get response from LLM
+                try:
+                    response = llm.create_chat_completion(
+                        messages = [
+                            {
+                                "role": "system",
+                                "content": system_message
+                            },
+                            {
+                                "role": "user",
+                                "content": prompt
+                            }
+                        ]
+                    )
+                    # Try to parse the response into the pydantic model
+                    try:
+                        # Clean the response before parsing
+                        cleaned_response = clean_llm_response(response)
+                        parsed_response = output_parser.parse(cleaned_response)
+                        results.append({
+                            "document_name": doc['name'],
+                            "analysis": parsed_response.dict()
+                        })
+                    except Exception as e:
+                        # If parsing fails, include the raw response
+                        if isinstance(response, dict) and 'choices' in response:
+                            raw_response = response['choices'][0]['message']['content']
+                        else:
+                            raw_response = str(response)
+                        results.append({
+                            "document_name": doc['name'],
+                            "analysis": raw_response,
+                            "parsing_error": str(e)
+                        })
+                except Exception as e:
+                    st.error(f"Error analyzing {doc['name']}: {str(e)}")
+        # Display results with card-based UI
+        for result in results:
+            st.markdown(f"<div class='card'>", unsafe_allow_html=True)
+            st.markdown(f"<h3>Analysis for: {result['document_name']}</h3>", unsafe_allow_html=True)
+            if isinstance(result['analysis'], dict) and 'parsing_error' not in result:
+                # Structured output
+                st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
+                st.markdown("### Summary")
+                st.write(result['analysis']['summary'])
+                st.markdown("</div>", unsafe_allow_html=True)
+                st.markdown("### Key Insights")
+                for insight in result['analysis']['key_insights']:
+                    st.markdown(f"- {insight}")
+                if result['analysis'].get('action_items'):
+                    st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
+                    st.markdown("### Action Items")
+                    for item in result['analysis']['action_items']:
+                        st.markdown(f"- {item}")
+                    st.markdown("</div>", unsafe_allow_html=True)
+                if result['analysis'].get('open_questions'):
+                    st.markdown("### Open Questions")
+                    for question in result['analysis']['open_questions']:
+                        st.markdown(f"- {question}")
+            else:
+                # Raw output
+                st.markdown(result['analysis'])
+                if 'parsing_error' in result:
+                    st.info(f"Note: The response could not be parsed into the expected format. Error: {result['parsing_error']}")
+            st.markdown("</div>", unsafe_allow_html=True)
+    else:
+        with st.spinner("Analyzing all documents together..."):
+            # Combine all documents
+            combined_content = "\n\n".join([f"Document: {doc['name']}\n\nContent: {doc['data']}" for doc in processed_docs])
+            # Create system message with combined instructions
+            system_message = f"{instruction_text} {tone_guidance} {length_guidance} Format your response according to these instructions: {format_instructions}"
+            # Create the prompt template for HuggingFace models
+            prompt = f"""
+            {prompt_text}
+            {combined_content}
+            """
+            # Get response from LLM
+            try:
+                response = llm.create_chat_completion(
+                    messages = [
+                        {
+                            "role": "system",
+                            "content": system_message
+                        },
+                        {
+                            "role": "user",
+                            "content": prompt
+                        }
+                    ]
+                )
+                # Try to parse the response into the pydantic model
+                try:
+                    # Clean the response before parsing
+                    cleaned_response = clean_llm_response(response)
+                    parsed_response = output_parser.parse(cleaned_response)
+                    st.markdown("<div class='card'>", unsafe_allow_html=True)
+                    st.markdown("<h3>Combined Analysis for All Documents</h3>", unsafe_allow_html=True)
+                    st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
+                    st.markdown("### Summary")
+                    st.write(parsed_response.summary)
+                    st.markdown("</div>", unsafe_allow_html=True)
+                    st.markdown("### Key Insights")
+                    for insight in parsed_response.key_insights:
+                        st.markdown(f"- {insight}")
+                    if parsed_response.action_items:
+                        st.markdown("<div class='highlight-container'>", unsafe_allow_html=True)
+                        st.markdown("### Action Items")
+                        for item in parsed_response.action_items:
+                            st.markdown(f"- {item}")
+                        st.markdown("</div>", unsafe_allow_html=True)
+                    if parsed_response.open_questions:
+                        st.markdown("### Open Questions")
+                        for question in parsed_response.open_questions:
+                            st.markdown(f"- {question}")
+                    st.markdown("</div>", unsafe_allow_html=True)
+                except Exception as e:
+                    # If parsing fails, return the raw response
+                    st.markdown("<div class='card'>", unsafe_allow_html=True)
+                    st.markdown("<h3>Combined Analysis for All Documents</h3>", unsafe_allow_html=True)
+                    # Get raw content from response
+                    if isinstance(response, dict) and 'choices' in response:
+                        raw_response = response['choices'][0]['message']['content']
+                    else:
+                        raw_response = str(response)
+                    st.markdown(raw_response)
+                    st.info(f"Note: The response could not be parsed into the expected format. Error: {str(e)}")
+                    st.markdown("</div>", unsafe_allow_html=True)
+            except Exception as e:
+                st.error(f"Error analyzing documents: {str(e)}")
+    # Create text chunks for embeddings
+    with st.spinner("Setting up document chat..."):
+        try:
+            text_splitter = RecursiveCharacterTextSplitter(
+                chunk_size=1000,
+                chunk_overlap=200
+            )
+            all_chunks = []
+            for doc in processed_docs:
+                if doc['data'] and len(doc['data'].strip()) > 0:  # Verify data exists and is not empty
+                    chunks = text_splitter.split_text(doc['data'])
+                    all_chunks.extend(chunks)
+            # Only create embeddings if we have chunks
+            if all_chunks and len(all_chunks) > 0:
+                # Load embeddings
+                embeddings = load_embeddings()
+                # Using 'None' as namespace to avoid unique ID issues with Chroma
+                vectorstore = Chroma.from_texts(
+                    texts=all_chunks,
+                    embedding=embeddings,
+                    collection_name="docmind_collection",
+                    collection_metadata={"timestamp": datetime.now().isoformat()}
+                )
+                retriever = vectorstore.as_retriever()
+                # Set up conversation memory
+                memory = ConversationBufferMemory(
+                    memory_key="chat_history",
+                    return_messages=True
+                )
+                # Create conversational chain
+                qa_chain = ConversationalRetrievalChain.from_llm(
+                    llm=llm,
+                    retriever=retriever,
+                    memory=memory
+                )
+                st.session_state['qa_chain'] = qa_chain
+                st.session_state['chat_history'] = []
+                st.success("Document chat is ready! Ask questions about your documents below.")
+            else:
+                st.warning("No text chunks were created from the documents. Chat functionality is unavailable.")
+        except Exception as e:
+            st.error(f"Error setting up document chat: {str(e)}")
+            # For debugging purposes
+            st.exception(e)
+# Initialize chat history
+if 'chat_history' not in st.session_state:
+    st.session_state['chat_history'] = []
+# Chat Interface with improved styling
+st.markdown("---")
+st.markdown("<h2 style='text-align: center;'>💬 Chat with your Documents</h2>", unsafe_allow_html=True)
+st.markdown("<p style='text-align: center;'>Ask follow-up questions about the analyzed documents.</p>", unsafe_allow_html=True)
+# Process the analysis if button is clicked
+col1, col2, col3 = st.columns([1, 2, 1])
+with col2:
+    if st.button("Extract and Analyze", use_container_width=True):
+        run_analysis()
+# Chat input and display
+if 'qa_chain' in st.session_state:
+    st.markdown("<div class='card'>", unsafe_allow_html=True)
+    user_question = st.text_input("Ask a question about your documents:")
+    if user_question:
+        with st.spinner("Generating response..."):
+            try:
+                response = st.session_state['qa_chain'].invoke({"question": user_question})
+                st.session_state['chat_history'].append({"question": user_question, "answer": response['answer']})
+            except Exception as e:
+                st.error(f"Error generating response: {str(e)}")
+    # Display chat history with improved styling
+    for exchange in st.session_state['chat_history']:
+        st.markdown("<div class='chat-container'>", unsafe_allow_html=True)
+        st.markdown(f"<div class='chat-message chat-user'><strong>You:</strong> {exchange['question']}</div>", unsafe_allow_html=True)
+        st.markdown(f"<div class='chat-message chat-ai'><strong>DocMind AI:</strong> {exchange['answer']}</div>", unsafe_allow_html=True)
+        st.markdown("</div>", unsafe_allow_html=True)
+    st.markdown("</div>", unsafe_allow_html=True)
+# Footer
+st.markdown("---")
+st.markdown(
+    """
+    <div style="text-align: center">
+    <p>Built with ❤️ using Streamlit, LangChain, and Gemma model</p>
+    <p>DocMind AI - AI-Powered Document Analysis</p>
+    </div>
+    """,
+    unsafe_allow_html=True
+)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,9 @@

+version: '3'
+services:
+  docmind:
+    build: .
+    ports:
+      - "8501:8501"
+    volumes:
+      - .:/app

requirements.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+streamlit
+pydantic
+langchain
+langchain-community
+unstructured
+unstructured-inference
+pdf2image
+pytesseract
+pandas
+chromadb
+sentence-transformers
+python-docx
+pymupdf4llm
+llama-cpp-python
+lxml
+python-pptx
+pdfminer.six
+pillow