Spaces:

Ryanfafa
/

docmind-ai

Running

App Files Files Community

Ryanfafa commited on Feb 17

Commit

477ca04

verified ·

1 Parent(s): 1e4325c

Upload 7 files

Browse files

Files changed (7) hide show

Dockerfile +45 -0
README (1).md +113 -0
app.py +339 -0
data_downloader.py +293 -0
packages.txt +3 -0
rag_engine.py +200 -0
requirements.txt +31 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,45 @@

+FROM python:3.10-slim
+# System dependencies
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    curl \
+    git \
+    libgl1 \
+    libglib2.0-0 \
+    poppler-utils \
+    && rm -rf /var/lib/apt/lists/*
+# Create non-root user (required by HuggingFace Spaces)
+RUN useradd -m -u 1000 appuser
+WORKDIR /app
+# Copy and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy app files
+COPY --chown=appuser:appuser . .
+# Create writable directories for ChromaDB and sample docs
+RUN mkdir -p /app/chroma_db /app/sample_docs && \
+    chown -R appuser:appuser /app/chroma_db /app/sample_docs
+USER appuser
+# Expose Streamlit port
+EXPOSE 7860
+# Health check
+HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health || exit 1
+# Run Streamlit on port 7860 (required by HuggingFace Spaces)
+CMD ["streamlit", "run", "app.py", \
+     "--server.port=7860", \
+     "--server.address=0.0.0.0", \
+     "--server.headless=true", \
+     "--server.enableCORS=false", \
+     "--server.enableXsrfProtection=false", \
+     "--browser.gatherUsageStats=false"]

README (1).md ADDED Viewed

	@@ -0,0 +1,113 @@

+---
+title: DocMind AI – RAG Document Q&A
+emoji: 🧠
+colorFrom: purple
+colorTo: indigo
+sdk: docker
+app_port: 7860
+pinned: true
+license: mit
+short_description: Chat with any PDF using RAG + ChromaDB
+---
+# 🧠 DocMind AI — RAG-Powered Document Q&A
+> Upload any PDF or text document and ask questions — answers are grounded in your content using Retrieval-Augmented Generation.
+## 🚀 Live Demo
+Upload a PDF or TXT, or click **"Load Sample: AI Report"** to instantly demo with a preloaded AI research document.
+---
+## 🏗️ Architecture
+```
+User Query
+    │
+    ▼
+┌─────────────────────────────────────────┐
+│           RETRIEVAL PIPELINE            │
+│                                         │
+│  Document → Chunking → Embedding        │
+│     (RecursiveCharacterSplitter)        │
+│     (all-MiniLM-L6-v2, 384 dims)       │
+│              │                          │
+│              ▼                          │
+│         ChromaDB                        │
+│    (local vector store, MMR)            │
+│              │                          │
+│     Top-4 relevant chunks               │
+└─────────────────────────────────────────┘
+    │
+    ▼
+┌─────────────────────────────────────────┐
+│           GENERATION PIPELINE           │
+│                                         │
+│   Context + Question → Prompt Template  │
+│              │                          │
+│   HuggingFace Inference API             │
+│   (zephyr-7b-beta)                      │
+│              │                          │
+│         Final Answer                    │
+└─────────────────────────────────────────┘
+```
+## 🛠️ Tech Stack
+| Component | Technology |
+|-----------|-----------|
+| **Framework** | LangChain 0.2 |
+| **Vector DB** | ChromaDB |
+| **Embeddings** | sentence-transformers/all-MiniLM-L6-v2 |
+| **LLM** | HuggingFaceH4/zephyr-7b-beta |
+| **UI** | Streamlit |
+| **Deployment** | HuggingFace Spaces |
+## ⚙️ Key RAG Concepts Demonstrated
+- **Recursive Character Splitting** — smart chunking with 800-token windows and 150-token overlap
+- **Dense Embeddings** — semantic vector representations, not keyword matching
+- **MMR Retrieval** — Maximal Marginal Relevance reduces redundant retrieved chunks
+- **Prompt Engineering** — structured system/user/assistant prompt for grounded answers
+- **Source Attribution** — every answer shows which document chunks were used
+## 🔧 Local Setup
+```bash
+git clone https://huggingface.co/spaces/YOUR_USERNAME/docmind-ai
+cd docmind-ai
+pip install -r requirements.txt
+streamlit run app.py
+```
+Optional — add a HuggingFace token for higher API rate limits:
+```bash
+export HF_TOKEN=hf_your_token_here
+```
+## 📁 Project Structure
+```
+docmind-ai/
+├── app.py              # Streamlit UI
+├── rag_engine.py       # Core RAG pipeline (embed, store, retrieve, generate)
+├── data_downloader.py  # Auto-downloads sample documents
+├── requirements.txt    # Dependencies
+└── README.md           # This file
+```
+## 💡 How It Works
+1. **Upload** a PDF or TXT file (or use the sample)
+2. The app **splits** the document into overlapping chunks
+3. Each chunk is **embedded** into a 384-dimensional vector
+4. Vectors are **stored** in ChromaDB (local vector database)
+5. Your question is **embedded** and matched against stored vectors via MMR
+6. The top-4 relevant chunks are **retrieved**
+7. Chunks + question are sent to **Zephyr-7B** via HuggingFace Inference API
+8. A grounded **answer** is returned with source attribution
+---
+*Built as a portfolio project demonstrating end-to-end RAG engineering.*

app.py ADDED Viewed

	@@ -0,0 +1,339 @@

+import streamlit as st
+import os
+import time
+import hashlib
+from pathlib import Path
+# ─── Page Config ───────────────────────────────────────────────────────────────
+st.set_page_config(
+    page_title="DocMind AI – RAG Document Q&A",
+    page_icon="🧠",
+    layout="wide",
+    initial_sidebar_state="expanded",
+)
+# ─── Custom CSS ────────────────────────────────────────────────────────────────
+st.markdown("""
+<style>
+@import url('https://fonts.googleapis.com/css2?family=Syne:wght@400;600;700;800&family=DM+Sans:wght@300;400;500&display=swap');
+html, body, [class*="css"] {
+    font-family: 'DM Sans', sans-serif;
+}
+.stApp {
+    background: #0f0f13;
+    color: #e8e8f0;
+}
+/* Sidebar */
+[data-testid="stSidebar"] {
+    background: #16161d !important;
+    border-right: 1px solid #2a2a3a;
+}
+/* Hero header */
+.hero-title {
+    font-family: 'Syne', sans-serif;
+    font-size: 2.8rem;
+    font-weight: 800;
+    background: linear-gradient(135deg, #7c6af7 0%, #a78bfa 40%, #38bdf8 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    line-height: 1.1;
+    margin-bottom: 0.2rem;
+}
+.hero-sub {
+    color: #6b6b8a;
+    font-size: 1rem;
+    font-weight: 300;
+    letter-spacing: 0.04em;
+    margin-bottom: 2rem;
+}
+/* Stat cards */
+.stat-card {
+    background: #1c1c26;
+    border: 1px solid #2a2a3a;
+    border-radius: 12px;
+    padding: 1rem 1.2rem;
+    text-align: center;
+}
+.stat-number {
+    font-family: 'Syne', sans-serif;
+    font-size: 1.6rem;
+    font-weight: 700;
+    color: #a78bfa;
+}
+.stat-label {
+    font-size: 0.75rem;
+    color: #6b6b8a;
+    text-transform: uppercase;
+    letter-spacing: 0.08em;
+}
+/* Chat messages */
+.chat-user {
+    background: #1e1e2e;
+    border: 1px solid #2a2a3a;
+    border-radius: 12px 12px 4px 12px;
+    padding: 0.9rem 1.1rem;
+    margin: 0.5rem 0;
+    color: #e8e8f0;
+}
+.chat-assistant {
+    background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%);
+    border: 1px solid #312e81;
+    border-radius: 12px 12px 12px 4px;
+    padding: 0.9rem 1.1rem;
+    margin: 0.5rem 0;
+    color: #e8e8f0;
+}
+.chat-label {
+    font-size: 0.7rem;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.1em;
+    margin-bottom: 0.4rem;
+}
+.label-user { color: #38bdf8; }
+.label-ai   { color: #a78bfa; }
+/* Source pills */
+.source-pill {
+    display: inline-block;
+    background: #1f1f2e;
+    border: 1px solid #3730a3;
+    border-radius: 20px;
+    padding: 0.2rem 0.7rem;
+    font-size: 0.72rem;
+    color: #818cf8;
+    margin: 0.2rem 0.15rem;
+}
+/* Upload area */
+[data-testid="stFileUploader"] {
+    background: #1c1c26 !important;
+    border: 2px dashed #2a2a3a !important;
+    border-radius: 12px !important;
+}
+/* Buttons */
+.stButton > button {
+    background: linear-gradient(135deg, #7c3aed, #4f46e5) !important;
+    color: white !important;
+    border: none !important;
+    border-radius: 8px !important;
+    font-family: 'DM Sans', sans-serif !important;
+    font-weight: 500 !important;
+    transition: all 0.2s ease !important;
+}
+.stButton > button:hover {
+    transform: translateY(-1px) !important;
+    box-shadow: 0 4px 20px rgba(124, 58, 237, 0.4) !important;
+}
+/* Input */
+.stTextInput > div > div > input,
+[data-testid="stChatInputTextArea"] {
+    background: #1c1c26 !important;
+    border: 1px solid #2a2a3a !important;
+    color: #e8e8f0 !important;
+    border-radius: 10px !important;
+}
+/* Status badges */
+.badge-ready  { background:#14532d; color:#86efac; padding:3px 10px; border-radius:20px; font-size:0.75rem; }
+.badge-empty  { background:#1c1917; color:#a8a29e; padding:3px 10px; border-radius:20px; font-size:0.75rem; }
+.badge-loading{ background:#1e3a5f; color:#7dd3fc; padding:3px 10px; border-radius:20px; font-size:0.75rem; }
+/* Divider */
+hr { border-color: #2a2a3a !important; }
+/* Scrollbar */
+::-webkit-scrollbar { width: 6px; }
+::-webkit-scrollbar-track { background: #0f0f13; }
+::-webkit-scrollbar-thumb { background: #2a2a3a; border-radius: 3px; }
+</style>
+""", unsafe_allow_html=True)
+# ─── Lazy imports (avoids reload cost) ────────────────────────────────────────
+@st.cache_resource(show_spinner=False)
+def load_rag_engine():
+    from rag_engine import RAGEngine
+    return RAGEngine()
+# ─── Session state init ────────────────────────────────────────────────────────
+if "messages"       not in st.session_state: st.session_state.messages       = []
+if "doc_loaded"     not in st.session_state: st.session_state.doc_loaded     = False
+if "doc_name"       not in st.session_state: st.session_state.doc_name       = ""
+if "chunk_count"    not in st.session_state: st.session_state.chunk_count    = 0
+if "processed_hash" not in st.session_state: st.session_state.processed_hash = ""
+# ─── Sidebar ───────────────────────────────────────────────────────────────────
+with st.sidebar:
+    st.markdown('<p style="font-family:Syne,sans-serif;font-size:1.3rem;font-weight:700;color:#a78bfa;">🧠 DocMind AI</p>', unsafe_allow_html=True)
+    st.markdown('<p style="color:#6b6b8a;font-size:0.8rem;">RAG-Powered Document Intelligence</p>', unsafe_allow_html=True)
+    st.markdown("---")
+    # Status
+    if st.session_state.doc_loaded:
+        st.markdown(f'<span class="badge-ready">✓ Ready</span>', unsafe_allow_html=True)
+        st.markdown(f'<p style="color:#e8e8f0;font-size:0.85rem;margin-top:0.5rem;">📄 <b>{st.session_state.doc_name}</b></p>', unsafe_allow_html=True)
+        st.markdown(f'<p style="color:#6b6b8a;font-size:0.78rem;">{st.session_state.chunk_count} chunks indexed</p>', unsafe_allow_html=True)
+    else:
+        st.markdown('<span class="badge-empty">○ No document loaded</span>', unsafe_allow_html=True)
+    st.markdown("---")
+    st.markdown('<p style="color:#6b6b8a;font-size:0.78rem;font-weight:600;text-transform:uppercase;letter-spacing:0.08em;">Upload Document</p>', unsafe_allow_html=True)
+    uploaded_file = st.file_uploader(
+        "PDF or TXT",
+        type=["pdf", "txt"],
+        label_visibility="collapsed"
+    )
+    if uploaded_file:
+        file_hash = hashlib.md5(uploaded_file.read()).hexdigest()
+        uploaded_file.seek(0)
+        if file_hash != st.session_state.processed_hash:
+            with st.spinner("🔍 Processing document..."):
+                rag = load_rag_engine()
+                chunks = rag.ingest_file(uploaded_file)
+                st.session_state.doc_loaded     = True
+                st.session_state.doc_name       = uploaded_file.name
+                st.session_state.chunk_count    = chunks
+                st.session_state.processed_hash = file_hash
+                st.session_state.messages       = []
+            st.success(f"✓ Indexed {chunks} chunks!")
+            st.rerun()
+    st.markdown("---")
+    # Try sample doc
+    st.markdown('<p style="color:#6b6b8a;font-size:0.78rem;font-weight:600;text-transform:uppercase;letter-spacing:0.08em;">Or try a sample</p>', unsafe_allow_html=True)
+    if st.button("📥 Load Sample: AI Report", use_container_width=True):
+        with st.spinner("Downloading sample document..."):
+            from data_downloader import download_sample_doc
+            path, name = download_sample_doc()
+            rag = load_rag_engine()
+            chunks = rag.ingest_path(path, name)
+            st.session_state.doc_loaded     = True
+            st.session_state.doc_name       = name
+            st.session_state.chunk_count    = chunks
+            st.session_state.processed_hash = "sample"
+            st.session_state.messages       = []
+        st.success(f"✓ Sample loaded! {chunks} chunks")
+        st.rerun()
+    st.markdown("---")
+    if st.button("🗑️ Clear Chat", use_container_width=True):
+        st.session_state.messages = []
+        st.rerun()
+    st.markdown("---")
+    st.markdown("""
+    <p style="color:#6b6b8a;font-size:0.72rem;line-height:1.6;">
+    <b style="color:#a78bfa;">Stack</b><br>
+    🔗 LangChain · ChromaDB<br>
+    🤗 HuggingFace Embeddings<br>
+    🦙 Mistral-7B (GGUF)<br>
+    🌊 Streamlit
+    </p>
+    """, unsafe_allow_html=True)
+# ─── Main Area ─────────────────────────────────────────────────────────────────
+st.markdown('<h1 class="hero-title">DocMind AI</h1>', unsafe_allow_html=True)
+st.markdown('<p class="hero-sub">Upload any document · Ask anything · Get answers grounded in your content</p>', unsafe_allow_html=True)
+# Stats row
+col1, col2, col3, col4 = st.columns(4)
+with col1:
+    st.markdown(f"""
+    <div class="stat-card">
+        <div class="stat-number">{st.session_state.chunk_count or "—"}</div>
+        <div class="stat-label">Chunks Indexed</div>
+    </div>""", unsafe_allow_html=True)
+with col2:
+    st.markdown(f"""
+    <div class="stat-card">
+        <div class="stat-number">{len(st.session_state.messages) // 2}</div>
+        <div class="stat-label">Questions Asked</div>
+    </div>""", unsafe_allow_html=True)
+with col3:
+    st.markdown("""
+    <div class="stat-card">
+        <div class="stat-number">384</div>
+        <div class="stat-label">Embedding Dims</div>
+    </div>""", unsafe_allow_html=True)
+with col4:
+    st.markdown("""
+    <div class="stat-card">
+        <div class="stat-number">Top-4</div>
+        <div class="stat-label">Retrieval K</div>
+    </div>""", unsafe_allow_html=True)
+st.markdown("<br>", unsafe_allow_html=True)
+# ─── Chat History ──────────────────────────────────────────────────────────────
+chat_container = st.container()
+with chat_container:
+    if not st.session_state.messages:
+        if st.session_state.doc_loaded:
+            st.markdown(f"""
+            <div style="text-align:center;padding:3rem;color:#6b6b8a;">
+                <div style="font-size:2.5rem;margin-bottom:1rem;">💬</div>
+                <p style="font-size:1rem;color:#a78bfa;">Document ready!</p>
+                <p style="font-size:0.85rem;">Ask anything about <b style="color:#e8e8f0;">{st.session_state.doc_name}</b></p>
+            </div>
+            """, unsafe_allow_html=True)
+        else:
+            st.markdown("""
+            <div style="text-align:center;padding:4rem 2rem;color:#6b6b8a;">
+                <div style="font-size:3rem;margin-bottom:1rem;">📄</div>
+                <p style="font-size:1.1rem;color:#a78bfa;font-family:'Syne',sans-serif;font-weight:600;">No document loaded yet</p>
+                <p style="font-size:0.85rem;">Upload a PDF or TXT file in the sidebar,<br>or load the sample AI report to get started.</p>
+            </div>
+            """, unsafe_allow_html=True)
+    else:
+        for msg in st.session_state.messages:
+            if msg["role"] == "user":
+                st.markdown(f"""
+                <div class="chat-user">
+                    <div class="chat-label label-user">You</div>
+                    {msg["content"]}
+                </div>""", unsafe_allow_html=True)
+            else:
+                sources_html = ""
+                if msg.get("sources"):
+                    pills = "".join(f'<span class="source-pill">📄 {s}</span>' for s in msg["sources"])
+                    sources_html = f'<div style="margin-top:0.7rem;">{pills}</div>'
+                st.markdown(f"""
+                <div class="chat-assistant">
+                    <div class="chat-label label-ai">DocMind AI</div>
+                    {msg["content"]}
+                    {sources_html}
+                </div>""", unsafe_allow_html=True)
+# ─── Chat Input ────────────────────────────────────────────────────────────────
+st.markdown("<br>", unsafe_allow_html=True)
+if not st.session_state.doc_loaded:
+    st.chat_input("Upload a document first...", disabled=True)
+else:
+    if prompt := st.chat_input("Ask anything about your document..."):
+        st.session_state.messages.append({"role": "user", "content": prompt})
+        with st.spinner("🔍 Retrieving & generating answer..."):
+            rag = load_rag_engine()
+            answer, sources = rag.query(prompt)
+        st.session_state.messages.append({
+            "role": "assistant",
+            "content": answer,
+            "sources": sources
+        })
+        st.rerun()

data_downloader.py ADDED Viewed

	@@ -0,0 +1,293 @@

+"""
+data_downloader.py
+──────────────────
+Downloads a free, publicly available AI research report to use as a
+demo document — no manual steps needed.
+Primary   : Stanford AI Index Report 2024 (summary chapter, public PDF)
+Fallback 1: Our World in Data – AI progress summary (txt)
+Fallback 2: Generate a synthetic AI overview document locally
+"""
+import os
+import time
+import textwrap
+import urllib.request
+from pathlib import Path
+CACHE_DIR  = Path("./sample_docs")
+SAMPLE_PDF = CACHE_DIR / "ai_report_sample.pdf"
+SAMPLE_TXT = CACHE_DIR / "ai_overview.txt"
+# Public, stable, lightweight PDFs (< 5 MB each)
+PDF_SOURCES = [
+    (
+        "https://arxiv.org/pdf/2310.07064",        # "Levels of AGI" Google DeepMind paper
+        "Levels_of_AGI_DeepMind.pdf",
+    ),
+    (
+        "https://arxiv.org/pdf/2303.12528",        # "Sparks of AGI" Microsoft Research
+        "Sparks_of_AGI_Microsoft.pdf",
+    ),
+    (
+        "https://arxiv.org/pdf/2304.15004",        # "AutoGPT for Online Dec. Making"
+        "AutoGPT_Decision_Making.pdf",
+    ),
+]
+def download_sample_doc() -> tuple[str, str]:
+    """
+    Returns (local_path, display_name).
+    Tries PDF sources first; falls back to a generated TXT file.
+    """
+    CACHE_DIR.mkdir(exist_ok=True)
+    # ── Try each PDF source ────────────────────────────────────────────────────
+    for url, fname in PDF_SOURCES:
+        dest = CACHE_DIR / fname
+        if dest.exists():
+            return str(dest), fname          # already cached
+        try:
+            print(f"Attempting download: {url}")
+            req = urllib.request.Request(
+                url,
+                headers={
+                    "User-Agent": (
+                        "Mozilla/5.0 (X11; Linux x86_64) "
+                        "AppleWebKit/537.36 (KHTML, like Gecko) "
+                        "Chrome/120.0 Safari/537.36"
+                    )
+                },
+            )
+            with urllib.request.urlopen(req, timeout=20) as resp:
+                data = resp.read()
+            # Sanity-check: must look like a PDF
+            if data[:4] == b"%PDF" and len(data) > 10_000:
+                dest.write_bytes(data)
+                print(f"✓ Downloaded {fname} ({len(data)//1024} KB)")
+                return str(dest), fname
+        except Exception as ex:
+            print(f"  ✗ Failed: {ex}")
+            time.sleep(1)
+    # ── Fallback: generate a rich synthetic TXT document ──────────────────────
+    print("All PDF downloads failed – generating synthetic document.")
+    return _generate_synthetic_doc()
+def _generate_synthetic_doc() -> tuple[str, str]:
+    """Creates a comprehensive synthetic AI overview document locally."""
+    fname   = "AI_Technology_Overview_2024.txt"
+    dest    = CACHE_DIR / fname
+    content = textwrap.dedent("""
+    ═══════════════════════════════════════════════════════════════
+    ARTIFICIAL INTELLIGENCE: STATE OF THE FIELD — 2024 OVERVIEW
+    A Comprehensive Technical Reference Document
+    ═══════════════════════════════════════════════════════════════
+    ── SECTION 1: LARGE LANGUAGE MODELS ──────────────────────────
+    Large Language Models (LLMs) are neural networks trained on vast corpora
+    of text data using the Transformer architecture introduced by Vaswani et
+    al. in 2017. Modern LLMs such as GPT-4, Claude 3, Gemini Ultra, and
+    LLaMA-3 contain hundreds of billions of parameters.
+    Training involves two primary phases:
+    1. Pre-training: Self-supervised learning on internet-scale text data
+       (Common Crawl, Wikipedia, Books, GitHub code). The model learns to
+       predict the next token in a sequence.
+    2. Fine-tuning / RLHF: Reinforcement Learning from Human Feedback aligns
+       the model with human preferences, improving helpfulness, harmlessness,
+       and honesty.
+    Key capabilities: text generation, translation, summarization, question
+    answering, code generation, reasoning, and multimodal understanding.
+    Limitations: hallucinations (generating plausible but false information),
+    knowledge cutoff dates, context-window constraints, and sensitivity to
+    prompt phrasing (prompt brittleness).
+    ── SECTION 2: RETRIEVAL-AUGMENTED GENERATION (RAG) ──────────
+    RAG is an architectural pattern that enhances LLM accuracy by grounding
+    generation in retrieved factual documents. It was introduced in a 2020
+    paper by Lewis et al. at Facebook AI Research.
+    RAG Pipeline Architecture:
+    1. Document Ingestion: PDFs, text files, or web pages are loaded.
+    2. Chunking: Documents are split into smaller overlapping segments
+       (typically 256–1024 tokens) to fit the model's context window.
+    3. Embedding: Each chunk is converted to a dense vector using a sentence
+       transformer model (e.g., all-MiniLM-L6-v2, text-embedding-ada-002).
+    4. Vector Storage: Embeddings are stored in a vector database such as
+       ChromaDB, Pinecone, Weaviate, or Qdrant for fast similarity search.
+    5. Query Processing: A user query is embedded and compared against stored
+       vectors using cosine similarity or ANN algorithms (HNSW, IVF).
+    6. Context Injection: The top-k most relevant chunks are retrieved and
+       injected into the LLM prompt as grounding context.
+    7. Generation: The LLM generates an answer informed by retrieved context.
+    Advantages over pure LLMs:
+    - Up-to-date information (no knowledge cutoff)
+    - Reduced hallucination (grounded in real documents)
+    - Source attribution and transparency
+    - Domain-specific knowledge without expensive fine-tuning
+    ── SECTION 3: VECTOR DATABASES ───────────────────────────────
+    Vector databases are specialized systems optimized for storing and
+    querying high-dimensional embedding vectors.
+    ChromaDB: Open-source, runs locally in Python. Ideal for development
+    and small-to-medium scale projects. Supports persistent and in-memory
+    storage. Integrates seamlessly with LangChain.
+    Pinecone: Managed cloud vector database. Scales to billions of vectors.
+    Supports metadata filtering, sparse-dense hybrid search.
+    Qdrant: Open-source with cloud option. Supports payload filtering,
+    multi-vector collections, and quantization for memory efficiency.
+    Weaviate: GraphQL-native vector search with modular ML integrations.
+    FAISS (Facebook AI Similarity Search): Library (not a database) for
+    efficient similarity search. Excellent for research and batch processing.
+    Approximate Nearest Neighbor (ANN) algorithms used by these systems
+    include HNSW (Hierarchical Navigable Small World graphs), which provides
+    O(log n) search complexity with high recall.
+    ── SECTION 4: EMBEDDING MODELS ───────────────────────────────
+    Embedding models convert text into dense numerical vectors that capture
+    semantic meaning. Similar texts produce vectors that are close in the
+    embedding space (measured by cosine similarity or dot product).
+    Popular models:
+    - all-MiniLM-L6-v2: 22M parameters, 384 dimensions, very fast, good
+      quality. Best for real-time applications.
+    - all-mpnet-base-v2: 110M parameters, 768 dimensions, higher quality.
+    - text-embedding-3-small (OpenAI): 1536 dims, strong general performance.
+    - text-embedding-3-large (OpenAI): 3072 dims, state-of-the-art quality.
+    - UAE-Large-V1 (WhereIsAI): Top performer on MTEB benchmark as of 2024.
+    The MTEB (Massive Text Embedding Benchmark) is the standard evaluation
+    suite for embedding models, covering retrieval, clustering, classification,
+    and semantic similarity tasks across 56 datasets.
+    ── SECTION 5: AI AGENTS & AGENTIC SYSTEMS ────────────────────
+    AI agents are LLM-powered systems that can take actions in the world—
+    browsing the web, executing code, calling APIs, and managing files—in
+    pursuit of a goal.
+    ReAct (Reason + Act) Framework: The model alternates between reasoning
+    steps (Thought) and actions (Act), observing results after each action.
+    LangGraph: A framework for building stateful, graph-based agent workflows.
+    Supports cycles, branching, parallel execution, and human-in-the-loop
+    interrupts.
+    CrewAI: Multi-agent framework where specialized agents collaborate on
+    complex tasks. Agents have roles, goals, tools, and can delegate to peers.
+    AutoGen (Microsoft): Framework for multi-agent conversation and code
+    execution. Supports human-agent collaboration workflows.
+    Key challenges in agent development:
+    - Long-horizon planning and task decomposition
+    - Reliable tool use and API integration
+    - Memory management (short-term, long-term, episodic)
+    - Error recovery and graceful degradation
+    - Safety and sandboxing of code execution
+    ── SECTION 6: FINE-TUNING & PEFT METHODS ─────────────────────
+    Full fine-tuning of LLMs is computationally expensive. Parameter-Efficient
+    Fine-Tuning (PEFT) methods adapt pre-trained models with minimal resources.
+    LoRA (Low-Rank Adaptation): Adds small trainable rank-decomposition matrices
+    to attention layers while freezing the base model. Reduces trainable
+    parameters by 10,000x while achieving near-full fine-tune quality.
+    QLoRA: Quantizes the base model to 4-bit precision (NF4), then applies
+    LoRA adapters. Enables fine-tuning of 70B models on a single consumer GPU.
+    Instruction tuning: Fine-tuning on (instruction, response) pairs to
+    improve the model's ability to follow natural language directions.
+    Popular open-source base models for fine-tuning:
+    - LLaMA-3 (Meta AI): 8B and 70B versions, strong multilingual support.
+    - Mistral-7B: Efficient 7B model with sliding window attention.
+    - Phi-3 (Microsoft): Small but surprisingly capable models (3.8B–14B).
+    - Gemma-2 (Google): 2B and 9B versions, optimized for efficiency.
+    ── SECTION 7: MLOPS AND MODEL DEPLOYMENT ─────────────────────
+    MLOps (Machine Learning Operations) covers the practices of deploying,
+    monitoring, and maintaining ML models in production.
+    Key components:
+    - Experiment Tracking: MLflow, Weights & Biases (W&B) track metrics,
+      hyperparameters, and model artifacts across training runs.
+    - Model Registry: Central repository for versioned model artifacts.
+    - Serving Infrastructure: FastAPI, TorchServe, Triton Inference Server,
+      or vLLM for high-throughput LLM serving.
+    - Containerization: Docker packages models with all dependencies.
+      Kubernetes orchestrates containers at scale.
+    - CI/CD: GitHub Actions or GitLab CI automates testing, building,
+      and deployment pipelines.
+    - Monitoring: Track data drift, concept drift, latency, and error rates
+      in production. Tools: Evidently AI, Arize, WhyLabs.
+    Deployment platforms:
+    - HuggingFace Spaces: Free hosting for Gradio/Streamlit ML demos.
+    - AWS SageMaker: Enterprise ML deployment on AWS infrastructure.
+    - Google Vertex AI: Managed ML platform on Google Cloud.
+    - Replicate: API-first model deployment, pay-per-prediction.
+    - Modal: Serverless GPU compute for ML inference.
+    ── SECTION 8: RESPONSIBLE AI & SAFETY ────────────────────────
+    As AI systems become more capable, ensuring they are safe, fair, and
+    aligned with human values is a critical research and engineering challenge.
+    Key principles:
+    - Helpfulness: The system should assist users effectively.
+    - Harmlessness: Avoid generating content that could cause real-world harm.
+    - Honesty: Acknowledge uncertainty; do not hallucinate or deceive.
+    Techniques:
+    - RLHF (Reinforcement Learning from Human Feedback): Trains reward models
+      from human preferences to guide LLM behavior.
+    - Constitutional AI (Anthropic): Models self-critique and revise outputs
+      against a set of principles.
+    - Red Teaming: Adversarial testing to discover model failure modes.
+    - Interpretability Research: Understanding internal model representations
+      (mechanistic interpretability, probing classifiers, attention analysis).
+    Regulatory landscape (2024):
+    - EU AI Act: First comprehensive AI regulation, risk-based tiered approach.
+    - US Executive Order on AI (Oct. 2023): Safety testing requirements for
+      large AI models.
+    - China AI Regulations: Content moderation and algorithmic transparency
+      requirements for generative AI services.
+    ═══════════════════════════════════════════════════════════════
+    END OF DOCUMENT
+    ═══════════════════════════════════════════════════════════════
+    """).strip()
+    dest.write_text(content, encoding="utf-8")
+    print(f"✓ Generated synthetic document ({len(content)} chars)")
+    return str(dest), fname
+if __name__ == "__main__":
+    path, name = download_sample_doc()
+    print(f"\nReady: {path} ({name})")

packages.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+libgl1
+libglib2.0-0
+poppler-utils

rag_engine.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+RAG Engine
+──────────
+- Embeddings : sentence-transformers/all-MiniLM-L6-v2  (HuggingFace, free)
+- Vector DB  : ChromaDB  (local, in-memory / persistent)
+- LLM        : HuggingFace Inference API  (zephyr-7b-beta, free tier)
+- Chunking   : Recursive character splitter with overlap
+"""
+import os
+import re
+import tempfile
+from typing import Tuple, List
+import chromadb
+from chromadb.config import Settings
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain_community.vectorstores import Chroma
+from langchain_community.document_loaders import PyPDFLoader, TextLoader
+from langchain.schema import Document
+# ─── Configuration ─────────────────────────────────────────────────────────────
+EMBED_MODEL      = "sentence-transformers/all-MiniLM-L6-v2"
+HF_MODEL_ID      = "HuggingFaceH4/zephyr-7b-beta"          # free inference API
+CHUNK_SIZE       = 800
+CHUNK_OVERLAP    = 150
+TOP_K            = 4
+COLLECTION_NAME  = "docmind_collection"
+CHROMA_DIR       = "./chroma_db"
+class RAGEngine:
+    """Full RAG pipeline: ingest → embed → store → retrieve → generate."""
+    def __init__(self):
+        self._embeddings  = None
+        self._vectorstore = None
+        self._splitter    = RecursiveCharacterTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            separators=["\n\n", "\n", ". ", " ", ""],
+        )
+    # ── Lazy-load embeddings ───────────────────────────────────────────────────
+    @property
+    def embeddings(self):
+        if self._embeddings is None:
+            self._embeddings = HuggingFaceEmbeddings(
+                model_name=EMBED_MODEL,
+                model_kwargs={"device": "cpu"},
+                encode_kwargs={"normalize_embeddings": True},
+            )
+        return self._embeddings
+    # ── Ingest an uploaded Streamlit file object ───────────────────────────────
+    def ingest_file(self, uploaded_file) -> int:
+        suffix = Path_suffix(uploaded_file.name)
+        with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+            tmp.write(uploaded_file.read())
+            tmp_path = tmp.name
+        return self.ingest_path(tmp_path, uploaded_file.name)
+    # ── Ingest from a file path ────────────────────────────────────────────────
+    def ingest_path(self, path: str, name: str = "") -> int:
+        suffix = Path_suffix(name or path)
+        if suffix == ".pdf":
+            loader = PyPDFLoader(path)
+        else:
+            loader = TextLoader(path, encoding="utf-8")
+        raw_docs = loader.load()
+        # Add source metadata
+        for doc in raw_docs:
+            doc.metadata["source"] = name or os.path.basename(path)
+        chunks = self._splitter.split_documents(raw_docs)
+        # Reset & recreate vectorstore for the new document
+        self._vectorstore = Chroma.from_documents(
+            documents=chunks,
+            embedding=self.embeddings,
+            collection_name=COLLECTION_NAME,
+            persist_directory=CHROMA_DIR,
+            client_settings=Settings(anonymized_telemetry=False),
+        )
+        return len(chunks)
+    # ── Query: retrieve + generate ─────────────────────────────────────────────
+    def query(self, question: str) -> Tuple[str, List[str]]:
+        if self._vectorstore is None:
+            return "⚠️ Please upload a document first.", []
+        # 1. Retrieve top-k relevant chunks
+        retriever = self._vectorstore.as_retriever(
+            search_type="mmr",                  # Maximal Marginal Relevance
+            search_kwargs={"k": TOP_K, "fetch_k": TOP_K * 3},
+        )
+        docs = retriever.invoke(question)
+        # 2. Build context
+        context = "\n\n---\n\n".join(
+            f"[Chunk {i+1}]\n{d.page_content}" for i, d in enumerate(docs)
+        )
+        # 3. Unique source names for display
+        sources = list({d.metadata.get("source", "Document") for d in docs})
+        # 4. Generate answer
+        answer = self._generate(question, context)
+        return answer, sources
+    # ── LLM call via HuggingFace Inference API ─────────────────────────────────
+    def _generate(self, question: str, context: str) -> str:
+        try:
+            from huggingface_hub import InferenceClient
+            prompt = _build_prompt(question, context)
+            hf_token = os.environ.get("HF_TOKEN", "")          # optional but unlocks higher rate limits
+            client   = InferenceClient(model=HF_MODEL_ID, token=hf_token if hf_token else None)
+            response = client.text_generation(
+                prompt,
+                max_new_tokens=512,
+                temperature=0.2,
+                repetition_penalty=1.15,
+                do_sample=True,
+                stop_sequences=["</s>", "[INST]", "Human:", "User:"],
+            )
+            # Strip any echoed prompt
+            answer = _clean_response(response, question)
+            return answer
+        except Exception as e:
+            # Fallback: context-extraction mode (no LLM needed)
+            return _fallback_answer(question, context, str(e))
+# ─── Prompt Builder ────────────────────────────────────────────────────────────
+def _build_prompt(question: str, context: str) -> str:
+    system = (
+        "You are DocMind, an expert document analyst. "
+        "Answer the user's question using ONLY the provided document context. "
+        "Be concise, accurate, and cite specific details from the context. "
+        "If the answer is not in the context, say so clearly."
+    )
+    return (
+        f"<|system|>\n{system}</s>\n"
+        f"<|user|>\n"
+        f"Document context:\n{context}\n\n"
+        f"Question: {question}</s>\n"
+        f"<|assistant|>\n"
+    )
+# ─── Response Cleaner ──────────────────────────────────────────────────────────
+def _clean_response(text: str, question: str) -> str:
+    # Remove any re-echoed prompt fragments
+    for marker in ["<|assistant|>", "<|user|>", "<|system|>", "</s>"]:
+        text = text.replace(marker, "")
+    text = text.strip()
+    # Remove leading repetition of question
+    if text.lower().startswith(question.lower()[:30]):
+        text = text[len(question):].strip()
+    return text or "I could not generate a response. Please try rephrasing your question."
+# ─── Fallback (no LLM) ─────────────────────────────────────────────────────────
+def _fallback_answer(question: str, context: str, error: str) -> str:
+    """Simple extractive answer when LLM is unavailable."""
+    keywords = set(re.findall(r'\b\w{4,}\b', question.lower()))
+    best_chunk, best_score = "", 0
+    for chunk in context.split("---"):
+        words  = set(re.findall(r'\b\w{4,}\b', chunk.lower()))
+        score  = len(keywords & words)
+        if score > best_score:
+            best_score  = score
+            best_chunk  = chunk.strip()
+    if best_chunk:
+        excerpt = best_chunk[:600] + ("..." if len(best_chunk) > 600 else "")
+        return (
+            f"*(LLM unavailable – showing most relevant excerpt)*\n\n{excerpt}\n\n"
+            f"<small>Error: {error}</small>"
+        )
+    return f"⚠️ Could not generate answer. Error: {error}"
+# ─── Helper ────────────────────────────────────────────────────────────────────
+def Path_suffix(name: str) -> str:
+    return os.path.splitext(name)[-1].lower() or ".txt"

requirements.txt ADDED Viewed

	@@ -0,0 +1,31 @@

+# ── Core RAG Stack ─────────────────────────────────────────────────────────────
+langchain==0.2.16
+langchain-community==0.2.16
+langchain-core==0.2.38
+# ── Vector DB ──────────────────────────────────────────────────────────────────
+chromadb==0.5.5
+# ── Embeddings ─────────────────────────────────────────────────────────────────
+sentence-transformers==3.0.1
+huggingface-hub==0.24.6
+transformers==4.44.2
+tokenizers==0.19.1
+# ── PDF Loading ────────────────────────────────────────────────────────────────
+pypdf==4.3.1
+pymupdf==1.24.9
+# ── UI ─────────────────────────────────────────────────────────────────────────
+streamlit==1.38.0
+# ── ML Dependencies ────────────────────────────────────────────────────────────
+torch==2.4.0
+numpy==1.26.4
+scipy==1.13.1
+scikit-learn==1.5.1
+# ── Utilities ──────────────────────────────────────────────────────────────────
+python-dotenv==1.0.1
+requests==2.32.3
+tqdm==4.66.5