Spaces:

kerdosdotio
/

Custom-LLM-Chat

Running

App Files Files Community

Bhaskar Ram commited on Mar 1

Commit

a465955

1 Parent(s): 3381167

feat: apply all 15 upgrades — BGE embeddings, cosine FAISS, streaming LLM, tenacity retry, dotenv, Dockerfile, tests

Browse files

Files changed (11) hide show

.env.example +11 -0
.gitignore +32 -0
Dockerfile +27 -0
README.md +14 -14
app.py +23 -14
rag/chain.py +61 -21
rag/embedder.py +5 -2
rag/retriever.py +7 -4
requirements-dev.txt +4 -0
requirements.txt +6 -4
tests/smoke_test.py +48 -0

.env.example ADDED Viewed

	@@ -0,0 +1,11 @@

+# Environment variable template — copy to .env and fill in your values
+# Required: Your Hugging Face API token (get one at https://huggingface.co/settings/tokens)
+HF_TOKEN=hf_...
+# Optional: Override the default LLM model
+# LLM_MODEL=meta-llama/Llama-3.1-8B-Instruct
+# Optional: Gradio server settings
+# GRADIO_SERVER_PORT=7860
+# GRADIO_SERVER_NAME=0.0.0.0

.gitignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Python
+__pycache__/
+*.py[cod]
+*.pyo
+.mypy_cache/
+.ruff_cache/
+# Environment
+.env
+*.env
+# Virtual environments
+.venv/
+venv/
+env/
+# Gradio cache / uploads
+gradio_cached_examples/
+flagged/
+# Test artefacts
+.pytest_cache/
+htmlcov/
+.coverage
+# Editors
+.vscode/
+.idea/
+# OS
+.DS_Store
+Thumbs.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,27 @@

+# Kerdos AI — Custom LLM Chat
+# Multi-stage Docker build for a lean production image
+FROM python:3.11-slim AS base
+# System dependencies for PyMuPDF and FAISS
+RUN apt-get update && apt-get install -y --no-install-recommends \
+        build-essential \
+        libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Install Python dependencies first (layer-cached)
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy source
+COPY . .
+# Gradio listens on 7860 by default
+EXPOSE 7860
+ENV GRADIO_SERVER_NAME=0.0.0.0
+ENV GRADIO_SERVER_PORT=7860
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 title: Kerdos AI — Custom LLM Chat (Demo)
 emoji: 🤖
 colorFrom: blue
-colorTo: cyan
 sdk: gradio
 sdk_version: "6.6.0"
 app_file: app.py
@@ -62,15 +62,15 @@ We are actively **seeking investment and strategic partnerships** to build the *
 ## ✨ Features (Demo)
-| Feature                       | Details                                                    |
-| ----------------------------- | ---------------------------------------------------------- |
-| 📄 **Multi-format ingestion** | PDF, DOCX, TXT, MD, CSV                                    |
-| 🧠 **Open-source LLM**        | `meta-llama/Llama-3.1-8B-Instruct` via HF Inference API    |
-| 🔒 **Strictly grounded**      | Answers only from your uploaded documents                  |
-| 📦 **Multi-document**         | Upload and query across multiple files simultaneously      |
-| 💬 **Multi-turn chat**        | Maintains conversation context across questions            |
-| ⚡ **Fast**                   | CPU-friendly embeddings (`all-MiniLM-L6-v2` + FAISS)       |
-| 🔑 **Secure**                 | Files processed in-session only — never stored permanently |
 ---
@@ -103,9 +103,9 @@ Document Parser (PDF / DOCX / TXT)
       ↓
 Text Chunking (512 chars, 64 overlap)
       ↓
-Embeddings (all-MiniLM-L6-v2)
       ↓
-FAISS Vector Index (in-memory)
       ↓
 User Question → Similarity Search → Top-K Chunks
       ↓
@@ -120,7 +120,7 @@ Response + Source Citations
 - **UI**: [Gradio](https://gradio.app)
 - **LLM**: `meta-llama/Llama-3.1-8B-Instruct`
-- **Embeddings**: `sentence-transformers/all-MiniLM-L6-v2`
 - **Vector Store**: [FAISS](https://github.com/facebookresearch/faiss)
 - **Document Parsing**: PyMuPDF, python-docx
@@ -148,4 +148,4 @@ MIT License — free for commercial and private use.
 ---
-_© 2024–2025 Kerdos Infrasoft Private Limited | Bengaluru, Karnataka, India_

 title: Kerdos AI — Custom LLM Chat (Demo)
 emoji: 🤖
 colorFrom: blue
+colorTo: indigo
 sdk: gradio
 sdk_version: "6.6.0"
 app_file: app.py
 ## ✨ Features (Demo)
+| Feature                       | Details                                                           |
+| ----------------------------- | ----------------------------------------------------------------- |
+| 📄 **Multi-format ingestion** | PDF, DOCX, TXT, MD, CSV                                           |
+| 🧠 **Open-source LLM**        | `meta-llama/Llama-3.1-8B-Instruct` via HF Inference API           |
+| 🔒 **Strictly grounded**      | Answers only from your uploaded documents                         |
+| 📦 **Multi-document**         | Upload and query across multiple files simultaneously             |
+| 💬 **Multi-turn chat**        | Maintains conversation context across questions                   |
+| ⚡ **Fast**                   | CPU-friendly embeddings (`BAAI/bge-small-en-v1.5` + FAISS cosine) |
+| 🔑 **Secure**                 | Files processed in-session only — never stored permanently        |
 ---
       ↓
 Text Chunking (512 chars, 64 overlap)
       ↓
+Embeddings (BAAI/bge-small-en-v1.5)
       ↓
+FAISS Vector Index (cosine similarity, in-memory)
       ↓
 User Question → Similarity Search → Top-K Chunks
       ↓
 - **UI**: [Gradio](https://gradio.app)
 - **LLM**: `meta-llama/Llama-3.1-8B-Instruct`
+- **Embeddings**: `BAAI/bge-small-en-v1.5` (cosine similarity via FAISS)
 - **Vector Store**: [FAISS](https://github.com/facebookresearch/faiss)
 - **Document Parsing**: PyMuPDF, python-docx
 ---
+_© 2024–2026 Kerdos Infrasoft Private Limited | Bengaluru, Karnataka, India_

app.py CHANGED Viewed

@@ -6,11 +6,14 @@ Website: https://kerdos.in
 """
 import os
 import gradio as gr
 from rag.document_loader import load_documents
 from rag.embedder import build_index, add_to_index
 from rag.retriever import retrieve
-from rag.chain import answer
 # ─────────────────────────────────────────────
 # State helpers
@@ -55,9 +58,10 @@ def process_files(files, current_index, status_box):
 def chat(user_message, history, vector_index, hf_token_input, top_k):
-    """Main chat handler — retrieves context and calls the LLM."""
     if not user_message.strip():
-        return history, ""
     hf_token = get_hf_token(hf_token_input)
     if not hf_token:
@@ -65,26 +69,30 @@ def chat(user_message, history, vector_index, hf_token_input, top_k):
             {"role": "user", "content": user_message},
             {"role": "assistant", "content": "⚠️ Please provide a Hugging Face API token to use the chat."},
         ]
-        return history, ""
     if vector_index is None:
         history = history + [
             {"role": "user", "content": user_message},
             {"role": "assistant", "content": "⚠️ Please upload at least one document first."},
         ]
-        return history, ""
     try:
         chunks = retrieve(user_message, vector_index, top_k=int(top_k))
-        bot_reply = answer(user_message, chunks, hf_token, chat_history=history)
     except Exception as e:
-        bot_reply = f"❌ Error: {e}"
-    history = history + [
-        {"role": "user", "content": user_message},
-        {"role": "assistant", "content": bot_reply},
-    ]
-    return history, ""
 def reset_all():
@@ -299,7 +307,7 @@ with gr.Blocks(title="Kerdos AI — Custom LLM Chat | Document Q&A Demo") as dem
     # ── Kerdos Footer ─────────────────────────
     gr.HTML("""
     <div id="kerdos-footer">
-        &copy; 2024–2025 <strong>Kerdos Infrasoft Private Limited</strong> &nbsp;|&nbsp;
         CIN: U62099KA2023PTC182869 &nbsp;|&nbsp; Bengaluru, Karnataka, India<br/>
         🌐 <a href="https://kerdos.in" target="_blank" style="color:#0055FF;">kerdos.in</a>
         &nbsp;|&nbsp;
@@ -311,4 +319,5 @@ with gr.Blocks(title="Kerdos AI — Custom LLM Chat | Document Q&A Demo") as dem
     """)
 if __name__ == "__main__":
     demo.launch(css=CSS, theme=gr.themes.Soft())

 """
 import os
+from dotenv import load_dotenv
 import gradio as gr
 from rag.document_loader import load_documents
 from rag.embedder import build_index, add_to_index
 from rag.retriever import retrieve
+from rag.chain import answer_stream
+load_dotenv()  # Load HF_TOKEN etc. from .env when running locally
 # ─────────────────────────────────────────────
 # State helpers
 def chat(user_message, history, vector_index, hf_token_input, top_k):
+    """Streaming chat handler — yields progressively-updated history for real-time response."""
     if not user_message.strip():
+        yield history, ""
+        return
     hf_token = get_hf_token(hf_token_input)
     if not hf_token:
             {"role": "user", "content": user_message},
             {"role": "assistant", "content": "⚠️ Please provide a Hugging Face API token to use the chat."},
         ]
+        yield history, ""
+        return
     if vector_index is None:
         history = history + [
             {"role": "user", "content": user_message},
             {"role": "assistant", "content": "⚠️ Please upload at least one document first."},
         ]
+        yield history, ""
+        return
     try:
         chunks = retrieve(user_message, vector_index, top_k=int(top_k))
+        # Append placeholder so user sees their message immediately
+        history = history + [
+            {"role": "user", "content": user_message},
+            {"role": "assistant", "content": ""},
+        ]
+        for partial in answer_stream(user_message, chunks, hf_token, chat_history=history[:-2]):
+            history[-1]["content"] = partial
+            yield history, ""
     except Exception as e:
+        history[-1]["content"] = f"❌ Error: {e}"
+        yield history, ""
 def reset_all():
     # ── Kerdos Footer ─────────────────────────
     gr.HTML("""
     <div id="kerdos-footer">
+        &copy; 2024–2026 <strong>Kerdos Infrasoft Private Limited</strong> &nbsp;|&nbsp;
         CIN: U62099KA2023PTC182869 &nbsp;|&nbsp; Bengaluru, Karnataka, India<br/>
         🌐 <a href="https://kerdos.in" target="_blank" style="color:#0055FF;">kerdos.in</a>
         &nbsp;|&nbsp;
     """)
 if __name__ == "__main__":
+    demo.queue()  # Required for streaming generators
     demo.launch(css=CSS, theme=gr.themes.Soft())

rag/chain.py CHANGED Viewed

@@ -2,10 +2,18 @@
 chain.py
 Calls the LLM via HF Inference API with a strict RAG prompt.
 Only answers from the retrieved context — never from general knowledge.
 """
 from __future__ import annotations
 from huggingface_hub import InferenceClient
 SYSTEM_PROMPT = """You are an enterprise document assistant. Your ONLY job is to answer questions using the provided document context below.
@@ -25,6 +33,8 @@ Context from uploaded documents:
 LLM_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.1   # Low temperature for factual, grounded responses
 def build_context(chunks: list[dict]) -> str:
@@ -35,37 +45,67 @@ def build_context(chunks: list[dict]) -> str:
     return "\n\n".join(parts)
-def answer(
-    query: str,
-    context_chunks: list[dict],
-    hf_token: str,
-    chat_history: list[dict] | None = None,
-) -> str:
-    """
-    Call Llama 3 via HF Inference API to answer the query
-    grounded strictly in context_chunks.
-    """
-    if not context_chunks:
-        return "I don't have that information in the uploaded documents."
     context = build_context(context_chunks)
     system_msg = SYSTEM_PROMPT.format(context=context)
-    # Build message history for multi-turn conversation
-    # chat_history is now a flat list of {"role": ..., "content": ...} dicts (Gradio 6.x)
-    messages = [{"role": "system", "content": system_msg}]
     if chat_history:
-        # Keep last 8 messages (4 turns) for context
-        for msg in chat_history[-8:]:
             if msg.get("role") in ("user", "assistant") and msg.get("content"):
                 messages.append({"role": msg["role"], "content": msg["content"]})
     messages.append({"role": "user", "content": query})
-    client = InferenceClient(token=hf_token)
-    response = client.chat_completion(
         model=LLM_MODEL,
         messages=messages,
         max_tokens=MAX_NEW_TOKENS,
         temperature=TEMPERATURE,
     )
-    return response.choices[0].message.content.strip()

 chain.py
 Calls the LLM via HF Inference API with a strict RAG prompt.
 Only answers from the retrieved context — never from general knowledge.
+Upgrades vs original:
+  • answer_stream() — yields token-by-token for real-time Gradio streaming
+  • tenacity retry (3 attempts, exponential back-off) on transient API errors
+  • Hard input length guard (query ≤ 2000 chars, history capped at 6 messages)
 """
 from __future__ import annotations
+from typing import Generator
 from huggingface_hub import InferenceClient
+from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception_type
 SYSTEM_PROMPT = """You are an enterprise document assistant. Your ONLY job is to answer questions using the provided document context below.
 LLM_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.1   # Low temperature for factual, grounded responses
+MAX_QUERY_CHARS = 2000
+MAX_HISTORY_TURNS = 6  # Keep last N messages (each turn = 1 user + 1 assistant)
 def build_context(chunks: list[dict]) -> str:
     return "\n\n".join(parts)
+def _build_messages(query: str, context_chunks: list[dict], chat_history: list[dict] | None) -> list[dict]:
+    """Assemble the full message list for the LLM call."""
     context = build_context(context_chunks)
     system_msg = SYSTEM_PROMPT.format(context=context)
+    messages: list[dict] = [{"role": "system", "content": system_msg}]
     if chat_history:
+        # Cap history to avoid overflow
+        for msg in chat_history[-MAX_HISTORY_TURNS:]:
             if msg.get("role") in ("user", "assistant") and msg.get("content"):
                 messages.append({"role": msg["role"], "content": msg["content"]})
+    # Guard: truncate excessively long queries
+    query = query[:MAX_QUERY_CHARS]
     messages.append({"role": "user", "content": query})
+    return messages
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=2, max=10),
+    retry=retry_if_exception_type(Exception),
+    reraise=True,
+)
+def _call_llm_stream(client: InferenceClient, messages: list[dict]):
+    """Streaming call to the LLM; decorated with retry logic."""
+    return client.chat_completion(
         model=LLM_MODEL,
         messages=messages,
         max_tokens=MAX_NEW_TOKENS,
         temperature=TEMPERATURE,
+        stream=True,
     )
+def answer_stream(
+    query: str,
+    context_chunks: list[dict],
+    hf_token: str,
+    chat_history: list[dict] | None = None,
+) -> Generator[str, None, None]:
+    """
+    Stream the LLM answer token-by-token.
+    Yields the progressively-growing reply string so Gradio can update in real time.
+    """
+    if not context_chunks:
+        yield "I don't have that information in the uploaded documents."
+        return
+    messages = _build_messages(query, context_chunks, chat_history)
+    client = InferenceClient(token=hf_token)
+    try:
+        stream = _call_llm_stream(client, messages)
+    except Exception as e:
+        yield f"❌ LLM error after retries: {e}"
+        return
+    accumulated = ""
+    for chunk in stream:
+        delta = chunk.choices[0].delta.content
+        if delta:
+            accumulated += delta
+            yield accumulated

rag/embedder.py CHANGED Viewed

@@ -9,7 +9,7 @@ from dataclasses import dataclass, field
 CHUNK_SIZE = 512        # characters
 CHUNK_OVERLAP = 64      # characters
-EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 @dataclass
@@ -56,7 +56,9 @@ def build_index(docs: list[dict]) -> VectorIndex:
     embeddings = np.array(embeddings, dtype="float32")
     dim = embeddings.shape[1]
-    index = faiss.IndexFlatL2(dim)
     index.add(embeddings)
     print(f"[Embedder] Index built: {index.ntotal} vectors, dim={dim}")
@@ -75,6 +77,7 @@ def add_to_index(vector_index: VectorIndex, docs: list[dict]) -> VectorIndex:
     texts = [c["text"] for c in new_chunks]
     embeddings = vector_index.embedder.encode(texts, show_progress_bar=False, batch_size=32)
     embeddings = np.array(embeddings, dtype="float32")
     vector_index.index.add(embeddings)
     vector_index.chunks.extend(new_chunks)

 CHUNK_SIZE = 512        # characters
 CHUNK_OVERLAP = 64      # characters
+EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # Upgraded: state-of-the-art small retrieval model
 @dataclass
     embeddings = np.array(embeddings, dtype="float32")
     dim = embeddings.shape[1]
+    # Use Inner Product index (cosine similarity after L2 normalisation)
+    faiss.normalize_L2(embeddings)
+    index = faiss.IndexFlatIP(dim)
     index.add(embeddings)
     print(f"[Embedder] Index built: {index.ntotal} vectors, dim={dim}")
     texts = [c["text"] for c in new_chunks]
     embeddings = vector_index.embedder.encode(texts, show_progress_bar=False, batch_size=32)
     embeddings = np.array(embeddings, dtype="float32")
+    faiss.normalize_L2(embeddings)  # Keep consistent with cosine index
     vector_index.index.add(embeddings)
     vector_index.chunks.extend(new_chunks)

rag/retriever.py CHANGED Viewed

@@ -1,10 +1,11 @@
 """
 retriever.py
-Performs similarity search against the FAISS index.
 """
 from __future__ import annotations
 import numpy as np
 from rag.embedder import VectorIndex
 DEFAULT_TOP_K = 5
@@ -14,24 +15,26 @@ def retrieve(query: str, vector_index: VectorIndex, top_k: int = DEFAULT_TOP_K)
     """
     Embed the query and return top_k most similar chunks.
     Each result: {"source": str, "text": str, "score": float}
     """
     if vector_index is None or vector_index.index is None:
         return []
     query_embedding = vector_index.embedder.encode([query], show_progress_bar=False)
     query_embedding = np.array(query_embedding, dtype="float32")
     n_results = min(top_k, vector_index.index.ntotal)
-    distances, indices = vector_index.index.search(query_embedding, n_results)
     results = []
-    for dist, idx in zip(distances[0], indices[0]):
         if idx == -1:
             continue
         chunk = vector_index.chunks[idx]
         results.append({
             "source": chunk["source"],
             "text": chunk["text"],
-            "score": float(dist),
         })
     return results

 """
 retriever.py
+Performs cosine-similarity search against the FAISS index.
 """
 from __future__ import annotations
 import numpy as np
+import faiss
 from rag.embedder import VectorIndex
 DEFAULT_TOP_K = 5
     """
     Embed the query and return top_k most similar chunks.
     Each result: {"source": str, "text": str, "score": float}
+    Scores are cosine similarities (higher = more relevant).
     """
     if vector_index is None or vector_index.index is None:
         return []
     query_embedding = vector_index.embedder.encode([query], show_progress_bar=False)
     query_embedding = np.array(query_embedding, dtype="float32")
+    faiss.normalize_L2(query_embedding)  # Must match IndexFlatIP cosine index
     n_results = min(top_k, vector_index.index.ntotal)
+    scores, indices = vector_index.index.search(query_embedding, n_results)
     results = []
+    for score, idx in zip(scores[0], indices[0]):
         if idx == -1:
             continue
         chunk = vector_index.chunks[idx]
         results.append({
             "source": chunk["source"],
             "text": chunk["text"],
+            "score": float(score),   # cosine similarity (0–1 range)
         })
     return results

requirements-dev.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+# Development dependencies (not needed in production)
+pytest>=8.0.0
+black>=24.0.0
+ruff>=0.4.0

requirements.txt CHANGED Viewed

@@ -1,7 +1,9 @@
 gradio>=6.6.0
-sentence-transformers>=2.7.0
-faiss-cpu>=1.7.4
 PyMuPDF>=1.24.0
 python-docx>=1.1.0
-huggingface-hub>=0.23.0
-numpy>=1.24.0

 gradio>=6.6.0
+sentence-transformers>=5.0.0
+faiss-cpu>=1.9.0
 PyMuPDF>=1.24.0
 python-docx>=1.1.0
+huggingface-hub>=0.28.0
+numpy>=1.26.0,<3
+python-dotenv>=1.0.0
+tenacity>=8.2.0

tests/smoke_test.py ADDED Viewed

	@@ -0,0 +1,48 @@

+"""
+tests/smoke_test.py
+Quick sanity check — verifies imports and a basic FAISS index round-trip.
+Run with:  python -m pytest tests/smoke_test.py -v
+"""
+import sys
+import os
+# Make sure the project root is on the path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+def test_imports():
+    """All RAG modules should import without error."""
+    from rag import document_loader, embedder, retriever, chain  # noqa: F401
+def test_index_and_retrieve():
+    """Build a tiny FAISS index and assert we get a result back."""
+    from rag.embedder import build_index
+    from rag.retriever import retrieve
+    docs = [
+        {"source": "test.txt", "text": "The refund policy allows returns within 30 days of purchase."},
+        {"source": "test.txt", "text": "Contact support at support@example.com for assistance."},
+    ]
+    idx = build_index(docs)
+    assert idx.index.ntotal > 0, "Index should have at least one vector"
+    results = retrieve("What is the refund policy?", idx, top_k=2)
+    assert len(results) > 0, "Should return at least one result"
+    # Cosine similarity scores should be in (0, 1] range
+    for r in results:
+        assert 0.0 <= r["score"] <= 1.01, f"Unexpected score: {r['score']}"
+        assert "source" in r and "text" in r
+def test_chunk_not_empty():
+    """Chunker should produce non-empty chunks."""
+    from rag.embedder import _chunk_text
+    chunks = _chunk_text("doc.txt", "Hello world. " * 100)
+    assert len(chunks) > 0
+    for c in chunks:
+        assert c["text"].strip()