Spaces:

kerdosdotio
/

Custom-LLM-Chat

Running

Bhaskar Ram commited on Mar 3

Commit

2623b17

1 Parent(s): a465955

fix: sentence-aware chunking, score threshold, DOCX tables, streaming error handling, LLM_MODEL env var

- embedder.py: replace character slicer with sentence-boundary-aware chunker
(regex split on [.!?]+capital / paragraph breaks, sentence-level overlap)
- retriever.py: add MIN_SCORE=0.30 cosine-similarity threshold to drop
irrelevant chunks before they reach the LLM
- document_loader.py: extend _load_docx() to extract table cell text
(previously tables were silently skipped)
- chain.py: split retry logic (connection phase only) from mid-stream error
handling; partial responses now surfaced on stream interruption
- chain.py + .env.example: LLM_MODEL now read from env var with Llama 3.1 8B
as fallback (was hardcoded, env override was broken)

Files changed (5) hide show

.env.example +3 -1
rag/chain.py +26 -10
rag/document_loader.py +16 -2
rag/embedder.py +62 -11
rag/retriever.py +9 -1

.env.example CHANGED Viewed

@@ -3,8 +3,10 @@
 # Required: Your Hugging Face API token (get one at https://huggingface.co/settings/tokens)
 HF_TOKEN=hf_...
-# Optional: Override the default LLM model
 # LLM_MODEL=meta-llama/Llama-3.1-8B-Instruct
 # Optional: Gradio server settings
 # GRADIO_SERVER_PORT=7860

 # Required: Your Hugging Face API token (get one at https://huggingface.co/settings/tokens)
 HF_TOKEN=hf_...
+# Optional: Override the default LLM model (defaults to Llama 3.1 8B if not set)
 # LLM_MODEL=meta-llama/Llama-3.1-8B-Instruct
+# LLM_MODEL=mistralai/Mistral-7B-Instruct-v0.3
+# LLM_MODEL=mistralai/Mixtral-8x7B-Instruct-v0.1
 # Optional: Gradio server settings
 # GRADIO_SERVER_PORT=7860

rag/chain.py CHANGED Viewed

@@ -10,6 +10,7 @@ Upgrades vs original:
 """
 from __future__ import annotations
 from typing import Generator
 from huggingface_hub import InferenceClient
@@ -30,7 +31,7 @@ Context from uploaded documents:
 ---
 """
-LLM_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.1   # Low temperature for factual, grounded responses
 MAX_QUERY_CHARS = 2000
@@ -69,8 +70,12 @@ def _build_messages(query: str, context_chunks: list[dict], chat_history: list[d
     retry=retry_if_exception_type(Exception),
     reraise=True,
 )
-def _call_llm_stream(client: InferenceClient, messages: list[dict]):
-    """Streaming call to the LLM; decorated with retry logic."""
     return client.chat_completion(
         model=LLM_MODEL,
         messages=messages,
@@ -89,6 +94,10 @@ def answer_stream(
     """
     Stream the LLM answer token-by-token.
     Yields the progressively-growing reply string so Gradio can update in real time.
     """
     if not context_chunks:
         yield "I don't have that information in the uploaded documents."
@@ -97,15 +106,22 @@ def answer_stream(
     messages = _build_messages(query, context_chunks, chat_history)
     client = InferenceClient(token=hf_token)
     try:
-        stream = _call_llm_stream(client, messages)
     except Exception as e:
-        yield f"❌ LLM error after retries: {e}"
         return
     accumulated = ""
-    for chunk in stream:
-        delta = chunk.choices[0].delta.content
-        if delta:
-            accumulated += delta
-            yield accumulated

 """
 from __future__ import annotations
+import os
 from typing import Generator
 from huggingface_hub import InferenceClient
 ---
 """
+LLM_MODEL = os.environ.get("LLM_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
 MAX_NEW_TOKENS = 1024
 TEMPERATURE = 0.1   # Low temperature for factual, grounded responses
 MAX_QUERY_CHARS = 2000
     retry=retry_if_exception_type(Exception),
     reraise=True,
 )
+def _open_stream(client: InferenceClient, messages: list[dict]):
+    """
+    Open a streaming connection to the LLM.
+    The @retry decorator governs ONLY this connection phase (handshake / auth /
+    transient 5xx).  Mid-stream token errors are handled separately in answer_stream().
+    """
     return client.chat_completion(
         model=LLM_MODEL,
         messages=messages,
     """
     Stream the LLM answer token-by-token.
     Yields the progressively-growing reply string so Gradio can update in real time.
+    Error handling:
+    • Connection failures → retried up to 3× before yielding an error message.
+    • Mid-stream failures → partial response is preserved; error notice appended.
     """
     if not context_chunks:
         yield "I don't have that information in the uploaded documents."
     messages = _build_messages(query, context_chunks, chat_history)
     client = InferenceClient(token=hf_token)
+    # Phase 1: open stream (retried automatically by _open_stream)
     try:
+        stream = _open_stream(client, messages)
     except Exception as e:
+        yield f"❌ Could not reach the LLM after 3 attempts: {e}"
         return
+    # Phase 2: consume the stream token-by-token
     accumulated = ""
+    try:
+        for chunk in stream:
+            delta = chunk.choices[0].delta.content
+            if delta:
+                accumulated += delta
+                yield accumulated
+    except Exception as e:
+        # Surface whatever was streamed so far alongside the error.
+        error_notice = f"\n\n⚠️ *Streaming interrupted: {e}*"
+        yield (accumulated + error_notice) if accumulated else f"❌ Streaming error: {e}"

rag/document_loader.py CHANGED Viewed

@@ -53,8 +53,22 @@ def _load_pdf(path: str) -> str:
 def _load_docx(path: str) -> str:
     from docx import Document
     doc = Document(path)
-    paragraphs = [p.text for p in doc.paragraphs if p.text.strip()]
-    return "\n".join(paragraphs)
 def _load_text(path: str) -> str:

 def _load_docx(path: str) -> str:
     from docx import Document
     doc = Document(path)
+    parts: list[str] = []
+    # Body paragraphs (existing)
+    for p in doc.paragraphs:
+        if p.text.strip():
+            parts.append(p.text.strip())
+    # Tables — previously skipped entirely
+    for table in doc.tables:
+        for row in table.rows:
+            cells = [cell.text.strip() for cell in row.cells if cell.text.strip()]
+            if cells:
+                parts.append("\t".join(cells))
+    return "\n".join(parts)
 def _load_text(path: str) -> str:

rag/embedder.py CHANGED Viewed

@@ -7,29 +7,80 @@ from __future__ import annotations
 import numpy as np
 from dataclasses import dataclass, field
-CHUNK_SIZE = 512        # characters
-CHUNK_OVERLAP = 64      # characters
-EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # Upgraded: state-of-the-art small retrieval model
 @dataclass
 class VectorIndex:
     """Holds chunks, their embeddings, and the FAISS index."""
     chunks: list[dict] = field(default_factory=list)   # {"source", "text"}
-    index: object = None                                # faiss.IndexFlatL2
     embedder: object = None                             # SentenceTransformer
 def _chunk_text(source: str, text: str) -> list[dict]:
-    """Split text into overlapping chunks."""
-    chunks = []
-    start = 0
-    while start < len(text):
-        end = start + CHUNK_SIZE
-        chunk_text = text[start:end]
         if chunk_text.strip():
             chunks.append({"source": source, "text": chunk_text})
-        start += CHUNK_SIZE - CHUNK_OVERLAP
     return chunks

 import numpy as np
 from dataclasses import dataclass, field
+CHUNK_SIZE = 512        # characters — max chars per chunk
+CHUNK_OVERLAP = 64      # characters — approx overlap between consecutive chunks
+EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # State-of-the-art small retrieval model
+# Regex: split on sentence-ending punctuation followed by whitespace + capital letter,
+# or on paragraph / line breaks.
+import re as _re
+_SENTENCE_SPLIT = _re.compile(r'(?<=[.!?])\s+(?=[A-Z])|(?<=\n)\s*\n+')
 @dataclass
 class VectorIndex:
     """Holds chunks, their embeddings, and the FAISS index."""
     chunks: list[dict] = field(default_factory=list)   # {"source", "text"}
+    index: object = None                                # faiss.IndexFlatIP
     embedder: object = None                             # SentenceTransformer
 def _chunk_text(source: str, text: str) -> list[dict]:
+    """
+    Split text into overlapping chunks that respect sentence boundaries.
+    Instead of slicing at a fixed character offset (which cuts mid-sentence),
+    we:
+      1. Split the document into sentences / paragraphs.
+      2. Greedily accumulate sentences until CHUNK_SIZE is reached.
+      3. For the next chunk, back up by ~CHUNK_OVERLAP chars worth of sentences
+         so consecutive chunks share context at their boundaries.
+    """
+    # Normalise excessive whitespace while preserving paragraph breaks
+    text = _re.sub(r'[ \t]+', ' ', text).strip()
+    sentences = [s.strip() for s in _SENTENCE_SPLIT.split(text) if s.strip()]
+    chunks: list[dict] = []
+    i = 0
+    while i < len(sentences):
+        # Accumulate sentences until we hit the size limit
+        parts: list[str] = []
+        total = 0
+        j = i
+        while j < len(sentences):
+            slen = len(sentences[j])
+            if total + slen > CHUNK_SIZE and parts:
+                break
+            parts.append(sentences[j])
+            total += slen + 1   # +1 for the space we'll join with
+            j += 1
+        chunk_text = " ".join(parts)
         if chunk_text.strip():
             chunks.append({"source": source, "text": chunk_text})
+        if j == i:
+            # Single sentence longer than CHUNK_SIZE — hard-split it
+            sent = sentences[i]
+            for k in range(0, len(sent), CHUNK_SIZE - CHUNK_OVERLAP):
+                part = sent[k: k + CHUNK_SIZE]
+                if part.strip():
+                    chunks.append({"source": source, "text": part})
+            i += 1
+            continue
+        # Slide forward, but overlap by backtracking ~CHUNK_OVERLAP chars
+        overlap_chars = 0
+        next_i = j
+        for k in range(j - 1, i, -1):
+            overlap_chars += len(sentences[k]) + 1
+            if overlap_chars >= CHUNK_OVERLAP:
+                next_i = k
+                break
+        i = max(i + 1, next_i)   # always advance at least one sentence
     return chunks

rag/retriever.py CHANGED Viewed

@@ -10,10 +10,16 @@ from rag.embedder import VectorIndex
 DEFAULT_TOP_K = 5
 def retrieve(query: str, vector_index: VectorIndex, top_k: int = DEFAULT_TOP_K) -> list[dict]:
     """
-    Embed the query and return top_k most similar chunks.
     Each result: {"source": str, "text": str, "score": float}
     Scores are cosine similarities (higher = more relevant).
     """
@@ -31,6 +37,8 @@ def retrieve(query: str, vector_index: VectorIndex, top_k: int = DEFAULT_TOP_K)
     for score, idx in zip(scores[0], indices[0]):
         if idx == -1:
             continue
         chunk = vector_index.chunks[idx]
         results.append({
             "source": chunk["source"],

 DEFAULT_TOP_K = 5
+# Chunks with a cosine similarity below this threshold are considered
+# too dissimilar to the query and are dropped before reaching the LLM.
+# This prevents low-quality context from polluting the answer.
+# Range: 0.0 (no filtering) → 1.0 (exact match only). 0.30 is a safe default.
+MIN_SCORE = 0.30
 def retrieve(query: str, vector_index: VectorIndex, top_k: int = DEFAULT_TOP_K) -> list[dict]:
     """
+    Embed the query and return top_k most similar chunks above MIN_SCORE.
     Each result: {"source": str, "text": str, "score": float}
     Scores are cosine similarities (higher = more relevant).
     """
     for score, idx in zip(scores[0], indices[0]):
         if idx == -1:
             continue
+        if float(score) < MIN_SCORE:
+            continue  # Drop chunks below relevance threshold
         chunk = vector_index.chunks[idx]
         results.append({
             "source": chunk["source"],