Spaces:

kerdosdotio
/

Custom-LLM-Chat

Running

File size: 5,749 Bytes

"""
embedder.py
Chunks raw text documents and builds an in-memory FAISS vector index.
"""

from __future__ import annotations
import re as _re
import numpy as np
from dataclasses import dataclass, field
from typing import Optional

CHUNK_SIZE = 512        # characters — max chars per chunk
CHUNK_OVERLAP = 64      # characters — approx overlap between consecutive chunks
EMBEDDING_MODEL = "BAAI/bge-small-en-v1.5"  # State-of-the-art small retrieval model

# Regex: split on sentence-ending punctuation followed by whitespace + capital letter,
# or on paragraph / line breaks.
_SENTENCE_SPLIT = _re.compile(r'(?<=[.!?])\s+(?=[A-Z])|(?<=\n)\s*\n+')

# ── Model singleton ───────────────────────────────────────────────────────────
# SentenceTransformer takes 5–15s to load from disk. We load it exactly once
# per process and reuse across all build_index / add_to_index calls.
_MODEL: Optional[object] = None


def _get_model():
    """Return the cached SentenceTransformer, loading it on first call only."""
    global _MODEL
    if _MODEL is None:
        from sentence_transformers import SentenceTransformer
        _MODEL = SentenceTransformer(EMBEDDING_MODEL)
    return _MODEL
# ─────────────────────────────────────────────────────────────────────────────


@dataclass
class VectorIndex:
    """Holds chunks, their embeddings, and the FAISS index."""
    chunks: list[dict] = field(default_factory=list)   # {"source", "text"}
    index: object = None                                # faiss.IndexFlatIP
    embedder: object = None                             # SentenceTransformer


def _chunk_text(source: str, text: str) -> list[dict]:
    """
    Split text into overlapping chunks that respect sentence boundaries.

    Instead of slicing at a fixed character offset (which cuts mid-sentence),
    we:
      1. Split the document into sentences / paragraphs.
      2. Greedily accumulate sentences until CHUNK_SIZE is reached.
      3. For the next chunk, back up by ~CHUNK_OVERLAP chars worth of sentences
         so consecutive chunks share context at their boundaries.
    """
    # Normalise excessive whitespace while preserving paragraph breaks
    text = _re.sub(r'[ \t]+', ' ', text).strip()
    sentences = [s.strip() for s in _SENTENCE_SPLIT.split(text) if s.strip()]

    chunks: list[dict] = []
    i = 0

    while i < len(sentences):
        # Accumulate sentences until we hit the size limit
        parts: list[str] = []
        total = 0
        j = i
        while j < len(sentences):
            slen = len(sentences[j])
            if total + slen > CHUNK_SIZE and parts:
                break
            parts.append(sentences[j])
            total += slen + 1   # +1 for the space we'll join with
            j += 1

        chunk_text = " ".join(parts)
        if chunk_text.strip():
            chunks.append({"source": source, "text": chunk_text})

        if j == i:
            # Single sentence longer than CHUNK_SIZE — hard-split it
            sent = sentences[i]
            for k in range(0, len(sent), CHUNK_SIZE - CHUNK_OVERLAP):
                part = sent[k: k + CHUNK_SIZE]
                if part.strip():
                    chunks.append({"source": source, "text": part})
            i += 1
            continue

        # Slide forward, but overlap by backtracking ~CHUNK_OVERLAP chars
        overlap_chars = 0
        next_i = j
        for k in range(j - 1, i, -1):
            overlap_chars += len(sentences[k]) + 1
            if overlap_chars >= CHUNK_OVERLAP:
                next_i = k
                break

        i = max(i + 1, next_i)   # always advance at least one sentence

    return chunks


def build_index(docs: list[dict]) -> VectorIndex:
    """
    Takes list of {"source", "text"} dicts.
    Returns a VectorIndex with embeddings stored in FAISS.
    """
    import faiss

    model = _get_model()  # reuse cached singleton — no reload cost

    # Chunk all documents
    all_chunks = []
    for doc in docs:
        all_chunks.extend(_chunk_text(doc["source"], doc["text"]))

    if not all_chunks:
        raise ValueError("No text chunks could be extracted from the uploaded files.")

    print(f"[Embedder] Embedding {len(all_chunks)} chunks...")
    texts = [c["text"] for c in all_chunks]
    embeddings = model.encode(texts, show_progress_bar=False, batch_size=32)
    embeddings = np.array(embeddings, dtype="float32")

    dim = embeddings.shape[1]
    # Use Inner Product index (cosine similarity after L2 normalisation)
    faiss.normalize_L2(embeddings)
    index = faiss.IndexFlatIP(dim)
    index.add(embeddings)

    print(f"[Embedder] Index built: {index.ntotal} vectors, dim={dim}")
    return VectorIndex(chunks=all_chunks, index=index, embedder=model)


def add_to_index(vector_index: VectorIndex, docs: list[dict]) -> VectorIndex:
    """Incrementally add new docs to an existing index."""
    import faiss
    # numpy already imported at module level — no duplicate import needed

    new_chunks = []
    for doc in docs:
        new_chunks.extend(_chunk_text(doc["source"], doc["text"]))

    texts = [c["text"] for c in new_chunks]
    embeddings = vector_index.embedder.encode(texts, show_progress_bar=False, batch_size=32)
    embeddings = np.array(embeddings, dtype="float32")
    faiss.normalize_L2(embeddings)  # Keep consistent with cosine index

    vector_index.index.add(embeddings)
    vector_index.chunks.extend(new_chunks)
    return vector_index