Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 23, 2025

Commit

6f1a758

verified ·

1 Parent(s): 97a9e9a

Create kb_creation.py

Browse files

Files changed (1) hide show

kb_creation.py +433 -0

kb_creation.py ADDED Viewed

	@@ -0,0 +1,433 @@

+import os
+import re
+import pickle
+from typing import List, Dict, Any, Tuple, Optional
+from docx import Document
+from sentence_transformers import SentenceTransformer
+import chromadb
+# ------------------------- ChromaDB setup -------------------------
+CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
+client = chromadb.PersistentClient(path=CHROMA_PATH)
+collection = client.get_or_create_collection(name="knowledge_base")
+# ------------------------- Embedding model ------------------------
+# You can swap to a multilingual model if you expect mixed language queries:
+# model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+MODEL_PATH = './models/all-MiniLM-L6-v2'
+model = SentenceTransformer(MODEL_PATH)
+# ------------------------- BM25 (lightweight) ---------------------
+BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
+bm25_docs: List[Dict[str, Any]] = []      # each: {id, text, tokens, tf, length, meta}
+bm25_inverted: Dict[str, List[int]] = {}  # term -> list of doc indices in bm25_docs
+bm25_df: Dict[str, int] = {}              # term -> document frequency
+bm25_avgdl: float = 0.0
+bm25_ready: bool = False
+BM25_K1 = 1.5
+BM25_B = 0.75
+# ------------------------- Utilities ------------------------------
+def _tokenize(text: str) -> List[str]:
+    """
+    Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
+    Keeps stopwords (BM25 can work with them), but normalizes whitespace.
+    """
+    if not text:
+        return []
+    text = text.lower()
+    tokens = re.findall(r"[a-z0-9]+", text)
+    return tokens
+def _normalize_query(q: str) -> str:
+    """
+    Language-agnostic normalization for user queries (no hardcoded domain synonyms).
+    Removes filler verbs, collapses whitespace, lowercases, keeps key terms.
+    """
+    q = (q or "").strip().lower()
+    q = re.sub(r"[^\w\s]", " ", q)
+    # remove generic filler verbs/common noise words across English variants
+    q = re.sub(r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b", " ", q)
+    q = re.sub(r"\s+", " ", q).strip()
+    return q
+# ------------------------- DOCX parsing & chunking ----------------
+def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
+    """
+    Split DOCX into (section_title, paragraphs_in_section).
+    Uses paragraph style names: 'Heading 1', 'Heading 2', etc.
+    Falls back to document-level when no headings are present.
+    """
+    sections: List[Tuple[str, List[str]]] = []
+    current_title = None
+    current_paras: List[str] = []
+    for para in doc.paragraphs:
+        text = (para.text or "").strip()
+        style_name = (para.style.name if para.style else "") or ""
+        is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
+        if is_heading and text:
+            # commit previous section
+            if current_title or current_paras:
+                sections.append((current_title or "Untitled Section", current_paras))
+            current_title = text
+            current_paras = []
+        else:
+            if text:
+                current_paras.append(text)
+    # final section
+    if current_title or current_paras:
+        sections.append((current_title or "Untitled Section", current_paras))
+    # in case no headings at all, make one pseudo-section with all text
+    if not sections:
+        all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
+        sections = [("Document", all_text)]
+    return sections
+def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
+    """
+    Build chunks that keep: Document Title + Section Title + paragraphs,
+    so short bullets like 'Putaway error: ...' stay with their header.
+    """
+    # Join paras for chunking
+    body = "\n".join(paragraphs)
+    words = body.split()
+    chunks: List[str] = []
+    for i in range(0, len(words), max_words):
+        chunk_body = ' '.join(words[i:i + max_words])
+        chunk_text = f"{doc_title}\n{section_title}\n\n{chunk_body}".strip()
+        chunks.append(chunk_text)
+    if not chunks and body:
+        chunks = [f"{doc_title}\n{section_title}\n\n{body}"]
+    return chunks
+# ------------------------- Ingestion ------------------------------
+def ingest_documents(folder_path: str) -> None:
+    """
+    Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
+    and build BM25 inverted index with persistence.
+    """
+    print(f"📂 Checking folder: {folder_path}")
+    files = [f for f in os.listdir(folder_path) if f.lower().endswith('.docx')]
+    print(f"Found {len(files)} Word files: {files}")
+    if not files:
+        print("⚠️ No .docx files found. Please check the folder path.")
+        return
+    # Reset BM25 memory structures
+    global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
+    bm25_docs = []
+    bm25_inverted = {}
+    bm25_df = {}
+    bm25_avgdl = 0.0
+    bm25_ready = False
+    for file in files:
+        file_path = os.path.join(folder_path, file)
+        doc_title = os.path.splitext(file)[0]
+        doc = Document(file_path)
+        sections = _split_by_sections(doc)
+        total_chunks = 0
+        for s_idx, (section_title, paras) in enumerate(sections):
+            chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
+            total_chunks += len(chunks)
+            for c_idx, chunk in enumerate(chunks):
+                # Embedding & Chroma
+                embedding = model.encode(chunk).tolist()
+                doc_id = f"{file}:{s_idx}:{c_idx}"  # stable unique id
+                meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
+                try:
+                    collection.add(
+                        ids=[doc_id],
+                        embeddings=[embedding],
+                        documents=[chunk],
+                        metadatas=[meta],
+                    )
+                except Exception as e:
+                    # upsert on duplicate
+                    try:
+                        collection.delete(ids=[doc_id])
+                        collection.add(
+                            ids=[doc_id],
+                            embeddings=[embedding],
+                            documents=[chunk],
+                            metadatas=[meta],
+                        )
+                    except Exception as e2:
+                        print(f"❌ Upsert failed for {doc_id}: {e2}")
+                # BM25 indexing
+                tokens = _tokenize(chunk)
+                tf: Dict[str, int] = {}
+                for t in tokens:
+                    tf[t] = tf.get(t, 0) + 1
+                idx = len(bm25_docs)
+                bm25_docs.append({"id": doc_id, "text": chunk, "tokens": tokens, "tf": tf, "length": len(tokens), "meta": meta})
+                # update inverted index & df
+                seen_terms = set()
+                for term in tf.keys():
+                    bm25_inverted.setdefault(term, []).append(idx)
+                    if term not in seen_terms:
+                        bm25_df[term] = bm25_df.get(term, 0) + 1
+                        seen_terms.add(term)
+        print(f"📄 Ingested {file} → {total_chunks} chunks")
+    # finalize BM25 stats
+    N = len(bm25_docs)
+    if N > 0:
+        bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
+        bm25_ready = True
+        # persist BM25 index
+        payload = {
+            "bm25_docs": bm25_docs,
+            "bm25_inverted": bm25_inverted,
+            "bm25_df": bm25_df,
+            "bm25_avgdl": bm25_avgdl,
+            "BM25_K1": BM25_K1,
+            "BM25_B": BM25_B,
+        }
+        os.makedirs(CHROMA_PATH, exist_ok=True)
+        with open(BM25_INDEX_FILE, "wb") as f:
+            pickle.dump(payload, f)
+        print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
+    print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
+def _load_bm25_index() -> None:
+    """
+    Load persisted BM25 index if available.
+    """
+    global bm25_docs, bm25_inverted, bm25_df, bm25_avgdl, bm25_ready
+    if not os.path.exists(BM25_INDEX_FILE):
+        return
+    try:
+        with open(BM25_INDEX_FILE, "rb") as f:
+            payload = pickle.load(f)
+        bm25_docs = payload.get("bm25_docs", [])
+        bm25_inverted = payload.get("bm25_inverted", {})
+        bm25_df = payload.get("bm25_df", {})
+        bm25_avgdl = payload.get("bm25_avgdl", 0.0)
+        # params retained but we keep module-level constants
+        bm25_ready = len(bm25_docs) > 0
+        if bm25_ready:
+            print(f"✅ BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
+    except Exception as e:
+        print(f"⚠️ Could not load BM25 index: {e}")
+# auto-load on import
+_load_bm25_index()
+# ------------------------- BM25 search ----------------------------------------
+def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
+    """
+    Okapi BM25 score for a given doc.
+    """
+    if not bm25_ready or doc_idx < 0 or doc_idx >= len(bm25_docs):
+        return 0.0
+    doc = bm25_docs[doc_idx]
+    score = 0.0
+    dl = doc["length"] or 1
+    for term in query_terms:
+        df = bm25_df.get(term, 0)
+        if df == 0:
+            continue
+        tf = doc["tf"].get(term, 0)
+        if tf == 0:
+            continue
+        # BM25 idf
+        N = len(bm25_docs)
+        idf = max(0.0, ( (N - df + 0.5) / (df + 0.5) ))
+        idf = (idf if idf > 0 else 1.0)
+        idf = 1.0 * ( (N - df + 0.5) / (df + 0.5) )  # raw ratio
+        # typical log form
+        try:
+            import math
+            idf = math.log(idf + 1.0)
+        except Exception:
+            pass
+        denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
+        score += idf * ( (tf * (BM25_K1 + 1)) / (denom or 1.0) )
+    return score
+def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
+    """
+    Returns a list of (doc_idx, score) sorted by score desc.
+    """
+    if not bm25_ready:
+        return []
+    norm = _normalize_query(query)
+    q_terms = _tokenize(norm)
+    if not q_terms:
+        return []
+    # collect candidate doc indices via inverted index
+    candidates = set()
+    for t in q_terms:
+        for idx in bm25_inverted.get(t, []):
+            candidates.add(idx)
+    if not candidates:
+        # fallback to brute force if no inverted match
+        candidates = set(range(len(bm25_docs)))
+    scored = []
+    for idx in candidates:
+        s = _bm25_score_for_doc(q_terms, idx)
+        if s > 0:
+            scored.append((idx, s))
+    scored.sort(key=lambda x: x[1], reverse=True)
+    return scored[:top_k]
+# ------------------------- Semantic-only (legacy) ------------------------------
+def search_knowledge_base(query: str, top_k: int = 10) -> dict:
+    """
+    Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
+    because some Chroma clients reject it; if 'ids' is present in the
+    response we will use it, otherwise we synthesize stable IDs from metadata.
+    """
+    query_embedding = model.encode(query).tolist()
+    res = collection.query(
+        query_embeddings=[query_embedding],
+        n_results=top_k,
+        include=['documents', 'metadatas', 'distances']  # ← no 'ids' here
+    )
+    # Flatten lists-per-query
+    docs_ll  = res.get("documents", [[]]) or [[]]
+    metas_ll = res.get("metadatas", [[]]) or [[]]
+    dists_ll = res.get("distances", [[]]) or [[]]
+    ids_ll   = res.get("ids", [[]]) or [[]]  # some clients still return 'ids' anyway
+    documents = docs_ll[0]  if docs_ll  else []
+    metadatas = metas_ll[0] if metas_ll else []
+    distances = dists_ll[0] if dists_ll else []
+    ids       = ids_ll[0]   if ids_ll   else []
+    # If 'ids' is missing, synthesize stable IDs from metadata
+    if not ids and documents:
+        synthesized = []
+        for i, m in enumerate(metadatas):
+            fn   = (m or {}).get("filename", "unknown")
+            sec  = (m or {}).get("section", "section")
+            idx  = (m or {}).get("chunk_index", i)
+            synthesized.append(f"{fn}:{sec}:{idx}")
+        ids = synthesized
+    print(f"🔎 KB search → {len(documents)} docs (top_k={top_k}); "
+          f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
+    return {
+        "documents": documents,
+        "metadatas": metadatas,
+        "distances": distances,
+        "ids": ids,
+    }
+# ------------------------- Hybrid (BM25 + Embeddings) -------------------------
+def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
+    """
+    Hybrid retrieval:
+      - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
+      - BM25 keyword → score (higher = better)
+      - Re-rank union of candidates by: final = alpha * semantic_sim + beta * bm25_norm
+    Returns a dict compatible with the extractor but also includes:
+      - 'ids': list[str]
+      - 'combined_scores': list[float] (0..1)
+      - 'distances': list[float] from semantic (may be missing if fetched from BM25-only)
+    """
+    # 1) Normalize query (language-agnostic, no domain synonyms)
+    norm_query = _normalize_query(query)
+    # 2) Semantic candidates (Chroma)
+    sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
+    sem_docs = sem_res.get("documents", [])
+    sem_metas = sem_res.get("metadatas", [])
+    sem_dists = sem_res.get("distances", [])
+    sem_ids = sem_res.get("ids", [])
+    # Convert distances to 0..1 similarity (simple monotonic mapping)
+    def dist_to_sim(d: Optional[float]) -> float:
+        if d is None:
+            return 0.0
+        try:
+            return 1.0 / (1.0 + float(d))  # lower distance -> higher sim
+        except Exception:
+            return 0.0
+    sem_sims = [dist_to_sim(d) for d in sem_dists]
+    # 3) BM25 candidates
+    bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
+    # normalize BM25 scores to 0..1
+    bm25_max = max([s for _, s in bm25_hits], default=1.0)
+    bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
+    # 4) Merge candidates by doc_id
+    # For BM25 doc_idx → get doc info
+    bm25_id_to_norm: Dict[str, float] = {}
+    bm25_id_to_text: Dict[str, str] = {}
+    bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
+    for idx, nscore in bm25_norm_pairs:
+        d = bm25_docs[idx]
+        bm25_id_to_norm[d["id"]] = nscore
+        bm25_id_to_text[d["id"]] = d["text"]
+        bm25_id_to_meta[d["id"]] = d["meta"]
+    # Build union
+    union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    # 5) For each candidate id, compute combined score and collect fields
+    combined_records: List[Tuple[str, float, float, str, Dict[str, Any]]] = []
+    for cid in union_ids:
+        # semantic part
+        if cid in sem_ids:
+            pos = sem_ids.index(cid)
+            sem_sim = sem_sims[pos] if pos < len(sem_sims) else 0.0
+            sem_dist = sem_dists[pos] if pos < len(sem_dists) else None
+            sem_text = sem_docs[pos] if pos < len(sem_docs) else ""
+            sem_meta = sem_metas[pos] if pos < len(sem_metas) else {}
+        else:
+            sem_sim, sem_dist, sem_text, sem_meta = 0.0, None, "", {}
+        # bm25 part
+        bm25_sim = bm25_id_to_norm.get(cid, 0.0)
+        bm25_text = bm25_id_to_text.get(cid, "")
+        bm25_meta = bm25_id_to_meta.get(cid, {})
+        # prefer non-empty text/meta
+        text = sem_text if sem_text else bm25_text
+        meta = sem_meta if sem_meta else bm25_meta
+        # final combined score
+        final_score = alpha * sem_sim + beta * bm25_sim
+        combined_records.append((cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta))
+    # 6) Sort by combined score desc and take top_k
+    combined_records.sort(key=lambda x: x[1], reverse=True)
+    top = combined_records[:top_k]
+    documents = [t[3] for t in top]
+    metadatas = [t[4] for t in top]
+    distances = [t[2] for t in top]              # keep semantic distance (999 if BM25-only)
+    ids        = [t[0] for t in top]
+    combined_scores = [t[1] for t in top]
+    return {
+        "documents": documents,
+        "metadatas": metadatas,
+        "distances": distances,
+        "ids": ids,
+        "combined_scores": combined_scores,
+    }