Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 23, 2025

Commit

f89383d

verified ·

1 Parent(s): 18ab6d6

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +126 -72

services/kb_creation.py CHANGED Viewed

@@ -6,31 +6,30 @@ from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
-# ------------------------- ChromaDB setup -------------------------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
-# ------------------------- Embedding model ------------------------
 # You can swap to a multilingual model if you expect mixed language queries:
 # model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
-#MODEL_PATH = './models/all-MiniLM-L6-v2'
-#model = SentenceTransformer(MODEL_PATH)
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-# ------------------------- BM25 (lightweight) ---------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
-bm25_docs: List[Dict[str, Any]] = []      # each: {id, text, tokens, tf, length, meta}
 bm25_inverted: Dict[str, List[int]] = {}  # term -> list of doc indices in bm25_docs
-bm25_df: Dict[str, int] = {}              # term -> document frequency
 bm25_avgdl: float = 0.0
 bm25_ready: bool = False
 BM25_K1 = 1.5
 BM25_B = 0.75
-# ------------------------- Utilities ------------------------------
 def _tokenize(text: str) -> List[str]:
     """
     Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
@@ -50,11 +49,20 @@ def _normalize_query(q: str) -> str:
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
     # remove generic filler verbs/common noise words across English variants
-    q = re.sub(r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b", " ", q)
     q = re.sub(r"\s+", " ", q).strip()
     return q
-# ------------------------- DOCX parsing & chunking ----------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     """
     Split DOCX into (section_title, paragraphs_in_section).
@@ -64,12 +72,10 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
     current_paras: List[str] = []
     for para in doc.paragraphs:
         text = (para.text or "").strip()
         style_name = (para.style.name if para.style else "") or ""
         is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
         if is_heading and text:
             # commit previous section
             if current_title or current_paras:
@@ -79,16 +85,13 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
         else:
             if text:
                 current_paras.append(text)
     # final section
     if current_title or current_paras:
         sections.append((current_title or "Untitled Section", current_paras))
     # in case no headings at all, make one pseudo-section with all text
     if not sections:
         all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
         sections = [("Document", all_text)]
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
@@ -109,7 +112,7 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
         chunks = [body]
     return chunks
-# ------------------------- Ingestion ------------------------------
 def ingest_documents(folder_path: str) -> None:
     """
     Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
@@ -140,13 +143,11 @@ def ingest_documents(folder_path: str) -> None:
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
             for c_idx, chunk in enumerate(chunks):
                 # Embedding & Chroma
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"  # stable unique id
                 meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
                 try:
                     collection.add(
                         ids=[doc_id],
@@ -154,7 +155,7 @@ def ingest_documents(folder_path: str) -> None:
                         documents=[chunk],
                         metadatas=[meta],
                     )
-                except Exception as e:
                     # upsert on duplicate
                     try:
                         collection.delete(ids=[doc_id])
@@ -190,20 +191,19 @@ def ingest_documents(folder_path: str) -> None:
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
-        # persist BM25 index
-        payload = {
-            "bm25_docs": bm25_docs,
-            "bm25_inverted": bm25_inverted,
-            "bm25_df": bm25_df,
-            "bm25_avgdl": bm25_avgdl,
-            "BM25_K1": BM25_K1,
-            "BM25_B": BM25_B,
-        }
-        os.makedirs(CHROMA_PATH, exist_ok=True)
-        with open(BM25_INDEX_FILE, "wb") as f:
-            pickle.dump(payload, f)
-        print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
     print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
 def _load_bm25_index() -> None:
@@ -230,7 +230,7 @@ def _load_bm25_index() -> None:
 # auto-load on import
 _load_bm25_index()
-# ------------------------- BM25 search ----------------------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     """
     Okapi BM25 score for a given doc.
@@ -249,18 +249,14 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
             continue
         # BM25 idf
         N = len(bm25_docs)
-        idf = max(0.0, ( (N - df + 0.5) / (df + 0.5) ))
-        idf = (idf if idf > 0 else 1.0)
-        idf = 1.0 * ( (N - df + 0.5) / (df + 0.5) )  # raw ratio
-        # typical log form
         try:
             import math
-            idf = math.log(idf + 1.0)
         except Exception:
-            pass
         denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
-        score += idf * ( (tf * (BM25_K1 + 1)) / (denom or 1.0) )
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
@@ -273,6 +269,7 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     q_terms = _tokenize(norm)
     if not q_terms:
         return []
     # collect candidate doc indices via inverted index
     candidates = set()
     for t in q_terms:
@@ -290,8 +287,7 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
-# ------------------------- Semantic-only (legacy) ------------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     """
     Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
@@ -306,29 +302,28 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     )
     # Flatten lists-per-query
-    docs_ll  = res.get("documents", [[]]) or [[]]
     metas_ll = res.get("metadatas", [[]]) or [[]]
     dists_ll = res.get("distances", [[]]) or [[]]
-    ids_ll   = res.get("ids", [[]]) or [[]]  # some clients still return 'ids' anyway
-    documents = docs_ll[0]  if docs_ll  else []
     metadatas = metas_ll[0] if metas_ll else []
     distances = dists_ll[0] if dists_ll else []
-    ids       = ids_ll[0]   if ids_ll   else []
     # If 'ids' is missing, synthesize stable IDs from metadata
     if not ids and documents:
         synthesized = []
         for i, m in enumerate(metadatas):
-            fn   = (m or {}).get("filename", "unknown")
-            sec  = (m or {}).get("section", "section")
-            idx  = (m or {}).get("chunk_index", i)
             synthesized.append(f"{fn}:{sec}:{idx}")
         ids = synthesized
     print(f"🔎 KB search → {len(documents)} docs (top_k={top_k}); "
           f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
     return {
         "documents": documents,
         "metadatas": metadatas,
@@ -336,21 +331,39 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# ------------------------- Hybrid (BM25 + Embeddings) -------------------------
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     """
     Hybrid retrieval:
       - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
       - BM25 keyword → score (higher = better)
-      - Re-rank union of candidates by: final = alpha * semantic_sim + beta * bm25_norm
-    Returns a dict compatible with the extractor but also includes:
       - 'ids': list[str]
-      - 'combined_scores': list[float] (0..1)
-      - 'distances': list[float] from semantic (may be missing if fetched from BM25-only)
     """
-    # 1) Normalize query (language-agnostic, no domain synonyms)
     norm_query = _normalize_query(query)
     # 2) Semantic candidates (Chroma)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
@@ -377,21 +390,22 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     # 4) Merge candidates by doc_id
-    # For BM25 doc_idx → get doc info
     bm25_id_to_norm: Dict[str, float] = {}
     bm25_id_to_text: Dict[str, str] = {}
     bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
         bm25_id_to_norm[d["id"]] = nscore
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
-    # Build union
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    # 5) For each candidate id, compute combined score and collect fields
-    combined_records: List[Tuple[str, float, float, str, Dict[str, Any]]] = []
     for cid in union_ids:
         # semantic part
         if cid in sem_ids:
@@ -412,18 +426,56 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
         # final combined score
-        final_score = alpha * sem_sim + beta * bm25_sim
-        combined_records.append((cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta))
-    # 6) Sort by combined score desc and take top_k
-    combined_records.sort(key=lambda x: x[1], reverse=True)
-    top = combined_records[:top_k]
     documents = [t[3] for t in top]
     metadatas = [t[4] for t in top]
-    distances = [t[2] for t in top]              # keep semantic distance (999 if BM25-only)
-    ids        = [t[0] for t in top]
     combined_scores = [t[1] for t in top]
     return {
@@ -432,4 +484,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         "distances": distances,
         "ids": ids,
         "combined_scores": combined_scores,
     }

 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
+#updated
+# --------------------------- ChromaDB setup ---------------------------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
 collection = client.get_or_create_collection(name="knowledge_base")
+# --------------------------- Embedding model ---------------------------
 # You can swap to a multilingual model if you expect mixed language queries:
 # model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+# MODEL_PATH = './models/all-MiniLM-L6-v2'
+# model = SentenceTransformer(MODEL_PATH)
 model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
+# --------------------------- BM25 (lightweight) ---------------------------
 BM25_INDEX_FILE = os.path.join(CHROMA_PATH, "bm25_index.pkl")
+bm25_docs: List[Dict[str, Any]] = []  # each: {id, text, tokens, tf, length, meta}
 bm25_inverted: Dict[str, List[int]] = {}  # term -> list of doc indices in bm25_docs
+bm25_df: Dict[str, int] = {}  # term -> document frequency
 bm25_avgdl: float = 0.0
 bm25_ready: bool = False
 BM25_K1 = 1.5
 BM25_B = 0.75
+# --------------------------- Utilities ---------------------------
 def _tokenize(text: str) -> List[str]:
     """
     Simple tokenizer: lowercase alphanumeric words; removes most punctuation.
     q = (q or "").strip().lower()
     q = re.sub(r"[^\w\s]", " ", q)
     # remove generic filler verbs/common noise words across English variants
+    q = re.sub(
+        r"\b(facing|get|getting|got|seeing|receiving|encountered|having|observing|issue|problem)\b",
+        " ",
+        q,
+    )
     q = re.sub(r"\s+", " ", q).strip()
     return q
+def _tokenize_meta_value(val: Optional[str]) -> List[str]:
+    if not val:
+        return []
+    return _tokenize(val)
+# --------------------------- DOCX parsing & chunking ---------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     """
     Split DOCX into (section_title, paragraphs_in_section).
     sections: List[Tuple[str, List[str]]] = []
     current_title = None
     current_paras: List[str] = []
     for para in doc.paragraphs:
         text = (para.text or "").strip()
         style_name = (para.style.name if para.style else "") or ""
         is_heading = bool(re.match(r"Heading\s*\d+", style_name, flags=re.IGNORECASE))
         if is_heading and text:
             # commit previous section
             if current_title or current_paras:
         else:
             if text:
                 current_paras.append(text)
     # final section
     if current_title or current_paras:
         sections.append((current_title or "Untitled Section", current_paras))
     # in case no headings at all, make one pseudo-section with all text
     if not sections:
         all_text = [p.text.strip() for p in doc.paragraphs if p.text and p.text.strip()]
         sections = [("Document", all_text)]
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
         chunks = [body]
     return chunks
+# --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     """
     Read .docx files, section-aware chunking, generate embeddings, store in ChromaDB,
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
             for c_idx, chunk in enumerate(chunks):
                 # Embedding & Chroma
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"  # stable unique id
                 meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
                 try:
                     collection.add(
                         ids=[doc_id],
                         documents=[chunk],
                         metadatas=[meta],
                     )
+                except Exception:
                     # upsert on duplicate
                     try:
                         collection.delete(ids=[doc_id])
         bm25_avgdl = sum(d["length"] for d in bm25_docs) / float(N)
         bm25_ready = True
+    # persist BM25 index
+    payload = {
+        "bm25_docs": bm25_docs,
+        "bm25_inverted": bm25_inverted,
+        "bm25_df": bm25_df,
+        "bm25_avgdl": bm25_avgdl,
+        "BM25_K1": BM25_K1,
+        "BM25_B": BM25_B,
+    }
+    os.makedirs(CHROMA_PATH, exist_ok=True)
+    with open(BM25_INDEX_FILE, "wb") as f:
+        pickle.dump(payload, f)
+    print(f"✅ BM25 index saved: {BM25_INDEX_FILE}")
     print(f"✅ Documents ingested. Total entries in Chroma: {collection.count()}")
 def _load_bm25_index() -> None:
 # auto-load on import
 _load_bm25_index()
+# --------------------------- BM25 search ---------------------------
 def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
     """
     Okapi BM25 score for a given doc.
             continue
         # BM25 idf
         N = len(bm25_docs)
+        idf_ratio = ( (N - df + 0.5) / (df + 0.5) )
         try:
             import math
+            idf = math.log(idf_ratio + 1.0)
         except Exception:
+            idf = 1.0
         denom = tf + BM25_K1 * (1 - BM25_B + BM25_B * (dl / (bm25_avgdl or 1.0)))
+        score += idf * ((tf * (BM25_K1 + 1)) / (denom or 1.0))
     return score
 def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     q_terms = _tokenize(norm)
     if not q_terms:
         return []
     # collect candidate doc indices via inverted index
     candidates = set()
     for t in q_terms:
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
+# --------------------------- Semantic-only (legacy) ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     """
     Semantic-only search (Chroma). We DO NOT ask for 'ids' in include
     )
     # Flatten lists-per-query
+    docs_ll = res.get("documents", [[]]) or [[]]
     metas_ll = res.get("metadatas", [[]]) or [[]]
     dists_ll = res.get("distances", [[]]) or [[]]
+    ids_ll = res.get("ids", [[]]) or [[]]  # some clients still return 'ids' anyway
+    documents = docs_ll[0] if docs_ll else []
     metadatas = metas_ll[0] if metas_ll else []
     distances = dists_ll[0] if dists_ll else []
+    ids = ids_ll[0] if ids_ll else []
     # If 'ids' is missing, synthesize stable IDs from metadata
     if not ids and documents:
         synthesized = []
         for i, m in enumerate(metadatas):
+            fn = (m or {}).get("filename", "unknown")
+            sec = (m or {}).get("section", "section")
+            idx = (m or {}).get("chunk_index", i)
             synthesized.append(f"{fn}:{sec}:{idx}")
         ids = synthesized
     print(f"🔎 KB search → {len(documents)} docs (top_k={top_k}); "
           f"first distance: {distances[0] if distances else 'n/a'}; ids={len(ids)}")
     return {
         "documents": documents,
         "metadatas": metadatas,
         "ids": ids,
     }
+# --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
+def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
+    """
+    Automatic metadata overlap score (no manual module list).
+    Uses filename, title, and section tokens. Range ~0..1.
+    """
+    if not meta:
+        return 0.0
+    fn_tokens = _tokenize_meta_value(meta.get("filename"))
+    title_tokens = _tokenize_meta_value(meta.get("title"))
+    section_tokens = _tokenize_meta_value(meta.get("section"))
+    meta_tokens = set(fn_tokens + title_tokens + section_tokens)
+    if not meta_tokens or not q_terms:
+        return 0.0
+    qset = set(q_terms)
+    inter = len(meta_tokens & qset)
+    return inter / max(1, len(qset))
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     """
     Hybrid retrieval:
       - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
       - BM25 keyword → score (higher = better)
+      - Re-rank union of candidates by:
+          final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap
+      - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
+    Returns a dict compatible with the extractor and includes:
       - 'ids': list[str]
+      - 'combined_scores': list[float] (0..1ish)
     """
+    # 1) Normalize query (language-agnostic)
     norm_query = _normalize_query(query)
+    q_terms = _tokenize(norm_query)
     # 2) Semantic candidates (Chroma)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
     # 4) Merge candidates by doc_id
     bm25_id_to_norm: Dict[str, float] = {}
     bm25_id_to_text: Dict[str, str] = {}
     bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
     for idx, nscore in bm25_norm_pairs:
         d = bm25_docs[idx]
         bm25_id_to_norm[d["id"]] = nscore
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    gamma = 0.25  # metadata boost weight (tunable)
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float]] = []  # include meta_overlap
     for cid in union_ids:
         # semantic part
         if cid in sem_ids:
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
+        # NEW: automatic metadata overlap (no manual lists)
+        m_overlap = _meta_overlap(meta, q_terms)
         # final combined score
+        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap
+        combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap)
+        )
+    # ---------------- Document-level voting prior ----------------
+    # Group by filename and compute aggregate doc score → prefer best doc first
+    from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float]]] = defaultdict(list)
+    for rec in combined_records_ext:
+        meta = rec[4] or {}
+        fn = meta.get("filename", "unknown")
+        doc_groups[fn].append(rec)
+    # Compute doc_prior = sum(final_score) + small bonus for metadata overlap sum
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float]]) -> float:
+        total_score = sum(r[1] for r in recs)
+        total_meta = sum(r[5] for r in recs)
+        return total_score + 0.4 * total_meta  # 0.4 is tunable
+    # Pick best document
+    best_doc = None
+    best_doc_prior = -1.0
+    for fn, recs in doc_groups.items():
+        p = doc_prior(recs)
+        if p > best_doc_prior:
+            best_doc_prior = p
+            best_doc = fn
+    # Reorder: take items from best_doc first (sorted by score), then others
+    best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
+    other_recs = []
+    for fn, recs in doc_groups.items():
+        if fn == best_doc:
+            continue
+        other_recs.extend(recs)
+    other_recs.sort(key=lambda x: x[1], reverse=True)
+    reordered = best_recs + other_recs
+    top = reordered[:top_k]
     documents = [t[3] for t in top]
     metadatas = [t[4] for t in top]
+    distances = [t[2] for t in top]
+    ids = [t[0] for t in top]
     combined_scores = [t[1] for t in top]
     return {
         "distances": distances,
         "ids": ids,
         "combined_scores": combined_scores,
+        "best_doc": best_doc,              # helpful for debugging
+        "best_doc_prior": best_doc_prior,  # helpful for debugging
     }