Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 23, 2025

Commit

60c3916

verified ·

1 Parent(s): f89383d

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +73 -22

services/kb_creation.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
-#updated
 # --------------------------- ChromaDB setup ---------------------------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
@@ -112,6 +112,22 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
         chunks = [body]
     return chunks
 # --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     """
@@ -143,11 +159,19 @@ def ingest_documents(folder_path: str) -> None:
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
             for c_idx, chunk in enumerate(chunks):
                 # Embedding & Chroma
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"  # stable unique id
-                meta = {"filename": file, "section": section_title, "chunk_index": c_idx, "title": doc_title, "collection": "SOP"}
                 try:
                     collection.add(
                         ids=[doc_id],
@@ -220,7 +244,6 @@ def _load_bm25_index() -> None:
         bm25_inverted = payload.get("bm25_inverted", {})
         bm25_df = payload.get("bm25_df", {})
         bm25_avgdl = payload.get("bm25_avgdl", 0.0)
-        # params retained but we keep module-level constants
         bm25_ready = len(bm25_docs) > 0
         if bm25_ready:
             print(f"✅ BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
@@ -249,7 +272,7 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
             continue
         # BM25 idf
         N = len(bm25_docs)
-        idf_ratio = ( (N - df + 0.5) / (df + 0.5) )
         try:
             import math
             idf = math.log(idf_ratio + 1.0)
@@ -334,7 +357,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
 # --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     """
-    Automatic metadata overlap score (no manual module list).
     Uses filename, title, and section tokens. Range ~0..1.
     """
     if not meta:
@@ -349,21 +372,45 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     """
     Hybrid retrieval:
       - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
       - BM25 keyword → score (higher = better)
       - Re-rank union of candidates by:
-          final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap
       - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
     Returns a dict compatible with the extractor and includes:
       - 'ids': list[str]
-      - 'combined_scores': list[float] (0..1ish)
     """
     # 1) Normalize query (language-agnostic)
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     # 2) Semantic candidates (Chroma)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
@@ -372,12 +419,12 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     sem_dists = sem_res.get("distances", [])
     sem_ids = sem_res.get("ids", [])
-    # Convert distances to 0..1 similarity (simple monotonic mapping)
     def dist_to_sim(d: Optional[float]) -> float:
         if d is None:
             return 0.0
         try:
-            return 1.0 / (1.0 + float(d))  # lower distance -> higher sim
         except Exception:
             return 0.0
@@ -385,11 +432,10 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     # 3) BM25 candidates
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
-    # normalize BM25 scores to 0..1
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
-    # 4) Merge candidates by doc_id
     bm25_id_to_norm: Dict[str, float] = {}
     bm25_id_to_text: Dict[str, str] = {}
     bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
@@ -400,11 +446,13 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    gamma = 0.25  # metadata boost weight (tunable)
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float]] = []  # include meta_overlap
     for cid in union_ids:
         # semantic part
@@ -426,30 +474,32 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
-        # NEW: automatic metadata overlap (no manual lists)
         m_overlap = _meta_overlap(meta, q_terms)
         # final combined score
-        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap
         combined_records_ext.append(
-            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap)
         )
     # ---------------- Document-level voting prior ----------------
-    # Group by filename and compute aggregate doc score → prefer best doc first
     from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
-    # Compute doc_prior = sum(final_score) + small bonus for metadata overlap sum
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float]]) -> float:
         total_score = sum(r[1] for r in recs)
-        total_meta = sum(r[5] for r in recs)
-        return total_score + 0.4 * total_meta  # 0.4 is tunable
     # Pick best document
     best_doc = None
@@ -486,4 +536,5 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         "combined_scores": combined_scores,
         "best_doc": best_doc,              # helpful for debugging
         "best_doc_prior": best_doc_prior,  # helpful for debugging
     }

 from docx import Document
 from sentence_transformers import SentenceTransformer
 import chromadb
 # --------------------------- ChromaDB setup ---------------------------
 CHROMA_PATH = os.path.join(os.getcwd(), "chroma_db")
 client = chromadb.PersistentClient(path=CHROMA_PATH)
         chunks = [body]
     return chunks
+# --------------------------- Intent tagging (auto) ---------------------------
+def _infer_intent_tag(section_title: str) -> str:
+    """
+    Infer coarse intent from section title—no manual curation.
+    """
+    st = (section_title or "").lower()
+    if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
+        return "steps"
+    if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
+        return "errors"
+    if any(k in st for k in ["pre-requisites", "prerequisites"]):
+        return "prereqs"
+    if any(k in st for k in ["purpose", "overview", "introduction"]):
+        return "purpose"
+    return "neutral"
 # --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     """
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
+            intent_tag = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
                 # Embedding & Chroma
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"  # stable unique id
+                meta = {
+                    "filename": file,
+                    "section": section_title,
+                    "chunk_index": c_idx,
+                    "title": doc_title,
+                    "collection": "SOP",
+                    "intent_tag": intent_tag,  # NEW
+                }
                 try:
                     collection.add(
                         ids=[doc_id],
         bm25_inverted = payload.get("bm25_inverted", {})
         bm25_df = payload.get("bm25_df", {})
         bm25_avgdl = payload.get("bm25_avgdl", 0.0)
         bm25_ready = len(bm25_docs) > 0
         if bm25_ready:
             print(f"✅ BM25 index loaded: {BM25_INDEX_FILE} (docs={len(bm25_docs)})")
             continue
         # BM25 idf
         N = len(bm25_docs)
+        idf_ratio = ((N - df + 0.5) / (df + 0.5))
         try:
             import math
             idf = math.log(idf_ratio + 1.0)
 # --------------------------- Hybrid (BM25 + Embeddings) ---------------------------
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     """
+    Automatic metadata overlap score (no manual per-SOP lists).
     Uses filename, title, and section tokens. Range ~0..1.
     """
     if not meta:
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
+def _detect_user_intent(query: str) -> str:
+    q = (query or "").lower()
+    if any(k in q for k in ["steps", "procedure", "how to", "navigate", "perform", "do", "process"]):
+        return "steps"
+    if any(k in q for k in ["error", "issue", "fail", "not working", "resolution", "fix"]):
+        return "errors"
+    if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
+        return "prereqs"
+    if any(k in q for k in ["purpose", "overview", "introduction"]):
+        return "purpose"
+    return "neutral"
+def _intent_weight(meta: dict, user_intent: str) -> float:
+    tag = (meta or {}).get("intent_tag", "neutral")
+    if user_intent == "neutral":
+        return 0.0
+    if tag == user_intent:
+        return 1.0     # strong boost when intent matches
+    if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
+        return -0.6     # penalize overview/prereqs for steps/errors queries
+    return -0.2         # small penalty for other mismatches
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     """
     Hybrid retrieval:
       - Semantic (Chroma/embeddings) → distances (lower = better) → convert to similarity
       - BM25 keyword → score (higher = better)
       - Re-rank union of candidates by:
+          final = alpha * semantic_sim + beta * bm25_norm + gamma * meta_overlap + delta * intent_boost
       - Document-level voting prior: aggregate scores by 'filename' and prefer the best document first.
     Returns a dict compatible with the extractor and includes:
       - 'ids': list[str]
+      - 'combined_scores': list[float]
+      - 'best_doc', 'best_doc_prior', 'user_intent'
     """
     # 1) Normalize query (language-agnostic)
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
+    user_intent = _detect_user_intent(query)
     # 2) Semantic candidates (Chroma)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_dists = sem_res.get("distances", [])
     sem_ids = sem_res.get("ids", [])
+    # Convert distances to 0..1 similarity
     def dist_to_sim(d: Optional[float]) -> float:
         if d is None:
             return 0.0
         try:
+            return 1.0 / (1.0 + float(d))
         except Exception:
             return 0.0
     # 3) BM25 candidates
     bm25_hits = bm25_search(norm_query, top_k=max(50, top_k * 5))
     bm25_max = max([s for _, s in bm25_hits], default=1.0)
     bm25_norm_pairs = [(idx, (score / bm25_max) if bm25_max > 0 else 0.0) for idx, score in bm25_hits]
+    # 4) Prepare BM25 maps
     bm25_id_to_norm: Dict[str, float] = {}
     bm25_id_to_text: Dict[str, str] = {}
     bm25_id_to_meta: Dict[str, Dict[str, Any]] = {}
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
+    # 5) Union of candidates
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    gamma = 0.25  # metadata overlap weight
+    delta = 0.35  # intent-aware weight
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float]] = []  # include overlap+intent
     for cid in union_ids:
         # semantic part
         text = sem_text if sem_text else bm25_text
         meta = sem_meta if sem_meta else bm25_meta
+        # NEW: automatic metadata overlap + intent-aware boost
         m_overlap = _meta_overlap(meta, q_terms)
+        intent_boost = _intent_weight(meta, user_intent)
         # final combined score
+        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost
         combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost)
         )
     # ---------------- Document-level voting prior ----------------
     from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
+    # Compute doc_prior = sum(final_score) + bonuses for overlap+intent
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
+        total_overlap = sum(r[5] for r in recs)
+        total_intent = sum(max(0.0, r[6]) for r in recs)  # positive intent boosts
+        total_penalty = sum(min(0.0, r[6]) for r in recs)  # penalties
+        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.3 * total_penalty
     # Pick best document
     best_doc = None
         "combined_scores": combined_scores,
         "best_doc": best_doc,              # helpful for debugging
         "best_doc_prior": best_doc_prior,  # helpful for debugging
+        "user_intent": user_intent,        # helpful for debugging
     }