Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 23, 2025

Commit

08c1dac

verified ·

1 Parent(s): 5dd6a63

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +75 -93

services/kb_creation.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import os
 import re
 import pickle
 from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
@@ -46,6 +47,39 @@ def _normalize_query(q: str) -> str:
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
     return _tokenize(val or "")
 # --------------------------- DOCX parsing & chunking ---------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
@@ -71,6 +105,7 @@ def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
     body = "\n".join(paragraphs).strip()
     if not body:
         return []
@@ -79,24 +114,11 @@ def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: Lis
     for i in range(0, len(words), max_words):
         chunk_body = ' '.join(words[i:i + max_words]).strip()
         if chunk_body:
-            chunks.append(chunk_body)  # no doc/section headers inside text
     if not chunks:
         chunks = [body]
     return chunks
-# --------------------------- Intent tagging (auto) ---------------------------
-def _infer_intent_tag(section_title: str) -> str:
-    st = (section_title or "").lower()
-    if any(k in st for k in ["process steps", "procedure", "how to", "workflow", "instructions"]):
-        return "steps"
-    if any(k in st for k in ["common errors", "resolution", "troubleshooting"]):
-        return "errors"
-    if any(k in st for k in ["pre-requisites", "prerequisites"]):
-        return "prereqs"
-    if any(k in st for k in ["purpose", "overview", "introduction"]):
-        return "purpose"
-    return "neutral"
 # --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     print(f"📂 Checking folder: {folder_path}")
@@ -120,7 +142,16 @@ def ingest_documents(folder_path: str) -> None:
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
-            intent_tag = _infer_intent_tag(section_title)
             for c_idx, chunk in enumerate(chunks):
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
@@ -130,7 +161,8 @@ def ingest_documents(folder_path: str) -> None:
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
-                    "intent_tag": intent_tag,  # NEW
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -141,6 +173,7 @@ def ingest_documents(folder_path: str) -> None:
                     except Exception as e2:
                         print(f"❌ Upsert failed for {doc_id}: {e2}")
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for t in tokens:
@@ -212,7 +245,6 @@ def _bm25_score_for_doc(query_terms: List[str], doc_idx: int) -> float:
         N = len(bm25_docs)
         idf_ratio = ((N - df + 0.5) / (df + 0.5))
         try:
-            import math
             idf = math.log(idf_ratio + 1.0)
         except Exception:
             idf = 1.0
@@ -241,7 +273,7 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
-# --------------------------- Semantic-only ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
@@ -277,45 +309,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# --------------------------- Hybrid (BM25 + Embeddings + Intent + Action) ---------------------------
-ACTION_SYNONYMS = {
-    "create": ["create", "creation", "add", "new", "generate"],
-    "update": ["update", "modify", "change", "edit"],
-    "delete": ["delete", "remove"],
-    "navigate": ["navigate", "go to", "open"],
-    # NOTE: 'perform' REMOVED to avoid wrong boosts like Appointment "performed..."
-}
-def _detect_user_intent(query: str) -> str:
-    q = (query or "").lower()
-    if any(k in q for k in ["steps", "procedure", "how to", "navigate", "perform", "do", "process"]):
-        return "steps"
-    if any(k in q for k in ["error", "issue", "fail", "not working", "resolution", "fix"]):
-        return "errors"
-    if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
-        return "prereqs"
-    if any(k in q for k in ["purpose", "overview", "introduction"]):
-        return "purpose"
-    return "neutral"
-def _extract_actions(query: str) -> List[str]:
-    q = (query or "").lower()
-    found = []
-    for act, syns in ACTION_SYNONYMS.items():
-        if any(s in q for s in syns):
-            found.append(act)
-    return found or []
-def _intent_weight(meta: dict, user_intent: str) -> float:
-    tag = (meta or {}).get("intent_tag", "neutral")
-    if user_intent == "neutral":
-        return 0.0
-    if tag == user_intent:
-        return 1.0
-    if tag in ["purpose", "prereqs"] and user_intent in ["steps", "errors"]:
-        return -0.6
-    return -0.2
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     fn_tokens = _tokenize_meta_value(meta.get("filename"))
     title_tokens = _tokenize_meta_value(meta.get("title"))
@@ -327,28 +321,10 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
-def _action_weight(text: str, actions: List[str]) -> float:
-    if not actions:
-        return 0.0
-    t = (text or "").lower()
-    score = 0.0
-    for act in actions:
-        for syn in ACTION_SYNONYMS.get(act, [act]):
-            if syn in t:
-                score += 1.0
-    conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
-    for act in actions:
-        for bad in conflicts.get(act, []):
-            for syn in ACTION_SYNONYMS.get(bad, [bad]):
-                if syn in t:
-                    score -= 0.8
-    return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
-    user_intent = _detect_user_intent(query)
-    actions = _extract_actions(query)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
@@ -379,11 +355,9 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    gamma = 0.25  # meta overlap
-    delta = 0.35  # intent boost
-    epsilon = 0.30  # action weight
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
@@ -402,29 +376,37 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         meta = sem_meta if sem_meta else bm25_meta
         m_overlap = _meta_overlap(meta, q_terms)
-        intent_boost = _intent_weight(meta, user_intent)
-        act_wt = _action_weight(text, actions)
-        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt
         combined_records_ext.append(
-            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt)
         )
     from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
-        total_intent = sum(max(0.0, r[6]) for r in recs)
-        total_action = sum(max(0.0, r[7]) for r in recs)
-        total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
-        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.5 * total_action + 0.3 * total_penalty
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
@@ -458,10 +440,10 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         "best_doc": best_doc,
         "best_doc_prior": best_doc_prior,
         "user_intent": user_intent,
-        "actions": actions,
     }
-# --------------------------- Section fetch helpers (for full output) ---------------------------
 def get_section_text(filename: str, section: str) -> str:
     """Concatenate all chunk texts for a given filename+section."""
     texts: List[str] = []
@@ -484,7 +466,7 @@ def get_best_steps_section_text(filename: str) -> str:
                 texts.append(t)
     return "\n\n".join(texts).strip()
-# --- Admin helpers (optional; unchanged) ---
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,

 import os
 import re
 import pickle
+import math
 from typing import List, Dict, Any, Tuple, Optional
 from docx import Document
 from sentence_transformers import SentenceTransformer
 def _tokenize_meta_value(val: Optional[str]) -> List[str]:
     return _tokenize(val or "")
+# --------------------------- Semantic intent prototypes ---------------------------
+INTENT_PROTOTYPES: Dict[str, str] = {
+    "steps":       "Step-by-step procedure with actions the user must perform",
+    "navigation":  "Menu paths and locations in WMS, for example Navigate to Inbound > Receiving",
+    "errors":      "Common errors and resolution tips or troubleshooting guidance",
+    "prereqs":     "Pre-requisites, authorization, requirements before executing steps",
+    "purpose":     "Purpose, overview, introduction that explains why something is done",
+    "escalation":  "Escalation path or who to contact if the issue cannot be resolved",
+    "permission":  "User lacks authorization or access denied and needs role access check",
+}
+# Precompute prototype embeddings once
+PROTO_EMBS: Dict[str, List[float]] = {label: model.encode(text).tolist() for label, text in INTENT_PROTOTYPES.items()}
+def _embed(txt: str) -> List[float]:
+    return model.encode((txt or "").strip()).tolist()
+def _cos_sim(a: List[float], b: List[float]) -> float:
+    # pure-python cosine similarity
+    dot = sum(x * y for x, y in zip(a, b))
+    na = math.sqrt(sum(x * x for x in a)) + 1e-9
+    nb = math.sqrt(sum(y * y for y in b)) + 1e-9
+    return float(dot / (na * nb))
+def detect_user_intent(query: str) -> Tuple[str, float]:
+    q_vec = _embed(query or "")
+    best, best_s = "neutral", 0.0
+    for label, proto_vec in PROTO_EMBS.items():
+        s = _cos_sim(q_vec, proto_vec)
+        if s > best_s:
+            best, best_s = label, s
+    return best, best_s  # (intent label, confidence approx 0..1)
 # --------------------------- DOCX parsing & chunking ---------------------------
 def _split_by_sections(doc: Document) -> List[Tuple[str, List[str]]]:
     sections: List[Tuple[str, List[str]]] = []
     return sections
 def _chunk_text_with_context(doc_title: str, section_title: str, paragraphs: List[str], max_words: int = 900) -> List[str]:
+    # Store only body text (no titles/headers in chunk) so users never see SOP headers
     body = "\n".join(paragraphs).strip()
     if not body:
         return []
     for i in range(0, len(words), max_words):
         chunk_body = ' '.join(words[i:i + max_words]).strip()
         if chunk_body:
+            chunks.append(chunk_body)
     if not chunks:
         chunks = [body]
     return chunks
 # --------------------------- Ingestion ---------------------------
 def ingest_documents(folder_path: str) -> None:
     print(f"📂 Checking folder: {folder_path}")
         for s_idx, (section_title, paras) in enumerate(sections):
             chunks = _chunk_text_with_context(doc_title, section_title, paras, max_words=900)
             total_chunks += len(chunks)
+            # --- Semantic section intent tagging (no keywords to maintain) ---
+            section_text_for_tag = (section_title or "") + "\n" + ("\n".join(paras[:6]) if paras else "")
+            sec_vec = _embed(section_text_for_tag)
+            best_intent, best_score = "neutral", 0.0
+            for label, proto_vec in PROTO_EMBS.items():
+                s = _cos_sim(sec_vec, proto_vec)
+                if s > best_score:
+                    best_intent, best_score = label, s
             for c_idx, chunk in enumerate(chunks):
                 embedding = model.encode(chunk).tolist()
                 doc_id = f"{file}:{s_idx}:{c_idx}"
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
+                    "intent_tag": best_intent,
+                    "intent_score": best_score,
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
                     except Exception as e2:
                         print(f"❌ Upsert failed for {doc_id}: {e2}")
+                # BM25 indexing
                 tokens = _tokenize(chunk)
                 tf: Dict[str, int] = {}
                 for t in tokens:
         N = len(bm25_docs)
         idf_ratio = ((N - df + 0.5) / (df + 0.5))
         try:
             idf = math.log(idf_ratio + 1.0)
         except Exception:
             idf = 1.0
     scored.sort(key=lambda x: x[1], reverse=True)
     return scored[:top_k]
+# --------------------------- Semantic-only (Chroma) ---------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         "ids": ids,
     }
+# --------------------------- Hybrid (BM25 + Embeddings + Semantic Intent) ---------------------------
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     fn_tokens = _tokenize_meta_value(meta.get("filename"))
     title_tokens = _tokenize_meta_value(meta.get("title"))
     inter = len(meta_tokens & qset)
     return inter / max(1, len(qset))
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
+    user_intent, intent_conf = detect_user_intent(query)  # semantic
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    gamma = 0.25  # metadata overlap weight
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float]] = []  # id, score, dist, text, meta, overlap, intentBoost
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
         meta = sem_meta if sem_meta else bm25_meta
         m_overlap = _meta_overlap(meta, q_terms)
+        tag = (meta or {}).get("intent_tag", "neutral")
+        tag_conf = float((meta or {}).get("intent_score", 0.0))
+        # Semantic intent boost (no keyword list)
+        intent_boost = 0.0
+        if user_intent != "neutral":
+            if tag == user_intent:
+                intent_boost = 0.7 * (0.5 + 0.5 * tag_conf)  # stronger if section is confidently tagged
+            elif tag_conf > 0.4:
+                intent_boost = -0.3 * tag_conf             # soft penalty if clearly different and confident
+        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + intent_boost
         combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost)
         )
+    # ---------------- Document-level voting prior ----------------
     from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
+        total_intent = sum(max(0.0, r[6]) for r in recs)  # positive boosts
+        total_penalty = sum(min(0.0, r[6]) for r in recs) # penalties
+        return total_score + 0.4 * total_overlap + 0.6 * total_intent + 0.3 * total_penalty
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
         "best_doc": best_doc,
         "best_doc_prior": best_doc_prior,
         "user_intent": user_intent,
+        "user_intent_conf": intent_conf,
     }
+# --------------------------- Section fetch helpers ---------------------------
 def get_section_text(filename: str, section: str) -> str:
     """Concatenate all chunk texts for a given filename+section."""
     texts: List[str] = []
                 texts.append(t)
     return "\n\n".join(texts).strip()
+# --------------------------- Admin helpers ---------------------------
 def get_kb_runtime_info() -> Dict[str, Any]:
     return {
         "chroma_path": CHROMA_PATH,