Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 24, 2025

Commit

073fd3d

verified ·

1 Parent(s): 19b804e

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +26 -16

services/kb_creation.py CHANGED Viewed

@@ -92,7 +92,6 @@ PERMISSION_TERMS = [
 ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
 STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
-# Expanded module vocabulary: split Receiving vs Appointments (generic, non-hardcoded)
 MODULE_VOCAB = {
     "receiving": [
         "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
@@ -119,7 +118,6 @@ def _infer_intent_tag(section_title: str) -> str:
         return "prereqs"
     if any(k in st for k in ["purpose", "overview", "introduction"]):
         return "purpose"
-    # Heading hints (generic)
     if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
         return "steps"
     if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
@@ -327,11 +325,10 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
 # ---------------------------- Semantic-only ----------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
-    # Request supported fields only; synthesize ids later
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
-        include=['documents', 'metadatas', 'distances']
     )
     documents = (res.get("documents", [[]]) or [[]])[0]
     metadatas = (res.get("metadatas", [[]]) or [[]])[0]
@@ -356,7 +353,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# ---------------------------- Hybrid search (intent + module + action + phrases) ----------------------------
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
@@ -388,14 +385,23 @@ def _extract_actions(query: str) -> List[str]:
     for act, syns in ACTION_SYNONYMS.items():
         if any(s in q for s in syns):
             found.append(act)
-    # receiving verbs hint (generic)
     if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
         found.append("navigate")
-    found = list(sorted(set(found)))
-    return found or []
 def _action_weight(text: str, actions: List[str]) -> float:
-    """Score based on presence of action synonyms in the text."""
     if not actions:
         return 0.0
     t = (text or "").lower()
@@ -419,7 +425,7 @@ def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
     doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
     overlap = len(set(user_modules) & set(doc_modules))
     if overlap == 0:
-        return -0.8  # stronger penalty to avoid wrong SOP
     return 0.7 * overlap
 def _intent_weight(meta: dict, user_intent: str) -> float:
@@ -462,7 +468,6 @@ def _make_ngrams(tokens: List[str], n: int) -> List[str]:
     return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
 def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
-    """Phrase-level scoring: boosts exact bigram/trigram matches."""
     if not text or not q_terms:
         return 0.0
     low = (text or "").lower()
@@ -481,8 +486,16 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
-    actions = _extract_actions(query)
-    user_modules = _extract_modules_from_query(query)
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
@@ -510,10 +523,8 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
-    # Union of IDs from semantic and BM25
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
-    # Weights
     gamma = 0.30   # meta overlap
     delta = 0.50   # intent boost
     epsilon = 0.30 # action weight
@@ -545,7 +556,6 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         mod_wt = _module_weight(meta, user_modules)
         phrase_wt = _phrase_boost_score(text, q_terms)
-        # Heading alignment bonus / demotion
         sec_low = ((meta or {}).get("section", "") or "").lower()
         title_low = ((meta or {}).get("title", "") or "").lower()
         heading_bonus = 0.0

 ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
 STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
 MODULE_VOCAB = {
     "receiving": [
         "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
         return "prereqs"
     if any(k in st for k in ["purpose", "overview", "introduction"]):
         return "purpose"
     if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
         return "steps"
     if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
 # ---------------------------- Semantic-only ----------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
+        include=['documents', 'metadatas', 'distances']  # no 'ids'
     )
     documents = (res.get("documents", [[]]) or [[]])[0]
     metadatas = (res.get("metadatas", [[]]) or [[]])[0]
         "ids": ids,
     }
+# ---------------------------- Hybrid search (robust) ----------------------------
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
     for act, syns in ACTION_SYNONYMS.items():
         if any(s in q for s in syns):
             found.append(act)
     if any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
         found.append("navigate")
+    return list(sorted(set(found))) or []
+def _extract_modules_from_query(query: str) -> List[str]:
+    q = (query or "").lower()
+    found = []
+    for mod, syns in MODULE_VOCAB.items():
+        if any(s in q for s in syns):
+            found.append(mod)
+    if not found and any(w in q for w in ["receive", "receiving", "grn", "goods receipt", "inbound"]):
+        found = ["receiving"]
+    if "receiving" in found and "appointments" in found:
+        return ["receiving"]
+    return list(sorted(set(found)))
 def _action_weight(text: str, actions: List[str]) -> float:
     if not actions:
         return 0.0
     t = (text or "").lower()
     doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
     overlap = len(set(user_modules) & set(doc_modules))
     if overlap == 0:
+        return -0.8
     return 0.7 * overlap
 def _intent_weight(meta: dict, user_intent: str) -> float:
     return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
 def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
     if not text or not q_terms:
         return 0.0
     low = (text or "").lower()
     norm_query = _normalize_query(query)
     q_terms = _tokenize(norm_query)
     user_intent = _detect_user_intent(query)
+    # Robust guards so missing helpers can’t crash
+    try:
+        actions = _extract_actions(query)
+    except Exception:
+        actions = []
+    try:
+        user_modules = _extract_modules_from_query(query)
+    except Exception:
+        user_modules = []
     sem_res = search_knowledge_base(norm_query, top_k=max(top_k, 30))
     sem_docs = sem_res.get("documents", [])
         bm25_id_to_text[d["id"]] = d["text"]
         bm25_id_to_meta[d["id"]] = d["meta"]
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
     gamma = 0.30   # meta overlap
     delta = 0.50   # intent boost
     epsilon = 0.30 # action weight
         mod_wt = _module_weight(meta, user_modules)
         phrase_wt = _phrase_boost_score(text, q_terms)
         sec_low = ((meta or {}).get("section", "") or "").lower()
         title_low = ((meta or {}).get("title", "") or "").lower()
         heading_bonus = 0.0