Spaces:

ChatbotNova
/

Chatbot-Backend

Sleeping

App Files Files Community

srilakshu012456 commited on Dec 24, 2025

Commit

32e50bb

verified ·

1 Parent(s): afd54bd

Update services/kb_creation.py

Browse files

Files changed (1) hide show

services/kb_creation.py +117 -40

services/kb_creation.py CHANGED Viewed

@@ -1,4 +1,3 @@
-#updated
 # services/kb_creation.py
 import os
@@ -96,12 +95,19 @@ PERMISSION_TERMS = [
 ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
 STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
 MODULE_VOCAB = {
     "picking": ["pick", "picking", "pick release", "wave", "allocation"],
-    "receiving": ["receive", "receiving", "inbound", "asn", "appointment"],
-    "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
     "putaway": ["putaway", "staging", "put away", "location assignment"],
     "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
     "replenishment": ["replenishment", "replenish"],
 }
@@ -116,6 +122,11 @@ def _infer_intent_tag(section_title: str) -> str:
         return "prereqs"
     if any(k in st for k in ["purpose", "overview", "introduction"]):
         return "purpose"
     return "neutral"
@@ -145,9 +156,14 @@ def _derive_module_tags(text: str, filename: str, section_title: str) -> List[st
     for mod, syns in MODULE_VOCAB.items():
         if any(s in tokens for s in syns):
             found.append(mod)
     if not found:
         if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
             found = ["inventory"]
     return list(sorted(set(found)))
 # ---------------------------- Ingestion ----------------------------
@@ -193,13 +209,13 @@ def ingest_documents(folder_path: str) -> None:
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
-                    "intent_tag": final_intent,  # str
-                    "topic_tags": ", ".join(topic_tags) if topic_tags else "",  # str (NOT list)
-                    "module_tags": ", ".join(module_tags) if module_tags else "",  # str (NOT list)
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
-                except Exception as e1:
                     try:
                         collection.delete(ids=[doc_id])
                         collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
@@ -319,11 +335,11 @@ def bm25_search(query: str, top_k: int = 50) -> List[Tuple[int, float]]:
 # ---------------------------- Semantic-only ----------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
-    # Some Chroma client versions do not support "ids" in include.
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
-        include=['documents', 'metadatas', 'distances']  # no 'ids' here
     )
     documents = (res.get("documents", [[]]) or [[]])[0]
     metadatas = (res.get("metadatas", [[]]) or [[]])[0]
@@ -348,7 +364,7 @@ def search_knowledge_base(query: str, top_k: int = 10) -> dict:
         "ids": ids,
     }
-# ---------------------------- Hybrid search (intent + module + action) ----------------------------
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
@@ -367,7 +383,7 @@ def _detect_user_intent(query: str) -> str:
     q = (query or "").lower()
     if any(k in q for k in ERROR_INTENT_TERMS):
         return "errors"
-    if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform"]):
         return "steps"
     if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
         return "prereqs"
@@ -382,7 +398,10 @@ def _extract_actions(query: str) -> List[str]:
     for act, syns in ACTION_SYNONYMS.items():
         if any(s in q for s in syns):
             found.append(act)
-    return found or []
 def _extract_modules_from_query(query: str) -> List[str]:
@@ -391,8 +410,13 @@ def _extract_modules_from_query(query: str) -> List[str]:
     for mod, syns in MODULE_VOCAB.items():
         if any(s in q for s in syns):
             found.append(mod)
-    if not found and ("inventory" in q or "adjust" in q):
-        found = ["inventory"]
     return list(sorted(set(found)))
@@ -410,6 +434,9 @@ def _intent_weight(meta: dict, user_intent: str) -> float:
     # Strongly prefer errors/escalation/permissions when the user intent is errors
     if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues", "common issues"]) or ("permissions" in topic_list)):
         return 0.95
     return -0.2
@@ -420,8 +447,10 @@ def _module_weight(meta: Dict[str, Any], user_modules: List[str]) -> float:
     doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
     overlap = len(set(user_modules) & set(doc_modules))
     if overlap == 0:
-        return -0.4  # demote different modules to avoid wrong SOP
-    return 0.6 * overlap
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
@@ -438,22 +467,29 @@ def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     return inter / max(1, len(qset))
-def _action_weight(text: str, actions: List[str]) -> float:
-    if not actions:
         return 0.0
-    t = (text or "").lower()
     score = 0.0
-    for act in actions:
-        for syn in ACTION_SYNONYMS.get(act, [act]):
-            if syn in t:
-                score += 1.0
-    conflicts = {"create": ["delete"], "delete": ["create"], "update": ["delete"], "navigate": []}
-    for act in actions:
-        for bad in conflicts.get(act, []):
-            for syn in ACTION_SYNONYMS.get(bad, [bad]):
-                if syn in t:
-                    score -= 0.8
-    return score
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
@@ -492,12 +528,18 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
     # Union of IDs from semantic and BM25
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
     gamma = 0.30  # meta overlap
-    delta = 0.45  # intent boost (stronger for errors)
     epsilon = 0.30  # action weight
-    zeta = 0.50  # module weight (new)
-    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
@@ -518,30 +560,65 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
         m_overlap = _meta_overlap(meta, q_terms)
         intent_boost = _intent_weight(meta, user_intent)
         act_wt = _action_weight(text, actions)
-        mod_wt = _module_weight(meta, user_modules)
-        final_score = alpha * sem_sim + beta * bm25_sim + gamma * m_overlap + delta * intent_boost + epsilon * act_wt + zeta * mod_wt
         combined_records_ext.append(
-            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt)
         )
     from collections import defaultdict
-    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
-    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
         total_intent = sum(max(0.0, r[6]) for r in recs)
         total_action = sum(max(0.0, r[7]) for r in recs)
         total_module = sum(r[8] for r in recs)
         total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
         esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
         perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
-        return total_score + 0.4 * total_overlap + 0.7 * total_intent + 0.5 * total_action + 0.6 * total_module + 0.3 * total_penalty + esc_weight + perm_weight
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
@@ -550,7 +627,7 @@ def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6
             best_doc_prior, best_doc = p, fn
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
-    other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float]] = []
     for fn, recs in doc_groups.items():
         if fn == best_doc:
             continue

 # services/kb_creation.py
 import os
 ERROR_TERMS = ["error", "issue", "fail", "failure", "not working", "cannot", "can't"]
 STEP_VERBS = ["navigate", "select", "scan", "verify", "confirm", "print", "move", "complete", "click", "open", "choose", "enter", "update", "save", "delete", "create", "attach", "assign"]
+# Expanded module vocabulary: split Receiving vs Appointments
 MODULE_VOCAB = {
+    "receiving": [
+        "receive", "receiving", "inbound receiving", "inbound", "goods receipt", "grn",
+        "asn receiving", "unload", "check-in", "dock check-in"
+    ],
+    "appointments": [
+        "appointment", "appointments", "schedule", "scheduling", "slot", "dock door", "appointment creation", "appointment details"
+    ],
     "picking": ["pick", "picking", "pick release", "wave", "allocation"],
     "putaway": ["putaway", "staging", "put away", "location assignment"],
     "shipping": ["shipping", "ship confirm", "outbound", "load", "trailer"],
+    "inventory": ["inventory", "adjustment", "cycle count", "count", "uom"],
     "replenishment": ["replenishment", "replenish"],
 }
         return "prereqs"
     if any(k in st for k in ["purpose", "overview", "introduction"]):
         return "purpose"
+    # Heading hints (e.g., "Inbound Receiving", "Appointment Creation")
+    if any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
+        return "steps"
+    if any(k in st for k in ["appointment", "appointments", "schedule", "scheduling"]):
+        return "steps"
     return "neutral"
     for mod, syns in MODULE_VOCAB.items():
         if any(s in tokens for s in syns):
             found.append(mod)
+    # defaulting rule if none found
     if not found:
         if "inventory" in tokens or "adjust" in tokens or "uom" in tokens or "cycle" in tokens:
             found = ["inventory"]
+        elif "receive" in tokens or "inbound" in tokens or "goods receipt" in tokens or "grn" in tokens:
+            found = ["receiving"]
+        elif "appointment" in tokens or "schedule" in tokens or "dock" in tokens:
+            found = ["appointments"]
     return list(sorted(set(found)))
 # ---------------------------- Ingestion ----------------------------
                     "chunk_index": c_idx,
                     "title": doc_title,
                     "collection": "SOP",
+                    "intent_tag": final_intent,
+                    "topic_tags": ", ".join(topic_tags) if topic_tags else "",
+                    "module_tags": ", ".join(module_tags) if module_tags else "",
                 }
                 try:
                     collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
+                except Exception:
                     try:
                         collection.delete(ids=[doc_id])
                         collection.add(ids=[doc_id], embeddings=[embedding], documents=[chunk], metadatas=[meta])
 # ---------------------------- Semantic-only ----------------------------
 def search_knowledge_base(query: str, top_k: int = 10) -> dict:
     query_embedding = model.encode(query).tolist()
+    # Request supported fields only, synthesize ids
     res = collection.query(
         query_embeddings=[query_embedding],
         n_results=top_k,
+        include=['documents', 'metadatas', 'distances']
     )
     documents = (res.get("documents", [[]]) or [[]])[0]
     metadatas = (res.get("metadatas", [[]]) or [[]])[0]
         "ids": ids,
     }
+# ---------------------------- Hybrid search (intent + module + action + phrases) ----------------------------
 ACTION_SYNONYMS = {
     "create": ["create", "creation", "add", "new", "generate"],
     "update": ["update", "modify", "change", "edit"],
     q = (query or "").lower()
     if any(k in q for k in ERROR_INTENT_TERMS):
         return "errors"
+    if any(k in q for k in ["steps", "procedure", "how to", "navigate", "process", "do", "perform", "receiving"]):
         return "steps"
     if any(k in q for k in ["pre-requisite", "prerequisites", "requirement", "requirements"]):
         return "prereqs"
     for act, syns in ACTION_SYNONYMS.items():
         if any(s in q for s in syns):
             found.append(act)
+    # receiving verbs hint
+    if any(w in q for w in ["receive", "receiving", "grn", "goods receipt"]):
+        found.append("navigate")  # safe generic
+    return list(sorted(set(found))) or []
 def _extract_modules_from_query(query: str) -> List[str]:
     for mod, syns in MODULE_VOCAB.items():
         if any(s in q for s in syns):
             found.append(mod)
+    # Default if none found
+    if not found:
+        if "receive" in q or "receiving" in q or "grn" in q or "goods receipt" in q or "inbound" in q:
+            found = ["receiving"]
+    # Prefer 'receiving' over 'appointments' when both present (generic rule)
+    if "receiving" in found and "appointments" in found:
+        return ["receiving"]
     return list(sorted(set(found)))
     # Strongly prefer errors/escalation/permissions when the user intent is errors
     if user_intent == "errors" and (any(k in st for k in ["escalation", "permissions", "access", "known issues", "common issues"]) or ("permissions" in topic_list)):
         return 0.95
+    # Prefer receiving headings for receiving queries
+    if user_intent == "steps" and any(k in st for k in ["inbound receiving", "receiving", "goods receipt", "grn"]):
+        return 0.75
     return -0.2
     doc_modules = [m.strip() for m in raw.split(",") if m.strip()] if isinstance(raw, str) else (raw or [])
     overlap = len(set(user_modules) & set(doc_modules))
     if overlap == 0:
+        # Stronger generic penalty for mismatched modules
+        return -0.8
+    # Slight boost per overlapping module
+    return 0.7 * overlap
 def _meta_overlap(meta: Dict[str, Any], q_terms: List[str]) -> float:
     return inter / max(1, len(qset))
+def _make_ngrams(tokens: List[str], n: int) -> List[str]:
+    return [" ".join(tokens[i:i+n]) for i in range(len(tokens) - n + 1)]
+def _phrase_boost_score(text: str, q_terms: List[str]) -> float:
+    """
+    Phrase-level scoring: boosts exact bigram/trigram matches.
+    Generic, no hardcoding to doc names.
+    """
+    if not text or not q_terms:
         return 0.0
+    low = (text or "").lower()
+    bigrams = _make_ngrams(q_terms, 2)
+    trigrams = _make_ngrams(q_terms, 3)
     score = 0.0
+    for bg in bigrams:
+        if bg and bg in low:
+            score += 0.35
+    for tg in trigrams:
+        if tg and tg in low:
+            score += 0.60
+    # cap to avoid over-weighting
+    return min(score, 1.5)
 def hybrid_search_knowledge_base(query: str, top_k: int = 10, alpha: float = 0.6, beta: float = 0.4) -> dict:
     # Union of IDs from semantic and BM25
     union_ids = set(sem_ids) | set(bm25_id_to_norm.keys())
+    # Optional light gating: if we know user's primary module, keep union but strengthen penalties later.
+    primary_user_modules = user_modules if user_modules else []
+    # Weights
     gamma = 0.30  # meta overlap
+    delta = 0.50  # intent boost (stronger for steps/errors now)
     epsilon = 0.30  # action weight
+    zeta = 0.65  # module weight (stronger to avoid wrong SOP)
+    eta = 0.45   # phrase-level boost
+    theta = 0.40 # heading alignment bonus
+    combined_records_ext: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
     for cid in union_ids:
         if cid in sem_ids:
             pos = sem_ids.index(cid)
         m_overlap = _meta_overlap(meta, q_terms)
         intent_boost = _intent_weight(meta, user_intent)
         act_wt = _action_weight(text, actions)
+        mod_wt = _module_weight(meta, primary_user_modules)
+        phrase_wt = _phrase_boost_score(text, q_terms)
+        # Heading alignment: bonus if section/title contains key query term roots
+        sec_low = ((meta or {}).get("section", "") or "").lower()
+        title_low = ((meta or {}).get("title", "") or "").lower()
+        heading_bonus = 0.0
+        if any(root in sec_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
+            heading_bonus += 0.40
+        if any(root in title_low for root in ["receiving", "inbound receiving", "goods receipt", "grn"]) and any(w in norm_query for w in ["receive", "receiving", "inbound", "grn", "goods receipt"]):
+            heading_bonus += 0.40
+        if any(root in sec_low for root in ["appointment", "appointments", "schedule"]) and "receiv" in norm_query:
+            # mild demotion for appointment sections when user asks receiving
+            heading_bonus -= 0.35
+        final_score = (
+            alpha * sem_sim
+            + beta * bm25_sim
+            + gamma * m_overlap
+            + delta * intent_boost
+            + epsilon * act_wt
+            + zeta * mod_wt
+            + eta * phrase_wt
+            + theta * heading_bonus
+        )
         combined_records_ext.append(
+            (cid, final_score, (sem_dist if sem_dist is not None else 999.0), text, meta, m_overlap, intent_boost, act_wt, mod_wt, phrase_wt, heading_bonus)
         )
     from collections import defaultdict
+    doc_groups: Dict[str, List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]] = defaultdict(list)
     for rec in combined_records_ext:
         meta = rec[4] or {}
         fn = meta.get("filename", "unknown")
         doc_groups[fn].append(rec)
+    def doc_prior(recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]]) -> float:
         total_score = sum(r[1] for r in recs)
         total_overlap = sum(r[5] for r in recs)
         total_intent = sum(max(0.0, r[6]) for r in recs)
         total_action = sum(max(0.0, r[7]) for r in recs)
         total_module = sum(r[8] for r in recs)
+        total_phrase = sum(r[9] for r in recs)
+        total_heading = sum(r[10] for r in recs)
         total_penalty = sum(min(0.0, r[6]) for r in recs) + sum(min(0.0, r[7]) for r in recs)
         esc_weight = 0.3 if any("escalation" in ((r[4] or {}).get("section", "")).lower() for r in recs) else 0.0
         perm_weight = 0.3 if any("permissions" in (((r[4] or {}).get("topic_tags") or [])) for r in recs) else 0.0
+        return (
+            total_score
+            + 0.4 * total_overlap
+            + 0.7 * total_intent
+            + 0.5 * total_action
+            + 0.8 * total_module   # stronger module prior
+            + 0.6 * total_phrase   # phrase prior
+            + 0.6 * total_heading  # heading prior
+            + 0.3 * total_penalty
+            + esc_weight + perm_weight
+        )
     best_doc, best_doc_prior = None, -1.0
     for fn, recs in doc_groups.items():
             best_doc_prior, best_doc = p, fn
     best_recs = sorted(doc_groups.get(best_doc, []), key=lambda x: x[1], reverse=True)
+    other_recs: List[Tuple[str, float, float, str, Dict[str, Any], float, float, float, float, float, float]] = []
     for fn, recs in doc_groups.items():
         if fn == best_doc:
             continue